compiler: only build thunk struct type when it is needed
[official-gcc.git] / gcc / tree-vect-loop.cc
blob2536cc3cf4903f898245a9012b097a91d192e6b3
1 /* Loop Vectorization
2 Copyright (C) 2003-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 #include "case-cfn-macros.h"
59 /* Loop Vectorization Pass.
61 This pass tries to vectorize loops.
63 For example, the vectorizer transforms the following simple loop:
65 short a[N]; short b[N]; short c[N]; int i;
67 for (i=0; i<N; i++){
68 a[i] = b[i] + c[i];
71 as if it was manually vectorized by rewriting the source code into:
73 typedef int __attribute__((mode(V8HI))) v8hi;
74 short a[N]; short b[N]; short c[N]; int i;
75 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76 v8hi va, vb, vc;
78 for (i=0; i<N/8; i++){
79 vb = pb[i];
80 vc = pc[i];
81 va = vb + vc;
82 pa[i] = va;
85 The main entry to this pass is vectorize_loops(), in which
86 the vectorizer applies a set of analyses on a given set of loops,
87 followed by the actual vectorization transformation for the loops that
88 had successfully passed the analysis phase.
89 Throughout this pass we make a distinction between two types of
90 data: scalars (which are represented by SSA_NAMES), and memory references
91 ("data-refs"). These two types of data require different handling both
92 during analysis and transformation. The types of data-refs that the
93 vectorizer currently supports are ARRAY_REFS which base is an array DECL
94 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95 accesses are required to have a simple (consecutive) access pattern.
97 Analysis phase:
98 ===============
99 The driver for the analysis phase is vect_analyze_loop().
100 It applies a set of analyses, some of which rely on the scalar evolution
101 analyzer (scev) developed by Sebastian Pop.
103 During the analysis phase the vectorizer records some information
104 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105 loop, as well as general information about the loop as a whole, which is
106 recorded in a "loop_vec_info" struct attached to each loop.
108 Transformation phase:
109 =====================
110 The loop transformation phase scans all the stmts in the loop, and
111 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112 the loop that needs to be vectorized. It inserts the vector code sequence
113 just before the scalar stmt S, and records a pointer to the vector code
114 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115 attached to S). This pointer will be used for the vectorization of following
116 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117 otherwise, we rely on dead code elimination for removing it.
119 For example, say stmt S1 was vectorized into stmt VS1:
121 VS1: vb = px[i];
122 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123 S2: a = b;
125 To vectorize stmt S2, the vectorizer first finds the stmt that defines
126 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
128 resulting sequence would be:
130 VS1: vb = px[i];
131 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132 VS2: va = vb;
133 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
135 Operands that are not SSA_NAMEs, are data-refs that appear in
136 load/store operations (like 'x[i]' in S1), and are handled differently.
138 Target modeling:
139 =================
140 Currently the only target specific information that is used is the
141 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142 Targets that can support different sizes of vectors, for now will need
143 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
144 flexibility will be added in the future.
146 Since we only vectorize operations which vector form can be
147 expressed using existing tree codes, to verify that an operation is
148 supported, the vectorizer checks the relevant optab at the relevant
149 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
150 the value found is CODE_FOR_nothing, then there's no target support, and
151 we can't vectorize the stmt.
153 For additional information on this project see:
154 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
158 unsigned *);
159 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
160 bool *, bool *, bool);
162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
163 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
164 may already be set for general statements (not just data refs). */
166 static opt_result
167 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
168 bool vectype_maybe_set_p,
169 poly_uint64 *vf)
171 gimple *stmt = stmt_info->stmt;
173 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
174 && !STMT_VINFO_LIVE_P (stmt_info))
175 || gimple_clobber_p (stmt))
177 if (dump_enabled_p ())
178 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
179 return opt_result::success ();
182 tree stmt_vectype, nunits_vectype;
183 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
184 &stmt_vectype,
185 &nunits_vectype);
186 if (!res)
187 return res;
189 if (stmt_vectype)
191 if (STMT_VINFO_VECTYPE (stmt_info))
192 /* The only case when a vectype had been already set is for stmts
193 that contain a data ref, or for "pattern-stmts" (stmts generated
194 by the vectorizer to represent/replace a certain idiom). */
195 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
196 || vectype_maybe_set_p)
197 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
198 else
199 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
202 if (nunits_vectype)
203 vect_update_max_nunits (vf, nunits_vectype);
205 return opt_result::success ();
208 /* Subroutine of vect_determine_vectorization_factor. Set the vector
209 types of STMT_INFO and all attached pattern statements and update
210 the vectorization factor VF accordingly. Return true on success
211 or false if something prevented vectorization. */
213 static opt_result
214 vect_determine_vf_for_stmt (vec_info *vinfo,
215 stmt_vec_info stmt_info, poly_uint64 *vf)
217 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
219 stmt_info->stmt);
220 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
221 if (!res)
222 return res;
224 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
225 && STMT_VINFO_RELATED_STMT (stmt_info))
227 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
228 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
230 /* If a pattern statement has def stmts, analyze them too. */
231 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
232 !gsi_end_p (si); gsi_next (&si))
234 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
235 if (dump_enabled_p ())
236 dump_printf_loc (MSG_NOTE, vect_location,
237 "==> examining pattern def stmt: %G",
238 def_stmt_info->stmt);
239 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
240 if (!res)
241 return res;
244 if (dump_enabled_p ())
245 dump_printf_loc (MSG_NOTE, vect_location,
246 "==> examining pattern statement: %G",
247 stmt_info->stmt);
248 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
249 if (!res)
250 return res;
253 return opt_result::success ();
256 /* Function vect_determine_vectorization_factor
258 Determine the vectorization factor (VF). VF is the number of data elements
259 that are operated upon in parallel in a single iteration of the vectorized
260 loop. For example, when vectorizing a loop that operates on 4byte elements,
261 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
262 elements can fit in a single vector register.
264 We currently support vectorization of loops in which all types operated upon
265 are of the same size. Therefore this function currently sets VF according to
266 the size of the types operated upon, and fails if there are multiple sizes
267 in the loop.
269 VF is also the factor by which the loop iterations are strip-mined, e.g.:
270 original loop:
271 for (i=0; i<N; i++){
272 a[i] = b[i] + c[i];
275 vectorized loop:
276 for (i=0; i<N; i+=VF){
277 a[i:VF] = b[i:VF] + c[i:VF];
281 static opt_result
282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
284 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
285 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
286 unsigned nbbs = loop->num_nodes;
287 poly_uint64 vectorization_factor = 1;
288 tree scalar_type = NULL_TREE;
289 gphi *phi;
290 tree vectype;
291 stmt_vec_info stmt_info;
292 unsigned i;
294 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
296 for (i = 0; i < nbbs; i++)
298 basic_block bb = bbs[i];
300 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
301 gsi_next (&si))
303 phi = si.phi ();
304 stmt_info = loop_vinfo->lookup_stmt (phi);
305 if (dump_enabled_p ())
306 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
307 (gimple *) phi);
309 gcc_assert (stmt_info);
311 if (STMT_VINFO_RELEVANT_P (stmt_info)
312 || STMT_VINFO_LIVE_P (stmt_info))
314 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
315 scalar_type = TREE_TYPE (PHI_RESULT (phi));
317 if (dump_enabled_p ())
318 dump_printf_loc (MSG_NOTE, vect_location,
319 "get vectype for scalar type: %T\n",
320 scalar_type);
322 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
323 if (!vectype)
324 return opt_result::failure_at (phi,
325 "not vectorized: unsupported "
326 "data-type %T\n",
327 scalar_type);
328 STMT_VINFO_VECTYPE (stmt_info) = vectype;
330 if (dump_enabled_p ())
331 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
332 vectype);
334 if (dump_enabled_p ())
336 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
337 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
338 dump_printf (MSG_NOTE, "\n");
341 vect_update_max_nunits (&vectorization_factor, vectype);
345 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
346 gsi_next (&si))
348 if (is_gimple_debug (gsi_stmt (si)))
349 continue;
350 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
351 opt_result res
352 = vect_determine_vf_for_stmt (loop_vinfo,
353 stmt_info, &vectorization_factor);
354 if (!res)
355 return res;
359 /* TODO: Analyze cost. Decide if worth while to vectorize. */
360 if (dump_enabled_p ())
362 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
363 dump_dec (MSG_NOTE, vectorization_factor);
364 dump_printf (MSG_NOTE, "\n");
367 if (known_le (vectorization_factor, 1U))
368 return opt_result::failure_at (vect_location,
369 "not vectorized: unsupported data-type\n");
370 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
371 return opt_result::success ();
375 /* Function vect_is_simple_iv_evolution.
377 FORNOW: A simple evolution of an induction variables in the loop is
378 considered a polynomial evolution. */
380 static bool
381 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
382 tree * step)
384 tree init_expr;
385 tree step_expr;
386 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
387 basic_block bb;
389 /* When there is no evolution in this loop, the evolution function
390 is not "simple". */
391 if (evolution_part == NULL_TREE)
392 return false;
394 /* When the evolution is a polynomial of degree >= 2
395 the evolution function is not "simple". */
396 if (tree_is_chrec (evolution_part))
397 return false;
399 step_expr = evolution_part;
400 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
402 if (dump_enabled_p ())
403 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
404 step_expr, init_expr);
406 *init = init_expr;
407 *step = step_expr;
409 if (TREE_CODE (step_expr) != INTEGER_CST
410 && (TREE_CODE (step_expr) != SSA_NAME
411 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
412 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
413 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
414 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
415 || !flag_associative_math)))
416 && (TREE_CODE (step_expr) != REAL_CST
417 || !flag_associative_math))
419 if (dump_enabled_p ())
420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
421 "step unknown.\n");
422 return false;
425 return true;
428 /* Function vect_is_nonlinear_iv_evolution
430 Only support nonlinear induction for integer type
431 1. neg
432 2. mul by constant
433 3. lshift/rshift by constant.
435 For neg induction, return a fake step as integer -1. */
436 static bool
437 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
438 gphi* loop_phi_node, tree *init, tree *step)
440 tree init_expr, ev_expr, result, op1, op2;
441 gimple* def;
443 if (gimple_phi_num_args (loop_phi_node) != 2)
444 return false;
446 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
447 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
449 /* Support nonlinear induction only for integer type. */
450 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
451 return false;
453 *init = init_expr;
454 result = PHI_RESULT (loop_phi_node);
456 if (TREE_CODE (ev_expr) != SSA_NAME
457 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
458 || !is_gimple_assign (def))
459 return false;
461 enum tree_code t_code = gimple_assign_rhs_code (def);
462 switch (t_code)
464 case NEGATE_EXPR:
465 if (gimple_assign_rhs1 (def) != result)
466 return false;
467 *step = build_int_cst (TREE_TYPE (init_expr), -1);
468 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
469 break;
471 case RSHIFT_EXPR:
472 case LSHIFT_EXPR:
473 case MULT_EXPR:
474 op1 = gimple_assign_rhs1 (def);
475 op2 = gimple_assign_rhs2 (def);
476 if (TREE_CODE (op2) != INTEGER_CST
477 || op1 != result)
478 return false;
479 *step = op2;
480 if (t_code == LSHIFT_EXPR)
481 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
482 else if (t_code == RSHIFT_EXPR)
483 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
484 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
485 else
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
487 break;
489 default:
490 return false;
493 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
494 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
496 return true;
499 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
500 what we are assuming is a double reduction. For example, given
501 a structure like this:
503 outer1:
504 x_1 = PHI <x_4(outer2), ...>;
507 inner:
508 x_2 = PHI <x_1(outer1), ...>;
510 x_3 = ...;
513 outer2:
514 x_4 = PHI <x_3(inner)>;
517 outer loop analysis would treat x_1 as a double reduction phi and
518 this function would then return true for x_2. */
520 static bool
521 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
523 use_operand_p use_p;
524 ssa_op_iter op_iter;
525 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
526 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
527 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
528 return true;
529 return false;
532 /* Function vect_analyze_scalar_cycles_1.
534 Examine the cross iteration def-use cycles of scalar variables
535 in LOOP. LOOP_VINFO represents the loop that is now being
536 considered for vectorization (can be LOOP, or an outer-loop
537 enclosing LOOP). SLP indicates there will be some subsequent
538 slp analyses or not. */
540 static void
541 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
542 bool slp)
544 basic_block bb = loop->header;
545 tree init, step;
546 auto_vec<stmt_vec_info, 64> worklist;
547 gphi_iterator gsi;
548 bool double_reduc, reduc_chain;
550 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
552 /* First - identify all inductions. Reduction detection assumes that all the
553 inductions have been identified, therefore, this order must not be
554 changed. */
555 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
557 gphi *phi = gsi.phi ();
558 tree access_fn = NULL;
559 tree def = PHI_RESULT (phi);
560 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
562 if (dump_enabled_p ())
563 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
564 (gimple *) phi);
566 /* Skip virtual phi's. The data dependences that are associated with
567 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
568 if (virtual_operand_p (def))
569 continue;
571 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
573 /* Analyze the evolution function. */
574 access_fn = analyze_scalar_evolution (loop, def);
575 if (access_fn)
577 STRIP_NOPS (access_fn);
578 if (dump_enabled_p ())
579 dump_printf_loc (MSG_NOTE, vect_location,
580 "Access function of PHI: %T\n", access_fn);
581 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
582 = initial_condition_in_loop_num (access_fn, loop->num);
583 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
584 = evolution_part_in_loop_num (access_fn, loop->num);
587 if ((!access_fn
588 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
589 || !vect_is_simple_iv_evolution (loop->num, access_fn,
590 &init, &step)
591 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
592 && TREE_CODE (step) != INTEGER_CST))
593 /* Only handle nonlinear iv for same loop. */
594 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
595 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
596 phi, &init, &step)))
598 worklist.safe_push (stmt_vinfo);
599 continue;
602 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
603 != NULL_TREE);
604 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
608 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
612 /* Second - identify all reductions and nested cycles. */
613 while (worklist.length () > 0)
615 stmt_vec_info stmt_vinfo = worklist.pop ();
616 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
617 tree def = PHI_RESULT (phi);
619 if (dump_enabled_p ())
620 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
621 (gimple *) phi);
623 gcc_assert (!virtual_operand_p (def)
624 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
626 stmt_vec_info reduc_stmt_info
627 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
628 &reduc_chain, slp);
629 if (reduc_stmt_info)
631 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
632 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
633 if (double_reduc)
635 if (dump_enabled_p ())
636 dump_printf_loc (MSG_NOTE, vect_location,
637 "Detected double reduction.\n");
639 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
640 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
642 else
644 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
646 if (dump_enabled_p ())
647 dump_printf_loc (MSG_NOTE, vect_location,
648 "Detected vectorizable nested cycle.\n");
650 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
652 else
654 if (dump_enabled_p ())
655 dump_printf_loc (MSG_NOTE, vect_location,
656 "Detected reduction.\n");
658 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
659 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
660 /* Store the reduction cycles for possible vectorization in
661 loop-aware SLP if it was not detected as reduction
662 chain. */
663 if (! reduc_chain)
664 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
665 (reduc_stmt_info);
669 else
670 if (dump_enabled_p ())
671 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
672 "Unknown def-use cycle pattern.\n");
677 /* Function vect_analyze_scalar_cycles.
679 Examine the cross iteration def-use cycles of scalar variables, by
680 analyzing the loop-header PHIs of scalar variables. Classify each
681 cycle as one of the following: invariant, induction, reduction, unknown.
682 We do that for the loop represented by LOOP_VINFO, and also to its
683 inner-loop, if exists.
684 Examples for scalar cycles:
686 Example1: reduction:
688 loop1:
689 for (i=0; i<N; i++)
690 sum += a[i];
692 Example2: induction:
694 loop2:
695 for (i=0; i<N; i++)
696 a[i] = i; */
698 static void
699 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
701 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
703 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
705 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
706 Reductions in such inner-loop therefore have different properties than
707 the reductions in the nest that gets vectorized:
708 1. When vectorized, they are executed in the same order as in the original
709 scalar loop, so we can't change the order of computation when
710 vectorizing them.
711 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
712 current checks are too strict. */
714 if (loop->inner)
715 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
718 /* Transfer group and reduction information from STMT_INFO to its
719 pattern stmt. */
721 static void
722 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
724 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
725 stmt_vec_info stmtp;
726 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
727 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
728 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
731 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
732 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
733 == STMT_VINFO_DEF_TYPE (stmt_info));
734 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
735 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
736 if (stmt_info)
737 REDUC_GROUP_NEXT_ELEMENT (stmtp)
738 = STMT_VINFO_RELATED_STMT (stmt_info);
740 while (stmt_info);
743 /* Fixup scalar cycles that now have their stmts detected as patterns. */
745 static void
746 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
748 stmt_vec_info first;
749 unsigned i;
751 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
753 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
754 while (next)
756 if ((STMT_VINFO_IN_PATTERN_P (next)
757 != STMT_VINFO_IN_PATTERN_P (first))
758 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
759 break;
760 next = REDUC_GROUP_NEXT_ELEMENT (next);
762 /* If all reduction chain members are well-formed patterns adjust
763 the group to group the pattern stmts instead. */
764 if (! next
765 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
767 if (STMT_VINFO_IN_PATTERN_P (first))
769 vect_fixup_reduc_chain (first);
770 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
771 = STMT_VINFO_RELATED_STMT (first);
774 /* If not all stmt in the chain are patterns or if we failed
775 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
776 it as regular reduction instead. */
777 else
779 stmt_vec_info vinfo = first;
780 stmt_vec_info last = NULL;
781 while (vinfo)
783 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
784 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
785 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
786 last = vinfo;
787 vinfo = next;
789 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
790 = vect_internal_def;
791 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
792 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
793 --i;
798 /* Function vect_get_loop_niters.
800 Determine how many iterations the loop is executed and place it
801 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
802 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
803 niter information holds in ASSUMPTIONS.
805 Return the loop exit condition. */
808 static gcond *
809 vect_get_loop_niters (class loop *loop, tree *assumptions,
810 tree *number_of_iterations, tree *number_of_iterationsm1)
812 edge exit = single_exit (loop);
813 class tree_niter_desc niter_desc;
814 tree niter_assumptions, niter, may_be_zero;
815 gcond *cond = get_loop_exit_condition (loop);
817 *assumptions = boolean_true_node;
818 *number_of_iterationsm1 = chrec_dont_know;
819 *number_of_iterations = chrec_dont_know;
820 DUMP_VECT_SCOPE ("get_loop_niters");
822 if (!exit)
823 return cond;
825 may_be_zero = NULL_TREE;
826 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
827 || chrec_contains_undetermined (niter_desc.niter))
828 return cond;
830 niter_assumptions = niter_desc.assumptions;
831 may_be_zero = niter_desc.may_be_zero;
832 niter = niter_desc.niter;
834 if (may_be_zero && integer_zerop (may_be_zero))
835 may_be_zero = NULL_TREE;
837 if (may_be_zero)
839 if (COMPARISON_CLASS_P (may_be_zero))
841 /* Try to combine may_be_zero with assumptions, this can simplify
842 computation of niter expression. */
843 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
844 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
845 niter_assumptions,
846 fold_build1 (TRUTH_NOT_EXPR,
847 boolean_type_node,
848 may_be_zero));
849 else
850 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
851 build_int_cst (TREE_TYPE (niter), 0),
852 rewrite_to_non_trapping_overflow (niter));
854 may_be_zero = NULL_TREE;
856 else if (integer_nonzerop (may_be_zero))
858 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
859 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
860 return cond;
862 else
863 return cond;
866 *assumptions = niter_assumptions;
867 *number_of_iterationsm1 = niter;
869 /* We want the number of loop header executions which is the number
870 of latch executions plus one.
871 ??? For UINT_MAX latch executions this number overflows to zero
872 for loops like do { n++; } while (n != 0); */
873 if (niter && !chrec_contains_undetermined (niter))
874 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
875 build_int_cst (TREE_TYPE (niter), 1));
876 *number_of_iterations = niter;
878 return cond;
881 /* Function bb_in_loop_p
883 Used as predicate for dfs order traversal of the loop bbs. */
885 static bool
886 bb_in_loop_p (const_basic_block bb, const void *data)
888 const class loop *const loop = (const class loop *)data;
889 if (flow_bb_inside_loop_p (loop, bb))
890 return true;
891 return false;
895 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
896 stmt_vec_info structs for all the stmts in LOOP_IN. */
898 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
899 : vec_info (vec_info::loop, shared),
900 loop (loop_in),
901 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
902 num_itersm1 (NULL_TREE),
903 num_iters (NULL_TREE),
904 num_iters_unchanged (NULL_TREE),
905 num_iters_assumptions (NULL_TREE),
906 vector_costs (nullptr),
907 scalar_costs (nullptr),
908 th (0),
909 versioning_threshold (0),
910 vectorization_factor (0),
911 main_loop_edge (nullptr),
912 skip_main_loop_edge (nullptr),
913 skip_this_loop_edge (nullptr),
914 reusable_accumulators (),
915 suggested_unroll_factor (1),
916 max_vectorization_factor (0),
917 mask_skip_niters (NULL_TREE),
918 rgroup_compare_type (NULL_TREE),
919 simd_if_cond (NULL_TREE),
920 unaligned_dr (NULL),
921 peeling_for_alignment (0),
922 ptr_mask (0),
923 ivexpr_map (NULL),
924 scan_map (NULL),
925 slp_unrolling_factor (1),
926 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
927 vectorizable (false),
928 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
929 using_partial_vectors_p (false),
930 epil_using_partial_vectors_p (false),
931 partial_load_store_bias (0),
932 peeling_for_gaps (false),
933 peeling_for_niter (false),
934 no_data_dependencies (false),
935 has_mask_store (false),
936 scalar_loop_scaling (profile_probability::uninitialized ()),
937 scalar_loop (NULL),
938 orig_loop_info (NULL)
940 /* CHECKME: We want to visit all BBs before their successors (except for
941 latch blocks, for which this assertion wouldn't hold). In the simple
942 case of the loop forms we allow, a dfs order of the BBs would the same
943 as reversed postorder traversal, so we are safe. */
945 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
946 bbs, loop->num_nodes, loop);
947 gcc_assert (nbbs == loop->num_nodes);
949 for (unsigned int i = 0; i < nbbs; i++)
951 basic_block bb = bbs[i];
952 gimple_stmt_iterator si;
954 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
956 gimple *phi = gsi_stmt (si);
957 gimple_set_uid (phi, 0);
958 add_stmt (phi);
961 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
963 gimple *stmt = gsi_stmt (si);
964 gimple_set_uid (stmt, 0);
965 if (is_gimple_debug (stmt))
966 continue;
967 add_stmt (stmt);
968 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
969 third argument is the #pragma omp simd if (x) condition, when 0,
970 loop shouldn't be vectorized, when non-zero constant, it should
971 be vectorized normally, otherwise versioned with vectorized loop
972 done if the condition is non-zero at runtime. */
973 if (loop_in->simduid
974 && is_gimple_call (stmt)
975 && gimple_call_internal_p (stmt)
976 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
977 && gimple_call_num_args (stmt) >= 3
978 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
979 && (loop_in->simduid
980 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
982 tree arg = gimple_call_arg (stmt, 2);
983 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
984 simd_if_cond = arg;
985 else
986 gcc_assert (integer_nonzerop (arg));
991 epilogue_vinfos.create (6);
994 /* Free all levels of rgroup CONTROLS. */
996 void
997 release_vec_loop_controls (vec<rgroup_controls> *controls)
999 rgroup_controls *rgc;
1000 unsigned int i;
1001 FOR_EACH_VEC_ELT (*controls, i, rgc)
1002 rgc->controls.release ();
1003 controls->release ();
1006 /* Free all memory used by the _loop_vec_info, as well as all the
1007 stmt_vec_info structs of all the stmts in the loop. */
1009 _loop_vec_info::~_loop_vec_info ()
1011 free (bbs);
1013 release_vec_loop_controls (&masks);
1014 release_vec_loop_controls (&lens);
1015 delete ivexpr_map;
1016 delete scan_map;
1017 epilogue_vinfos.release ();
1018 delete scalar_costs;
1019 delete vector_costs;
1021 /* When we release an epiloge vinfo that we do not intend to use
1022 avoid clearing AUX of the main loop which should continue to
1023 point to the main loop vinfo since otherwise we'll leak that. */
1024 if (loop->aux == this)
1025 loop->aux = NULL;
1028 /* Return an invariant or register for EXPR and emit necessary
1029 computations in the LOOP_VINFO loop preheader. */
1031 tree
1032 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1034 if (is_gimple_reg (expr)
1035 || is_gimple_min_invariant (expr))
1036 return expr;
1038 if (! loop_vinfo->ivexpr_map)
1039 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1040 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1041 if (! cached)
1043 gimple_seq stmts = NULL;
1044 cached = force_gimple_operand (unshare_expr (expr),
1045 &stmts, true, NULL_TREE);
1046 if (stmts)
1048 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1049 gsi_insert_seq_on_edge_immediate (e, stmts);
1052 return cached;
1055 /* Return true if we can use CMP_TYPE as the comparison type to produce
1056 all masks required to mask LOOP_VINFO. */
1058 static bool
1059 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1061 rgroup_controls *rgm;
1062 unsigned int i;
1063 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1064 if (rgm->type != NULL_TREE
1065 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1066 cmp_type, rgm->type,
1067 OPTIMIZE_FOR_SPEED))
1068 return false;
1069 return true;
1072 /* Calculate the maximum number of scalars per iteration for every
1073 rgroup in LOOP_VINFO. */
1075 static unsigned int
1076 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1078 unsigned int res = 1;
1079 unsigned int i;
1080 rgroup_controls *rgm;
1081 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1082 res = MAX (res, rgm->max_nscalars_per_iter);
1083 return res;
1086 /* Calculate the minimum precision necessary to represent:
1088 MAX_NITERS * FACTOR
1090 as an unsigned integer, where MAX_NITERS is the maximum number of
1091 loop header iterations for the original scalar form of LOOP_VINFO. */
1093 static unsigned
1094 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1096 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1098 /* Get the maximum number of iterations that is representable
1099 in the counter type. */
1100 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1101 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1103 /* Get a more refined estimate for the number of iterations. */
1104 widest_int max_back_edges;
1105 if (max_loop_iterations (loop, &max_back_edges))
1106 max_ni = wi::smin (max_ni, max_back_edges + 1);
1108 /* Work out how many bits we need to represent the limit. */
1109 return wi::min_precision (max_ni * factor, UNSIGNED);
1112 /* True if the loop needs peeling or partial vectors when vectorized. */
1114 static bool
1115 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1117 unsigned HOST_WIDE_INT const_vf;
1118 HOST_WIDE_INT max_niter
1119 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1121 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1122 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1123 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1124 (loop_vinfo));
1126 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1127 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1129 /* Work out the (constant) number of iterations that need to be
1130 peeled for reasons other than niters. */
1131 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1132 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1133 peel_niter += 1;
1134 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1135 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1136 return true;
1138 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1139 /* ??? When peeling for gaps but not alignment, we could
1140 try to check whether the (variable) niters is known to be
1141 VF * N + 1. That's something of a niche case though. */
1142 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1143 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1144 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1145 < (unsigned) exact_log2 (const_vf))
1146 /* In case of versioning, check if the maximum number of
1147 iterations is greater than th. If they are identical,
1148 the epilogue is unnecessary. */
1149 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1150 || ((unsigned HOST_WIDE_INT) max_niter
1151 > (th / const_vf) * const_vf))))
1152 return true;
1154 return false;
1157 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1158 whether we can actually generate the masks required. Return true if so,
1159 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1161 static bool
1162 vect_verify_full_masking (loop_vec_info loop_vinfo)
1164 unsigned int min_ni_width;
1165 unsigned int max_nscalars_per_iter
1166 = vect_get_max_nscalars_per_iter (loop_vinfo);
1168 /* Use a normal loop if there are no statements that need masking.
1169 This only happens in rare degenerate cases: it means that the loop
1170 has no loads, no stores, and no live-out values. */
1171 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1172 return false;
1174 /* Work out how many bits we need to represent the limit. */
1175 min_ni_width
1176 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1178 /* Find a scalar mode for which WHILE_ULT is supported. */
1179 opt_scalar_int_mode cmp_mode_iter;
1180 tree cmp_type = NULL_TREE;
1181 tree iv_type = NULL_TREE;
1182 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1183 unsigned int iv_precision = UINT_MAX;
1185 if (iv_limit != -1)
1186 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1187 UNSIGNED);
1189 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1191 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1192 if (cmp_bits >= min_ni_width
1193 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1195 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1196 if (this_type
1197 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1199 /* Although we could stop as soon as we find a valid mode,
1200 there are at least two reasons why that's not always the
1201 best choice:
1203 - An IV that's Pmode or wider is more likely to be reusable
1204 in address calculations than an IV that's narrower than
1205 Pmode.
1207 - Doing the comparison in IV_PRECISION or wider allows
1208 a natural 0-based IV, whereas using a narrower comparison
1209 type requires mitigations against wrap-around.
1211 Conversely, if the IV limit is variable, doing the comparison
1212 in a wider type than the original type can introduce
1213 unnecessary extensions, so picking the widest valid mode
1214 is not always a good choice either.
1216 Here we prefer the first IV type that's Pmode or wider,
1217 and the first comparison type that's IV_PRECISION or wider.
1218 (The comparison type must be no wider than the IV type,
1219 to avoid extensions in the vector loop.)
1221 ??? We might want to try continuing beyond Pmode for ILP32
1222 targets if CMP_BITS < IV_PRECISION. */
1223 iv_type = this_type;
1224 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1225 cmp_type = this_type;
1226 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1227 break;
1232 if (!cmp_type)
1233 return false;
1235 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1236 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1237 return true;
1240 /* Check whether we can use vector access with length based on precison
1241 comparison. So far, to keep it simple, we only allow the case that the
1242 precision of the target supported length is larger than the precision
1243 required by loop niters. */
1245 static bool
1246 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1248 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1249 return false;
1251 machine_mode len_load_mode = get_len_load_store_mode
1252 (loop_vinfo->vector_mode, true).require ();
1253 machine_mode len_store_mode = get_len_load_store_mode
1254 (loop_vinfo->vector_mode, false).require ();
1256 signed char partial_load_bias = internal_len_load_store_bias
1257 (IFN_LEN_LOAD, len_load_mode);
1259 signed char partial_store_bias = internal_len_load_store_bias
1260 (IFN_LEN_STORE, len_store_mode);
1262 gcc_assert (partial_load_bias == partial_store_bias);
1264 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1265 return false;
1267 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1268 len_loads with a length of zero. In order to avoid that we prohibit
1269 more than one loop length here. */
1270 if (partial_load_bias == -1
1271 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1272 return false;
1274 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1276 unsigned int max_nitems_per_iter = 1;
1277 unsigned int i;
1278 rgroup_controls *rgl;
1279 /* Find the maximum number of items per iteration for every rgroup. */
1280 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1282 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1283 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1286 /* Work out how many bits we need to represent the length limit. */
1287 unsigned int min_ni_prec
1288 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1290 /* Now use the maximum of below precisions for one suitable IV type:
1291 - the IV's natural precision
1292 - the precision needed to hold: the maximum number of scalar
1293 iterations multiplied by the scale factor (min_ni_prec above)
1294 - the Pmode precision
1296 If min_ni_prec is less than the precision of the current niters,
1297 we perfer to still use the niters type. Prefer to use Pmode and
1298 wider IV to avoid narrow conversions. */
1300 unsigned int ni_prec
1301 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1302 min_ni_prec = MAX (min_ni_prec, ni_prec);
1303 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1305 tree iv_type = NULL_TREE;
1306 opt_scalar_int_mode tmode_iter;
1307 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1309 scalar_mode tmode = tmode_iter.require ();
1310 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1312 /* ??? Do we really want to construct one IV whose precision exceeds
1313 BITS_PER_WORD? */
1314 if (tbits > BITS_PER_WORD)
1315 break;
1317 /* Find the first available standard integral type. */
1318 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1320 iv_type = build_nonstandard_integer_type (tbits, true);
1321 break;
1325 if (!iv_type)
1327 if (dump_enabled_p ())
1328 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1329 "can't vectorize with length-based partial vectors"
1330 " because there is no suitable iv type.\n");
1331 return false;
1334 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1335 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1337 return true;
1340 /* Calculate the cost of one scalar iteration of the loop. */
1341 static void
1342 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1344 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1345 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1346 int nbbs = loop->num_nodes, factor;
1347 int innerloop_iters, i;
1349 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1351 /* Gather costs for statements in the scalar loop. */
1353 /* FORNOW. */
1354 innerloop_iters = 1;
1355 if (loop->inner)
1356 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1358 for (i = 0; i < nbbs; i++)
1360 gimple_stmt_iterator si;
1361 basic_block bb = bbs[i];
1363 if (bb->loop_father == loop->inner)
1364 factor = innerloop_iters;
1365 else
1366 factor = 1;
1368 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1370 gimple *stmt = gsi_stmt (si);
1371 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1373 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1374 continue;
1376 /* Skip stmts that are not vectorized inside the loop. */
1377 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1378 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1379 && (!STMT_VINFO_LIVE_P (vstmt_info)
1380 || !VECTORIZABLE_CYCLE_DEF
1381 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1382 continue;
1384 vect_cost_for_stmt kind;
1385 if (STMT_VINFO_DATA_REF (stmt_info))
1387 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1388 kind = scalar_load;
1389 else
1390 kind = scalar_store;
1392 else if (vect_nop_conversion_p (stmt_info))
1393 continue;
1394 else
1395 kind = scalar_stmt;
1397 /* We are using vect_prologue here to avoid scaling twice
1398 by the inner loop factor. */
1399 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1400 factor, kind, stmt_info, 0, vect_prologue);
1404 /* Now accumulate cost. */
1405 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1406 add_stmt_costs (loop_vinfo->scalar_costs,
1407 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1408 loop_vinfo->scalar_costs->finish_cost (nullptr);
1412 /* Function vect_analyze_loop_form.
1414 Verify that certain CFG restrictions hold, including:
1415 - the loop has a pre-header
1416 - the loop has a single entry and exit
1417 - the loop exit condition is simple enough
1418 - the number of iterations can be analyzed, i.e, a countable loop. The
1419 niter could be analyzed under some assumptions. */
1421 opt_result
1422 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1424 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1426 /* Different restrictions apply when we are considering an inner-most loop,
1427 vs. an outer (nested) loop.
1428 (FORNOW. May want to relax some of these restrictions in the future). */
1430 info->inner_loop_cond = NULL;
1431 if (!loop->inner)
1433 /* Inner-most loop. We currently require that the number of BBs is
1434 exactly 2 (the header and latch). Vectorizable inner-most loops
1435 look like this:
1437 (pre-header)
1439 header <--------+
1440 | | |
1441 | +--> latch --+
1443 (exit-bb) */
1445 if (loop->num_nodes != 2)
1446 return opt_result::failure_at (vect_location,
1447 "not vectorized:"
1448 " control flow in loop.\n");
1450 if (empty_block_p (loop->header))
1451 return opt_result::failure_at (vect_location,
1452 "not vectorized: empty loop.\n");
1454 else
1456 class loop *innerloop = loop->inner;
1457 edge entryedge;
1459 /* Nested loop. We currently require that the loop is doubly-nested,
1460 contains a single inner loop, and the number of BBs is exactly 5.
1461 Vectorizable outer-loops look like this:
1463 (pre-header)
1465 header <---+
1467 inner-loop |
1469 tail ------+
1471 (exit-bb)
1473 The inner-loop has the properties expected of inner-most loops
1474 as described above. */
1476 if ((loop->inner)->inner || (loop->inner)->next)
1477 return opt_result::failure_at (vect_location,
1478 "not vectorized:"
1479 " multiple nested loops.\n");
1481 if (loop->num_nodes != 5)
1482 return opt_result::failure_at (vect_location,
1483 "not vectorized:"
1484 " control flow in loop.\n");
1486 entryedge = loop_preheader_edge (innerloop);
1487 if (entryedge->src != loop->header
1488 || !single_exit (innerloop)
1489 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1490 return opt_result::failure_at (vect_location,
1491 "not vectorized:"
1492 " unsupported outerloop form.\n");
1494 /* Analyze the inner-loop. */
1495 vect_loop_form_info inner;
1496 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1497 if (!res)
1499 if (dump_enabled_p ())
1500 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1501 "not vectorized: Bad inner loop.\n");
1502 return res;
1505 /* Don't support analyzing niter under assumptions for inner
1506 loop. */
1507 if (!integer_onep (inner.assumptions))
1508 return opt_result::failure_at (vect_location,
1509 "not vectorized: Bad inner loop.\n");
1511 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1512 return opt_result::failure_at (vect_location,
1513 "not vectorized: inner-loop count not"
1514 " invariant.\n");
1516 if (dump_enabled_p ())
1517 dump_printf_loc (MSG_NOTE, vect_location,
1518 "Considering outer-loop vectorization.\n");
1519 info->inner_loop_cond = inner.loop_cond;
1522 if (!single_exit (loop))
1523 return opt_result::failure_at (vect_location,
1524 "not vectorized: multiple exits.\n");
1525 if (EDGE_COUNT (loop->header->preds) != 2)
1526 return opt_result::failure_at (vect_location,
1527 "not vectorized:"
1528 " too many incoming edges.\n");
1530 /* We assume that the loop exit condition is at the end of the loop. i.e,
1531 that the loop is represented as a do-while (with a proper if-guard
1532 before the loop if needed), where the loop header contains all the
1533 executable statements, and the latch is empty. */
1534 if (!empty_block_p (loop->latch)
1535 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1536 return opt_result::failure_at (vect_location,
1537 "not vectorized: latch block not empty.\n");
1539 /* Make sure the exit is not abnormal. */
1540 edge e = single_exit (loop);
1541 if (e->flags & EDGE_ABNORMAL)
1542 return opt_result::failure_at (vect_location,
1543 "not vectorized:"
1544 " abnormal loop exit edge.\n");
1546 info->loop_cond
1547 = vect_get_loop_niters (loop, &info->assumptions,
1548 &info->number_of_iterations,
1549 &info->number_of_iterationsm1);
1550 if (!info->loop_cond)
1551 return opt_result::failure_at
1552 (vect_location,
1553 "not vectorized: complicated exit condition.\n");
1555 if (integer_zerop (info->assumptions)
1556 || !info->number_of_iterations
1557 || chrec_contains_undetermined (info->number_of_iterations))
1558 return opt_result::failure_at
1559 (info->loop_cond,
1560 "not vectorized: number of iterations cannot be computed.\n");
1562 if (integer_zerop (info->number_of_iterations))
1563 return opt_result::failure_at
1564 (info->loop_cond,
1565 "not vectorized: number of iterations = 0.\n");
1567 if (!(tree_fits_shwi_p (info->number_of_iterations)
1568 && tree_to_shwi (info->number_of_iterations) > 0))
1570 if (dump_enabled_p ())
1572 dump_printf_loc (MSG_NOTE, vect_location,
1573 "Symbolic number of iterations is ");
1574 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1575 dump_printf (MSG_NOTE, "\n");
1579 return opt_result::success ();
1582 /* Create a loop_vec_info for LOOP with SHARED and the
1583 vect_analyze_loop_form result. */
1585 loop_vec_info
1586 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1587 const vect_loop_form_info *info,
1588 loop_vec_info main_loop_info)
1590 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1591 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1592 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1593 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1594 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1595 /* Also record the assumptions for versioning. */
1596 if (!integer_onep (info->assumptions) && !main_loop_info)
1597 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1599 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1600 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1601 if (info->inner_loop_cond)
1603 stmt_vec_info inner_loop_cond_info
1604 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1605 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1606 /* If we have an estimate on the number of iterations of the inner
1607 loop use that to limit the scale for costing, otherwise use
1608 --param vect-inner-loop-cost-factor literally. */
1609 widest_int nit;
1610 if (estimated_stmt_executions (loop->inner, &nit))
1611 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1612 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1615 return loop_vinfo;
1620 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1621 statements update the vectorization factor. */
1623 static void
1624 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1626 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1627 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1628 int nbbs = loop->num_nodes;
1629 poly_uint64 vectorization_factor;
1630 int i;
1632 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1634 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1635 gcc_assert (known_ne (vectorization_factor, 0U));
1637 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1638 vectorization factor of the loop is the unrolling factor required by
1639 the SLP instances. If that unrolling factor is 1, we say, that we
1640 perform pure SLP on loop - cross iteration parallelism is not
1641 exploited. */
1642 bool only_slp_in_loop = true;
1643 for (i = 0; i < nbbs; i++)
1645 basic_block bb = bbs[i];
1646 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1647 gsi_next (&si))
1649 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1650 if (!stmt_info)
1651 continue;
1652 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1653 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1654 && !PURE_SLP_STMT (stmt_info))
1655 /* STMT needs both SLP and loop-based vectorization. */
1656 only_slp_in_loop = false;
1658 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1659 gsi_next (&si))
1661 if (is_gimple_debug (gsi_stmt (si)))
1662 continue;
1663 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1664 stmt_info = vect_stmt_to_vectorize (stmt_info);
1665 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1666 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1667 && !PURE_SLP_STMT (stmt_info))
1668 /* STMT needs both SLP and loop-based vectorization. */
1669 only_slp_in_loop = false;
1673 if (only_slp_in_loop)
1675 if (dump_enabled_p ())
1676 dump_printf_loc (MSG_NOTE, vect_location,
1677 "Loop contains only SLP stmts\n");
1678 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1680 else
1682 if (dump_enabled_p ())
1683 dump_printf_loc (MSG_NOTE, vect_location,
1684 "Loop contains SLP and non-SLP stmts\n");
1685 /* Both the vectorization factor and unroll factor have the form
1686 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1687 so they must have a common multiple. */
1688 vectorization_factor
1689 = force_common_multiple (vectorization_factor,
1690 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1693 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1694 if (dump_enabled_p ())
1696 dump_printf_loc (MSG_NOTE, vect_location,
1697 "Updating vectorization factor to ");
1698 dump_dec (MSG_NOTE, vectorization_factor);
1699 dump_printf (MSG_NOTE, ".\n");
1703 /* Return true if STMT_INFO describes a double reduction phi and if
1704 the other phi in the reduction is also relevant for vectorization.
1705 This rejects cases such as:
1707 outer1:
1708 x_1 = PHI <x_3(outer2), ...>;
1711 inner:
1712 x_2 = ...;
1715 outer2:
1716 x_3 = PHI <x_2(inner)>;
1718 if nothing in x_2 or elsewhere makes x_1 relevant. */
1720 static bool
1721 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1723 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1724 return false;
1726 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1729 /* Function vect_analyze_loop_operations.
1731 Scan the loop stmts and make sure they are all vectorizable. */
1733 static opt_result
1734 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1736 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1737 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1738 int nbbs = loop->num_nodes;
1739 int i;
1740 stmt_vec_info stmt_info;
1741 bool need_to_vectorize = false;
1742 bool ok;
1744 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1746 auto_vec<stmt_info_for_cost> cost_vec;
1748 for (i = 0; i < nbbs; i++)
1750 basic_block bb = bbs[i];
1752 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1753 gsi_next (&si))
1755 gphi *phi = si.phi ();
1756 ok = true;
1758 stmt_info = loop_vinfo->lookup_stmt (phi);
1759 if (dump_enabled_p ())
1760 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1761 (gimple *) phi);
1762 if (virtual_operand_p (gimple_phi_result (phi)))
1763 continue;
1765 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1766 (i.e., a phi in the tail of the outer-loop). */
1767 if (! is_loop_header_bb_p (bb))
1769 /* FORNOW: we currently don't support the case that these phis
1770 are not used in the outerloop (unless it is double reduction,
1771 i.e., this phi is vect_reduction_def), cause this case
1772 requires to actually do something here. */
1773 if (STMT_VINFO_LIVE_P (stmt_info)
1774 && !vect_active_double_reduction_p (stmt_info))
1775 return opt_result::failure_at (phi,
1776 "Unsupported loop-closed phi"
1777 " in outer-loop.\n");
1779 /* If PHI is used in the outer loop, we check that its operand
1780 is defined in the inner loop. */
1781 if (STMT_VINFO_RELEVANT_P (stmt_info))
1783 tree phi_op;
1785 if (gimple_phi_num_args (phi) != 1)
1786 return opt_result::failure_at (phi, "unsupported phi");
1788 phi_op = PHI_ARG_DEF (phi, 0);
1789 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1790 if (!op_def_info)
1791 return opt_result::failure_at (phi, "unsupported phi\n");
1793 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1794 && (STMT_VINFO_RELEVANT (op_def_info)
1795 != vect_used_in_outer_by_reduction))
1796 return opt_result::failure_at (phi, "unsupported phi\n");
1798 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1799 || (STMT_VINFO_DEF_TYPE (stmt_info)
1800 == vect_double_reduction_def))
1801 && !vectorizable_lc_phi (loop_vinfo,
1802 stmt_info, NULL, NULL))
1803 return opt_result::failure_at (phi, "unsupported phi\n");
1806 continue;
1809 gcc_assert (stmt_info);
1811 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1812 || STMT_VINFO_LIVE_P (stmt_info))
1813 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1814 /* A scalar-dependence cycle that we don't support. */
1815 return opt_result::failure_at (phi,
1816 "not vectorized:"
1817 " scalar dependence cycle.\n");
1819 if (STMT_VINFO_RELEVANT_P (stmt_info))
1821 need_to_vectorize = true;
1822 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1823 && ! PURE_SLP_STMT (stmt_info))
1824 ok = vectorizable_induction (loop_vinfo,
1825 stmt_info, NULL, NULL,
1826 &cost_vec);
1827 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1828 || (STMT_VINFO_DEF_TYPE (stmt_info)
1829 == vect_double_reduction_def)
1830 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1831 && ! PURE_SLP_STMT (stmt_info))
1832 ok = vectorizable_reduction (loop_vinfo,
1833 stmt_info, NULL, NULL, &cost_vec);
1836 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1837 if (ok
1838 && STMT_VINFO_LIVE_P (stmt_info)
1839 && !PURE_SLP_STMT (stmt_info))
1840 ok = vectorizable_live_operation (loop_vinfo,
1841 stmt_info, NULL, NULL, NULL,
1842 -1, false, &cost_vec);
1844 if (!ok)
1845 return opt_result::failure_at (phi,
1846 "not vectorized: relevant phi not "
1847 "supported: %G",
1848 static_cast <gimple *> (phi));
1851 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1852 gsi_next (&si))
1854 gimple *stmt = gsi_stmt (si);
1855 if (!gimple_clobber_p (stmt)
1856 && !is_gimple_debug (stmt))
1858 opt_result res
1859 = vect_analyze_stmt (loop_vinfo,
1860 loop_vinfo->lookup_stmt (stmt),
1861 &need_to_vectorize,
1862 NULL, NULL, &cost_vec);
1863 if (!res)
1864 return res;
1867 } /* bbs */
1869 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1871 /* All operations in the loop are either irrelevant (deal with loop
1872 control, or dead), or only used outside the loop and can be moved
1873 out of the loop (e.g. invariants, inductions). The loop can be
1874 optimized away by scalar optimizations. We're better off not
1875 touching this loop. */
1876 if (!need_to_vectorize)
1878 if (dump_enabled_p ())
1879 dump_printf_loc (MSG_NOTE, vect_location,
1880 "All the computation can be taken out of the loop.\n");
1881 return opt_result::failure_at
1882 (vect_location,
1883 "not vectorized: redundant loop. no profit to vectorize.\n");
1886 return opt_result::success ();
1889 /* Return true if we know that the iteration count is smaller than the
1890 vectorization factor. Return false if it isn't, or if we can't be sure
1891 either way. */
1893 static bool
1894 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1896 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1898 HOST_WIDE_INT max_niter;
1899 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1900 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1901 else
1902 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1904 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1905 return true;
1907 return false;
1910 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1911 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1912 definitely no, or -1 if it's worth retrying. */
1914 static int
1915 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1916 unsigned *suggested_unroll_factor)
1918 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1919 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1921 /* Only loops that can handle partially-populated vectors can have iteration
1922 counts less than the vectorization factor. */
1923 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1925 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1927 if (dump_enabled_p ())
1928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929 "not vectorized: iteration count smaller than "
1930 "vectorization factor.\n");
1931 return 0;
1935 /* If using the "very cheap" model. reject cases in which we'd keep
1936 a copy of the scalar code (even if we might be able to vectorize it). */
1937 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1938 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1939 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1940 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1942 if (dump_enabled_p ())
1943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1944 "some scalar iterations would need to be peeled\n");
1945 return 0;
1948 int min_profitable_iters, min_profitable_estimate;
1949 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1950 &min_profitable_estimate,
1951 suggested_unroll_factor);
1953 if (min_profitable_iters < 0)
1955 if (dump_enabled_p ())
1956 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1957 "not vectorized: vectorization not profitable.\n");
1958 if (dump_enabled_p ())
1959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1960 "not vectorized: vector version will never be "
1961 "profitable.\n");
1962 return -1;
1965 int min_scalar_loop_bound = (param_min_vect_loop_bound
1966 * assumed_vf);
1968 /* Use the cost model only if it is more conservative than user specified
1969 threshold. */
1970 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1971 min_profitable_iters);
1973 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1975 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1976 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1978 if (dump_enabled_p ())
1979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1980 "not vectorized: vectorization not profitable.\n");
1981 if (dump_enabled_p ())
1982 dump_printf_loc (MSG_NOTE, vect_location,
1983 "not vectorized: iteration count smaller than user "
1984 "specified loop bound parameter or minimum profitable "
1985 "iterations (whichever is more conservative).\n");
1986 return 0;
1989 /* The static profitablity threshold min_profitable_estimate includes
1990 the cost of having to check at runtime whether the scalar loop
1991 should be used instead. If it turns out that we don't need or want
1992 such a check, the threshold we should use for the static estimate
1993 is simply the point at which the vector loop becomes more profitable
1994 than the scalar loop. */
1995 if (min_profitable_estimate > min_profitable_iters
1996 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1997 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1998 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1999 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2001 if (dump_enabled_p ())
2002 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2003 " choice between the scalar and vector loops\n");
2004 min_profitable_estimate = min_profitable_iters;
2007 /* If the vector loop needs multiple iterations to be beneficial then
2008 things are probably too close to call, and the conservative thing
2009 would be to stick with the scalar code. */
2010 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2011 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2013 if (dump_enabled_p ())
2014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2015 "one iteration of the vector loop would be"
2016 " more expensive than the equivalent number of"
2017 " iterations of the scalar loop\n");
2018 return 0;
2021 HOST_WIDE_INT estimated_niter;
2023 /* If we are vectorizing an epilogue then we know the maximum number of
2024 scalar iterations it will cover is at least one lower than the
2025 vectorization factor of the main loop. */
2026 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2027 estimated_niter
2028 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2029 else
2031 estimated_niter = estimated_stmt_executions_int (loop);
2032 if (estimated_niter == -1)
2033 estimated_niter = likely_max_stmt_executions_int (loop);
2035 if (estimated_niter != -1
2036 && ((unsigned HOST_WIDE_INT) estimated_niter
2037 < MAX (th, (unsigned) min_profitable_estimate)))
2039 if (dump_enabled_p ())
2040 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2041 "not vectorized: estimated iteration count too "
2042 "small.\n");
2043 if (dump_enabled_p ())
2044 dump_printf_loc (MSG_NOTE, vect_location,
2045 "not vectorized: estimated iteration count smaller "
2046 "than specified loop bound parameter or minimum "
2047 "profitable iterations (whichever is more "
2048 "conservative).\n");
2049 return -1;
2052 return 1;
2055 static opt_result
2056 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2057 vec<data_reference_p> *datarefs,
2058 unsigned int *n_stmts)
2060 *n_stmts = 0;
2061 for (unsigned i = 0; i < loop->num_nodes; i++)
2062 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2063 !gsi_end_p (gsi); gsi_next (&gsi))
2065 gimple *stmt = gsi_stmt (gsi);
2066 if (is_gimple_debug (stmt))
2067 continue;
2068 ++(*n_stmts);
2069 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2070 NULL, 0);
2071 if (!res)
2073 if (is_gimple_call (stmt) && loop->safelen)
2075 tree fndecl = gimple_call_fndecl (stmt), op;
2076 if (fndecl != NULL_TREE)
2078 cgraph_node *node = cgraph_node::get (fndecl);
2079 if (node != NULL && node->simd_clones != NULL)
2081 unsigned int j, n = gimple_call_num_args (stmt);
2082 for (j = 0; j < n; j++)
2084 op = gimple_call_arg (stmt, j);
2085 if (DECL_P (op)
2086 || (REFERENCE_CLASS_P (op)
2087 && get_base_address (op)))
2088 break;
2090 op = gimple_call_lhs (stmt);
2091 /* Ignore #pragma omp declare simd functions
2092 if they don't have data references in the
2093 call stmt itself. */
2094 if (j == n
2095 && !(op
2096 && (DECL_P (op)
2097 || (REFERENCE_CLASS_P (op)
2098 && get_base_address (op)))))
2099 continue;
2103 return res;
2105 /* If dependence analysis will give up due to the limit on the
2106 number of datarefs stop here and fail fatally. */
2107 if (datarefs->length ()
2108 > (unsigned)param_loop_max_datarefs_for_datadeps)
2109 return opt_result::failure_at (stmt, "exceeded param "
2110 "loop-max-datarefs-for-datadeps\n");
2112 return opt_result::success ();
2115 /* Look for SLP-only access groups and turn each individual access into its own
2116 group. */
2117 static void
2118 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2120 unsigned int i;
2121 struct data_reference *dr;
2123 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2125 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2126 FOR_EACH_VEC_ELT (datarefs, i, dr)
2128 gcc_assert (DR_REF (dr));
2129 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2131 /* Check if the load is a part of an interleaving chain. */
2132 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2134 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2135 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2136 unsigned int group_size = DR_GROUP_SIZE (first_element);
2138 /* Check if SLP-only groups. */
2139 if (!STMT_SLP_TYPE (stmt_info)
2140 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2142 /* Dissolve the group. */
2143 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2145 stmt_vec_info vinfo = first_element;
2146 while (vinfo)
2148 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2149 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2150 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2151 DR_GROUP_SIZE (vinfo) = 1;
2152 if (STMT_VINFO_STRIDED_P (first_element))
2153 DR_GROUP_GAP (vinfo) = 0;
2154 else
2155 DR_GROUP_GAP (vinfo) = group_size - 1;
2156 /* Duplicate and adjust alignment info, it needs to
2157 be present on each group leader, see dr_misalignment. */
2158 if (vinfo != first_element)
2160 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2161 dr_info2->target_alignment = dr_info->target_alignment;
2162 int misalignment = dr_info->misalignment;
2163 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2165 HOST_WIDE_INT diff
2166 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2167 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2168 unsigned HOST_WIDE_INT align_c
2169 = dr_info->target_alignment.to_constant ();
2170 misalignment = (misalignment + diff) % align_c;
2172 dr_info2->misalignment = misalignment;
2174 vinfo = next;
2181 /* Determine if operating on full vectors for LOOP_VINFO might leave
2182 some scalar iterations still to do. If so, decide how we should
2183 handle those scalar iterations. The possibilities are:
2185 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2186 In this case:
2188 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2189 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2190 LOOP_VINFO_PEELING_FOR_NITER == false
2192 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2193 to handle the remaining scalar iterations. In this case:
2195 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2196 LOOP_VINFO_PEELING_FOR_NITER == true
2198 There are two choices:
2200 (2a) Consider vectorizing the epilogue loop at the same VF as the
2201 main loop, but using partial vectors instead of full vectors.
2202 In this case:
2204 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2206 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2207 In this case:
2209 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2211 When FOR_EPILOGUE_P is true, make this determination based on the
2212 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2213 based on the assumption that LOOP_VINFO is the main loop. The caller
2214 has made sure that the number of iterations is set appropriately for
2215 this value of FOR_EPILOGUE_P. */
2217 opt_result
2218 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2219 bool for_epilogue_p)
2221 /* Determine whether there would be any scalar iterations left over. */
2222 bool need_peeling_or_partial_vectors_p
2223 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2225 /* Decide whether to vectorize the loop with partial vectors. */
2226 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2227 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2228 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2229 && need_peeling_or_partial_vectors_p)
2231 /* For partial-vector-usage=1, try to push the handling of partial
2232 vectors to the epilogue, with the main loop continuing to operate
2233 on full vectors.
2235 If we are unrolling we also do not want to use partial vectors. This
2236 is to avoid the overhead of generating multiple masks and also to
2237 avoid having to execute entire iterations of FALSE masked instructions
2238 when dealing with one or less full iterations.
2240 ??? We could then end up failing to use partial vectors if we
2241 decide to peel iterations into a prologue, and if the main loop
2242 then ends up processing fewer than VF iterations. */
2243 if ((param_vect_partial_vector_usage == 1
2244 || loop_vinfo->suggested_unroll_factor > 1)
2245 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2246 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2247 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2248 else
2249 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2252 if (dump_enabled_p ())
2254 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2255 dump_printf_loc (MSG_NOTE, vect_location,
2256 "operating on partial vectors%s.\n",
2257 for_epilogue_p ? " for epilogue loop" : "");
2258 else
2259 dump_printf_loc (MSG_NOTE, vect_location,
2260 "operating only on full vectors%s.\n",
2261 for_epilogue_p ? " for epilogue loop" : "");
2264 if (for_epilogue_p)
2266 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2267 gcc_assert (orig_loop_vinfo);
2268 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2269 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2270 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2273 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2274 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2276 /* Check that the loop processes at least one full vector. */
2277 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2278 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2279 if (known_lt (wi::to_widest (scalar_niters), vf))
2280 return opt_result::failure_at (vect_location,
2281 "loop does not have enough iterations"
2282 " to support vectorization.\n");
2284 /* If we need to peel an extra epilogue iteration to handle data
2285 accesses with gaps, check that there are enough scalar iterations
2286 available.
2288 The check above is redundant with this one when peeling for gaps,
2289 but the distinction is useful for diagnostics. */
2290 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2291 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2292 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2293 return opt_result::failure_at (vect_location,
2294 "loop does not have enough iterations"
2295 " to support peeling for gaps.\n");
2298 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2299 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2300 && need_peeling_or_partial_vectors_p);
2302 return opt_result::success ();
2305 /* Function vect_analyze_loop_2.
2307 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2308 analyses will record information in some members of LOOP_VINFO. FATAL
2309 indicates if some analysis meets fatal error. If one non-NULL pointer
2310 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2311 worked out suggested unroll factor, while one NULL pointer shows it's
2312 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2313 is to hold the slp decision when the suggested unroll factor is worked
2314 out. */
2315 static opt_result
2316 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2317 unsigned *suggested_unroll_factor,
2318 bool& slp_done_for_suggested_uf)
2320 opt_result ok = opt_result::success ();
2321 int res;
2322 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2323 poly_uint64 min_vf = 2;
2324 loop_vec_info orig_loop_vinfo = NULL;
2326 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2327 loop_vec_info of the first vectorized loop. */
2328 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2329 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2330 else
2331 orig_loop_vinfo = loop_vinfo;
2332 gcc_assert (orig_loop_vinfo);
2334 /* The first group of checks is independent of the vector size. */
2335 fatal = true;
2337 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2338 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2339 return opt_result::failure_at (vect_location,
2340 "not vectorized: simd if(0)\n");
2342 /* Find all data references in the loop (which correspond to vdefs/vuses)
2343 and analyze their evolution in the loop. */
2345 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2347 /* Gather the data references and count stmts in the loop. */
2348 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2350 opt_result res
2351 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2352 &LOOP_VINFO_DATAREFS (loop_vinfo),
2353 &LOOP_VINFO_N_STMTS (loop_vinfo));
2354 if (!res)
2356 if (dump_enabled_p ())
2357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2358 "not vectorized: loop contains function "
2359 "calls or data references that cannot "
2360 "be analyzed\n");
2361 return res;
2363 loop_vinfo->shared->save_datarefs ();
2365 else
2366 loop_vinfo->shared->check_datarefs ();
2368 /* Analyze the data references and also adjust the minimal
2369 vectorization factor according to the loads and stores. */
2371 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2372 if (!ok)
2374 if (dump_enabled_p ())
2375 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2376 "bad data references.\n");
2377 return ok;
2380 /* Check if we are applying unroll factor now. */
2381 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2382 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2384 /* If the slp decision is false when suggested unroll factor is worked
2385 out, and we are applying suggested unroll factor, we can simply skip
2386 all slp related analyses this time. */
2387 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2389 /* Classify all cross-iteration scalar data-flow cycles.
2390 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2391 vect_analyze_scalar_cycles (loop_vinfo, slp);
2393 vect_pattern_recog (loop_vinfo);
2395 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2397 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2398 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2400 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2401 if (!ok)
2403 if (dump_enabled_p ())
2404 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2405 "bad data access.\n");
2406 return ok;
2409 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2411 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2412 if (!ok)
2414 if (dump_enabled_p ())
2415 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2416 "unexpected pattern.\n");
2417 return ok;
2420 /* While the rest of the analysis below depends on it in some way. */
2421 fatal = false;
2423 /* Analyze data dependences between the data-refs in the loop
2424 and adjust the maximum vectorization factor according to
2425 the dependences.
2426 FORNOW: fail at the first data dependence that we encounter. */
2428 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2429 if (!ok)
2431 if (dump_enabled_p ())
2432 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2433 "bad data dependence.\n");
2434 return ok;
2436 if (max_vf != MAX_VECTORIZATION_FACTOR
2437 && maybe_lt (max_vf, min_vf))
2438 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2439 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2441 ok = vect_determine_vectorization_factor (loop_vinfo);
2442 if (!ok)
2444 if (dump_enabled_p ())
2445 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2446 "can't determine vectorization factor.\n");
2447 return ok;
2449 if (max_vf != MAX_VECTORIZATION_FACTOR
2450 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2451 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2453 /* Compute the scalar iteration cost. */
2454 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2456 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2458 if (slp)
2460 /* Check the SLP opportunities in the loop, analyze and build
2461 SLP trees. */
2462 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2463 if (!ok)
2464 return ok;
2466 /* If there are any SLP instances mark them as pure_slp. */
2467 slp = vect_make_slp_decision (loop_vinfo);
2468 if (slp)
2470 /* Find stmts that need to be both vectorized and SLPed. */
2471 vect_detect_hybrid_slp (loop_vinfo);
2473 /* Update the vectorization factor based on the SLP decision. */
2474 vect_update_vf_for_slp (loop_vinfo);
2476 /* Optimize the SLP graph with the vectorization factor fixed. */
2477 vect_optimize_slp (loop_vinfo);
2479 /* Gather the loads reachable from the SLP graph entries. */
2480 vect_gather_slp_loads (loop_vinfo);
2484 bool saved_can_use_partial_vectors_p
2485 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2487 /* We don't expect to have to roll back to anything other than an empty
2488 set of rgroups. */
2489 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2491 /* This is the point where we can re-start analysis with SLP forced off. */
2492 start_over:
2494 /* Apply the suggested unrolling factor, this was determined by the backend
2495 during finish_cost the first time we ran the analyzis for this
2496 vector mode. */
2497 if (applying_suggested_uf)
2498 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2500 /* Now the vectorization factor is final. */
2501 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2502 gcc_assert (known_ne (vectorization_factor, 0U));
2504 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2506 dump_printf_loc (MSG_NOTE, vect_location,
2507 "vectorization_factor = ");
2508 dump_dec (MSG_NOTE, vectorization_factor);
2509 dump_printf (MSG_NOTE, ", niters = %wd\n",
2510 LOOP_VINFO_INT_NITERS (loop_vinfo));
2513 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2515 /* Analyze the alignment of the data-refs in the loop.
2516 Fail if a data reference is found that cannot be vectorized. */
2518 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2519 if (!ok)
2521 if (dump_enabled_p ())
2522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2523 "bad data alignment.\n");
2524 return ok;
2527 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2528 It is important to call pruning after vect_analyze_data_ref_accesses,
2529 since we use grouping information gathered by interleaving analysis. */
2530 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2531 if (!ok)
2532 return ok;
2534 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2535 vectorization, since we do not want to add extra peeling or
2536 add versioning for alignment. */
2537 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2538 /* This pass will decide on using loop versioning and/or loop peeling in
2539 order to enhance the alignment of data references in the loop. */
2540 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2541 if (!ok)
2542 return ok;
2544 if (slp)
2546 /* Analyze operations in the SLP instances. Note this may
2547 remove unsupported SLP instances which makes the above
2548 SLP kind detection invalid. */
2549 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2550 vect_slp_analyze_operations (loop_vinfo);
2551 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2553 ok = opt_result::failure_at (vect_location,
2554 "unsupported SLP instances\n");
2555 goto again;
2558 /* Check whether any load in ALL SLP instances is possibly permuted. */
2559 slp_tree load_node, slp_root;
2560 unsigned i, x;
2561 slp_instance instance;
2562 bool can_use_lanes = true;
2563 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2565 slp_root = SLP_INSTANCE_TREE (instance);
2566 int group_size = SLP_TREE_LANES (slp_root);
2567 tree vectype = SLP_TREE_VECTYPE (slp_root);
2568 bool loads_permuted = false;
2569 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2571 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2572 continue;
2573 unsigned j;
2574 stmt_vec_info load_info;
2575 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2576 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2578 loads_permuted = true;
2579 break;
2583 /* If the loads and stores can be handled with load/store-lane
2584 instructions record it and move on to the next instance. */
2585 if (loads_permuted
2586 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2587 && vect_store_lanes_supported (vectype, group_size, false))
2589 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2591 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2592 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2593 /* Use SLP for strided accesses (or if we can't
2594 load-lanes). */
2595 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2596 || ! vect_load_lanes_supported
2597 (STMT_VINFO_VECTYPE (stmt_vinfo),
2598 DR_GROUP_SIZE (stmt_vinfo), false))
2599 break;
2602 can_use_lanes
2603 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2605 if (can_use_lanes && dump_enabled_p ())
2606 dump_printf_loc (MSG_NOTE, vect_location,
2607 "SLP instance %p can use load/store-lanes\n",
2608 (void *) instance);
2610 else
2612 can_use_lanes = false;
2613 break;
2617 /* If all SLP instances can use load/store-lanes abort SLP and try again
2618 with SLP disabled. */
2619 if (can_use_lanes)
2621 ok = opt_result::failure_at (vect_location,
2622 "Built SLP cancelled: can use "
2623 "load/store-lanes\n");
2624 if (dump_enabled_p ())
2625 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2626 "Built SLP cancelled: all SLP instances support "
2627 "load/store-lanes\n");
2628 goto again;
2632 /* Dissolve SLP-only groups. */
2633 vect_dissolve_slp_only_groups (loop_vinfo);
2635 /* Scan all the remaining operations in the loop that are not subject
2636 to SLP and make sure they are vectorizable. */
2637 ok = vect_analyze_loop_operations (loop_vinfo);
2638 if (!ok)
2640 if (dump_enabled_p ())
2641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2642 "bad operation or unsupported loop bound.\n");
2643 return ok;
2646 /* For now, we don't expect to mix both masking and length approaches for one
2647 loop, disable it if both are recorded. */
2648 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2649 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2650 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2652 if (dump_enabled_p ())
2653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2654 "can't vectorize a loop with partial vectors"
2655 " because we don't expect to mix different"
2656 " approaches with partial vectors for the"
2657 " same loop.\n");
2658 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2661 /* If we still have the option of using partial vectors,
2662 check whether we can generate the necessary loop controls. */
2663 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2664 && !vect_verify_full_masking (loop_vinfo)
2665 && !vect_verify_loop_lens (loop_vinfo))
2666 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2668 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2669 to be able to handle fewer than VF scalars, or needs to have a lower VF
2670 than the main loop. */
2671 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2672 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2673 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2674 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2675 return opt_result::failure_at (vect_location,
2676 "Vectorization factor too high for"
2677 " epilogue loop.\n");
2679 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2680 assuming that the loop will be used as a main loop. We will redo
2681 this analysis later if we instead decide to use the loop as an
2682 epilogue loop. */
2683 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2684 if (!ok)
2685 return ok;
2687 /* Check the costings of the loop make vectorizing worthwhile. */
2688 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2689 if (res < 0)
2691 ok = opt_result::failure_at (vect_location,
2692 "Loop costings may not be worthwhile.\n");
2693 goto again;
2695 if (!res)
2696 return opt_result::failure_at (vect_location,
2697 "Loop costings not worthwhile.\n");
2699 /* If an epilogue loop is required make sure we can create one. */
2700 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2701 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2703 if (dump_enabled_p ())
2704 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2705 if (!vect_can_advance_ivs_p (loop_vinfo)
2706 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2707 single_exit (LOOP_VINFO_LOOP
2708 (loop_vinfo))))
2710 ok = opt_result::failure_at (vect_location,
2711 "not vectorized: can't create required "
2712 "epilog loop\n");
2713 goto again;
2717 /* During peeling, we need to check if number of loop iterations is
2718 enough for both peeled prolog loop and vector loop. This check
2719 can be merged along with threshold check of loop versioning, so
2720 increase threshold for this case if necessary.
2722 If we are analyzing an epilogue we still want to check what its
2723 versioning threshold would be. If we decide to vectorize the epilogues we
2724 will want to use the lowest versioning threshold of all epilogues and main
2725 loop. This will enable us to enter a vectorized epilogue even when
2726 versioning the loop. We can't simply check whether the epilogue requires
2727 versioning though since we may have skipped some versioning checks when
2728 analyzing the epilogue. For instance, checks for alias versioning will be
2729 skipped when dealing with epilogues as we assume we already checked them
2730 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2731 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2733 poly_uint64 niters_th = 0;
2734 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2736 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2738 /* Niters for peeled prolog loop. */
2739 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2741 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2742 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2743 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2745 else
2746 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2749 /* Niters for at least one iteration of vectorized loop. */
2750 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2751 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2752 /* One additional iteration because of peeling for gap. */
2753 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2754 niters_th += 1;
2756 /* Use the same condition as vect_transform_loop to decide when to use
2757 the cost to determine a versioning threshold. */
2758 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2759 && ordered_p (th, niters_th))
2760 niters_th = ordered_max (poly_uint64 (th), niters_th);
2762 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2765 gcc_assert (known_eq (vectorization_factor,
2766 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2768 slp_done_for_suggested_uf = slp;
2770 /* Ok to vectorize! */
2771 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2772 return opt_result::success ();
2774 again:
2775 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2776 gcc_assert (!ok);
2778 /* Try again with SLP forced off but if we didn't do any SLP there is
2779 no point in re-trying. */
2780 if (!slp)
2781 return ok;
2783 /* If the slp decision is true when suggested unroll factor is worked
2784 out, and we are applying suggested unroll factor, we don't need to
2785 re-try any more. */
2786 if (applying_suggested_uf && slp_done_for_suggested_uf)
2787 return ok;
2789 /* If there are reduction chains re-trying will fail anyway. */
2790 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2791 return ok;
2793 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2794 via interleaving or lane instructions. */
2795 slp_instance instance;
2796 slp_tree node;
2797 unsigned i, j;
2798 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2800 stmt_vec_info vinfo;
2801 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2802 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2803 continue;
2804 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2805 unsigned int size = DR_GROUP_SIZE (vinfo);
2806 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2807 if (! vect_store_lanes_supported (vectype, size, false)
2808 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2809 && ! vect_grouped_store_supported (vectype, size))
2810 return opt_result::failure_at (vinfo->stmt,
2811 "unsupported grouped store\n");
2812 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2814 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2815 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2816 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2817 size = DR_GROUP_SIZE (vinfo);
2818 vectype = STMT_VINFO_VECTYPE (vinfo);
2819 if (! vect_load_lanes_supported (vectype, size, false)
2820 && ! vect_grouped_load_supported (vectype, single_element_p,
2821 size))
2822 return opt_result::failure_at (vinfo->stmt,
2823 "unsupported grouped load\n");
2827 if (dump_enabled_p ())
2828 dump_printf_loc (MSG_NOTE, vect_location,
2829 "re-trying with SLP disabled\n");
2831 /* Roll back state appropriately. No SLP this time. */
2832 slp = false;
2833 /* Restore vectorization factor as it were without SLP. */
2834 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2835 /* Free the SLP instances. */
2836 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2837 vect_free_slp_instance (instance);
2838 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2839 /* Reset SLP type to loop_vect on all stmts. */
2840 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2842 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2843 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2844 !gsi_end_p (si); gsi_next (&si))
2846 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2847 STMT_SLP_TYPE (stmt_info) = loop_vect;
2848 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2849 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2851 /* vectorizable_reduction adjusts reduction stmt def-types,
2852 restore them to that of the PHI. */
2853 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2854 = STMT_VINFO_DEF_TYPE (stmt_info);
2855 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2856 (STMT_VINFO_REDUC_DEF (stmt_info)))
2857 = STMT_VINFO_DEF_TYPE (stmt_info);
2860 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2861 !gsi_end_p (si); gsi_next (&si))
2863 if (is_gimple_debug (gsi_stmt (si)))
2864 continue;
2865 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2866 STMT_SLP_TYPE (stmt_info) = loop_vect;
2867 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2869 stmt_vec_info pattern_stmt_info
2870 = STMT_VINFO_RELATED_STMT (stmt_info);
2871 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2872 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2874 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2875 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2876 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2877 !gsi_end_p (pi); gsi_next (&pi))
2878 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2879 = loop_vect;
2883 /* Free optimized alias test DDRS. */
2884 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2885 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2886 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2887 /* Reset target cost data. */
2888 delete loop_vinfo->vector_costs;
2889 loop_vinfo->vector_costs = nullptr;
2890 /* Reset accumulated rgroup information. */
2891 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2892 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2893 /* Reset assorted flags. */
2894 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2895 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2896 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2897 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2898 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2899 = saved_can_use_partial_vectors_p;
2901 goto start_over;
2904 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2905 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2906 OLD_LOOP_VINFO is better unless something specifically indicates
2907 otherwise.
2909 Note that this deliberately isn't a partial order. */
2911 static bool
2912 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2913 loop_vec_info old_loop_vinfo)
2915 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2916 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2918 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2919 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2921 /* Always prefer a VF of loop->simdlen over any other VF. */
2922 if (loop->simdlen)
2924 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2925 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2926 if (new_simdlen_p != old_simdlen_p)
2927 return new_simdlen_p;
2930 const auto *old_costs = old_loop_vinfo->vector_costs;
2931 const auto *new_costs = new_loop_vinfo->vector_costs;
2932 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2933 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2935 return new_costs->better_main_loop_than_p (old_costs);
2938 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2939 true if we should. */
2941 static bool
2942 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2943 loop_vec_info old_loop_vinfo)
2945 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2946 return false;
2948 if (dump_enabled_p ())
2949 dump_printf_loc (MSG_NOTE, vect_location,
2950 "***** Preferring vector mode %s to vector mode %s\n",
2951 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2952 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2953 return true;
2956 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2957 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2958 MODE_I to the next mode useful to analyze.
2959 Return the loop_vinfo on success and wrapped null on failure. */
2961 static opt_loop_vec_info
2962 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2963 const vect_loop_form_info *loop_form_info,
2964 loop_vec_info main_loop_vinfo,
2965 const vector_modes &vector_modes, unsigned &mode_i,
2966 machine_mode &autodetected_vector_mode,
2967 bool &fatal)
2969 loop_vec_info loop_vinfo
2970 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2972 machine_mode vector_mode = vector_modes[mode_i];
2973 loop_vinfo->vector_mode = vector_mode;
2974 unsigned int suggested_unroll_factor = 1;
2975 bool slp_done_for_suggested_uf;
2977 /* Run the main analysis. */
2978 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
2979 &suggested_unroll_factor,
2980 slp_done_for_suggested_uf);
2981 if (dump_enabled_p ())
2982 dump_printf_loc (MSG_NOTE, vect_location,
2983 "***** Analysis %s with vector mode %s\n",
2984 res ? "succeeded" : " failed",
2985 GET_MODE_NAME (loop_vinfo->vector_mode));
2987 if (!main_loop_vinfo && suggested_unroll_factor > 1)
2989 if (dump_enabled_p ())
2990 dump_printf_loc (MSG_NOTE, vect_location,
2991 "***** Re-trying analysis for unrolling"
2992 " with unroll factor %d and slp %s.\n",
2993 suggested_unroll_factor,
2994 slp_done_for_suggested_uf ? "on" : "off");
2995 loop_vec_info unroll_vinfo
2996 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2997 unroll_vinfo->vector_mode = vector_mode;
2998 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
2999 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3000 slp_done_for_suggested_uf);
3001 if (new_res)
3003 delete loop_vinfo;
3004 loop_vinfo = unroll_vinfo;
3006 else
3007 delete unroll_vinfo;
3010 /* Remember the autodetected vector mode. */
3011 if (vector_mode == VOIDmode)
3012 autodetected_vector_mode = loop_vinfo->vector_mode;
3014 /* Advance mode_i, first skipping modes that would result in the
3015 same analysis result. */
3016 while (mode_i + 1 < vector_modes.length ()
3017 && vect_chooses_same_modes_p (loop_vinfo,
3018 vector_modes[mode_i + 1]))
3020 if (dump_enabled_p ())
3021 dump_printf_loc (MSG_NOTE, vect_location,
3022 "***** The result for vector mode %s would"
3023 " be the same\n",
3024 GET_MODE_NAME (vector_modes[mode_i + 1]));
3025 mode_i += 1;
3027 if (mode_i + 1 < vector_modes.length ()
3028 && VECTOR_MODE_P (autodetected_vector_mode)
3029 && (related_vector_mode (vector_modes[mode_i + 1],
3030 GET_MODE_INNER (autodetected_vector_mode))
3031 == autodetected_vector_mode)
3032 && (related_vector_mode (autodetected_vector_mode,
3033 GET_MODE_INNER (vector_modes[mode_i + 1]))
3034 == vector_modes[mode_i + 1]))
3036 if (dump_enabled_p ())
3037 dump_printf_loc (MSG_NOTE, vect_location,
3038 "***** Skipping vector mode %s, which would"
3039 " repeat the analysis for %s\n",
3040 GET_MODE_NAME (vector_modes[mode_i + 1]),
3041 GET_MODE_NAME (autodetected_vector_mode));
3042 mode_i += 1;
3044 mode_i++;
3046 if (!res)
3048 delete loop_vinfo;
3049 if (fatal)
3050 gcc_checking_assert (main_loop_vinfo == NULL);
3051 return opt_loop_vec_info::propagate_failure (res);
3054 return opt_loop_vec_info::success (loop_vinfo);
3057 /* Function vect_analyze_loop.
3059 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3060 for it. The different analyses will record information in the
3061 loop_vec_info struct. */
3062 opt_loop_vec_info
3063 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3065 DUMP_VECT_SCOPE ("analyze_loop_nest");
3067 if (loop_outer (loop)
3068 && loop_vec_info_for_loop (loop_outer (loop))
3069 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3070 return opt_loop_vec_info::failure_at (vect_location,
3071 "outer-loop already vectorized.\n");
3073 if (!find_loop_nest (loop, &shared->loop_nest))
3074 return opt_loop_vec_info::failure_at
3075 (vect_location,
3076 "not vectorized: loop nest containing two or more consecutive inner"
3077 " loops cannot be vectorized\n");
3079 /* Analyze the loop form. */
3080 vect_loop_form_info loop_form_info;
3081 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3082 if (!res)
3084 if (dump_enabled_p ())
3085 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3086 "bad loop form.\n");
3087 return opt_loop_vec_info::propagate_failure (res);
3089 if (!integer_onep (loop_form_info.assumptions))
3091 /* We consider to vectorize this loop by versioning it under
3092 some assumptions. In order to do this, we need to clear
3093 existing information computed by scev and niter analyzer. */
3094 scev_reset_htab ();
3095 free_numbers_of_iterations_estimates (loop);
3096 /* Also set flag for this loop so that following scev and niter
3097 analysis are done under the assumptions. */
3098 loop_constraint_set (loop, LOOP_C_FINITE);
3101 auto_vector_modes vector_modes;
3102 /* Autodetect first vector size we try. */
3103 vector_modes.safe_push (VOIDmode);
3104 unsigned int autovec_flags
3105 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3106 loop->simdlen != 0);
3107 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3108 && !unlimited_cost_model (loop));
3109 machine_mode autodetected_vector_mode = VOIDmode;
3110 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3111 unsigned int mode_i = 0;
3112 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3114 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3115 a mode has not been analyzed. */
3116 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3117 for (unsigned i = 0; i < vector_modes.length (); ++i)
3118 cached_vf_per_mode.safe_push (0);
3120 /* First determine the main loop vectorization mode, either the first
3121 one that works, starting with auto-detecting the vector mode and then
3122 following the targets order of preference, or the one with the
3123 lowest cost if pick_lowest_cost_p. */
3124 while (1)
3126 bool fatal;
3127 unsigned int last_mode_i = mode_i;
3128 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3129 failed. */
3130 cached_vf_per_mode[last_mode_i] = -1;
3131 opt_loop_vec_info loop_vinfo
3132 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3133 NULL, vector_modes, mode_i,
3134 autodetected_vector_mode, fatal);
3135 if (fatal)
3136 break;
3138 if (loop_vinfo)
3140 /* Analyzis has been successful so update the VF value. The
3141 VF should always be a multiple of unroll_factor and we want to
3142 capture the original VF here. */
3143 cached_vf_per_mode[last_mode_i]
3144 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3145 loop_vinfo->suggested_unroll_factor);
3146 /* Once we hit the desired simdlen for the first time,
3147 discard any previous attempts. */
3148 if (simdlen
3149 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3151 delete first_loop_vinfo;
3152 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3153 simdlen = 0;
3155 else if (pick_lowest_cost_p
3156 && first_loop_vinfo
3157 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3159 /* Pick loop_vinfo over first_loop_vinfo. */
3160 delete first_loop_vinfo;
3161 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3163 if (first_loop_vinfo == NULL)
3164 first_loop_vinfo = loop_vinfo;
3165 else
3167 delete loop_vinfo;
3168 loop_vinfo = opt_loop_vec_info::success (NULL);
3171 /* Commit to first_loop_vinfo if we have no reason to try
3172 alternatives. */
3173 if (!simdlen && !pick_lowest_cost_p)
3174 break;
3176 if (mode_i == vector_modes.length ()
3177 || autodetected_vector_mode == VOIDmode)
3178 break;
3180 /* Try the next biggest vector size. */
3181 if (dump_enabled_p ())
3182 dump_printf_loc (MSG_NOTE, vect_location,
3183 "***** Re-trying analysis with vector mode %s\n",
3184 GET_MODE_NAME (vector_modes[mode_i]));
3186 if (!first_loop_vinfo)
3187 return opt_loop_vec_info::propagate_failure (res);
3189 if (dump_enabled_p ())
3190 dump_printf_loc (MSG_NOTE, vect_location,
3191 "***** Choosing vector mode %s\n",
3192 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3194 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3195 enabled, SIMDUID is not set, it is the innermost loop and we have
3196 either already found the loop's SIMDLEN or there was no SIMDLEN to
3197 begin with.
3198 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3199 bool vect_epilogues = (!simdlen
3200 && loop->inner == NULL
3201 && param_vect_epilogues_nomask
3202 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3203 && !loop->simduid);
3204 if (!vect_epilogues)
3205 return first_loop_vinfo;
3207 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3208 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3210 /* For epilogues start the analysis from the first mode. The motivation
3211 behind starting from the beginning comes from cases where the VECTOR_MODES
3212 array may contain length-agnostic and length-specific modes. Their
3213 ordering is not guaranteed, so we could end up picking a mode for the main
3214 loop that is after the epilogue's optimal mode. */
3215 vector_modes[0] = autodetected_vector_mode;
3216 mode_i = 0;
3218 bool supports_partial_vectors =
3219 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3220 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3222 while (1)
3224 /* If the target does not support partial vectors we can shorten the
3225 number of modes to analyze for the epilogue as we know we can't pick a
3226 mode that would lead to a VF at least as big as the
3227 FIRST_VINFO_VF. */
3228 if (!supports_partial_vectors
3229 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3231 mode_i++;
3232 if (mode_i == vector_modes.length ())
3233 break;
3234 continue;
3237 if (dump_enabled_p ())
3238 dump_printf_loc (MSG_NOTE, vect_location,
3239 "***** Re-trying epilogue analysis with vector "
3240 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3242 bool fatal;
3243 opt_loop_vec_info loop_vinfo
3244 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3245 first_loop_vinfo,
3246 vector_modes, mode_i,
3247 autodetected_vector_mode, fatal);
3248 if (fatal)
3249 break;
3251 if (loop_vinfo)
3253 if (pick_lowest_cost_p)
3255 /* Keep trying to roll back vectorization attempts while the
3256 loop_vec_infos they produced were worse than this one. */
3257 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3258 while (!vinfos.is_empty ()
3259 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3261 gcc_assert (vect_epilogues);
3262 delete vinfos.pop ();
3265 /* For now only allow one epilogue loop. */
3266 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3268 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3269 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3270 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3271 || maybe_ne (lowest_th, 0U));
3272 /* Keep track of the known smallest versioning
3273 threshold. */
3274 if (ordered_p (lowest_th, th))
3275 lowest_th = ordered_min (lowest_th, th);
3277 else
3279 delete loop_vinfo;
3280 loop_vinfo = opt_loop_vec_info::success (NULL);
3283 /* For now only allow one epilogue loop, but allow
3284 pick_lowest_cost_p to replace it, so commit to the
3285 first epilogue if we have no reason to try alternatives. */
3286 if (!pick_lowest_cost_p)
3287 break;
3290 if (mode_i == vector_modes.length ())
3291 break;
3295 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3297 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3298 if (dump_enabled_p ())
3299 dump_printf_loc (MSG_NOTE, vect_location,
3300 "***** Choosing epilogue vector mode %s\n",
3301 GET_MODE_NAME
3302 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3305 return first_loop_vinfo;
3308 /* Return true if there is an in-order reduction function for CODE, storing
3309 it in *REDUC_FN if so. */
3311 static bool
3312 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3314 if (code == PLUS_EXPR)
3316 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3317 return true;
3319 return false;
3322 /* Function reduction_fn_for_scalar_code
3324 Input:
3325 CODE - tree_code of a reduction operations.
3327 Output:
3328 REDUC_FN - the corresponding internal function to be used to reduce the
3329 vector of partial results into a single scalar result, or IFN_LAST
3330 if the operation is a supported reduction operation, but does not have
3331 such an internal function.
3333 Return FALSE if CODE currently cannot be vectorized as reduction. */
3335 bool
3336 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3338 if (code.is_tree_code ())
3339 switch (tree_code (code))
3341 case MAX_EXPR:
3342 *reduc_fn = IFN_REDUC_MAX;
3343 return true;
3345 case MIN_EXPR:
3346 *reduc_fn = IFN_REDUC_MIN;
3347 return true;
3349 case PLUS_EXPR:
3350 *reduc_fn = IFN_REDUC_PLUS;
3351 return true;
3353 case BIT_AND_EXPR:
3354 *reduc_fn = IFN_REDUC_AND;
3355 return true;
3357 case BIT_IOR_EXPR:
3358 *reduc_fn = IFN_REDUC_IOR;
3359 return true;
3361 case BIT_XOR_EXPR:
3362 *reduc_fn = IFN_REDUC_XOR;
3363 return true;
3365 case MULT_EXPR:
3366 case MINUS_EXPR:
3367 *reduc_fn = IFN_LAST;
3368 return true;
3370 default:
3371 return false;
3373 else
3374 switch (combined_fn (code))
3376 CASE_CFN_FMAX:
3377 *reduc_fn = IFN_REDUC_FMAX;
3378 return true;
3380 CASE_CFN_FMIN:
3381 *reduc_fn = IFN_REDUC_FMIN;
3382 return true;
3384 default:
3385 return false;
3389 /* If there is a neutral value X such that a reduction would not be affected
3390 by the introduction of additional X elements, return that X, otherwise
3391 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3392 of the scalar elements. If the reduction has just a single initial value
3393 then INITIAL_VALUE is that value, otherwise it is null. */
3395 tree
3396 neutral_op_for_reduction (tree scalar_type, code_helper code,
3397 tree initial_value)
3399 if (code.is_tree_code ())
3400 switch (tree_code (code))
3402 case WIDEN_SUM_EXPR:
3403 case DOT_PROD_EXPR:
3404 case SAD_EXPR:
3405 case PLUS_EXPR:
3406 case MINUS_EXPR:
3407 case BIT_IOR_EXPR:
3408 case BIT_XOR_EXPR:
3409 return build_zero_cst (scalar_type);
3411 case MULT_EXPR:
3412 return build_one_cst (scalar_type);
3414 case BIT_AND_EXPR:
3415 return build_all_ones_cst (scalar_type);
3417 case MAX_EXPR:
3418 case MIN_EXPR:
3419 return initial_value;
3421 default:
3422 return NULL_TREE;
3424 else
3425 switch (combined_fn (code))
3427 CASE_CFN_FMIN:
3428 CASE_CFN_FMAX:
3429 return initial_value;
3431 default:
3432 return NULL_TREE;
3436 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3437 STMT is printed with a message MSG. */
3439 static void
3440 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3442 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3445 /* Return true if we need an in-order reduction for operation CODE
3446 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3447 overflow must wrap. */
3449 bool
3450 needs_fold_left_reduction_p (tree type, code_helper code)
3452 /* CHECKME: check for !flag_finite_math_only too? */
3453 if (SCALAR_FLOAT_TYPE_P (type))
3455 if (code.is_tree_code ())
3456 switch (tree_code (code))
3458 case MIN_EXPR:
3459 case MAX_EXPR:
3460 return false;
3462 default:
3463 return !flag_associative_math;
3465 else
3466 switch (combined_fn (code))
3468 CASE_CFN_FMIN:
3469 CASE_CFN_FMAX:
3470 return false;
3472 default:
3473 return !flag_associative_math;
3477 if (INTEGRAL_TYPE_P (type))
3478 return (!code.is_tree_code ()
3479 || !operation_no_trapping_overflow (type, tree_code (code)));
3481 if (SAT_FIXED_POINT_TYPE_P (type))
3482 return true;
3484 return false;
3487 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3488 has a handled computation expression. Store the main reduction
3489 operation in *CODE. */
3491 static bool
3492 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3493 tree loop_arg, code_helper *code,
3494 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3496 auto_bitmap visited;
3497 tree lookfor = PHI_RESULT (phi);
3498 ssa_op_iter curri;
3499 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3500 while (USE_FROM_PTR (curr) != loop_arg)
3501 curr = op_iter_next_use (&curri);
3502 curri.i = curri.numops;
3505 path.safe_push (std::make_pair (curri, curr));
3506 tree use = USE_FROM_PTR (curr);
3507 if (use == lookfor)
3508 break;
3509 gimple *def = SSA_NAME_DEF_STMT (use);
3510 if (gimple_nop_p (def)
3511 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3513 pop:
3516 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3517 curri = x.first;
3518 curr = x.second;
3520 curr = op_iter_next_use (&curri);
3521 /* Skip already visited or non-SSA operands (from iterating
3522 over PHI args). */
3523 while (curr != NULL_USE_OPERAND_P
3524 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3525 || ! bitmap_set_bit (visited,
3526 SSA_NAME_VERSION
3527 (USE_FROM_PTR (curr)))));
3529 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3530 if (curr == NULL_USE_OPERAND_P)
3531 break;
3533 else
3535 if (gimple_code (def) == GIMPLE_PHI)
3536 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3537 else
3538 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3539 while (curr != NULL_USE_OPERAND_P
3540 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3541 || ! bitmap_set_bit (visited,
3542 SSA_NAME_VERSION
3543 (USE_FROM_PTR (curr)))))
3544 curr = op_iter_next_use (&curri);
3545 if (curr == NULL_USE_OPERAND_P)
3546 goto pop;
3549 while (1);
3550 if (dump_file && (dump_flags & TDF_DETAILS))
3552 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3553 unsigned i;
3554 std::pair<ssa_op_iter, use_operand_p> *x;
3555 FOR_EACH_VEC_ELT (path, i, x)
3556 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3557 dump_printf (MSG_NOTE, "\n");
3560 /* Check whether the reduction path detected is valid. */
3561 bool fail = path.length () == 0;
3562 bool neg = false;
3563 int sign = -1;
3564 *code = ERROR_MARK;
3565 for (unsigned i = 1; i < path.length (); ++i)
3567 gimple *use_stmt = USE_STMT (path[i].second);
3568 gimple_match_op op;
3569 if (!gimple_extract_op (use_stmt, &op))
3571 fail = true;
3572 break;
3574 unsigned int opi = op.num_ops;
3575 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3577 /* The following make sure we can compute the operand index
3578 easily plus it mostly disallows chaining via COND_EXPR condition
3579 operands. */
3580 for (opi = 0; opi < op.num_ops; ++opi)
3581 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3582 break;
3584 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3586 for (opi = 0; opi < op.num_ops; ++opi)
3587 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3588 break;
3590 if (opi == op.num_ops)
3592 fail = true;
3593 break;
3595 op.code = canonicalize_code (op.code, op.type);
3596 if (op.code == MINUS_EXPR)
3598 op.code = PLUS_EXPR;
3599 /* Track whether we negate the reduction value each iteration. */
3600 if (op.ops[1] == op.ops[opi])
3601 neg = ! neg;
3603 if (CONVERT_EXPR_CODE_P (op.code)
3604 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3606 else if (*code == ERROR_MARK)
3608 *code = op.code;
3609 sign = TYPE_SIGN (op.type);
3611 else if (op.code != *code)
3613 fail = true;
3614 break;
3616 else if ((op.code == MIN_EXPR
3617 || op.code == MAX_EXPR)
3618 && sign != TYPE_SIGN (op.type))
3620 fail = true;
3621 break;
3623 /* Check there's only a single stmt the op is used on. For the
3624 not value-changing tail and the last stmt allow out-of-loop uses.
3625 ??? We could relax this and handle arbitrary live stmts by
3626 forcing a scalar epilogue for example. */
3627 imm_use_iterator imm_iter;
3628 gimple *op_use_stmt;
3629 unsigned cnt = 0;
3630 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3631 if (!is_gimple_debug (op_use_stmt)
3632 && (*code != ERROR_MARK
3633 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3635 /* We want to allow x + x but not x < 1 ? x : 2. */
3636 if (is_gimple_assign (op_use_stmt)
3637 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3639 use_operand_p use_p;
3640 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3641 cnt++;
3643 else
3644 cnt++;
3646 if (cnt != 1)
3648 fail = true;
3649 break;
3652 return ! fail && ! neg && *code != ERROR_MARK;
3655 bool
3656 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3657 tree loop_arg, enum tree_code code)
3659 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3660 code_helper code_;
3661 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3662 && code_ == code);
3667 /* Function vect_is_simple_reduction
3669 (1) Detect a cross-iteration def-use cycle that represents a simple
3670 reduction computation. We look for the following pattern:
3672 loop_header:
3673 a1 = phi < a0, a2 >
3674 a3 = ...
3675 a2 = operation (a3, a1)
3679 a3 = ...
3680 loop_header:
3681 a1 = phi < a0, a2 >
3682 a2 = operation (a3, a1)
3684 such that:
3685 1. operation is commutative and associative and it is safe to
3686 change the order of the computation
3687 2. no uses for a2 in the loop (a2 is used out of the loop)
3688 3. no uses of a1 in the loop besides the reduction operation
3689 4. no uses of a1 outside the loop.
3691 Conditions 1,4 are tested here.
3692 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3694 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3695 nested cycles.
3697 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3698 reductions:
3700 a1 = phi < a0, a2 >
3701 inner loop (def of a3)
3702 a2 = phi < a3 >
3704 (4) Detect condition expressions, ie:
3705 for (int i = 0; i < N; i++)
3706 if (a[i] < val)
3707 ret_val = a[i];
3711 static stmt_vec_info
3712 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3713 bool *double_reduc, bool *reduc_chain_p, bool slp)
3715 gphi *phi = as_a <gphi *> (phi_info->stmt);
3716 gimple *phi_use_stmt = NULL;
3717 imm_use_iterator imm_iter;
3718 use_operand_p use_p;
3720 *double_reduc = false;
3721 *reduc_chain_p = false;
3722 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3724 tree phi_name = PHI_RESULT (phi);
3725 /* ??? If there are no uses of the PHI result the inner loop reduction
3726 won't be detected as possibly double-reduction by vectorizable_reduction
3727 because that tries to walk the PHI arg from the preheader edge which
3728 can be constant. See PR60382. */
3729 if (has_zero_uses (phi_name))
3730 return NULL;
3731 class loop *loop = (gimple_bb (phi))->loop_father;
3732 unsigned nphi_def_loop_uses = 0;
3733 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3735 gimple *use_stmt = USE_STMT (use_p);
3736 if (is_gimple_debug (use_stmt))
3737 continue;
3739 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3741 if (dump_enabled_p ())
3742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3743 "intermediate value used outside loop.\n");
3745 return NULL;
3748 nphi_def_loop_uses++;
3749 phi_use_stmt = use_stmt;
3752 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3753 if (TREE_CODE (latch_def) != SSA_NAME)
3755 if (dump_enabled_p ())
3756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3757 "reduction: not ssa_name: %T\n", latch_def);
3758 return NULL;
3761 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3762 if (!def_stmt_info
3763 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3764 return NULL;
3766 bool nested_in_vect_loop
3767 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3768 unsigned nlatch_def_loop_uses = 0;
3769 auto_vec<gphi *, 3> lcphis;
3770 bool inner_loop_of_double_reduc = false;
3771 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3773 gimple *use_stmt = USE_STMT (use_p);
3774 if (is_gimple_debug (use_stmt))
3775 continue;
3776 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3777 nlatch_def_loop_uses++;
3778 else
3780 /* We can have more than one loop-closed PHI. */
3781 lcphis.safe_push (as_a <gphi *> (use_stmt));
3782 if (nested_in_vect_loop
3783 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3784 == vect_double_reduction_def))
3785 inner_loop_of_double_reduc = true;
3789 /* If we are vectorizing an inner reduction we are executing that
3790 in the original order only in case we are not dealing with a
3791 double reduction. */
3792 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3794 if (dump_enabled_p ())
3795 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3796 "detected nested cycle: ");
3797 return def_stmt_info;
3800 /* When the inner loop of a double reduction ends up with more than
3801 one loop-closed PHI we have failed to classify alternate such
3802 PHIs as double reduction, leading to wrong code. See PR103237. */
3803 if (inner_loop_of_double_reduc && lcphis.length () != 1)
3805 if (dump_enabled_p ())
3806 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3807 "unhandle double reduction\n");
3808 return NULL;
3811 /* If this isn't a nested cycle or if the nested cycle reduction value
3812 is used ouside of the inner loop we cannot handle uses of the reduction
3813 value. */
3814 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3816 if (dump_enabled_p ())
3817 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3818 "reduction used in loop.\n");
3819 return NULL;
3822 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3823 defined in the inner loop. */
3824 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3826 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3827 if (gimple_phi_num_args (def_stmt) != 1
3828 || TREE_CODE (op1) != SSA_NAME)
3830 if (dump_enabled_p ())
3831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3832 "unsupported phi node definition.\n");
3834 return NULL;
3837 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3838 if (gimple_bb (def1)
3839 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3840 && loop->inner
3841 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3842 && (is_gimple_assign (def1) || is_gimple_call (def1))
3843 && is_a <gphi *> (phi_use_stmt)
3844 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3846 if (dump_enabled_p ())
3847 report_vect_op (MSG_NOTE, def_stmt,
3848 "detected double reduction: ");
3850 *double_reduc = true;
3851 return def_stmt_info;
3854 return NULL;
3857 /* Look for the expression computing latch_def from then loop PHI result. */
3858 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3859 code_helper code;
3860 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3861 path))
3863 STMT_VINFO_REDUC_CODE (phi_info) = code;
3864 if (code == COND_EXPR && !nested_in_vect_loop)
3865 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3867 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3868 reduction chain for which the additional restriction is that
3869 all operations in the chain are the same. */
3870 auto_vec<stmt_vec_info, 8> reduc_chain;
3871 unsigned i;
3872 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3873 for (i = path.length () - 1; i >= 1; --i)
3875 gimple *stmt = USE_STMT (path[i].second);
3876 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3877 gimple_match_op op;
3878 if (!gimple_extract_op (stmt, &op))
3879 gcc_unreachable ();
3880 if (gassign *assign = dyn_cast<gassign *> (stmt))
3881 STMT_VINFO_REDUC_IDX (stmt_info)
3882 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3883 else
3885 gcall *call = as_a<gcall *> (stmt);
3886 STMT_VINFO_REDUC_IDX (stmt_info)
3887 = path[i].second->use - gimple_call_arg_ptr (call, 0);
3889 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3890 && (i == 1 || i == path.length () - 1));
3891 if ((op.code != code && !leading_conversion)
3892 /* We can only handle the final value in epilogue
3893 generation for reduction chains. */
3894 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3895 is_slp_reduc = false;
3896 /* For reduction chains we support a trailing/leading
3897 conversions. We do not store those in the actual chain. */
3898 if (leading_conversion)
3899 continue;
3900 reduc_chain.safe_push (stmt_info);
3902 if (slp && is_slp_reduc && reduc_chain.length () > 1)
3904 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3906 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3907 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3909 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3910 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3912 /* Save the chain for further analysis in SLP detection. */
3913 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3914 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3916 *reduc_chain_p = true;
3917 if (dump_enabled_p ())
3918 dump_printf_loc (MSG_NOTE, vect_location,
3919 "reduction: detected reduction chain\n");
3921 else if (dump_enabled_p ())
3922 dump_printf_loc (MSG_NOTE, vect_location,
3923 "reduction: detected reduction\n");
3925 return def_stmt_info;
3928 if (dump_enabled_p ())
3929 dump_printf_loc (MSG_NOTE, vect_location,
3930 "reduction: unknown pattern\n");
3932 return NULL;
3935 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3936 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3937 or -1 if not known. */
3939 static int
3940 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3942 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3943 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3945 if (dump_enabled_p ())
3946 dump_printf_loc (MSG_NOTE, vect_location,
3947 "cost model: epilogue peel iters set to vf/2 "
3948 "because loop iterations are unknown .\n");
3949 return assumed_vf / 2;
3951 else
3953 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3954 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3955 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3956 /* If we need to peel for gaps, but no peeling is required, we have to
3957 peel VF iterations. */
3958 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3959 peel_iters_epilogue = assumed_vf;
3960 return peel_iters_epilogue;
3964 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3966 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3967 int *peel_iters_epilogue,
3968 stmt_vector_for_cost *scalar_cost_vec,
3969 stmt_vector_for_cost *prologue_cost_vec,
3970 stmt_vector_for_cost *epilogue_cost_vec)
3972 int retval = 0;
3974 *peel_iters_epilogue
3975 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3977 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3979 /* If peeled iterations are known but number of scalar loop
3980 iterations are unknown, count a taken branch per peeled loop. */
3981 if (peel_iters_prologue > 0)
3982 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3983 vect_prologue);
3984 if (*peel_iters_epilogue > 0)
3985 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3986 vect_epilogue);
3989 stmt_info_for_cost *si;
3990 int j;
3991 if (peel_iters_prologue)
3992 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3993 retval += record_stmt_cost (prologue_cost_vec,
3994 si->count * peel_iters_prologue,
3995 si->kind, si->stmt_info, si->misalign,
3996 vect_prologue);
3997 if (*peel_iters_epilogue)
3998 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3999 retval += record_stmt_cost (epilogue_cost_vec,
4000 si->count * *peel_iters_epilogue,
4001 si->kind, si->stmt_info, si->misalign,
4002 vect_epilogue);
4004 return retval;
4007 /* Function vect_estimate_min_profitable_iters
4009 Return the number of iterations required for the vector version of the
4010 loop to be profitable relative to the cost of the scalar version of the
4011 loop.
4013 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4014 of iterations for vectorization. -1 value means loop vectorization
4015 is not profitable. This returned value may be used for dynamic
4016 profitability check.
4018 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4019 for static check against estimated number of iterations. */
4021 static void
4022 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4023 int *ret_min_profitable_niters,
4024 int *ret_min_profitable_estimate,
4025 unsigned *suggested_unroll_factor)
4027 int min_profitable_iters;
4028 int min_profitable_estimate;
4029 int peel_iters_prologue;
4030 int peel_iters_epilogue;
4031 unsigned vec_inside_cost = 0;
4032 int vec_outside_cost = 0;
4033 unsigned vec_prologue_cost = 0;
4034 unsigned vec_epilogue_cost = 0;
4035 int scalar_single_iter_cost = 0;
4036 int scalar_outside_cost = 0;
4037 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4038 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4039 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4041 /* Cost model disabled. */
4042 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4044 if (dump_enabled_p ())
4045 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4046 *ret_min_profitable_niters = 0;
4047 *ret_min_profitable_estimate = 0;
4048 return;
4051 /* Requires loop versioning tests to handle misalignment. */
4052 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4054 /* FIXME: Make cost depend on complexity of individual check. */
4055 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4056 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4057 if (dump_enabled_p ())
4058 dump_printf (MSG_NOTE,
4059 "cost model: Adding cost of checks for loop "
4060 "versioning to treat misalignment.\n");
4063 /* Requires loop versioning with alias checks. */
4064 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4066 /* FIXME: Make cost depend on complexity of individual check. */
4067 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4068 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4069 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4070 if (len)
4071 /* Count LEN - 1 ANDs and LEN comparisons. */
4072 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4073 scalar_stmt, vect_prologue);
4074 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4075 if (len)
4077 /* Count LEN - 1 ANDs and LEN comparisons. */
4078 unsigned int nstmts = len * 2 - 1;
4079 /* +1 for each bias that needs adding. */
4080 for (unsigned int i = 0; i < len; ++i)
4081 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4082 nstmts += 1;
4083 (void) add_stmt_cost (target_cost_data, nstmts,
4084 scalar_stmt, vect_prologue);
4086 if (dump_enabled_p ())
4087 dump_printf (MSG_NOTE,
4088 "cost model: Adding cost of checks for loop "
4089 "versioning aliasing.\n");
4092 /* Requires loop versioning with niter checks. */
4093 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4095 /* FIXME: Make cost depend on complexity of individual check. */
4096 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4097 NULL, NULL, NULL_TREE, 0, vect_prologue);
4098 if (dump_enabled_p ())
4099 dump_printf (MSG_NOTE,
4100 "cost model: Adding cost of checks for loop "
4101 "versioning niters.\n");
4104 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4105 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4106 vect_prologue);
4108 /* Count statements in scalar loop. Using this as scalar cost for a single
4109 iteration for now.
4111 TODO: Add outer loop support.
4113 TODO: Consider assigning different costs to different scalar
4114 statements. */
4116 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4118 /* Add additional cost for the peeled instructions in prologue and epilogue
4119 loop. (For fully-masked loops there will be no peeling.)
4121 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4122 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4124 TODO: Build an expression that represents peel_iters for prologue and
4125 epilogue to be used in a run-time test. */
4127 bool prologue_need_br_taken_cost = false;
4128 bool prologue_need_br_not_taken_cost = false;
4130 /* Calculate peel_iters_prologue. */
4131 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4132 peel_iters_prologue = 0;
4133 else if (npeel < 0)
4135 peel_iters_prologue = assumed_vf / 2;
4136 if (dump_enabled_p ())
4137 dump_printf (MSG_NOTE, "cost model: "
4138 "prologue peel iters set to vf/2.\n");
4140 /* If peeled iterations are unknown, count a taken branch and a not taken
4141 branch per peeled loop. Even if scalar loop iterations are known,
4142 vector iterations are not known since peeled prologue iterations are
4143 not known. Hence guards remain the same. */
4144 prologue_need_br_taken_cost = true;
4145 prologue_need_br_not_taken_cost = true;
4147 else
4149 peel_iters_prologue = npeel;
4150 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4151 /* If peeled iterations are known but number of scalar loop
4152 iterations are unknown, count a taken branch per peeled loop. */
4153 prologue_need_br_taken_cost = true;
4156 bool epilogue_need_br_taken_cost = false;
4157 bool epilogue_need_br_not_taken_cost = false;
4159 /* Calculate peel_iters_epilogue. */
4160 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4161 /* We need to peel exactly one iteration for gaps. */
4162 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4163 else if (npeel < 0)
4165 /* If peeling for alignment is unknown, loop bound of main loop
4166 becomes unknown. */
4167 peel_iters_epilogue = assumed_vf / 2;
4168 if (dump_enabled_p ())
4169 dump_printf (MSG_NOTE, "cost model: "
4170 "epilogue peel iters set to vf/2 because "
4171 "peeling for alignment is unknown.\n");
4173 /* See the same reason above in peel_iters_prologue calculation. */
4174 epilogue_need_br_taken_cost = true;
4175 epilogue_need_br_not_taken_cost = true;
4177 else
4179 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4180 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4181 /* If peeled iterations are known but number of scalar loop
4182 iterations are unknown, count a taken branch per peeled loop. */
4183 epilogue_need_br_taken_cost = true;
4186 stmt_info_for_cost *si;
4187 int j;
4188 /* Add costs associated with peel_iters_prologue. */
4189 if (peel_iters_prologue)
4190 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4192 (void) add_stmt_cost (target_cost_data,
4193 si->count * peel_iters_prologue, si->kind,
4194 si->stmt_info, si->node, si->vectype,
4195 si->misalign, vect_prologue);
4198 /* Add costs associated with peel_iters_epilogue. */
4199 if (peel_iters_epilogue)
4200 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4202 (void) add_stmt_cost (target_cost_data,
4203 si->count * peel_iters_epilogue, si->kind,
4204 si->stmt_info, si->node, si->vectype,
4205 si->misalign, vect_epilogue);
4208 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4210 if (prologue_need_br_taken_cost)
4211 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4212 vect_prologue);
4214 if (prologue_need_br_not_taken_cost)
4215 (void) add_stmt_cost (target_cost_data, 1,
4216 cond_branch_not_taken, vect_prologue);
4218 if (epilogue_need_br_taken_cost)
4219 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4220 vect_epilogue);
4222 if (epilogue_need_br_not_taken_cost)
4223 (void) add_stmt_cost (target_cost_data, 1,
4224 cond_branch_not_taken, vect_epilogue);
4226 /* Take care of special costs for rgroup controls of partial vectors. */
4227 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4229 /* Calculate how many masks we need to generate. */
4230 unsigned int num_masks = 0;
4231 rgroup_controls *rgm;
4232 unsigned int num_vectors_m1;
4233 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4234 if (rgm->type)
4235 num_masks += num_vectors_m1 + 1;
4236 gcc_assert (num_masks > 0);
4238 /* In the worst case, we need to generate each mask in the prologue
4239 and in the loop body. One of the loop body mask instructions
4240 replaces the comparison in the scalar loop, and since we don't
4241 count the scalar comparison against the scalar body, we shouldn't
4242 count that vector instruction against the vector body either.
4244 Sometimes we can use unpacks instead of generating prologue
4245 masks and sometimes the prologue mask will fold to a constant,
4246 so the actual prologue cost might be smaller. However, it's
4247 simpler and safer to use the worst-case cost; if this ends up
4248 being the tie-breaker between vectorizing or not, then it's
4249 probably better not to vectorize. */
4250 (void) add_stmt_cost (target_cost_data, num_masks,
4251 vector_stmt, NULL, NULL, NULL_TREE, 0,
4252 vect_prologue);
4253 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4254 vector_stmt, NULL, NULL, NULL_TREE, 0,
4255 vect_body);
4257 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4259 /* Referring to the functions vect_set_loop_condition_partial_vectors
4260 and vect_set_loop_controls_directly, we need to generate each
4261 length in the prologue and in the loop body if required. Although
4262 there are some possible optimizations, we consider the worst case
4263 here. */
4265 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4266 signed char partial_load_store_bias
4267 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4268 bool need_iterate_p
4269 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4270 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4272 /* Calculate how many statements to be added. */
4273 unsigned int prologue_stmts = 0;
4274 unsigned int body_stmts = 0;
4276 rgroup_controls *rgc;
4277 unsigned int num_vectors_m1;
4278 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4279 if (rgc->type)
4281 /* May need one SHIFT for nitems_total computation. */
4282 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4283 if (nitems != 1 && !niters_known_p)
4284 prologue_stmts += 1;
4286 /* May need one MAX and one MINUS for wrap around. */
4287 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4288 prologue_stmts += 2;
4290 /* Need one MAX and one MINUS for each batch limit excepting for
4291 the 1st one. */
4292 prologue_stmts += num_vectors_m1 * 2;
4294 unsigned int num_vectors = num_vectors_m1 + 1;
4296 /* Need to set up lengths in prologue, only one MIN required
4297 for each since start index is zero. */
4298 prologue_stmts += num_vectors;
4300 /* If we have a non-zero partial load bias, we need one PLUS
4301 to adjust the load length. */
4302 if (partial_load_store_bias != 0)
4303 body_stmts += 1;
4305 /* Each may need two MINs and one MINUS to update lengths in body
4306 for next iteration. */
4307 if (need_iterate_p)
4308 body_stmts += 3 * num_vectors;
4311 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4312 scalar_stmt, vect_prologue);
4313 (void) add_stmt_cost (target_cost_data, body_stmts,
4314 scalar_stmt, vect_body);
4317 /* FORNOW: The scalar outside cost is incremented in one of the
4318 following ways:
4320 1. The vectorizer checks for alignment and aliasing and generates
4321 a condition that allows dynamic vectorization. A cost model
4322 check is ANDED with the versioning condition. Hence scalar code
4323 path now has the added cost of the versioning check.
4325 if (cost > th & versioning_check)
4326 jmp to vector code
4328 Hence run-time scalar is incremented by not-taken branch cost.
4330 2. The vectorizer then checks if a prologue is required. If the
4331 cost model check was not done before during versioning, it has to
4332 be done before the prologue check.
4334 if (cost <= th)
4335 prologue = scalar_iters
4336 if (prologue == 0)
4337 jmp to vector code
4338 else
4339 execute prologue
4340 if (prologue == num_iters)
4341 go to exit
4343 Hence the run-time scalar cost is incremented by a taken branch,
4344 plus a not-taken branch, plus a taken branch cost.
4346 3. The vectorizer then checks if an epilogue is required. If the
4347 cost model check was not done before during prologue check, it
4348 has to be done with the epilogue check.
4350 if (prologue == 0)
4351 jmp to vector code
4352 else
4353 execute prologue
4354 if (prologue == num_iters)
4355 go to exit
4356 vector code:
4357 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4358 jmp to epilogue
4360 Hence the run-time scalar cost should be incremented by 2 taken
4361 branches.
4363 TODO: The back end may reorder the BBS's differently and reverse
4364 conditions/branch directions. Change the estimates below to
4365 something more reasonable. */
4367 /* If the number of iterations is known and we do not do versioning, we can
4368 decide whether to vectorize at compile time. Hence the scalar version
4369 do not carry cost model guard costs. */
4370 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4371 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4373 /* Cost model check occurs at versioning. */
4374 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4375 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4376 else
4378 /* Cost model check occurs at prologue generation. */
4379 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4380 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4381 + vect_get_stmt_cost (cond_branch_not_taken);
4382 /* Cost model check occurs at epilogue generation. */
4383 else
4384 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4388 /* Complete the target-specific cost calculations. */
4389 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4390 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4391 suggested_unroll_factor);
4393 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4394 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4395 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4396 *suggested_unroll_factor,
4397 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4399 if (dump_enabled_p ())
4400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4401 "can't unroll as unrolled vectorization factor larger"
4402 " than maximum vectorization factor: "
4403 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4404 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4405 *suggested_unroll_factor = 1;
4408 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4410 if (dump_enabled_p ())
4412 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4413 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4414 vec_inside_cost);
4415 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4416 vec_prologue_cost);
4417 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4418 vec_epilogue_cost);
4419 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4420 scalar_single_iter_cost);
4421 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4422 scalar_outside_cost);
4423 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4424 vec_outside_cost);
4425 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4426 peel_iters_prologue);
4427 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4428 peel_iters_epilogue);
4431 /* Calculate number of iterations required to make the vector version
4432 profitable, relative to the loop bodies only. The following condition
4433 must hold true:
4434 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4435 where
4436 SIC = scalar iteration cost, VIC = vector iteration cost,
4437 VOC = vector outside cost, VF = vectorization factor,
4438 NPEEL = prologue iterations + epilogue iterations,
4439 SOC = scalar outside cost for run time cost model check. */
4441 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4442 - vec_inside_cost);
4443 if (saving_per_viter <= 0)
4445 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4446 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4447 "vectorization did not happen for a simd loop");
4449 if (dump_enabled_p ())
4450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4451 "cost model: the vector iteration cost = %d "
4452 "divided by the scalar iteration cost = %d "
4453 "is greater or equal to the vectorization factor = %d"
4454 ".\n",
4455 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4456 *ret_min_profitable_niters = -1;
4457 *ret_min_profitable_estimate = -1;
4458 return;
4461 /* ??? The "if" arm is written to handle all cases; see below for what
4462 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4463 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4465 /* Rewriting the condition above in terms of the number of
4466 vector iterations (vniters) rather than the number of
4467 scalar iterations (niters) gives:
4469 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4471 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4473 For integer N, X and Y when X > 0:
4475 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4476 int outside_overhead = (vec_outside_cost
4477 - scalar_single_iter_cost * peel_iters_prologue
4478 - scalar_single_iter_cost * peel_iters_epilogue
4479 - scalar_outside_cost);
4480 /* We're only interested in cases that require at least one
4481 vector iteration. */
4482 int min_vec_niters = 1;
4483 if (outside_overhead > 0)
4484 min_vec_niters = outside_overhead / saving_per_viter + 1;
4486 if (dump_enabled_p ())
4487 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4488 min_vec_niters);
4490 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4492 /* Now that we know the minimum number of vector iterations,
4493 find the minimum niters for which the scalar cost is larger:
4495 SIC * niters > VIC * vniters + VOC - SOC
4497 We know that the minimum niters is no more than
4498 vniters * VF + NPEEL, but it might be (and often is) less
4499 than that if a partial vector iteration is cheaper than the
4500 equivalent scalar code. */
4501 int threshold = (vec_inside_cost * min_vec_niters
4502 + vec_outside_cost
4503 - scalar_outside_cost);
4504 if (threshold <= 0)
4505 min_profitable_iters = 1;
4506 else
4507 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4509 else
4510 /* Convert the number of vector iterations into a number of
4511 scalar iterations. */
4512 min_profitable_iters = (min_vec_niters * assumed_vf
4513 + peel_iters_prologue
4514 + peel_iters_epilogue);
4516 else
4518 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4519 * assumed_vf
4520 - vec_inside_cost * peel_iters_prologue
4521 - vec_inside_cost * peel_iters_epilogue);
4522 if (min_profitable_iters <= 0)
4523 min_profitable_iters = 0;
4524 else
4526 min_profitable_iters /= saving_per_viter;
4528 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4529 <= (((int) vec_inside_cost * min_profitable_iters)
4530 + (((int) vec_outside_cost - scalar_outside_cost)
4531 * assumed_vf)))
4532 min_profitable_iters++;
4536 if (dump_enabled_p ())
4537 dump_printf (MSG_NOTE,
4538 " Calculated minimum iters for profitability: %d\n",
4539 min_profitable_iters);
4541 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4542 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4543 /* We want the vectorized loop to execute at least once. */
4544 min_profitable_iters = assumed_vf + peel_iters_prologue;
4545 else if (min_profitable_iters < peel_iters_prologue)
4546 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4547 vectorized loop executes at least once. */
4548 min_profitable_iters = peel_iters_prologue;
4550 if (dump_enabled_p ())
4551 dump_printf_loc (MSG_NOTE, vect_location,
4552 " Runtime profitability threshold = %d\n",
4553 min_profitable_iters);
4555 *ret_min_profitable_niters = min_profitable_iters;
4557 /* Calculate number of iterations required to make the vector version
4558 profitable, relative to the loop bodies only.
4560 Non-vectorized variant is SIC * niters and it must win over vector
4561 variant on the expected loop trip count. The following condition must hold true:
4562 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4564 if (vec_outside_cost <= 0)
4565 min_profitable_estimate = 0;
4566 /* ??? This "else if" arm is written to handle all cases; see below for
4567 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4568 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4570 /* This is a repeat of the code above, but with + SOC rather
4571 than - SOC. */
4572 int outside_overhead = (vec_outside_cost
4573 - scalar_single_iter_cost * peel_iters_prologue
4574 - scalar_single_iter_cost * peel_iters_epilogue
4575 + scalar_outside_cost);
4576 int min_vec_niters = 1;
4577 if (outside_overhead > 0)
4578 min_vec_niters = outside_overhead / saving_per_viter + 1;
4580 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4582 int threshold = (vec_inside_cost * min_vec_niters
4583 + vec_outside_cost
4584 + scalar_outside_cost);
4585 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4587 else
4588 min_profitable_estimate = (min_vec_niters * assumed_vf
4589 + peel_iters_prologue
4590 + peel_iters_epilogue);
4592 else
4594 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4595 * assumed_vf
4596 - vec_inside_cost * peel_iters_prologue
4597 - vec_inside_cost * peel_iters_epilogue)
4598 / ((scalar_single_iter_cost * assumed_vf)
4599 - vec_inside_cost);
4601 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4602 if (dump_enabled_p ())
4603 dump_printf_loc (MSG_NOTE, vect_location,
4604 " Static estimate profitability threshold = %d\n",
4605 min_profitable_estimate);
4607 *ret_min_profitable_estimate = min_profitable_estimate;
4610 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4611 vector elements (not bits) for a vector with NELT elements. */
4612 static void
4613 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4614 vec_perm_builder *sel)
4616 /* The encoding is a single stepped pattern. Any wrap-around is handled
4617 by vec_perm_indices. */
4618 sel->new_vector (nelt, 1, 3);
4619 for (unsigned int i = 0; i < 3; i++)
4620 sel->quick_push (i + offset);
4623 /* Checks whether the target supports whole-vector shifts for vectors of mode
4624 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4625 it supports vec_perm_const with masks for all necessary shift amounts. */
4626 static bool
4627 have_whole_vector_shift (machine_mode mode)
4629 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4630 return true;
4632 /* Variable-length vectors should be handled via the optab. */
4633 unsigned int nelt;
4634 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4635 return false;
4637 vec_perm_builder sel;
4638 vec_perm_indices indices;
4639 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4641 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4642 indices.new_vector (sel, 2, nelt);
4643 if (!can_vec_perm_const_p (mode, mode, indices, false))
4644 return false;
4646 return true;
4649 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4650 multiplication operands have differing signs and (b) we intend
4651 to emulate the operation using a series of signed DOT_PROD_EXPRs.
4652 See vect_emulate_mixed_dot_prod for the actual sequence used. */
4654 static bool
4655 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
4656 stmt_vec_info stmt_info)
4658 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4659 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4660 return false;
4662 tree rhs1 = gimple_assign_rhs1 (assign);
4663 tree rhs2 = gimple_assign_rhs2 (assign);
4664 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4665 return false;
4667 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4668 gcc_assert (reduc_info->is_reduc_info);
4669 return !directly_supported_p (DOT_PROD_EXPR,
4670 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
4671 optab_vector_mixed_sign);
4674 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4675 functions. Design better to avoid maintenance issues. */
4677 /* Function vect_model_reduction_cost.
4679 Models cost for a reduction operation, including the vector ops
4680 generated within the strip-mine loop in some cases, the initial
4681 definition before the loop, and the epilogue code that must be generated. */
4683 static void
4684 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4685 stmt_vec_info stmt_info, internal_fn reduc_fn,
4686 vect_reduction_type reduction_type,
4687 int ncopies, stmt_vector_for_cost *cost_vec)
4689 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4690 tree vectype;
4691 machine_mode mode;
4692 class loop *loop = NULL;
4694 if (loop_vinfo)
4695 loop = LOOP_VINFO_LOOP (loop_vinfo);
4697 /* Condition reductions generate two reductions in the loop. */
4698 if (reduction_type == COND_REDUCTION)
4699 ncopies *= 2;
4701 vectype = STMT_VINFO_VECTYPE (stmt_info);
4702 mode = TYPE_MODE (vectype);
4703 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4705 gimple_match_op op;
4706 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4707 gcc_unreachable ();
4709 bool emulated_mixed_dot_prod
4710 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
4711 if (reduction_type == EXTRACT_LAST_REDUCTION)
4712 /* No extra instructions are needed in the prologue. The loop body
4713 operations are costed in vectorizable_condition. */
4714 inside_cost = 0;
4715 else if (reduction_type == FOLD_LEFT_REDUCTION)
4717 /* No extra instructions needed in the prologue. */
4718 prologue_cost = 0;
4720 if (reduc_fn != IFN_LAST)
4721 /* Count one reduction-like operation per vector. */
4722 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4723 stmt_info, 0, vect_body);
4724 else
4726 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4727 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4728 inside_cost = record_stmt_cost (cost_vec, nelements,
4729 vec_to_scalar, stmt_info, 0,
4730 vect_body);
4731 inside_cost += record_stmt_cost (cost_vec, nelements,
4732 scalar_stmt, stmt_info, 0,
4733 vect_body);
4736 else
4738 /* Add in the cost of the initial definitions. */
4739 int prologue_stmts;
4740 if (reduction_type == COND_REDUCTION)
4741 /* For cond reductions we have four vectors: initial index, step,
4742 initial result of the data reduction, initial value of the index
4743 reduction. */
4744 prologue_stmts = 4;
4745 else if (emulated_mixed_dot_prod)
4746 /* We need the initial reduction value and two invariants:
4747 one that contains the minimum signed value and one that
4748 contains half of its negative. */
4749 prologue_stmts = 3;
4750 else
4751 prologue_stmts = 1;
4752 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4753 scalar_to_vec, stmt_info, 0,
4754 vect_prologue);
4757 /* Determine cost of epilogue code.
4759 We have a reduction operator that will reduce the vector in one statement.
4760 Also requires scalar extract. */
4762 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4764 if (reduc_fn != IFN_LAST)
4766 if (reduction_type == COND_REDUCTION)
4768 /* An EQ stmt and an COND_EXPR stmt. */
4769 epilogue_cost += record_stmt_cost (cost_vec, 2,
4770 vector_stmt, stmt_info, 0,
4771 vect_epilogue);
4772 /* Reduction of the max index and a reduction of the found
4773 values. */
4774 epilogue_cost += record_stmt_cost (cost_vec, 2,
4775 vec_to_scalar, stmt_info, 0,
4776 vect_epilogue);
4777 /* A broadcast of the max value. */
4778 epilogue_cost += record_stmt_cost (cost_vec, 1,
4779 scalar_to_vec, stmt_info, 0,
4780 vect_epilogue);
4782 else
4784 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4785 stmt_info, 0, vect_epilogue);
4786 epilogue_cost += record_stmt_cost (cost_vec, 1,
4787 vec_to_scalar, stmt_info, 0,
4788 vect_epilogue);
4791 else if (reduction_type == COND_REDUCTION)
4793 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4794 /* Extraction of scalar elements. */
4795 epilogue_cost += record_stmt_cost (cost_vec,
4796 2 * estimated_nunits,
4797 vec_to_scalar, stmt_info, 0,
4798 vect_epilogue);
4799 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4800 epilogue_cost += record_stmt_cost (cost_vec,
4801 2 * estimated_nunits - 3,
4802 scalar_stmt, stmt_info, 0,
4803 vect_epilogue);
4805 else if (reduction_type == EXTRACT_LAST_REDUCTION
4806 || reduction_type == FOLD_LEFT_REDUCTION)
4807 /* No extra instructions need in the epilogue. */
4809 else
4811 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4812 tree bitsize = TYPE_SIZE (op.type);
4813 int element_bitsize = tree_to_uhwi (bitsize);
4814 int nelements = vec_size_in_bits / element_bitsize;
4816 if (op.code == COND_EXPR)
4817 op.code = MAX_EXPR;
4819 /* We have a whole vector shift available. */
4820 if (VECTOR_MODE_P (mode)
4821 && directly_supported_p (op.code, vectype)
4822 && have_whole_vector_shift (mode))
4824 /* Final reduction via vector shifts and the reduction operator.
4825 Also requires scalar extract. */
4826 epilogue_cost += record_stmt_cost (cost_vec,
4827 exact_log2 (nelements) * 2,
4828 vector_stmt, stmt_info, 0,
4829 vect_epilogue);
4830 epilogue_cost += record_stmt_cost (cost_vec, 1,
4831 vec_to_scalar, stmt_info, 0,
4832 vect_epilogue);
4834 else
4835 /* Use extracts and reduction op for final reduction. For N
4836 elements, we have N extracts and N-1 reduction ops. */
4837 epilogue_cost += record_stmt_cost (cost_vec,
4838 nelements + nelements - 1,
4839 vector_stmt, stmt_info, 0,
4840 vect_epilogue);
4844 if (dump_enabled_p ())
4845 dump_printf (MSG_NOTE,
4846 "vect_model_reduction_cost: inside_cost = %d, "
4847 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4848 prologue_cost, epilogue_cost);
4851 /* SEQ is a sequence of instructions that initialize the reduction
4852 described by REDUC_INFO. Emit them in the appropriate place. */
4854 static void
4855 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4856 stmt_vec_info reduc_info, gimple *seq)
4858 if (reduc_info->reused_accumulator)
4860 /* When reusing an accumulator from the main loop, we only need
4861 initialization instructions if the main loop can be skipped.
4862 In that case, emit the initialization instructions at the end
4863 of the guard block that does the skip. */
4864 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4865 gcc_assert (skip_edge);
4866 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4867 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4869 else
4871 /* The normal case: emit the initialization instructions on the
4872 preheader edge. */
4873 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4874 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4878 /* Function get_initial_def_for_reduction
4880 Input:
4881 REDUC_INFO - the info_for_reduction
4882 INIT_VAL - the initial value of the reduction variable
4883 NEUTRAL_OP - a value that has no effect on the reduction, as per
4884 neutral_op_for_reduction
4886 Output:
4887 Return a vector variable, initialized according to the operation that
4888 STMT_VINFO performs. This vector will be used as the initial value
4889 of the vector of partial results.
4891 The value we need is a vector in which element 0 has value INIT_VAL
4892 and every other element has value NEUTRAL_OP. */
4894 static tree
4895 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4896 stmt_vec_info reduc_info,
4897 tree init_val, tree neutral_op)
4899 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4900 tree scalar_type = TREE_TYPE (init_val);
4901 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4902 tree init_def;
4903 gimple_seq stmts = NULL;
4905 gcc_assert (vectype);
4907 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4908 || SCALAR_FLOAT_TYPE_P (scalar_type));
4910 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4911 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4913 if (operand_equal_p (init_val, neutral_op))
4915 /* If both elements are equal then the vector described above is
4916 just a splat. */
4917 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4918 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4920 else
4922 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4923 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4924 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4926 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4927 element 0. */
4928 init_def = gimple_build_vector_from_val (&stmts, vectype,
4929 neutral_op);
4930 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4931 vectype, init_def, init_val);
4933 else
4935 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
4936 tree_vector_builder elts (vectype, 1, 2);
4937 elts.quick_push (init_val);
4938 elts.quick_push (neutral_op);
4939 init_def = gimple_build_vector (&stmts, &elts);
4943 if (stmts)
4944 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4945 return init_def;
4948 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4949 which performs a reduction involving GROUP_SIZE scalar statements.
4950 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4951 is nonnull, introducing extra elements of that value will not change the
4952 result. */
4954 static void
4955 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4956 stmt_vec_info reduc_info,
4957 vec<tree> *vec_oprnds,
4958 unsigned int number_of_vectors,
4959 unsigned int group_size, tree neutral_op)
4961 vec<tree> &initial_values = reduc_info->reduc_initial_values;
4962 unsigned HOST_WIDE_INT nunits;
4963 unsigned j, number_of_places_left_in_vector;
4964 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4965 unsigned int i;
4967 gcc_assert (group_size == initial_values.length () || neutral_op);
4969 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4970 created vectors. It is greater than 1 if unrolling is performed.
4972 For example, we have two scalar operands, s1 and s2 (e.g., group of
4973 strided accesses of size two), while NUNITS is four (i.e., four scalars
4974 of this type can be packed in a vector). The output vector will contain
4975 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4976 will be 2).
4978 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4979 vectors containing the operands.
4981 For example, NUNITS is four as before, and the group size is 8
4982 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4983 {s5, s6, s7, s8}. */
4985 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4986 nunits = group_size;
4988 number_of_places_left_in_vector = nunits;
4989 bool constant_p = true;
4990 tree_vector_builder elts (vector_type, nunits, 1);
4991 elts.quick_grow (nunits);
4992 gimple_seq ctor_seq = NULL;
4993 for (j = 0; j < nunits * number_of_vectors; ++j)
4995 tree op;
4996 i = j % group_size;
4998 /* Get the def before the loop. In reduction chain we have only
4999 one initial value. Else we have as many as PHIs in the group. */
5000 if (i >= initial_values.length () || (j > i && neutral_op))
5001 op = neutral_op;
5002 else
5003 op = initial_values[i];
5005 /* Create 'vect_ = {op0,op1,...,opn}'. */
5006 number_of_places_left_in_vector--;
5007 elts[nunits - number_of_places_left_in_vector - 1] = op;
5008 if (!CONSTANT_CLASS_P (op))
5009 constant_p = false;
5011 if (number_of_places_left_in_vector == 0)
5013 tree init;
5014 if (constant_p && !neutral_op
5015 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5016 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5017 /* Build the vector directly from ELTS. */
5018 init = gimple_build_vector (&ctor_seq, &elts);
5019 else if (neutral_op)
5021 /* Build a vector of the neutral value and shift the
5022 other elements into place. */
5023 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5024 neutral_op);
5025 int k = nunits;
5026 while (k > 0 && elts[k - 1] == neutral_op)
5027 k -= 1;
5028 while (k > 0)
5030 k -= 1;
5031 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5032 vector_type, init, elts[k]);
5035 else
5037 /* First time round, duplicate ELTS to fill the
5038 required number of vectors. */
5039 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5040 elts, number_of_vectors, *vec_oprnds);
5041 break;
5043 vec_oprnds->quick_push (init);
5045 number_of_places_left_in_vector = nunits;
5046 elts.new_vector (vector_type, nunits, 1);
5047 elts.quick_grow (nunits);
5048 constant_p = true;
5051 if (ctor_seq != NULL)
5052 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5055 /* For a statement STMT_INFO taking part in a reduction operation return
5056 the stmt_vec_info the meta information is stored on. */
5058 stmt_vec_info
5059 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5061 stmt_info = vect_orig_stmt (stmt_info);
5062 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5063 if (!is_a <gphi *> (stmt_info->stmt)
5064 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5065 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5066 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5067 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5069 if (gimple_phi_num_args (phi) == 1)
5070 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5072 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5074 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5075 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5076 stmt_info = info;
5078 return stmt_info;
5081 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5082 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5083 return false. */
5085 static bool
5086 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5087 stmt_vec_info reduc_info)
5089 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5090 if (!main_loop_vinfo)
5091 return false;
5093 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5094 return false;
5096 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5097 auto_vec<tree, 16> main_loop_results (num_phis);
5098 auto_vec<tree, 16> initial_values (num_phis);
5099 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5101 /* The epilogue loop can be entered either from the main loop or
5102 from an earlier guard block. */
5103 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5104 for (tree incoming_value : reduc_info->reduc_initial_values)
5106 /* Look for:
5108 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5109 INITIAL_VALUE(guard block)>. */
5110 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5112 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5113 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5115 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5116 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5118 main_loop_results.quick_push (from_main_loop);
5119 initial_values.quick_push (from_skip);
5122 else
5123 /* The main loop dominates the epilogue loop. */
5124 main_loop_results.splice (reduc_info->reduc_initial_values);
5126 /* See if the main loop has the kind of accumulator we need. */
5127 vect_reusable_accumulator *accumulator
5128 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5129 if (!accumulator
5130 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5131 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5132 accumulator->reduc_info->reduc_scalar_results.begin ()))
5133 return false;
5135 /* Handle the case where we can reduce wider vectors to narrower ones. */
5136 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5137 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5138 unsigned HOST_WIDE_INT m;
5139 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5140 TYPE_VECTOR_SUBPARTS (vectype), &m))
5141 return false;
5142 /* Check the intermediate vector types and operations are available. */
5143 tree prev_vectype = old_vectype;
5144 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5145 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5147 intermediate_nunits = exact_div (intermediate_nunits, 2);
5148 tree intermediate_vectype = get_related_vectype_for_scalar_type
5149 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5150 if (!intermediate_vectype
5151 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5152 intermediate_vectype)
5153 || !can_vec_extract (TYPE_MODE (prev_vectype),
5154 TYPE_MODE (intermediate_vectype)))
5155 return false;
5156 prev_vectype = intermediate_vectype;
5159 /* Non-SLP reductions might apply an adjustment after the reduction
5160 operation, in order to simplify the initialization of the accumulator.
5161 If the epilogue loop carries on from where the main loop left off,
5162 it should apply the same adjustment to the final reduction result.
5164 If the epilogue loop can also be entered directly (rather than via
5165 the main loop), we need to be able to handle that case in the same way,
5166 with the same adjustment. (In principle we could add a PHI node
5167 to select the correct adjustment, but in practice that shouldn't be
5168 necessary.) */
5169 tree main_adjustment
5170 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5171 if (loop_vinfo->main_loop_edge && main_adjustment)
5173 gcc_assert (num_phis == 1);
5174 tree initial_value = initial_values[0];
5175 /* Check that we can use INITIAL_VALUE as the adjustment and
5176 initialize the accumulator with a neutral value instead. */
5177 if (!operand_equal_p (initial_value, main_adjustment))
5178 return false;
5179 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5180 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5181 code, initial_value);
5183 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5184 reduc_info->reduc_initial_values.truncate (0);
5185 reduc_info->reduc_initial_values.splice (initial_values);
5186 reduc_info->reused_accumulator = accumulator;
5187 return true;
5190 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5191 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5193 static tree
5194 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5195 gimple_seq *seq)
5197 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5198 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5199 tree stype = TREE_TYPE (vectype);
5200 tree new_temp = vec_def;
5201 while (nunits > nunits1)
5203 nunits /= 2;
5204 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5205 stype, nunits);
5206 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5208 /* The target has to make sure we support lowpart/highpart
5209 extraction, either via direct vector extract or through
5210 an integer mode punning. */
5211 tree dst1, dst2;
5212 gimple *epilog_stmt;
5213 if (convert_optab_handler (vec_extract_optab,
5214 TYPE_MODE (TREE_TYPE (new_temp)),
5215 TYPE_MODE (vectype1))
5216 != CODE_FOR_nothing)
5218 /* Extract sub-vectors directly once vec_extract becomes
5219 a conversion optab. */
5220 dst1 = make_ssa_name (vectype1);
5221 epilog_stmt
5222 = gimple_build_assign (dst1, BIT_FIELD_REF,
5223 build3 (BIT_FIELD_REF, vectype1,
5224 new_temp, TYPE_SIZE (vectype1),
5225 bitsize_int (0)));
5226 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5227 dst2 = make_ssa_name (vectype1);
5228 epilog_stmt
5229 = gimple_build_assign (dst2, BIT_FIELD_REF,
5230 build3 (BIT_FIELD_REF, vectype1,
5231 new_temp, TYPE_SIZE (vectype1),
5232 bitsize_int (bitsize)));
5233 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5235 else
5237 /* Extract via punning to appropriately sized integer mode
5238 vector. */
5239 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5240 tree etype = build_vector_type (eltype, 2);
5241 gcc_assert (convert_optab_handler (vec_extract_optab,
5242 TYPE_MODE (etype),
5243 TYPE_MODE (eltype))
5244 != CODE_FOR_nothing);
5245 tree tem = make_ssa_name (etype);
5246 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5247 build1 (VIEW_CONVERT_EXPR,
5248 etype, new_temp));
5249 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5250 new_temp = tem;
5251 tem = make_ssa_name (eltype);
5252 epilog_stmt
5253 = gimple_build_assign (tem, BIT_FIELD_REF,
5254 build3 (BIT_FIELD_REF, eltype,
5255 new_temp, TYPE_SIZE (eltype),
5256 bitsize_int (0)));
5257 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5258 dst1 = make_ssa_name (vectype1);
5259 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5260 build1 (VIEW_CONVERT_EXPR,
5261 vectype1, tem));
5262 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5263 tem = make_ssa_name (eltype);
5264 epilog_stmt
5265 = gimple_build_assign (tem, BIT_FIELD_REF,
5266 build3 (BIT_FIELD_REF, eltype,
5267 new_temp, TYPE_SIZE (eltype),
5268 bitsize_int (bitsize)));
5269 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5270 dst2 = make_ssa_name (vectype1);
5271 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5272 build1 (VIEW_CONVERT_EXPR,
5273 vectype1, tem));
5274 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5277 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5280 return new_temp;
5283 /* Function vect_create_epilog_for_reduction
5285 Create code at the loop-epilog to finalize the result of a reduction
5286 computation.
5288 STMT_INFO is the scalar reduction stmt that is being vectorized.
5289 SLP_NODE is an SLP node containing a group of reduction statements. The
5290 first one in this group is STMT_INFO.
5291 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5292 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5293 (counting from 0)
5295 This function:
5296 1. Completes the reduction def-use cycles.
5297 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5298 by calling the function specified by REDUC_FN if available, or by
5299 other means (whole-vector shifts or a scalar loop).
5300 The function also creates a new phi node at the loop exit to preserve
5301 loop-closed form, as illustrated below.
5303 The flow at the entry to this function:
5305 loop:
5306 vec_def = phi <vec_init, null> # REDUCTION_PHI
5307 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5308 s_loop = scalar_stmt # (scalar) STMT_INFO
5309 loop_exit:
5310 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5311 use <s_out0>
5312 use <s_out0>
5314 The above is transformed by this function into:
5316 loop:
5317 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5318 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5319 s_loop = scalar_stmt # (scalar) STMT_INFO
5320 loop_exit:
5321 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5322 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5323 v_out2 = reduce <v_out1>
5324 s_out3 = extract_field <v_out2, 0>
5325 s_out4 = adjust_result <s_out3>
5326 use <s_out4>
5327 use <s_out4>
5330 static void
5331 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5332 stmt_vec_info stmt_info,
5333 slp_tree slp_node,
5334 slp_instance slp_node_instance)
5336 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5337 gcc_assert (reduc_info->is_reduc_info);
5338 /* For double reductions we need to get at the inner loop reduction
5339 stmt which has the meta info attached. Our stmt_info is that of the
5340 loop-closed PHI of the inner loop which we remember as
5341 def for the reduction PHI generation. */
5342 bool double_reduc = false;
5343 stmt_vec_info rdef_info = stmt_info;
5344 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5346 gcc_assert (!slp_node);
5347 double_reduc = true;
5348 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5349 (stmt_info->stmt, 0));
5350 stmt_info = vect_stmt_to_vectorize (stmt_info);
5352 gphi *reduc_def_stmt
5353 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5354 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5355 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5356 tree vectype;
5357 machine_mode mode;
5358 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5359 basic_block exit_bb;
5360 tree scalar_dest;
5361 tree scalar_type;
5362 gimple *new_phi = NULL, *phi;
5363 gimple_stmt_iterator exit_gsi;
5364 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5365 gimple *epilog_stmt = NULL;
5366 gimple *exit_phi;
5367 tree bitsize;
5368 tree def;
5369 tree orig_name, scalar_result;
5370 imm_use_iterator imm_iter, phi_imm_iter;
5371 use_operand_p use_p, phi_use_p;
5372 gimple *use_stmt;
5373 auto_vec<tree> reduc_inputs;
5374 int j, i;
5375 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5376 unsigned int group_size = 1, k;
5377 auto_vec<gimple *> phis;
5378 /* SLP reduction without reduction chain, e.g.,
5379 # a1 = phi <a2, a0>
5380 # b1 = phi <b2, b0>
5381 a2 = operation (a1)
5382 b2 = operation (b1) */
5383 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5384 bool direct_slp_reduc;
5385 tree induction_index = NULL_TREE;
5387 if (slp_node)
5388 group_size = SLP_TREE_LANES (slp_node);
5390 if (nested_in_vect_loop_p (loop, stmt_info))
5392 outer_loop = loop;
5393 loop = loop->inner;
5394 gcc_assert (!slp_node && double_reduc);
5397 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5398 gcc_assert (vectype);
5399 mode = TYPE_MODE (vectype);
5401 tree induc_val = NULL_TREE;
5402 tree adjustment_def = NULL;
5403 if (slp_node)
5405 else
5407 /* Optimize: for induction condition reduction, if we can't use zero
5408 for induc_val, use initial_def. */
5409 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5410 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5411 else if (double_reduc)
5413 else
5414 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5417 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5418 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5419 if (slp_reduc)
5420 /* All statements produce live-out values. */
5421 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5422 else if (slp_node)
5424 /* The last statement in the reduction chain produces the live-out
5425 value. Note SLP optimization can shuffle scalar stmts to
5426 optimize permutations so we have to search for the last stmt. */
5427 for (k = 0; k < group_size; ++k)
5428 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5430 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5431 break;
5435 unsigned vec_num;
5436 int ncopies;
5437 if (slp_node)
5439 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5440 ncopies = 1;
5442 else
5444 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5445 vec_num = 1;
5446 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5449 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5450 which is updated with the current index of the loop for every match of
5451 the original loop's cond_expr (VEC_STMT). This results in a vector
5452 containing the last time the condition passed for that vector lane.
5453 The first match will be a 1 to allow 0 to be used for non-matching
5454 indexes. If there are no matches at all then the vector will be all
5455 zeroes.
5457 PR92772: This algorithm is broken for architectures that support
5458 masked vectors, but do not provide fold_extract_last. */
5459 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5461 auto_vec<std::pair<tree, bool>, 2> ccompares;
5462 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5463 cond_info = vect_stmt_to_vectorize (cond_info);
5464 while (cond_info != reduc_info)
5466 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5468 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5469 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5470 ccompares.safe_push
5471 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5472 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5474 cond_info
5475 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5476 1 + STMT_VINFO_REDUC_IDX
5477 (cond_info)));
5478 cond_info = vect_stmt_to_vectorize (cond_info);
5480 gcc_assert (ccompares.length () != 0);
5482 tree indx_before_incr, indx_after_incr;
5483 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5484 int scalar_precision
5485 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5486 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5487 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5488 (TYPE_MODE (vectype), cr_index_scalar_type,
5489 TYPE_VECTOR_SUBPARTS (vectype));
5491 /* First we create a simple vector induction variable which starts
5492 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5493 vector size (STEP). */
5495 /* Create a {1,2,3,...} vector. */
5496 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5498 /* Create a vector of the step value. */
5499 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5500 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5502 /* Create an induction variable. */
5503 gimple_stmt_iterator incr_gsi;
5504 bool insert_after;
5505 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5506 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5507 insert_after, &indx_before_incr, &indx_after_incr);
5509 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5510 filled with zeros (VEC_ZERO). */
5512 /* Create a vector of 0s. */
5513 tree zero = build_zero_cst (cr_index_scalar_type);
5514 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5516 /* Create a vector phi node. */
5517 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5518 new_phi = create_phi_node (new_phi_tree, loop->header);
5519 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5520 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5522 /* Now take the condition from the loops original cond_exprs
5523 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5524 every match uses values from the induction variable
5525 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5526 (NEW_PHI_TREE).
5527 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5528 the new cond_expr (INDEX_COND_EXPR). */
5529 gimple_seq stmts = NULL;
5530 for (int i = ccompares.length () - 1; i != -1; --i)
5532 tree ccompare = ccompares[i].first;
5533 if (ccompares[i].second)
5534 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5535 cr_index_vector_type,
5536 ccompare,
5537 indx_before_incr, new_phi_tree);
5538 else
5539 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5540 cr_index_vector_type,
5541 ccompare,
5542 new_phi_tree, indx_before_incr);
5544 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5546 /* Update the phi with the vec cond. */
5547 induction_index = new_phi_tree;
5548 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5549 loop_latch_edge (loop), UNKNOWN_LOCATION);
5552 /* 2. Create epilog code.
5553 The reduction epilog code operates across the elements of the vector
5554 of partial results computed by the vectorized loop.
5555 The reduction epilog code consists of:
5557 step 1: compute the scalar result in a vector (v_out2)
5558 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5559 step 3: adjust the scalar result (s_out3) if needed.
5561 Step 1 can be accomplished using one the following three schemes:
5562 (scheme 1) using reduc_fn, if available.
5563 (scheme 2) using whole-vector shifts, if available.
5564 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5565 combined.
5567 The overall epilog code looks like this:
5569 s_out0 = phi <s_loop> # original EXIT_PHI
5570 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5571 v_out2 = reduce <v_out1> # step 1
5572 s_out3 = extract_field <v_out2, 0> # step 2
5573 s_out4 = adjust_result <s_out3> # step 3
5575 (step 3 is optional, and steps 1 and 2 may be combined).
5576 Lastly, the uses of s_out0 are replaced by s_out4. */
5579 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5580 v_out1 = phi <VECT_DEF>
5581 Store them in NEW_PHIS. */
5582 if (double_reduc)
5583 loop = outer_loop;
5584 exit_bb = single_exit (loop)->dest;
5585 exit_gsi = gsi_after_labels (exit_bb);
5586 reduc_inputs.create (slp_node ? vec_num : ncopies);
5587 for (unsigned i = 0; i < vec_num; i++)
5589 gimple_seq stmts = NULL;
5590 if (slp_node)
5591 def = vect_get_slp_vect_def (slp_node, i);
5592 else
5593 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5594 for (j = 0; j < ncopies; j++)
5596 tree new_def = copy_ssa_name (def);
5597 phi = create_phi_node (new_def, exit_bb);
5598 if (j)
5599 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5600 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5601 new_def = gimple_convert (&stmts, vectype, new_def);
5602 reduc_inputs.quick_push (new_def);
5604 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5607 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5608 (i.e. when reduc_fn is not available) and in the final adjustment
5609 code (if needed). Also get the original scalar reduction variable as
5610 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5611 represents a reduction pattern), the tree-code and scalar-def are
5612 taken from the original stmt that the pattern-stmt (STMT) replaces.
5613 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5614 are taken from STMT. */
5616 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5617 if (orig_stmt_info != stmt_info)
5619 /* Reduction pattern */
5620 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5621 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5624 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5625 scalar_type = TREE_TYPE (scalar_dest);
5626 scalar_results.truncate (0);
5627 scalar_results.reserve_exact (group_size);
5628 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5629 bitsize = TYPE_SIZE (scalar_type);
5631 /* True if we should implement SLP_REDUC using native reduction operations
5632 instead of scalar operations. */
5633 direct_slp_reduc = (reduc_fn != IFN_LAST
5634 && slp_reduc
5635 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5637 /* In case of reduction chain, e.g.,
5638 # a1 = phi <a3, a0>
5639 a2 = operation (a1)
5640 a3 = operation (a2),
5642 we may end up with more than one vector result. Here we reduce them
5643 to one vector.
5645 The same is true if we couldn't use a single defuse cycle. */
5646 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5647 || direct_slp_reduc
5648 || ncopies > 1)
5650 gimple_seq stmts = NULL;
5651 tree single_input = reduc_inputs[0];
5652 for (k = 1; k < reduc_inputs.length (); k++)
5653 single_input = gimple_build (&stmts, code, vectype,
5654 single_input, reduc_inputs[k]);
5655 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5657 reduc_inputs.truncate (0);
5658 reduc_inputs.safe_push (single_input);
5661 tree orig_reduc_input = reduc_inputs[0];
5663 /* If this loop is an epilogue loop that can be skipped after the
5664 main loop, we can only share a reduction operation between the
5665 main loop and the epilogue if we put it at the target of the
5666 skip edge.
5668 We can still reuse accumulators if this check fails. Doing so has
5669 the minor(?) benefit of making the epilogue loop's scalar result
5670 independent of the main loop's scalar result. */
5671 bool unify_with_main_loop_p = false;
5672 if (reduc_info->reused_accumulator
5673 && loop_vinfo->skip_this_loop_edge
5674 && single_succ_p (exit_bb)
5675 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5677 unify_with_main_loop_p = true;
5679 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5680 reduc_inputs[0] = make_ssa_name (vectype);
5681 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5682 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5683 UNKNOWN_LOCATION);
5684 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5685 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5686 exit_gsi = gsi_after_labels (reduc_block);
5689 /* Shouldn't be used beyond this point. */
5690 exit_bb = nullptr;
5692 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5693 && reduc_fn != IFN_LAST)
5695 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5696 various data values where the condition matched and another vector
5697 (INDUCTION_INDEX) containing all the indexes of those matches. We
5698 need to extract the last matching index (which will be the index with
5699 highest value) and use this to index into the data vector.
5700 For the case where there were no matches, the data vector will contain
5701 all default values and the index vector will be all zeros. */
5703 /* Get various versions of the type of the vector of indexes. */
5704 tree index_vec_type = TREE_TYPE (induction_index);
5705 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5706 tree index_scalar_type = TREE_TYPE (index_vec_type);
5707 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5709 /* Get an unsigned integer version of the type of the data vector. */
5710 int scalar_precision
5711 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5712 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5713 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5714 vectype);
5716 /* First we need to create a vector (ZERO_VEC) of zeros and another
5717 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5718 can create using a MAX reduction and then expanding.
5719 In the case where the loop never made any matches, the max index will
5720 be zero. */
5722 /* Vector of {0, 0, 0,...}. */
5723 tree zero_vec = build_zero_cst (vectype);
5725 /* Find maximum value from the vector of found indexes. */
5726 tree max_index = make_ssa_name (index_scalar_type);
5727 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5728 1, induction_index);
5729 gimple_call_set_lhs (max_index_stmt, max_index);
5730 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5732 /* Vector of {max_index, max_index, max_index,...}. */
5733 tree max_index_vec = make_ssa_name (index_vec_type);
5734 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5735 max_index);
5736 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5737 max_index_vec_rhs);
5738 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5740 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5741 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5742 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5743 otherwise. Only one value should match, resulting in a vector
5744 (VEC_COND) with one data value and the rest zeros.
5745 In the case where the loop never made any matches, every index will
5746 match, resulting in a vector with all data values (which will all be
5747 the default value). */
5749 /* Compare the max index vector to the vector of found indexes to find
5750 the position of the max value. */
5751 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5752 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5753 induction_index,
5754 max_index_vec);
5755 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5757 /* Use the compare to choose either values from the data vector or
5758 zero. */
5759 tree vec_cond = make_ssa_name (vectype);
5760 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5761 vec_compare,
5762 reduc_inputs[0],
5763 zero_vec);
5764 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5766 /* Finally we need to extract the data value from the vector (VEC_COND)
5767 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5768 reduction, but because this doesn't exist, we can use a MAX reduction
5769 instead. The data value might be signed or a float so we need to cast
5770 it first.
5771 In the case where the loop never made any matches, the data values are
5772 all identical, and so will reduce down correctly. */
5774 /* Make the matched data values unsigned. */
5775 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5776 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5777 vec_cond);
5778 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5779 VIEW_CONVERT_EXPR,
5780 vec_cond_cast_rhs);
5781 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5783 /* Reduce down to a scalar value. */
5784 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5785 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5786 1, vec_cond_cast);
5787 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5788 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5790 /* Convert the reduced value back to the result type and set as the
5791 result. */
5792 gimple_seq stmts = NULL;
5793 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5794 data_reduc);
5795 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5796 scalar_results.safe_push (new_temp);
5798 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5799 && reduc_fn == IFN_LAST)
5801 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5802 idx = 0;
5803 idx_val = induction_index[0];
5804 val = data_reduc[0];
5805 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5806 if (induction_index[i] > idx_val)
5807 val = data_reduc[i], idx_val = induction_index[i];
5808 return val; */
5810 tree data_eltype = TREE_TYPE (vectype);
5811 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5812 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5813 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5814 /* Enforced by vectorizable_reduction, which ensures we have target
5815 support before allowing a conditional reduction on variable-length
5816 vectors. */
5817 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5818 tree idx_val = NULL_TREE, val = NULL_TREE;
5819 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5821 tree old_idx_val = idx_val;
5822 tree old_val = val;
5823 idx_val = make_ssa_name (idx_eltype);
5824 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5825 build3 (BIT_FIELD_REF, idx_eltype,
5826 induction_index,
5827 bitsize_int (el_size),
5828 bitsize_int (off)));
5829 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5830 val = make_ssa_name (data_eltype);
5831 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5832 build3 (BIT_FIELD_REF,
5833 data_eltype,
5834 reduc_inputs[0],
5835 bitsize_int (el_size),
5836 bitsize_int (off)));
5837 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5838 if (off != 0)
5840 tree new_idx_val = idx_val;
5841 if (off != v_size - el_size)
5843 new_idx_val = make_ssa_name (idx_eltype);
5844 epilog_stmt = gimple_build_assign (new_idx_val,
5845 MAX_EXPR, idx_val,
5846 old_idx_val);
5847 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5849 tree cond = make_ssa_name (boolean_type_node);
5850 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5851 idx_val, old_idx_val);
5852 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5853 tree new_val = make_ssa_name (data_eltype);
5854 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5855 cond, val, old_val);
5856 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5857 idx_val = new_idx_val;
5858 val = new_val;
5861 /* Convert the reduced value back to the result type and set as the
5862 result. */
5863 gimple_seq stmts = NULL;
5864 val = gimple_convert (&stmts, scalar_type, val);
5865 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5866 scalar_results.safe_push (val);
5869 /* 2.3 Create the reduction code, using one of the three schemes described
5870 above. In SLP we simply need to extract all the elements from the
5871 vector (without reducing them), so we use scalar shifts. */
5872 else if (reduc_fn != IFN_LAST && !slp_reduc)
5874 tree tmp;
5875 tree vec_elem_type;
5877 /* Case 1: Create:
5878 v_out2 = reduc_expr <v_out1> */
5880 if (dump_enabled_p ())
5881 dump_printf_loc (MSG_NOTE, vect_location,
5882 "Reduce using direct vector reduction.\n");
5884 gimple_seq stmts = NULL;
5885 vec_elem_type = TREE_TYPE (vectype);
5886 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5887 vec_elem_type, reduc_inputs[0]);
5888 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5889 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5891 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5892 && induc_val)
5894 /* Earlier we set the initial value to be a vector if induc_val
5895 values. Check the result and if it is induc_val then replace
5896 with the original initial value, unless induc_val is
5897 the same as initial_def already. */
5898 tree zcompare = make_ssa_name (boolean_type_node);
5899 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5900 new_temp, induc_val);
5901 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5902 tree initial_def = reduc_info->reduc_initial_values[0];
5903 tmp = make_ssa_name (new_scalar_dest);
5904 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5905 initial_def, new_temp);
5906 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5907 new_temp = tmp;
5910 scalar_results.safe_push (new_temp);
5912 else if (direct_slp_reduc)
5914 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5915 with the elements for other SLP statements replaced with the
5916 neutral value. We can then do a normal reduction on each vector. */
5918 /* Enforced by vectorizable_reduction. */
5919 gcc_assert (reduc_inputs.length () == 1);
5920 gcc_assert (pow2p_hwi (group_size));
5922 gimple_seq seq = NULL;
5924 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5925 and the same element size as VECTYPE. */
5926 tree index = build_index_vector (vectype, 0, 1);
5927 tree index_type = TREE_TYPE (index);
5928 tree index_elt_type = TREE_TYPE (index_type);
5929 tree mask_type = truth_type_for (index_type);
5931 /* Create a vector that, for each element, identifies which of
5932 the REDUC_GROUP_SIZE results should use it. */
5933 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5934 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5935 build_vector_from_val (index_type, index_mask));
5937 /* Get a neutral vector value. This is simply a splat of the neutral
5938 scalar value if we have one, otherwise the initial scalar value
5939 is itself a neutral value. */
5940 tree vector_identity = NULL_TREE;
5941 tree neutral_op = NULL_TREE;
5942 if (slp_node)
5944 tree initial_value = NULL_TREE;
5945 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5946 initial_value = reduc_info->reduc_initial_values[0];
5947 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5948 initial_value);
5950 if (neutral_op)
5951 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5952 neutral_op);
5953 for (unsigned int i = 0; i < group_size; ++i)
5955 /* If there's no univeral neutral value, we can use the
5956 initial scalar value from the original PHI. This is used
5957 for MIN and MAX reduction, for example. */
5958 if (!neutral_op)
5960 tree scalar_value = reduc_info->reduc_initial_values[i];
5961 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5962 scalar_value);
5963 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5964 scalar_value);
5967 /* Calculate the equivalent of:
5969 sel[j] = (index[j] == i);
5971 which selects the elements of REDUC_INPUTS[0] that should
5972 be included in the result. */
5973 tree compare_val = build_int_cst (index_elt_type, i);
5974 compare_val = build_vector_from_val (index_type, compare_val);
5975 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5976 index, compare_val);
5978 /* Calculate the equivalent of:
5980 vec = seq ? reduc_inputs[0] : vector_identity;
5982 VEC is now suitable for a full vector reduction. */
5983 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5984 sel, reduc_inputs[0], vector_identity);
5986 /* Do the reduction and convert it to the appropriate type. */
5987 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5988 TREE_TYPE (vectype), vec);
5989 scalar = gimple_convert (&seq, scalar_type, scalar);
5990 scalar_results.safe_push (scalar);
5992 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5994 else
5996 bool reduce_with_shift;
5997 tree vec_temp;
5999 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6001 /* See if the target wants to do the final (shift) reduction
6002 in a vector mode of smaller size and first reduce upper/lower
6003 halves against each other. */
6004 enum machine_mode mode1 = mode;
6005 tree stype = TREE_TYPE (vectype);
6006 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6007 unsigned nunits1 = nunits;
6008 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6009 && reduc_inputs.length () == 1)
6011 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6012 /* For SLP reductions we have to make sure lanes match up, but
6013 since we're doing individual element final reduction reducing
6014 vector width here is even more important.
6015 ??? We can also separate lanes with permutes, for the common
6016 case of power-of-two group-size odd/even extracts would work. */
6017 if (slp_reduc && nunits != nunits1)
6019 nunits1 = least_common_multiple (nunits1, group_size);
6020 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6023 if (!slp_reduc
6024 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6025 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6027 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6028 stype, nunits1);
6029 reduce_with_shift = have_whole_vector_shift (mode1);
6030 if (!VECTOR_MODE_P (mode1)
6031 || !directly_supported_p (code, vectype1))
6032 reduce_with_shift = false;
6034 /* First reduce the vector to the desired vector size we should
6035 do shift reduction on by combining upper and lower halves. */
6036 gimple_seq stmts = NULL;
6037 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6038 code, &stmts);
6039 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6040 reduc_inputs[0] = new_temp;
6042 if (reduce_with_shift && !slp_reduc)
6044 int element_bitsize = tree_to_uhwi (bitsize);
6045 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6046 for variable-length vectors and also requires direct target support
6047 for loop reductions. */
6048 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6049 int nelements = vec_size_in_bits / element_bitsize;
6050 vec_perm_builder sel;
6051 vec_perm_indices indices;
6053 int elt_offset;
6055 tree zero_vec = build_zero_cst (vectype1);
6056 /* Case 2: Create:
6057 for (offset = nelements/2; offset >= 1; offset/=2)
6059 Create: va' = vec_shift <va, offset>
6060 Create: va = vop <va, va'>
6061 } */
6063 tree rhs;
6065 if (dump_enabled_p ())
6066 dump_printf_loc (MSG_NOTE, vect_location,
6067 "Reduce using vector shifts\n");
6069 gimple_seq stmts = NULL;
6070 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6071 for (elt_offset = nelements / 2;
6072 elt_offset >= 1;
6073 elt_offset /= 2)
6075 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6076 indices.new_vector (sel, 2, nelements);
6077 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6078 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6079 new_temp, zero_vec, mask);
6080 new_temp = gimple_build (&stmts, code,
6081 vectype1, new_name, new_temp);
6083 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6085 /* 2.4 Extract the final scalar result. Create:
6086 s_out3 = extract_field <v_out2, bitpos> */
6088 if (dump_enabled_p ())
6089 dump_printf_loc (MSG_NOTE, vect_location,
6090 "extract scalar result\n");
6092 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6093 bitsize, bitsize_zero_node);
6094 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6095 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6096 gimple_assign_set_lhs (epilog_stmt, new_temp);
6097 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6098 scalar_results.safe_push (new_temp);
6100 else
6102 /* Case 3: Create:
6103 s = extract_field <v_out2, 0>
6104 for (offset = element_size;
6105 offset < vector_size;
6106 offset += element_size;)
6108 Create: s' = extract_field <v_out2, offset>
6109 Create: s = op <s, s'> // For non SLP cases
6110 } */
6112 if (dump_enabled_p ())
6113 dump_printf_loc (MSG_NOTE, vect_location,
6114 "Reduce using scalar code.\n");
6116 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6117 int element_bitsize = tree_to_uhwi (bitsize);
6118 tree compute_type = TREE_TYPE (vectype);
6119 gimple_seq stmts = NULL;
6120 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6122 int bit_offset;
6123 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6124 vec_temp, bitsize, bitsize_zero_node);
6126 /* In SLP we don't need to apply reduction operation, so we just
6127 collect s' values in SCALAR_RESULTS. */
6128 if (slp_reduc)
6129 scalar_results.safe_push (new_temp);
6131 for (bit_offset = element_bitsize;
6132 bit_offset < vec_size_in_bits;
6133 bit_offset += element_bitsize)
6135 tree bitpos = bitsize_int (bit_offset);
6136 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6137 compute_type, vec_temp,
6138 bitsize, bitpos);
6139 if (slp_reduc)
6141 /* In SLP we don't need to apply reduction operation, so
6142 we just collect s' values in SCALAR_RESULTS. */
6143 new_temp = new_name;
6144 scalar_results.safe_push (new_name);
6146 else
6147 new_temp = gimple_build (&stmts, code, compute_type,
6148 new_name, new_temp);
6152 /* The only case where we need to reduce scalar results in SLP, is
6153 unrolling. If the size of SCALAR_RESULTS is greater than
6154 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6155 REDUC_GROUP_SIZE. */
6156 if (slp_reduc)
6158 tree res, first_res, new_res;
6160 /* Reduce multiple scalar results in case of SLP unrolling. */
6161 for (j = group_size; scalar_results.iterate (j, &res);
6162 j++)
6164 first_res = scalar_results[j % group_size];
6165 new_res = gimple_build (&stmts, code, compute_type,
6166 first_res, res);
6167 scalar_results[j % group_size] = new_res;
6169 scalar_results.truncate (group_size);
6170 for (k = 0; k < group_size; k++)
6171 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6172 scalar_results[k]);
6174 else
6176 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6177 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6178 scalar_results.safe_push (new_temp);
6181 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6184 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6185 && induc_val)
6187 /* Earlier we set the initial value to be a vector if induc_val
6188 values. Check the result and if it is induc_val then replace
6189 with the original initial value, unless induc_val is
6190 the same as initial_def already. */
6191 tree zcompare = make_ssa_name (boolean_type_node);
6192 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6193 induc_val);
6194 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6195 tree initial_def = reduc_info->reduc_initial_values[0];
6196 tree tmp = make_ssa_name (new_scalar_dest);
6197 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6198 initial_def, new_temp);
6199 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6200 scalar_results[0] = tmp;
6204 /* 2.5 Adjust the final result by the initial value of the reduction
6205 variable. (When such adjustment is not needed, then
6206 'adjustment_def' is zero). For example, if code is PLUS we create:
6207 new_temp = loop_exit_def + adjustment_def */
6209 if (adjustment_def)
6211 gcc_assert (!slp_reduc);
6212 gimple_seq stmts = NULL;
6213 if (double_reduc)
6215 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6216 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6217 new_temp = gimple_build (&stmts, code, vectype,
6218 reduc_inputs[0], adjustment_def);
6220 else
6222 new_temp = scalar_results[0];
6223 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6224 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
6225 new_temp = gimple_build (&stmts, code, scalar_type,
6226 new_temp, adjustment_def);
6229 epilog_stmt = gimple_seq_last_stmt (stmts);
6230 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6231 scalar_results[0] = new_temp;
6234 /* Record this operation if it could be reused by the epilogue loop. */
6235 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
6236 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6237 { orig_reduc_input, reduc_info });
6239 if (double_reduc)
6240 loop = outer_loop;
6242 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6243 phis with new adjusted scalar results, i.e., replace use <s_out0>
6244 with use <s_out4>.
6246 Transform:
6247 loop_exit:
6248 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6249 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6250 v_out2 = reduce <v_out1>
6251 s_out3 = extract_field <v_out2, 0>
6252 s_out4 = adjust_result <s_out3>
6253 use <s_out0>
6254 use <s_out0>
6256 into:
6258 loop_exit:
6259 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6260 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6261 v_out2 = reduce <v_out1>
6262 s_out3 = extract_field <v_out2, 0>
6263 s_out4 = adjust_result <s_out3>
6264 use <s_out4>
6265 use <s_out4> */
6267 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6268 for (k = 0; k < live_out_stmts.size (); k++)
6270 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6271 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6273 phis.create (3);
6274 /* Find the loop-closed-use at the loop exit of the original scalar
6275 result. (The reduction result is expected to have two immediate uses,
6276 one at the latch block, and one at the loop exit). For double
6277 reductions we are looking for exit phis of the outer loop. */
6278 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6280 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6282 if (!is_gimple_debug (USE_STMT (use_p)))
6283 phis.safe_push (USE_STMT (use_p));
6285 else
6287 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6289 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6291 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6293 if (!flow_bb_inside_loop_p (loop,
6294 gimple_bb (USE_STMT (phi_use_p)))
6295 && !is_gimple_debug (USE_STMT (phi_use_p)))
6296 phis.safe_push (USE_STMT (phi_use_p));
6302 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6304 /* Replace the uses: */
6305 orig_name = PHI_RESULT (exit_phi);
6307 /* Look for a single use at the target of the skip edge. */
6308 if (unify_with_main_loop_p)
6310 use_operand_p use_p;
6311 gimple *user;
6312 if (!single_imm_use (orig_name, &use_p, &user))
6313 gcc_unreachable ();
6314 orig_name = gimple_get_lhs (user);
6317 scalar_result = scalar_results[k];
6318 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6320 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6321 SET_USE (use_p, scalar_result);
6322 update_stmt (use_stmt);
6326 phis.release ();
6330 /* Return a vector of type VECTYPE that is equal to the vector select
6331 operation "MASK ? VEC : IDENTITY". Insert the select statements
6332 before GSI. */
6334 static tree
6335 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6336 tree vec, tree identity)
6338 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6339 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6340 mask, vec, identity);
6341 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6342 return cond;
6345 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6346 order, starting with LHS. Insert the extraction statements before GSI and
6347 associate the new scalar SSA names with variable SCALAR_DEST.
6348 Return the SSA name for the result. */
6350 static tree
6351 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6352 tree_code code, tree lhs, tree vector_rhs)
6354 tree vectype = TREE_TYPE (vector_rhs);
6355 tree scalar_type = TREE_TYPE (vectype);
6356 tree bitsize = TYPE_SIZE (scalar_type);
6357 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6358 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6360 for (unsigned HOST_WIDE_INT bit_offset = 0;
6361 bit_offset < vec_size_in_bits;
6362 bit_offset += element_bitsize)
6364 tree bitpos = bitsize_int (bit_offset);
6365 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6366 bitsize, bitpos);
6368 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6369 rhs = make_ssa_name (scalar_dest, stmt);
6370 gimple_assign_set_lhs (stmt, rhs);
6371 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6373 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6374 tree new_name = make_ssa_name (scalar_dest, stmt);
6375 gimple_assign_set_lhs (stmt, new_name);
6376 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6377 lhs = new_name;
6379 return lhs;
6382 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6383 type of the vector input. */
6385 static internal_fn
6386 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6388 internal_fn mask_reduc_fn;
6390 switch (reduc_fn)
6392 case IFN_FOLD_LEFT_PLUS:
6393 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6394 break;
6396 default:
6397 return IFN_LAST;
6400 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6401 OPTIMIZE_FOR_SPEED))
6402 return mask_reduc_fn;
6403 return IFN_LAST;
6406 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6407 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6408 statement. CODE is the operation performed by STMT_INFO and OPS are
6409 its scalar operands. REDUC_INDEX is the index of the operand in
6410 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6411 implements in-order reduction, or IFN_LAST if we should open-code it.
6412 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6413 that should be used to control the operation in a fully-masked loop. */
6415 static bool
6416 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6417 stmt_vec_info stmt_info,
6418 gimple_stmt_iterator *gsi,
6419 gimple **vec_stmt, slp_tree slp_node,
6420 gimple *reduc_def_stmt,
6421 tree_code code, internal_fn reduc_fn,
6422 tree ops[3], tree vectype_in,
6423 int reduc_index, vec_loop_masks *masks)
6425 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6426 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6427 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6429 int ncopies;
6430 if (slp_node)
6431 ncopies = 1;
6432 else
6433 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6435 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6436 gcc_assert (ncopies == 1);
6437 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6439 if (slp_node)
6440 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6441 TYPE_VECTOR_SUBPARTS (vectype_in)));
6443 tree op0 = ops[1 - reduc_index];
6445 int group_size = 1;
6446 stmt_vec_info scalar_dest_def_info;
6447 auto_vec<tree> vec_oprnds0;
6448 if (slp_node)
6450 auto_vec<vec<tree> > vec_defs (2);
6451 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6452 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6453 vec_defs[0].release ();
6454 vec_defs[1].release ();
6455 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6456 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6458 else
6460 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6461 op0, &vec_oprnds0);
6462 scalar_dest_def_info = stmt_info;
6465 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6466 tree scalar_type = TREE_TYPE (scalar_dest);
6467 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6469 int vec_num = vec_oprnds0.length ();
6470 gcc_assert (vec_num == 1 || slp_node);
6471 tree vec_elem_type = TREE_TYPE (vectype_out);
6472 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6474 tree vector_identity = NULL_TREE;
6475 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6476 vector_identity = build_zero_cst (vectype_out);
6478 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6479 int i;
6480 tree def0;
6481 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6483 gimple *new_stmt;
6484 tree mask = NULL_TREE;
6485 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6486 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6488 /* Handle MINUS by adding the negative. */
6489 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6491 tree negated = make_ssa_name (vectype_out);
6492 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6493 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6494 def0 = negated;
6497 if (mask && mask_reduc_fn == IFN_LAST)
6498 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6499 vector_identity);
6501 /* On the first iteration the input is simply the scalar phi
6502 result, and for subsequent iterations it is the output of
6503 the preceding operation. */
6504 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6506 if (mask && mask_reduc_fn != IFN_LAST)
6507 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6508 def0, mask);
6509 else
6510 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6511 def0);
6512 /* For chained SLP reductions the output of the previous reduction
6513 operation serves as the input of the next. For the final statement
6514 the output cannot be a temporary - we reuse the original
6515 scalar destination of the last statement. */
6516 if (i != vec_num - 1)
6518 gimple_set_lhs (new_stmt, scalar_dest_var);
6519 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6520 gimple_set_lhs (new_stmt, reduc_var);
6523 else
6525 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6526 reduc_var, def0);
6527 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6528 /* Remove the statement, so that we can use the same code paths
6529 as for statements that we've just created. */
6530 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6531 gsi_remove (&tmp_gsi, true);
6534 if (i == vec_num - 1)
6536 gimple_set_lhs (new_stmt, scalar_dest);
6537 vect_finish_replace_stmt (loop_vinfo,
6538 scalar_dest_def_info,
6539 new_stmt);
6541 else
6542 vect_finish_stmt_generation (loop_vinfo,
6543 scalar_dest_def_info,
6544 new_stmt, gsi);
6546 if (slp_node)
6547 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6548 else
6550 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6551 *vec_stmt = new_stmt;
6555 return true;
6558 /* Function is_nonwrapping_integer_induction.
6560 Check if STMT_VINO (which is part of loop LOOP) both increments and
6561 does not cause overflow. */
6563 static bool
6564 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6566 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6567 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6568 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6569 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6570 widest_int ni, max_loop_value, lhs_max;
6571 wi::overflow_type overflow = wi::OVF_NONE;
6573 /* Make sure the loop is integer based. */
6574 if (TREE_CODE (base) != INTEGER_CST
6575 || TREE_CODE (step) != INTEGER_CST)
6576 return false;
6578 /* Check that the max size of the loop will not wrap. */
6580 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6581 return true;
6583 if (! max_stmt_executions (loop, &ni))
6584 return false;
6586 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6587 &overflow);
6588 if (overflow)
6589 return false;
6591 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6592 TYPE_SIGN (lhs_type), &overflow);
6593 if (overflow)
6594 return false;
6596 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6597 <= TYPE_PRECISION (lhs_type));
6600 /* Check if masking can be supported by inserting a conditional expression.
6601 CODE is the code for the operation. COND_FN is the conditional internal
6602 function, if it exists. VECTYPE_IN is the type of the vector input. */
6603 static bool
6604 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6605 tree vectype_in)
6607 if (cond_fn != IFN_LAST
6608 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6609 OPTIMIZE_FOR_SPEED))
6610 return false;
6612 if (code.is_tree_code ())
6613 switch (tree_code (code))
6615 case DOT_PROD_EXPR:
6616 case SAD_EXPR:
6617 return true;
6619 default:
6620 break;
6622 return false;
6625 /* Insert a conditional expression to enable masked vectorization. CODE is the
6626 code for the operation. VOP is the array of operands. MASK is the loop
6627 mask. GSI is a statement iterator used to place the new conditional
6628 expression. */
6629 static void
6630 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6631 gimple_stmt_iterator *gsi)
6633 switch (tree_code (code))
6635 case DOT_PROD_EXPR:
6637 tree vectype = TREE_TYPE (vop[1]);
6638 tree zero = build_zero_cst (vectype);
6639 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6640 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6641 mask, vop[1], zero);
6642 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6643 vop[1] = masked_op1;
6644 break;
6647 case SAD_EXPR:
6649 tree vectype = TREE_TYPE (vop[1]);
6650 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6651 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6652 mask, vop[1], vop[0]);
6653 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6654 vop[1] = masked_op1;
6655 break;
6658 default:
6659 gcc_unreachable ();
6663 /* Function vectorizable_reduction.
6665 Check if STMT_INFO performs a reduction operation that can be vectorized.
6666 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6667 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6668 Return true if STMT_INFO is vectorizable in this way.
6670 This function also handles reduction idioms (patterns) that have been
6671 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6672 may be of this form:
6673 X = pattern_expr (arg0, arg1, ..., X)
6674 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6675 sequence that had been detected and replaced by the pattern-stmt
6676 (STMT_INFO).
6678 This function also handles reduction of condition expressions, for example:
6679 for (int i = 0; i < N; i++)
6680 if (a[i] < value)
6681 last = a[i];
6682 This is handled by vectorising the loop and creating an additional vector
6683 containing the loop indexes for which "a[i] < value" was true. In the
6684 function epilogue this is reduced to a single max value and then used to
6685 index into the vector of results.
6687 In some cases of reduction patterns, the type of the reduction variable X is
6688 different than the type of the other arguments of STMT_INFO.
6689 In such cases, the vectype that is used when transforming STMT_INFO into
6690 a vector stmt is different than the vectype that is used to determine the
6691 vectorization factor, because it consists of a different number of elements
6692 than the actual number of elements that are being operated upon in parallel.
6694 For example, consider an accumulation of shorts into an int accumulator.
6695 On some targets it's possible to vectorize this pattern operating on 8
6696 shorts at a time (hence, the vectype for purposes of determining the
6697 vectorization factor should be V8HI); on the other hand, the vectype that
6698 is used to create the vector form is actually V4SI (the type of the result).
6700 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6701 indicates what is the actual level of parallelism (V8HI in the example), so
6702 that the right vectorization factor would be derived. This vectype
6703 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6704 be used to create the vectorized stmt. The right vectype for the vectorized
6705 stmt is obtained from the type of the result X:
6706 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6708 This means that, contrary to "regular" reductions (or "regular" stmts in
6709 general), the following equation:
6710 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6711 does *NOT* necessarily hold for reduction patterns. */
6713 bool
6714 vectorizable_reduction (loop_vec_info loop_vinfo,
6715 stmt_vec_info stmt_info, slp_tree slp_node,
6716 slp_instance slp_node_instance,
6717 stmt_vector_for_cost *cost_vec)
6719 tree vectype_in = NULL_TREE;
6720 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6721 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6722 stmt_vec_info cond_stmt_vinfo = NULL;
6723 int i;
6724 int ncopies;
6725 bool single_defuse_cycle = false;
6726 bool nested_cycle = false;
6727 bool double_reduc = false;
6728 int vec_num;
6729 tree tem;
6730 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6731 tree cond_reduc_val = NULL_TREE;
6733 /* Make sure it was already recognized as a reduction computation. */
6734 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6735 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6736 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6737 return false;
6739 /* The stmt we store reduction analysis meta on. */
6740 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6741 reduc_info->is_reduc_info = true;
6743 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6745 if (is_a <gphi *> (stmt_info->stmt))
6747 if (slp_node)
6749 /* We eventually need to set a vector type on invariant
6750 arguments. */
6751 unsigned j;
6752 slp_tree child;
6753 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6754 if (!vect_maybe_update_slp_op_vectype
6755 (child, SLP_TREE_VECTYPE (slp_node)))
6757 if (dump_enabled_p ())
6758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6759 "incompatible vector types for "
6760 "invariants\n");
6761 return false;
6764 /* Analysis for double-reduction is done on the outer
6765 loop PHI, nested cycles have no further restrictions. */
6766 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6768 else
6769 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6770 return true;
6773 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6774 stmt_vec_info phi_info = stmt_info;
6775 if (!is_a <gphi *> (stmt_info->stmt))
6777 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6778 return true;
6780 if (slp_node)
6782 slp_node_instance->reduc_phis = slp_node;
6783 /* ??? We're leaving slp_node to point to the PHIs, we only
6784 need it to get at the number of vector stmts which wasn't
6785 yet initialized for the instance root. */
6787 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6789 use_operand_p use_p;
6790 gimple *use_stmt;
6791 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6792 &use_p, &use_stmt);
6793 gcc_assert (res);
6794 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6797 /* PHIs should not participate in patterns. */
6798 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6799 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6801 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6802 and compute the reduction chain length. Discover the real
6803 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6804 tree reduc_def
6805 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6806 loop_latch_edge
6807 (gimple_bb (reduc_def_phi)->loop_father));
6808 unsigned reduc_chain_length = 0;
6809 bool only_slp_reduc_chain = true;
6810 stmt_info = NULL;
6811 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6812 while (reduc_def != PHI_RESULT (reduc_def_phi))
6814 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6815 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6816 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6818 if (dump_enabled_p ())
6819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6820 "reduction chain broken by patterns.\n");
6821 return false;
6823 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6824 only_slp_reduc_chain = false;
6825 /* ??? For epilogue generation live members of the chain need
6826 to point back to the PHI via their original stmt for
6827 info_for_reduction to work. */
6828 if (STMT_VINFO_LIVE_P (vdef))
6829 STMT_VINFO_REDUC_DEF (def) = phi_info;
6830 gimple_match_op op;
6831 if (!gimple_extract_op (vdef->stmt, &op))
6833 if (dump_enabled_p ())
6834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6835 "reduction chain includes unsupported"
6836 " statement type.\n");
6837 return false;
6839 if (CONVERT_EXPR_CODE_P (op.code))
6841 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6843 if (dump_enabled_p ())
6844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6845 "conversion in the reduction chain.\n");
6846 return false;
6849 else if (!stmt_info)
6850 /* First non-conversion stmt. */
6851 stmt_info = vdef;
6852 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6853 reduc_chain_length++;
6854 if (!stmt_info && slp_node)
6855 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6857 /* PHIs should not participate in patterns. */
6858 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6860 if (nested_in_vect_loop_p (loop, stmt_info))
6862 loop = loop->inner;
6863 nested_cycle = true;
6866 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6867 element. */
6868 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6870 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6871 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6873 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6874 gcc_assert (slp_node
6875 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6877 /* 1. Is vectorizable reduction? */
6878 /* Not supportable if the reduction variable is used in the loop, unless
6879 it's a reduction chain. */
6880 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6881 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6882 return false;
6884 /* Reductions that are not used even in an enclosing outer-loop,
6885 are expected to be "live" (used out of the loop). */
6886 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6887 && !STMT_VINFO_LIVE_P (stmt_info))
6888 return false;
6890 /* 2. Has this been recognized as a reduction pattern?
6892 Check if STMT represents a pattern that has been recognized
6893 in earlier analysis stages. For stmts that represent a pattern,
6894 the STMT_VINFO_RELATED_STMT field records the last stmt in
6895 the original sequence that constitutes the pattern. */
6897 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6898 if (orig_stmt_info)
6900 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6901 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6904 /* 3. Check the operands of the operation. The first operands are defined
6905 inside the loop body. The last operand is the reduction variable,
6906 which is defined by the loop-header-phi. */
6908 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6909 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6910 gimple_match_op op;
6911 if (!gimple_extract_op (stmt_info->stmt, &op))
6912 gcc_unreachable ();
6913 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
6914 || op.code == WIDEN_SUM_EXPR
6915 || op.code == SAD_EXPR);
6917 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
6918 && !SCALAR_FLOAT_TYPE_P (op.type))
6919 return false;
6921 /* Do not try to vectorize bit-precision reductions. */
6922 if (!type_has_mode_precision_p (op.type))
6923 return false;
6925 /* For lane-reducing ops we're reducing the number of reduction PHIs
6926 which means the only use of that may be in the lane-reducing operation. */
6927 if (lane_reduc_code_p
6928 && reduc_chain_length != 1
6929 && !only_slp_reduc_chain)
6931 if (dump_enabled_p ())
6932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6933 "lane-reducing reduction with extra stmts.\n");
6934 return false;
6937 /* All uses but the last are expected to be defined in the loop.
6938 The last use is the reduction variable. In case of nested cycle this
6939 assumption is not true: we use reduc_index to record the index of the
6940 reduction variable. */
6941 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
6942 /* We need to skip an extra operand for COND_EXPRs with embedded
6943 comparison. */
6944 unsigned opno_adjust = 0;
6945 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
6946 opno_adjust = 1;
6947 for (i = 0; i < (int) op.num_ops; i++)
6949 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6950 if (i == 0 && op.code == COND_EXPR)
6951 continue;
6953 stmt_vec_info def_stmt_info;
6954 enum vect_def_type dt;
6955 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6956 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
6957 &tem, &def_stmt_info))
6959 if (dump_enabled_p ())
6960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6961 "use not simple.\n");
6962 return false;
6964 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6965 continue;
6967 /* There should be only one cycle def in the stmt, the one
6968 leading to reduc_def. */
6969 if (VECTORIZABLE_CYCLE_DEF (dt))
6970 return false;
6972 /* To properly compute ncopies we are interested in the widest
6973 non-reduction input type in case we're looking at a widening
6974 accumulation that we later handle in vect_transform_reduction. */
6975 if (lane_reduc_code_p
6976 && tem
6977 && (!vectype_in
6978 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6979 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6980 vectype_in = tem;
6982 if (op.code == COND_EXPR)
6984 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6985 if (dt == vect_constant_def)
6987 cond_reduc_dt = dt;
6988 cond_reduc_val = op.ops[i];
6990 if (dt == vect_induction_def
6991 && def_stmt_info
6992 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6994 cond_reduc_dt = dt;
6995 cond_stmt_vinfo = def_stmt_info;
6999 if (!vectype_in)
7000 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7001 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7003 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7004 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7005 /* If we have a condition reduction, see if we can simplify it further. */
7006 if (v_reduc_type == COND_REDUCTION)
7008 if (slp_node)
7009 return false;
7011 /* When the condition uses the reduction value in the condition, fail. */
7012 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7014 if (dump_enabled_p ())
7015 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7016 "condition depends on previous iteration\n");
7017 return false;
7020 if (reduc_chain_length == 1
7021 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
7022 vectype_in, OPTIMIZE_FOR_SPEED))
7024 if (dump_enabled_p ())
7025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7026 "optimizing condition reduction with"
7027 " FOLD_EXTRACT_LAST.\n");
7028 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7030 else if (cond_reduc_dt == vect_induction_def)
7032 tree base
7033 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7034 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7036 gcc_assert (TREE_CODE (base) == INTEGER_CST
7037 && TREE_CODE (step) == INTEGER_CST);
7038 cond_reduc_val = NULL_TREE;
7039 enum tree_code cond_reduc_op_code = ERROR_MARK;
7040 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7041 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7043 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7044 above base; punt if base is the minimum value of the type for
7045 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7046 else if (tree_int_cst_sgn (step) == -1)
7048 cond_reduc_op_code = MIN_EXPR;
7049 if (tree_int_cst_sgn (base) == -1)
7050 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7051 else if (tree_int_cst_lt (base,
7052 TYPE_MAX_VALUE (TREE_TYPE (base))))
7053 cond_reduc_val
7054 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7056 else
7058 cond_reduc_op_code = MAX_EXPR;
7059 if (tree_int_cst_sgn (base) == 1)
7060 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7061 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7062 base))
7063 cond_reduc_val
7064 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7066 if (cond_reduc_val)
7068 if (dump_enabled_p ())
7069 dump_printf_loc (MSG_NOTE, vect_location,
7070 "condition expression based on "
7071 "integer induction.\n");
7072 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7073 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7074 = cond_reduc_val;
7075 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7078 else if (cond_reduc_dt == vect_constant_def)
7080 enum vect_def_type cond_initial_dt;
7081 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7082 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7083 if (cond_initial_dt == vect_constant_def
7084 && types_compatible_p (TREE_TYPE (cond_initial_val),
7085 TREE_TYPE (cond_reduc_val)))
7087 tree e = fold_binary (LE_EXPR, boolean_type_node,
7088 cond_initial_val, cond_reduc_val);
7089 if (e && (integer_onep (e) || integer_zerop (e)))
7091 if (dump_enabled_p ())
7092 dump_printf_loc (MSG_NOTE, vect_location,
7093 "condition expression based on "
7094 "compile time constant.\n");
7095 /* Record reduction code at analysis stage. */
7096 STMT_VINFO_REDUC_CODE (reduc_info)
7097 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7098 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7104 if (STMT_VINFO_LIVE_P (phi_info))
7105 return false;
7107 if (slp_node)
7108 ncopies = 1;
7109 else
7110 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7112 gcc_assert (ncopies >= 1);
7114 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7116 if (nested_cycle)
7118 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7119 == vect_double_reduction_def);
7120 double_reduc = true;
7123 /* 4.2. Check support for the epilog operation.
7125 If STMT represents a reduction pattern, then the type of the
7126 reduction variable may be different than the type of the rest
7127 of the arguments. For example, consider the case of accumulation
7128 of shorts into an int accumulator; The original code:
7129 S1: int_a = (int) short_a;
7130 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7132 was replaced with:
7133 STMT: int_acc = widen_sum <short_a, int_acc>
7135 This means that:
7136 1. The tree-code that is used to create the vector operation in the
7137 epilog code (that reduces the partial results) is not the
7138 tree-code of STMT, but is rather the tree-code of the original
7139 stmt from the pattern that STMT is replacing. I.e, in the example
7140 above we want to use 'widen_sum' in the loop, but 'plus' in the
7141 epilog.
7142 2. The type (mode) we use to check available target support
7143 for the vector operation to be created in the *epilog*, is
7144 determined by the type of the reduction variable (in the example
7145 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7146 However the type (mode) we use to check available target support
7147 for the vector operation to be created *inside the loop*, is
7148 determined by the type of the other arguments to STMT (in the
7149 example we'd check this: optab_handler (widen_sum_optab,
7150 vect_short_mode)).
7152 This is contrary to "regular" reductions, in which the types of all
7153 the arguments are the same as the type of the reduction variable.
7154 For "regular" reductions we can therefore use the same vector type
7155 (and also the same tree-code) when generating the epilog code and
7156 when generating the code inside the loop. */
7158 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7159 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7161 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7162 if (reduction_type == TREE_CODE_REDUCTION)
7164 /* Check whether it's ok to change the order of the computation.
7165 Generally, when vectorizing a reduction we change the order of the
7166 computation. This may change the behavior of the program in some
7167 cases, so we need to check that this is ok. One exception is when
7168 vectorizing an outer-loop: the inner-loop is executed sequentially,
7169 and therefore vectorizing reductions in the inner-loop during
7170 outer-loop vectorization is safe. Likewise when we are vectorizing
7171 a series of reductions using SLP and the VF is one the reductions
7172 are performed in scalar order. */
7173 if (slp_node
7174 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7175 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7177 else if (needs_fold_left_reduction_p (op.type, orig_code))
7179 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7180 is not directy used in stmt. */
7181 if (!only_slp_reduc_chain
7182 && reduc_chain_length != 1)
7184 if (dump_enabled_p ())
7185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7186 "in-order reduction chain without SLP.\n");
7187 return false;
7189 STMT_VINFO_REDUC_TYPE (reduc_info)
7190 = reduction_type = FOLD_LEFT_REDUCTION;
7192 else if (!commutative_binary_op_p (orig_code, op.type)
7193 || !associative_binary_op_p (orig_code, op.type))
7195 if (dump_enabled_p ())
7196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7197 "reduction: not commutative/associative");
7198 return false;
7202 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7203 && ncopies > 1)
7205 if (dump_enabled_p ())
7206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7207 "multiple types in double reduction or condition "
7208 "reduction or fold-left reduction.\n");
7209 return false;
7212 internal_fn reduc_fn = IFN_LAST;
7213 if (reduction_type == TREE_CODE_REDUCTION
7214 || reduction_type == FOLD_LEFT_REDUCTION
7215 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7216 || reduction_type == CONST_COND_REDUCTION)
7218 if (reduction_type == FOLD_LEFT_REDUCTION
7219 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7220 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7222 if (reduc_fn != IFN_LAST
7223 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7224 OPTIMIZE_FOR_SPEED))
7226 if (dump_enabled_p ())
7227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7228 "reduc op not supported by target.\n");
7230 reduc_fn = IFN_LAST;
7233 else
7235 if (!nested_cycle || double_reduc)
7237 if (dump_enabled_p ())
7238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7239 "no reduc code for scalar code.\n");
7241 return false;
7245 else if (reduction_type == COND_REDUCTION)
7247 int scalar_precision
7248 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7249 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7250 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7251 vectype_out);
7253 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7254 OPTIMIZE_FOR_SPEED))
7255 reduc_fn = IFN_REDUC_MAX;
7257 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7259 if (reduction_type != EXTRACT_LAST_REDUCTION
7260 && (!nested_cycle || double_reduc)
7261 && reduc_fn == IFN_LAST
7262 && !nunits_out.is_constant ())
7264 if (dump_enabled_p ())
7265 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7266 "missing target support for reduction on"
7267 " variable-length vectors.\n");
7268 return false;
7271 /* For SLP reductions, see if there is a neutral value we can use. */
7272 tree neutral_op = NULL_TREE;
7273 if (slp_node)
7275 tree initial_value = NULL_TREE;
7276 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7277 initial_value = vect_phi_initial_value (reduc_def_phi);
7278 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7279 orig_code, initial_value);
7282 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7284 /* We can't support in-order reductions of code such as this:
7286 for (int i = 0; i < n1; ++i)
7287 for (int j = 0; j < n2; ++j)
7288 l += a[j];
7290 since GCC effectively transforms the loop when vectorizing:
7292 for (int i = 0; i < n1 / VF; ++i)
7293 for (int j = 0; j < n2; ++j)
7294 for (int k = 0; k < VF; ++k)
7295 l += a[j];
7297 which is a reassociation of the original operation. */
7298 if (dump_enabled_p ())
7299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7300 "in-order double reduction not supported.\n");
7302 return false;
7305 if (reduction_type == FOLD_LEFT_REDUCTION
7306 && slp_node
7307 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7309 /* We cannot use in-order reductions in this case because there is
7310 an implicit reassociation of the operations involved. */
7311 if (dump_enabled_p ())
7312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7313 "in-order unchained SLP reductions not supported.\n");
7314 return false;
7317 /* For double reductions, and for SLP reductions with a neutral value,
7318 we construct a variable-length initial vector by loading a vector
7319 full of the neutral value and then shift-and-inserting the start
7320 values into the low-numbered elements. */
7321 if ((double_reduc || neutral_op)
7322 && !nunits_out.is_constant ()
7323 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7324 vectype_out, OPTIMIZE_FOR_SPEED))
7326 if (dump_enabled_p ())
7327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7328 "reduction on variable-length vectors requires"
7329 " target support for a vector-shift-and-insert"
7330 " operation.\n");
7331 return false;
7334 /* Check extra constraints for variable-length unchained SLP reductions. */
7335 if (STMT_SLP_TYPE (stmt_info)
7336 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7337 && !nunits_out.is_constant ())
7339 /* We checked above that we could build the initial vector when
7340 there's a neutral element value. Check here for the case in
7341 which each SLP statement has its own initial value and in which
7342 that value needs to be repeated for every instance of the
7343 statement within the initial vector. */
7344 unsigned int group_size = SLP_TREE_LANES (slp_node);
7345 if (!neutral_op
7346 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7347 TREE_TYPE (vectype_out)))
7349 if (dump_enabled_p ())
7350 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7351 "unsupported form of SLP reduction for"
7352 " variable-length vectors: cannot build"
7353 " initial vector.\n");
7354 return false;
7356 /* The epilogue code relies on the number of elements being a multiple
7357 of the group size. The duplicate-and-interleave approach to setting
7358 up the initial vector does too. */
7359 if (!multiple_p (nunits_out, group_size))
7361 if (dump_enabled_p ())
7362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7363 "unsupported form of SLP reduction for"
7364 " variable-length vectors: the vector size"
7365 " is not a multiple of the number of results.\n");
7366 return false;
7370 if (reduction_type == COND_REDUCTION)
7372 widest_int ni;
7374 if (! max_loop_iterations (loop, &ni))
7376 if (dump_enabled_p ())
7377 dump_printf_loc (MSG_NOTE, vect_location,
7378 "loop count not known, cannot create cond "
7379 "reduction.\n");
7380 return false;
7382 /* Convert backedges to iterations. */
7383 ni += 1;
7385 /* The additional index will be the same type as the condition. Check
7386 that the loop can fit into this less one (because we'll use up the
7387 zero slot for when there are no matches). */
7388 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7389 if (wi::geu_p (ni, wi::to_widest (max_index)))
7391 if (dump_enabled_p ())
7392 dump_printf_loc (MSG_NOTE, vect_location,
7393 "loop size is greater than data size.\n");
7394 return false;
7398 /* In case the vectorization factor (VF) is bigger than the number
7399 of elements that we can fit in a vectype (nunits), we have to generate
7400 more than one vector stmt - i.e - we need to "unroll" the
7401 vector stmt by a factor VF/nunits. For more details see documentation
7402 in vectorizable_operation. */
7404 /* If the reduction is used in an outer loop we need to generate
7405 VF intermediate results, like so (e.g. for ncopies=2):
7406 r0 = phi (init, r0)
7407 r1 = phi (init, r1)
7408 r0 = x0 + r0;
7409 r1 = x1 + r1;
7410 (i.e. we generate VF results in 2 registers).
7411 In this case we have a separate def-use cycle for each copy, and therefore
7412 for each copy we get the vector def for the reduction variable from the
7413 respective phi node created for this copy.
7415 Otherwise (the reduction is unused in the loop nest), we can combine
7416 together intermediate results, like so (e.g. for ncopies=2):
7417 r = phi (init, r)
7418 r = x0 + r;
7419 r = x1 + r;
7420 (i.e. we generate VF/2 results in a single register).
7421 In this case for each copy we get the vector def for the reduction variable
7422 from the vectorized reduction operation generated in the previous iteration.
7424 This only works when we see both the reduction PHI and its only consumer
7425 in vectorizable_reduction and there are no intermediate stmts
7426 participating. When unrolling we want each unrolled iteration to have its
7427 own reduction accumulator since one of the main goals of unrolling a
7428 reduction is to reduce the aggregate loop-carried latency. */
7429 if (ncopies > 1
7430 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7431 && reduc_chain_length == 1
7432 && loop_vinfo->suggested_unroll_factor == 1)
7433 single_defuse_cycle = true;
7435 if (single_defuse_cycle || lane_reduc_code_p)
7437 gcc_assert (op.code != COND_EXPR);
7439 /* 4. Supportable by target? */
7440 bool ok = true;
7442 /* 4.1. check support for the operation in the loop
7444 This isn't necessary for the lane reduction codes, since they
7445 can only be produced by pattern matching, and it's up to the
7446 pattern matcher to test for support. The main reason for
7447 specifically skipping this step is to avoid rechecking whether
7448 mixed-sign dot-products can be implemented using signed
7449 dot-products. */
7450 machine_mode vec_mode = TYPE_MODE (vectype_in);
7451 if (!lane_reduc_code_p
7452 && !directly_supported_p (op.code, vectype_in, optab_vector))
7454 if (dump_enabled_p ())
7455 dump_printf (MSG_NOTE, "op not supported by target.\n");
7456 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7457 || !vect_can_vectorize_without_simd_p (op.code))
7458 ok = false;
7459 else
7460 if (dump_enabled_p ())
7461 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7464 if (vect_emulated_vector_p (vectype_in)
7465 && !vect_can_vectorize_without_simd_p (op.code))
7467 if (dump_enabled_p ())
7468 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7469 return false;
7472 /* lane-reducing operations have to go through vect_transform_reduction.
7473 For the other cases try without the single cycle optimization. */
7474 if (!ok)
7476 if (lane_reduc_code_p)
7477 return false;
7478 else
7479 single_defuse_cycle = false;
7482 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7484 /* If the reduction stmt is one of the patterns that have lane
7485 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7486 if ((ncopies > 1 && ! single_defuse_cycle)
7487 && lane_reduc_code_p)
7489 if (dump_enabled_p ())
7490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7491 "multi def-use cycle not possible for lane-reducing "
7492 "reduction operation\n");
7493 return false;
7496 if (slp_node
7497 && !(!single_defuse_cycle
7498 && !lane_reduc_code_p
7499 && reduction_type != FOLD_LEFT_REDUCTION))
7500 for (i = 0; i < (int) op.num_ops; i++)
7501 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7503 if (dump_enabled_p ())
7504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7505 "incompatible vector types for invariants\n");
7506 return false;
7509 if (slp_node)
7510 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7511 else
7512 vec_num = 1;
7514 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7515 reduction_type, ncopies, cost_vec);
7516 /* Cost the reduction op inside the loop if transformed via
7517 vect_transform_reduction. Otherwise this is costed by the
7518 separate vectorizable_* routines. */
7519 if (single_defuse_cycle || lane_reduc_code_p)
7521 int factor = 1;
7522 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7523 /* Three dot-products and a subtraction. */
7524 factor = 4;
7525 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
7526 stmt_info, 0, vect_body);
7529 if (dump_enabled_p ()
7530 && reduction_type == FOLD_LEFT_REDUCTION)
7531 dump_printf_loc (MSG_NOTE, vect_location,
7532 "using an in-order (fold-left) reduction.\n");
7533 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7534 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7535 reductions go through their own vectorizable_* routines. */
7536 if (!single_defuse_cycle
7537 && !lane_reduc_code_p
7538 && reduction_type != FOLD_LEFT_REDUCTION)
7540 stmt_vec_info tem
7541 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7542 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7544 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7545 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7547 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7548 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7550 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7552 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7553 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7555 if (reduction_type != FOLD_LEFT_REDUCTION
7556 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7557 && (cond_fn == IFN_LAST
7558 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7559 OPTIMIZE_FOR_SPEED)))
7561 if (dump_enabled_p ())
7562 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7563 "can't operate on partial vectors because"
7564 " no conditional operation is available.\n");
7565 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7567 else if (reduction_type == FOLD_LEFT_REDUCTION
7568 && reduc_fn == IFN_LAST
7569 && !expand_vec_cond_expr_p (vectype_in,
7570 truth_type_for (vectype_in),
7571 SSA_NAME))
7573 if (dump_enabled_p ())
7574 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7575 "can't operate on partial vectors because"
7576 " no conditional operation is available.\n");
7577 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7579 else
7580 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7581 vectype_in, NULL);
7583 return true;
7586 /* STMT_INFO is a dot-product reduction whose multiplication operands
7587 have different signs. Emit a sequence to emulate the operation
7588 using a series of signed DOT_PROD_EXPRs and return the last
7589 statement generated. VEC_DEST is the result of the vector operation
7590 and VOP lists its inputs. */
7592 static gassign *
7593 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7594 gimple_stmt_iterator *gsi, tree vec_dest,
7595 tree vop[3])
7597 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7598 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7599 tree narrow_elttype = TREE_TYPE (narrow_vectype);
7600 gimple *new_stmt;
7602 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7603 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7604 std::swap (vop[0], vop[1]);
7606 /* Convert all inputs to signed types. */
7607 for (int i = 0; i < 3; ++i)
7608 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7610 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7611 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7612 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7613 vop[i] = tmp;
7616 /* In the comments below we assume 8-bit inputs for simplicity,
7617 but the approach works for any full integer type. */
7619 /* Create a vector of -128. */
7620 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7621 tree min_narrow = build_vector_from_val (narrow_vectype,
7622 min_narrow_elttype);
7624 /* Create a vector of 64. */
7625 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7626 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7627 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7629 /* Emit: SUB_RES = VOP[0] - 128. */
7630 tree sub_res = make_ssa_name (narrow_vectype);
7631 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7632 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7634 /* Emit:
7636 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7637 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7638 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7640 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7641 Doing the two 64 * y steps first allows more time to compute x. */
7642 tree stage1 = make_ssa_name (wide_vectype);
7643 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7644 vop[1], half_narrow, vop[2]);
7645 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7647 tree stage2 = make_ssa_name (wide_vectype);
7648 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7649 vop[1], half_narrow, stage1);
7650 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7652 tree stage3 = make_ssa_name (wide_vectype);
7653 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7654 sub_res, vop[1], stage2);
7655 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7657 /* Convert STAGE3 to the reduction type. */
7658 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7661 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7662 value. */
7664 bool
7665 vect_transform_reduction (loop_vec_info loop_vinfo,
7666 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7667 gimple **vec_stmt, slp_tree slp_node)
7669 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7670 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7671 int i;
7672 int ncopies;
7673 int vec_num;
7675 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7676 gcc_assert (reduc_info->is_reduc_info);
7678 if (nested_in_vect_loop_p (loop, stmt_info))
7680 loop = loop->inner;
7681 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7684 gimple_match_op op;
7685 if (!gimple_extract_op (stmt_info->stmt, &op))
7686 gcc_unreachable ();
7687 gcc_assert (op.code.is_tree_code ());
7688 auto code = tree_code (op.code);
7690 /* All uses but the last are expected to be defined in the loop.
7691 The last use is the reduction variable. In case of nested cycle this
7692 assumption is not true: we use reduc_index to record the index of the
7693 reduction variable. */
7694 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7695 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7696 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7697 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7699 if (slp_node)
7701 ncopies = 1;
7702 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7704 else
7706 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7707 vec_num = 1;
7710 internal_fn cond_fn = get_conditional_internal_fn (code);
7711 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7712 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7714 /* Transform. */
7715 tree new_temp = NULL_TREE;
7716 auto_vec<tree> vec_oprnds0;
7717 auto_vec<tree> vec_oprnds1;
7718 auto_vec<tree> vec_oprnds2;
7719 tree def0;
7721 if (dump_enabled_p ())
7722 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7724 /* FORNOW: Multiple types are not supported for condition. */
7725 if (code == COND_EXPR)
7726 gcc_assert (ncopies == 1);
7728 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7730 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7731 if (reduction_type == FOLD_LEFT_REDUCTION)
7733 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7734 return vectorize_fold_left_reduction
7735 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7736 reduc_fn, op.ops, vectype_in, reduc_index, masks);
7739 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7740 gcc_assert (single_defuse_cycle
7741 || code == DOT_PROD_EXPR
7742 || code == WIDEN_SUM_EXPR
7743 || code == SAD_EXPR);
7745 /* Create the destination vector */
7746 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
7747 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7749 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7750 single_defuse_cycle && reduc_index == 0
7751 ? NULL_TREE : op.ops[0], &vec_oprnds0,
7752 single_defuse_cycle && reduc_index == 1
7753 ? NULL_TREE : op.ops[1], &vec_oprnds1,
7754 op.num_ops == 3
7755 && !(single_defuse_cycle && reduc_index == 2)
7756 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7757 if (single_defuse_cycle)
7759 gcc_assert (!slp_node);
7760 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7761 op.ops[reduc_index],
7762 reduc_index == 0 ? &vec_oprnds0
7763 : (reduc_index == 1 ? &vec_oprnds1
7764 : &vec_oprnds2));
7767 bool emulated_mixed_dot_prod
7768 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
7769 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7771 gimple *new_stmt;
7772 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7773 if (masked_loop_p && !mask_by_cond_expr)
7775 /* No conditional ifns have been defined for dot-product yet. */
7776 gcc_assert (code != DOT_PROD_EXPR);
7778 /* Make sure that the reduction accumulator is vop[0]. */
7779 if (reduc_index == 1)
7781 gcc_assert (commutative_tree_code (code));
7782 std::swap (vop[0], vop[1]);
7784 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7785 vectype_in, i);
7786 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7787 vop[0], vop[1], vop[0]);
7788 new_temp = make_ssa_name (vec_dest, call);
7789 gimple_call_set_lhs (call, new_temp);
7790 gimple_call_set_nothrow (call, true);
7791 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7792 new_stmt = call;
7794 else
7796 if (op.num_ops == 3)
7797 vop[2] = vec_oprnds2[i];
7799 if (masked_loop_p && mask_by_cond_expr)
7801 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7802 vectype_in, i);
7803 build_vect_cond_expr (code, vop, mask, gsi);
7806 if (emulated_mixed_dot_prod)
7807 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
7808 vec_dest, vop);
7809 else
7810 new_stmt = gimple_build_assign (vec_dest, code,
7811 vop[0], vop[1], vop[2]);
7812 new_temp = make_ssa_name (vec_dest, new_stmt);
7813 gimple_assign_set_lhs (new_stmt, new_temp);
7814 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7817 if (slp_node)
7818 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7819 else if (single_defuse_cycle
7820 && i < ncopies - 1)
7822 if (reduc_index == 0)
7823 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7824 else if (reduc_index == 1)
7825 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7826 else if (reduc_index == 2)
7827 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7829 else
7830 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7833 if (!slp_node)
7834 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7836 return true;
7839 /* Transform phase of a cycle PHI. */
7841 bool
7842 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7843 stmt_vec_info stmt_info, gimple **vec_stmt,
7844 slp_tree slp_node, slp_instance slp_node_instance)
7846 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7847 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7848 int i;
7849 int ncopies;
7850 int j;
7851 bool nested_cycle = false;
7852 int vec_num;
7854 if (nested_in_vect_loop_p (loop, stmt_info))
7856 loop = loop->inner;
7857 nested_cycle = true;
7860 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7861 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7862 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7863 gcc_assert (reduc_info->is_reduc_info);
7865 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7866 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7867 /* Leave the scalar phi in place. */
7868 return true;
7870 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7871 /* For a nested cycle we do not fill the above. */
7872 if (!vectype_in)
7873 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7874 gcc_assert (vectype_in);
7876 if (slp_node)
7878 /* The size vect_schedule_slp_instance computes is off for us. */
7879 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7880 * SLP_TREE_LANES (slp_node), vectype_in);
7881 ncopies = 1;
7883 else
7885 vec_num = 1;
7886 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7889 /* Check whether we should use a single PHI node and accumulate
7890 vectors to one before the backedge. */
7891 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7892 ncopies = 1;
7894 /* Create the destination vector */
7895 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7896 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7897 vectype_out);
7899 /* Get the loop-entry arguments. */
7900 tree vec_initial_def = NULL_TREE;
7901 auto_vec<tree> vec_initial_defs;
7902 if (slp_node)
7904 vec_initial_defs.reserve (vec_num);
7905 if (nested_cycle)
7907 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7908 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7909 &vec_initial_defs);
7911 else
7913 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7914 vec<tree> &initial_values = reduc_info->reduc_initial_values;
7915 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7917 unsigned int num_phis = stmts.length ();
7918 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7919 num_phis = 1;
7920 initial_values.reserve (num_phis);
7921 for (unsigned int i = 0; i < num_phis; ++i)
7923 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7924 initial_values.quick_push (vect_phi_initial_value (this_phi));
7926 if (vec_num == 1)
7927 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7928 if (!initial_values.is_empty ())
7930 tree initial_value
7931 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7932 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7933 tree neutral_op
7934 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7935 code, initial_value);
7936 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7937 &vec_initial_defs, vec_num,
7938 stmts.length (), neutral_op);
7942 else
7944 /* Get at the scalar def before the loop, that defines the initial
7945 value of the reduction variable. */
7946 tree initial_def = vect_phi_initial_value (phi);
7947 reduc_info->reduc_initial_values.safe_push (initial_def);
7948 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7949 and we can't use zero for induc_val, use initial_def. Similarly
7950 for REDUC_MIN and initial_def larger than the base. */
7951 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7953 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7954 if (TREE_CODE (initial_def) == INTEGER_CST
7955 && !integer_zerop (induc_val)
7956 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7957 && tree_int_cst_lt (initial_def, induc_val))
7958 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7959 && tree_int_cst_lt (induc_val, initial_def))))
7961 induc_val = initial_def;
7962 /* Communicate we used the initial_def to epilouge
7963 generation. */
7964 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7966 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7968 else if (nested_cycle)
7970 /* Do not use an adjustment def as that case is not supported
7971 correctly if ncopies is not one. */
7972 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7973 ncopies, initial_def,
7974 &vec_initial_defs);
7976 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7977 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7978 /* Fill the initial vector with the initial scalar value. */
7979 vec_initial_def
7980 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7981 initial_def, initial_def);
7982 else
7984 if (ncopies == 1)
7985 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7986 if (!reduc_info->reduc_initial_values.is_empty ())
7988 initial_def = reduc_info->reduc_initial_values[0];
7989 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
7990 tree neutral_op
7991 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7992 code, initial_def);
7993 gcc_assert (neutral_op);
7994 /* Try to simplify the vector initialization by applying an
7995 adjustment after the reduction has been performed. */
7996 if (!reduc_info->reused_accumulator
7997 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7998 && !operand_equal_p (neutral_op, initial_def))
8000 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8001 = initial_def;
8002 initial_def = neutral_op;
8004 vec_initial_def
8005 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8006 initial_def, neutral_op);
8011 if (vec_initial_def)
8013 vec_initial_defs.create (ncopies);
8014 for (i = 0; i < ncopies; ++i)
8015 vec_initial_defs.quick_push (vec_initial_def);
8018 if (auto *accumulator = reduc_info->reused_accumulator)
8020 tree def = accumulator->reduc_input;
8021 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8023 unsigned int nreduc;
8024 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8025 (TREE_TYPE (def)),
8026 TYPE_VECTOR_SUBPARTS (vectype_out),
8027 &nreduc);
8028 gcc_assert (res);
8029 gimple_seq stmts = NULL;
8030 /* Reduce the single vector to a smaller one. */
8031 if (nreduc != 1)
8033 /* Perform the reduction in the appropriate type. */
8034 tree rvectype = vectype_out;
8035 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8036 TREE_TYPE (TREE_TYPE (def))))
8037 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8038 TYPE_VECTOR_SUBPARTS
8039 (vectype_out));
8040 def = vect_create_partial_epilog (def, rvectype,
8041 STMT_VINFO_REDUC_CODE
8042 (reduc_info),
8043 &stmts);
8045 /* The epilogue loop might use a different vector mode, like
8046 VNx2DI vs. V2DI. */
8047 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8049 tree reduc_type = build_vector_type_for_mode
8050 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8051 def = gimple_convert (&stmts, reduc_type, def);
8053 /* Adjust the input so we pick up the partially reduced value
8054 for the skip edge in vect_create_epilog_for_reduction. */
8055 accumulator->reduc_input = def;
8056 /* And the reduction could be carried out using a different sign. */
8057 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8058 def = gimple_convert (&stmts, vectype_out, def);
8059 if (loop_vinfo->main_loop_edge)
8061 /* While we'd like to insert on the edge this will split
8062 blocks and disturb bookkeeping, we also will eventually
8063 need this on the skip edge. Rely on sinking to
8064 fixup optimal placement and insert in the pred. */
8065 gimple_stmt_iterator gsi
8066 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8067 /* Insert before a cond that eventually skips the
8068 epilogue. */
8069 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8070 gsi_prev (&gsi);
8071 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8073 else
8074 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8075 stmts);
8077 if (loop_vinfo->main_loop_edge)
8078 vec_initial_defs[0]
8079 = vect_get_main_loop_result (loop_vinfo, def,
8080 vec_initial_defs[0]);
8081 else
8082 vec_initial_defs.safe_push (def);
8085 /* Generate the reduction PHIs upfront. */
8086 for (i = 0; i < vec_num; i++)
8088 tree vec_init_def = vec_initial_defs[i];
8089 for (j = 0; j < ncopies; j++)
8091 /* Create the reduction-phi that defines the reduction
8092 operand. */
8093 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8095 /* Set the loop-entry arg of the reduction-phi. */
8096 if (j != 0 && nested_cycle)
8097 vec_init_def = vec_initial_defs[j];
8098 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8099 UNKNOWN_LOCATION);
8101 /* The loop-latch arg is set in epilogue processing. */
8103 if (slp_node)
8104 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8105 else
8107 if (j == 0)
8108 *vec_stmt = new_phi;
8109 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8114 return true;
8117 /* Vectorizes LC PHIs. */
8119 bool
8120 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8121 stmt_vec_info stmt_info, gimple **vec_stmt,
8122 slp_tree slp_node)
8124 if (!loop_vinfo
8125 || !is_a <gphi *> (stmt_info->stmt)
8126 || gimple_phi_num_args (stmt_info->stmt) != 1)
8127 return false;
8129 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8130 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8131 return false;
8133 if (!vec_stmt) /* transformation not required. */
8135 /* Deal with copies from externs or constants that disguise as
8136 loop-closed PHI nodes (PR97886). */
8137 if (slp_node
8138 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8139 SLP_TREE_VECTYPE (slp_node)))
8141 if (dump_enabled_p ())
8142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8143 "incompatible vector types for invariants\n");
8144 return false;
8146 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8147 return true;
8150 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8151 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8152 basic_block bb = gimple_bb (stmt_info->stmt);
8153 edge e = single_pred_edge (bb);
8154 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8155 auto_vec<tree> vec_oprnds;
8156 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8157 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8158 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8159 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8161 /* Create the vectorized LC PHI node. */
8162 gphi *new_phi = create_phi_node (vec_dest, bb);
8163 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8164 if (slp_node)
8165 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8166 else
8167 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8169 if (!slp_node)
8170 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8172 return true;
8175 /* Vectorizes PHIs. */
8177 bool
8178 vectorizable_phi (vec_info *,
8179 stmt_vec_info stmt_info, gimple **vec_stmt,
8180 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8182 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8183 return false;
8185 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8186 return false;
8188 tree vectype = SLP_TREE_VECTYPE (slp_node);
8190 if (!vec_stmt) /* transformation not required. */
8192 slp_tree child;
8193 unsigned i;
8194 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8195 if (!child)
8197 if (dump_enabled_p ())
8198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8199 "PHI node with unvectorized backedge def\n");
8200 return false;
8202 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8204 if (dump_enabled_p ())
8205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8206 "incompatible vector types for invariants\n");
8207 return false;
8209 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8210 && !useless_type_conversion_p (vectype,
8211 SLP_TREE_VECTYPE (child)))
8213 /* With bools we can have mask and non-mask precision vectors
8214 or different non-mask precisions. while pattern recog is
8215 supposed to guarantee consistency here bugs in it can cause
8216 mismatches (PR103489 and PR103800 for example).
8217 Deal with them here instead of ICEing later. */
8218 if (dump_enabled_p ())
8219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8220 "incompatible vector type setup from "
8221 "bool pattern detection\n");
8222 return false;
8225 /* For single-argument PHIs assume coalescing which means zero cost
8226 for the scalar and the vector PHIs. This avoids artificially
8227 favoring the vector path (but may pessimize it in some cases). */
8228 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8229 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8230 vector_stmt, stmt_info, vectype, 0, vect_body);
8231 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8232 return true;
8235 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8236 basic_block bb = gimple_bb (stmt_info->stmt);
8237 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8238 auto_vec<gphi *> new_phis;
8239 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8241 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8243 /* Skip not yet vectorized defs. */
8244 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8245 && SLP_TREE_VEC_STMTS (child).is_empty ())
8246 continue;
8248 auto_vec<tree> vec_oprnds;
8249 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8250 if (!new_phis.exists ())
8252 new_phis.create (vec_oprnds.length ());
8253 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8255 /* Create the vectorized LC PHI node. */
8256 new_phis.quick_push (create_phi_node (vec_dest, bb));
8257 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8260 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8261 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8262 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8264 /* We should have at least one already vectorized child. */
8265 gcc_assert (new_phis.exists ());
8267 return true;
8270 /* Return true if VECTYPE represents a vector that requires lowering
8271 by the vector lowering pass. */
8273 bool
8274 vect_emulated_vector_p (tree vectype)
8276 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8277 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8278 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8281 /* Return true if we can emulate CODE on an integer mode representation
8282 of a vector. */
8284 bool
8285 vect_can_vectorize_without_simd_p (tree_code code)
8287 switch (code)
8289 case PLUS_EXPR:
8290 case MINUS_EXPR:
8291 case NEGATE_EXPR:
8292 case BIT_AND_EXPR:
8293 case BIT_IOR_EXPR:
8294 case BIT_XOR_EXPR:
8295 case BIT_NOT_EXPR:
8296 return true;
8298 default:
8299 return false;
8303 /* Likewise, but taking a code_helper. */
8305 bool
8306 vect_can_vectorize_without_simd_p (code_helper code)
8308 return (code.is_tree_code ()
8309 && vect_can_vectorize_without_simd_p (tree_code (code)));
8312 /* Create vector init for vectorized iv. */
8313 static tree
8314 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8315 tree step_expr, poly_uint64 nunits,
8316 tree vectype,
8317 enum vect_induction_op_type induction_type)
8319 unsigned HOST_WIDE_INT const_nunits;
8320 tree vec_shift, vec_init, new_name;
8321 unsigned i;
8322 tree itype = TREE_TYPE (vectype);
8324 /* iv_loop is the loop to be vectorized. Create:
8325 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8326 new_name = gimple_convert (stmts, itype, init_expr);
8327 switch (induction_type)
8329 case vect_step_op_shr:
8330 case vect_step_op_shl:
8331 /* Build the Initial value from shift_expr. */
8332 vec_init = gimple_build_vector_from_val (stmts,
8333 vectype,
8334 new_name);
8335 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8336 build_zero_cst (itype), step_expr);
8337 vec_init = gimple_build (stmts,
8338 (induction_type == vect_step_op_shr
8339 ? RSHIFT_EXPR : LSHIFT_EXPR),
8340 vectype, vec_init, vec_shift);
8341 break;
8343 case vect_step_op_neg:
8345 vec_init = gimple_build_vector_from_val (stmts,
8346 vectype,
8347 new_name);
8348 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8349 vectype, vec_init);
8350 /* The encoding has 2 interleaved stepped patterns. */
8351 vec_perm_builder sel (nunits, 2, 3);
8352 sel.quick_grow (6);
8353 for (i = 0; i < 3; i++)
8355 sel[2 * i] = i;
8356 sel[2 * i + 1] = i + nunits;
8358 vec_perm_indices indices (sel, 2, nunits);
8359 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8360 fail when vec_init is const vector. In that situation vec_perm is not
8361 really needed. */
8362 tree perm_mask_even
8363 = vect_gen_perm_mask_any (vectype, indices);
8364 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8365 vectype,
8366 vec_init, vec_neg,
8367 perm_mask_even);
8369 break;
8371 case vect_step_op_mul:
8373 /* Use unsigned mult to avoid UD integer overflow. */
8374 gcc_assert (nunits.is_constant (&const_nunits));
8375 tree utype = unsigned_type_for (itype);
8376 tree uvectype = build_vector_type (utype,
8377 TYPE_VECTOR_SUBPARTS (vectype));
8378 new_name = gimple_convert (stmts, utype, new_name);
8379 vec_init = gimple_build_vector_from_val (stmts,
8380 uvectype,
8381 new_name);
8382 tree_vector_builder elts (uvectype, const_nunits, 1);
8383 tree elt_step = build_one_cst (utype);
8385 elts.quick_push (elt_step);
8386 for (i = 1; i < const_nunits; i++)
8388 /* Create: new_name_i = new_name + step_expr. */
8389 elt_step = gimple_build (stmts, MULT_EXPR,
8390 utype, elt_step, step_expr);
8391 elts.quick_push (elt_step);
8393 /* Create a vector from [new_name_0, new_name_1, ...,
8394 new_name_nunits-1]. */
8395 tree vec_mul = gimple_build_vector (stmts, &elts);
8396 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
8397 vec_init, vec_mul);
8398 vec_init = gimple_convert (stmts, vectype, vec_init);
8400 break;
8402 default:
8403 gcc_unreachable ();
8406 return vec_init;
8409 /* Peel init_expr by skip_niter for induction_type. */
8410 tree
8411 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8412 tree skip_niters, tree step_expr,
8413 enum vect_induction_op_type induction_type)
8415 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
8416 tree type = TREE_TYPE (init_expr);
8417 unsigned prec = TYPE_PRECISION (type);
8418 switch (induction_type)
8420 case vect_step_op_neg:
8421 if (TREE_INT_CST_LOW (skip_niters) % 2)
8422 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
8423 /* else no change. */
8424 break;
8426 case vect_step_op_shr:
8427 case vect_step_op_shl:
8428 skip_niters = gimple_convert (stmts, type, skip_niters);
8429 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
8430 /* When shift mount >= precision, need to avoid UD.
8431 In the original loop, there's no UD, and according to semantic,
8432 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
8433 if (!tree_fits_uhwi_p (step_expr)
8434 || tree_to_uhwi (step_expr) >= prec)
8436 if (induction_type == vect_step_op_shl
8437 || TYPE_UNSIGNED (type))
8438 init_expr = build_zero_cst (type);
8439 else
8440 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
8441 init_expr,
8442 wide_int_to_tree (type, prec - 1));
8444 else
8445 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
8446 ? RSHIFT_EXPR : LSHIFT_EXPR),
8447 type, init_expr, step_expr);
8448 break;
8450 case vect_step_op_mul:
8452 tree utype = unsigned_type_for (type);
8453 init_expr = gimple_convert (stmts, utype, init_expr);
8454 unsigned skipn = TREE_INT_CST_LOW (skip_niters);
8455 wide_int begin = wi::to_wide (step_expr);
8456 for (unsigned i = 0; i != skipn - 1; i++)
8457 begin = wi::mul (begin, wi::to_wide (step_expr));
8458 tree mult_expr = wide_int_to_tree (utype, begin);
8459 init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
8460 init_expr = gimple_convert (stmts, type, init_expr);
8462 break;
8464 default:
8465 gcc_unreachable ();
8468 return init_expr;
8471 /* Create vector step for vectorized iv. */
8472 static tree
8473 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
8474 poly_uint64 vf,
8475 enum vect_induction_op_type induction_type)
8477 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
8478 tree new_name = NULL;
8479 /* Step should be pow (step, vf) for mult induction. */
8480 if (induction_type == vect_step_op_mul)
8482 gcc_assert (vf.is_constant ());
8483 wide_int begin = wi::to_wide (step_expr);
8485 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
8486 begin = wi::mul (begin, wi::to_wide (step_expr));
8488 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
8490 else if (induction_type == vect_step_op_neg)
8491 /* Do nothing. */
8493 else
8494 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
8495 expr, step_expr);
8496 return new_name;
8499 static tree
8500 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
8501 stmt_vec_info stmt_info,
8502 tree new_name, tree vectype,
8503 enum vect_induction_op_type induction_type)
8505 /* No step is needed for neg induction. */
8506 if (induction_type == vect_step_op_neg)
8507 return NULL;
8509 tree t = unshare_expr (new_name);
8510 gcc_assert (CONSTANT_CLASS_P (new_name)
8511 || TREE_CODE (new_name) == SSA_NAME);
8512 tree new_vec = build_vector_from_val (vectype, t);
8513 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
8514 new_vec, vectype, NULL);
8515 return vec_step;
8518 /* Update vectorized iv with vect_step, induc_def is init. */
8519 static tree
8520 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
8521 tree induc_def, tree vec_step,
8522 enum vect_induction_op_type induction_type)
8524 tree vec_def = induc_def;
8525 switch (induction_type)
8527 case vect_step_op_mul:
8529 /* Use unsigned mult to avoid UD integer overflow. */
8530 tree uvectype
8531 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
8532 TYPE_VECTOR_SUBPARTS (vectype));
8533 vec_def = gimple_convert (stmts, uvectype, vec_def);
8534 vec_step = gimple_convert (stmts, uvectype, vec_step);
8535 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
8536 vec_def, vec_step);
8537 vec_def = gimple_convert (stmts, vectype, vec_def);
8539 break;
8541 case vect_step_op_shr:
8542 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
8543 vec_def, vec_step);
8544 break;
8546 case vect_step_op_shl:
8547 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
8548 vec_def, vec_step);
8549 break;
8550 case vect_step_op_neg:
8551 vec_def = induc_def;
8552 /* Do nothing. */
8553 break;
8554 default:
8555 gcc_unreachable ();
8558 return vec_def;
8562 /* Return true if vectorizer can peel for nonlinear iv. */
8563 bool
8564 vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
8565 enum vect_induction_op_type induction_type)
8567 tree niters_skip;
8568 /* Init_expr will be update by vect_update_ivs_after_vectorizer,
8569 if niters is unkown:
8570 For shift, when shift mount >= precision, there would be UD.
8571 For mult, don't known how to generate
8572 init_expr * pow (step, niters) for variable niters.
8573 For neg, it should be ok, since niters of vectorized main loop
8574 will always be multiple of 2. */
8575 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8576 && induction_type != vect_step_op_neg)
8578 if (dump_enabled_p ())
8579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8580 "Peeling for epilogue is not supported"
8581 " for nonlinear induction except neg"
8582 " when iteration count is unknown.\n");
8583 return false;
8586 /* Also doens't support peel for neg when niter is variable.
8587 ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */
8588 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8589 if ((niters_skip != NULL_TREE
8590 && TREE_CODE (niters_skip) != INTEGER_CST)
8591 || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
8592 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
8594 if (dump_enabled_p ())
8595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8596 "Peeling for alignement is not supported"
8597 " for nonlinear induction when niters_skip"
8598 " is not constant.\n");
8599 return false;
8602 return true;
8605 /* Function vectorizable_induction
8607 Check if STMT_INFO performs an nonlinear induction computation that can be
8608 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
8609 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
8610 basic block.
8611 Return true if STMT_INFO is vectorizable in this way. */
8613 static bool
8614 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
8615 stmt_vec_info stmt_info,
8616 gimple **vec_stmt, slp_tree slp_node,
8617 stmt_vector_for_cost *cost_vec)
8619 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8620 unsigned ncopies;
8621 bool nested_in_vect_loop = false;
8622 class loop *iv_loop;
8623 tree vec_def;
8624 edge pe = loop_preheader_edge (loop);
8625 basic_block new_bb;
8626 tree vec_init, vec_step;
8627 tree new_name;
8628 gimple *new_stmt;
8629 gphi *induction_phi;
8630 tree induc_def, vec_dest;
8631 tree init_expr, step_expr;
8632 tree niters_skip;
8633 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8634 unsigned i;
8635 gimple_stmt_iterator si;
8637 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8639 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8640 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8641 enum vect_induction_op_type induction_type
8642 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
8644 gcc_assert (induction_type > vect_step_op_add);
8646 if (slp_node)
8647 ncopies = 1;
8648 else
8649 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8650 gcc_assert (ncopies >= 1);
8652 /* FORNOW. Only handle nonlinear induction in the same loop. */
8653 if (nested_in_vect_loop_p (loop, stmt_info))
8655 if (dump_enabled_p ())
8656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8657 "nonlinear induction in nested loop.\n");
8658 return false;
8661 iv_loop = loop;
8662 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8664 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
8665 update for each iv and a permutation to generate wanted vector iv. */
8666 if (slp_node)
8668 if (dump_enabled_p ())
8669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8670 "SLP induction not supported for nonlinear"
8671 " induction.\n");
8672 return false;
8675 if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, induction_type))
8676 return false;
8678 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
8680 if (dump_enabled_p ())
8681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8682 "floating point nonlinear induction vectorization"
8683 " not supported.\n");
8684 return false;
8687 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8688 init_expr = vect_phi_initial_value (phi);
8689 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
8690 && TREE_CODE (step_expr) == INTEGER_CST);
8691 /* step_expr should be aligned with init_expr,
8692 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
8693 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
8695 if (TREE_CODE (init_expr) == INTEGER_CST)
8696 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
8697 else
8698 gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
8699 TREE_TYPE (init_expr)));
8701 switch (induction_type)
8703 case vect_step_op_neg:
8704 if (TREE_CODE (init_expr) != INTEGER_CST
8705 && TREE_CODE (init_expr) != REAL_CST)
8707 /* Check for backend support of NEGATE_EXPR and vec_perm. */
8708 if (!directly_supported_p (NEGATE_EXPR, vectype))
8709 return false;
8711 /* The encoding has 2 interleaved stepped patterns. */
8712 vec_perm_builder sel (nunits, 2, 3);
8713 machine_mode mode = TYPE_MODE (vectype);
8714 sel.quick_grow (6);
8715 for (i = 0; i < 3; i++)
8717 sel[i * 2] = i;
8718 sel[i * 2 + 1] = i + nunits;
8720 vec_perm_indices indices (sel, 2, nunits);
8721 if (!can_vec_perm_const_p (mode, mode, indices))
8722 return false;
8724 break;
8726 case vect_step_op_mul:
8728 /* Check for backend support of MULT_EXPR. */
8729 if (!directly_supported_p (MULT_EXPR, vectype))
8730 return false;
8732 /* ?? How to construct vector step for variable number vector.
8733 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
8734 if (!vf.is_constant ())
8735 return false;
8737 break;
8739 case vect_step_op_shr:
8740 /* Check for backend support of RSHIFT_EXPR. */
8741 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
8742 return false;
8744 /* Don't shift more than type precision to avoid UD. */
8745 if (!tree_fits_uhwi_p (step_expr)
8746 || maybe_ge (nunits * tree_to_uhwi (step_expr),
8747 TYPE_PRECISION (TREE_TYPE (init_expr))))
8748 return false;
8749 break;
8751 case vect_step_op_shl:
8752 /* Check for backend support of RSHIFT_EXPR. */
8753 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
8754 return false;
8756 /* Don't shift more than type precision to avoid UD. */
8757 if (!tree_fits_uhwi_p (step_expr)
8758 || maybe_ge (nunits * tree_to_uhwi (step_expr),
8759 TYPE_PRECISION (TREE_TYPE (init_expr))))
8760 return false;
8762 break;
8764 default:
8765 gcc_unreachable ();
8768 if (!vec_stmt) /* transformation not required. */
8770 unsigned inside_cost = 0, prologue_cost = 0;
8771 /* loop cost for vec_loop. Neg induction doesn't have any
8772 inside_cost. */
8773 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8774 stmt_info, 0, vect_body);
8776 /* loop cost for vec_loop. Neg induction doesn't have any
8777 inside_cost. */
8778 if (induction_type == vect_step_op_neg)
8779 inside_cost = 0;
8781 /* prologue cost for vec_init and vec_step. */
8782 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8783 stmt_info, 0, vect_prologue);
8785 if (dump_enabled_p ())
8786 dump_printf_loc (MSG_NOTE, vect_location,
8787 "vect_model_induction_cost: inside_cost = %d, "
8788 "prologue_cost = %d. \n", inside_cost,
8789 prologue_cost);
8791 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8792 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
8793 return true;
8796 /* Transform. */
8798 /* Compute a vector variable, initialized with the first VF values of
8799 the induction variable. E.g., for an iv with IV_PHI='X' and
8800 evolution S, for a vector of 4 units, we want to compute:
8801 [X, X + S, X + 2*S, X + 3*S]. */
8803 if (dump_enabled_p ())
8804 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8806 pe = loop_preheader_edge (iv_loop);
8807 /* Find the first insertion point in the BB. */
8808 basic_block bb = gimple_bb (phi);
8809 si = gsi_after_labels (bb);
8811 gimple_seq stmts = NULL;
8813 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8814 /* If we are using the loop mask to "peel" for alignment then we need
8815 to adjust the start value here. */
8816 if (niters_skip != NULL_TREE)
8817 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
8818 step_expr, induction_type);
8820 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
8821 step_expr, nunits, vectype,
8822 induction_type);
8823 if (stmts)
8825 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8826 gcc_assert (!new_bb);
8829 stmts = NULL;
8830 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
8831 vf, induction_type);
8832 if (stmts)
8834 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8835 gcc_assert (!new_bb);
8838 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
8839 new_name, vectype,
8840 induction_type);
8841 /* Create the following def-use cycle:
8842 loop prolog:
8843 vec_init = ...
8844 vec_step = ...
8845 loop:
8846 vec_iv = PHI <vec_init, vec_loop>
8848 STMT
8850 vec_loop = vec_iv + vec_step; */
8852 /* Create the induction-phi that defines the induction-operand. */
8853 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8854 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8855 induc_def = PHI_RESULT (induction_phi);
8857 /* Create the iv update inside the loop. */
8858 stmts = NULL;
8859 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
8860 induc_def, vec_step,
8861 induction_type);
8863 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8864 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8866 /* Set the arguments of the phi node: */
8867 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8868 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8869 UNKNOWN_LOCATION);
8871 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8872 *vec_stmt = induction_phi;
8874 /* In case that vectorization factor (VF) is bigger than the number
8875 of elements that we can fit in a vectype (nunits), we have to generate
8876 more than one vector stmt - i.e - we need to "unroll" the
8877 vector stmt by a factor VF/nunits. For more details see documentation
8878 in vectorizable_operation. */
8880 if (ncopies > 1)
8882 stmts = NULL;
8883 /* FORNOW. This restriction should be relaxed. */
8884 gcc_assert (!nested_in_vect_loop);
8886 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
8887 nunits, induction_type);
8889 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
8890 new_name, vectype,
8891 induction_type);
8892 vec_def = induc_def;
8893 for (i = 1; i < ncopies; i++)
8895 /* vec_i = vec_prev + vec_step. */
8896 stmts = NULL;
8897 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
8898 vec_def, vec_step,
8899 induction_type);
8900 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8901 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8902 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8906 if (dump_enabled_p ())
8907 dump_printf_loc (MSG_NOTE, vect_location,
8908 "transform induction: created def-use cycle: %G%G",
8909 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
8911 return true;
8914 /* Function vectorizable_induction
8916 Check if STMT_INFO performs an induction computation that can be vectorized.
8917 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
8918 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
8919 Return true if STMT_INFO is vectorizable in this way. */
8921 bool
8922 vectorizable_induction (loop_vec_info loop_vinfo,
8923 stmt_vec_info stmt_info,
8924 gimple **vec_stmt, slp_tree slp_node,
8925 stmt_vector_for_cost *cost_vec)
8927 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8928 unsigned ncopies;
8929 bool nested_in_vect_loop = false;
8930 class loop *iv_loop;
8931 tree vec_def;
8932 edge pe = loop_preheader_edge (loop);
8933 basic_block new_bb;
8934 tree new_vec, vec_init, vec_step, t;
8935 tree new_name;
8936 gimple *new_stmt;
8937 gphi *induction_phi;
8938 tree induc_def, vec_dest;
8939 tree init_expr, step_expr;
8940 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8941 unsigned i;
8942 tree expr;
8943 gimple_stmt_iterator si;
8944 enum vect_induction_op_type induction_type
8945 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
8947 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8948 if (!phi)
8949 return false;
8951 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8952 return false;
8954 /* Make sure it was recognized as induction computation. */
8955 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
8956 return false;
8958 /* Handle nonlinear induction in a separate place. */
8959 if (induction_type != vect_step_op_add)
8960 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
8961 vec_stmt, slp_node, cost_vec);
8963 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8964 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8966 if (slp_node)
8967 ncopies = 1;
8968 else
8969 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8970 gcc_assert (ncopies >= 1);
8972 /* FORNOW. These restrictions should be relaxed. */
8973 if (nested_in_vect_loop_p (loop, stmt_info))
8975 imm_use_iterator imm_iter;
8976 use_operand_p use_p;
8977 gimple *exit_phi;
8978 edge latch_e;
8979 tree loop_arg;
8981 if (ncopies > 1)
8983 if (dump_enabled_p ())
8984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8985 "multiple types in nested loop.\n");
8986 return false;
8989 exit_phi = NULL;
8990 latch_e = loop_latch_edge (loop->inner);
8991 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
8992 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8994 gimple *use_stmt = USE_STMT (use_p);
8995 if (is_gimple_debug (use_stmt))
8996 continue;
8998 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9000 exit_phi = use_stmt;
9001 break;
9004 if (exit_phi)
9006 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9007 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9008 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9010 if (dump_enabled_p ())
9011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9012 "inner-loop induction only used outside "
9013 "of the outer vectorized loop.\n");
9014 return false;
9018 nested_in_vect_loop = true;
9019 iv_loop = loop->inner;
9021 else
9022 iv_loop = loop;
9023 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9025 if (slp_node && !nunits.is_constant ())
9027 /* The current SLP code creates the step value element-by-element. */
9028 if (dump_enabled_p ())
9029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9030 "SLP induction not supported for variable-length"
9031 " vectors.\n");
9032 return false;
9035 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9037 if (dump_enabled_p ())
9038 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9039 "floating point induction vectorization disabled\n");
9040 return false;
9043 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9044 gcc_assert (step_expr != NULL_TREE);
9045 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9047 /* Check for backend support of PLUS/MINUS_EXPR. */
9048 if (!directly_supported_p (PLUS_EXPR, step_vectype)
9049 || !directly_supported_p (MINUS_EXPR, step_vectype))
9050 return false;
9052 if (!vec_stmt) /* transformation not required. */
9054 unsigned inside_cost = 0, prologue_cost = 0;
9055 if (slp_node)
9057 /* We eventually need to set a vector type on invariant
9058 arguments. */
9059 unsigned j;
9060 slp_tree child;
9061 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9062 if (!vect_maybe_update_slp_op_vectype
9063 (child, SLP_TREE_VECTYPE (slp_node)))
9065 if (dump_enabled_p ())
9066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9067 "incompatible vector types for "
9068 "invariants\n");
9069 return false;
9071 /* loop cost for vec_loop. */
9072 inside_cost
9073 = record_stmt_cost (cost_vec,
9074 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9075 vector_stmt, stmt_info, 0, vect_body);
9076 /* prologue cost for vec_init (if not nested) and step. */
9077 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9078 scalar_to_vec,
9079 stmt_info, 0, vect_prologue);
9081 else /* if (!slp_node) */
9083 /* loop cost for vec_loop. */
9084 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9085 stmt_info, 0, vect_body);
9086 /* prologue cost for vec_init and vec_step. */
9087 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9088 stmt_info, 0, vect_prologue);
9090 if (dump_enabled_p ())
9091 dump_printf_loc (MSG_NOTE, vect_location,
9092 "vect_model_induction_cost: inside_cost = %d, "
9093 "prologue_cost = %d .\n", inside_cost,
9094 prologue_cost);
9096 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9097 DUMP_VECT_SCOPE ("vectorizable_induction");
9098 return true;
9101 /* Transform. */
9103 /* Compute a vector variable, initialized with the first VF values of
9104 the induction variable. E.g., for an iv with IV_PHI='X' and
9105 evolution S, for a vector of 4 units, we want to compute:
9106 [X, X + S, X + 2*S, X + 3*S]. */
9108 if (dump_enabled_p ())
9109 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9111 pe = loop_preheader_edge (iv_loop);
9112 /* Find the first insertion point in the BB. */
9113 basic_block bb = gimple_bb (phi);
9114 si = gsi_after_labels (bb);
9116 /* For SLP induction we have to generate several IVs as for example
9117 with group size 3 we need
9118 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9119 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9120 if (slp_node)
9122 /* Enforced above. */
9123 unsigned int const_nunits = nunits.to_constant ();
9125 /* The initial values are vectorized, but any lanes > group_size
9126 need adjustment. */
9127 slp_tree init_node
9128 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9130 /* Gather steps. Since we do not vectorize inductions as
9131 cycles we have to reconstruct the step from SCEV data. */
9132 unsigned group_size = SLP_TREE_LANES (slp_node);
9133 tree *steps = XALLOCAVEC (tree, group_size);
9134 tree *inits = XALLOCAVEC (tree, group_size);
9135 stmt_vec_info phi_info;
9136 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9138 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9139 if (!init_node)
9140 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9141 pe->dest_idx);
9144 /* Now generate the IVs. */
9145 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9146 gcc_assert ((const_nunits * nvects) % group_size == 0);
9147 unsigned nivs;
9148 if (nested_in_vect_loop)
9149 nivs = nvects;
9150 else
9152 /* Compute the number of distinct IVs we need. First reduce
9153 group_size if it is a multiple of const_nunits so we get
9154 one IV for a group_size of 4 but const_nunits 2. */
9155 unsigned group_sizep = group_size;
9156 if (group_sizep % const_nunits == 0)
9157 group_sizep = group_sizep / const_nunits;
9158 nivs = least_common_multiple (group_sizep,
9159 const_nunits) / const_nunits;
9161 tree stept = TREE_TYPE (step_vectype);
9162 tree lupdate_mul = NULL_TREE;
9163 if (!nested_in_vect_loop)
9165 /* The number of iterations covered in one vector iteration. */
9166 unsigned lup_mul = (nvects * const_nunits) / group_size;
9167 lupdate_mul
9168 = build_vector_from_val (step_vectype,
9169 SCALAR_FLOAT_TYPE_P (stept)
9170 ? build_real_from_wide (stept, lup_mul,
9171 UNSIGNED)
9172 : build_int_cstu (stept, lup_mul));
9174 tree peel_mul = NULL_TREE;
9175 gimple_seq init_stmts = NULL;
9176 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9178 if (SCALAR_FLOAT_TYPE_P (stept))
9179 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9180 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9181 else
9182 peel_mul = gimple_convert (&init_stmts, stept,
9183 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9184 peel_mul = gimple_build_vector_from_val (&init_stmts,
9185 step_vectype, peel_mul);
9187 unsigned ivn;
9188 auto_vec<tree> vec_steps;
9189 for (ivn = 0; ivn < nivs; ++ivn)
9191 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9192 tree_vector_builder init_elts (vectype, const_nunits, 1);
9193 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9194 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9196 /* The scalar steps of the IVs. */
9197 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9198 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9199 step_elts.quick_push (elt);
9200 if (!init_node)
9202 /* The scalar inits of the IVs if not vectorized. */
9203 elt = inits[(ivn*const_nunits + eltn) % group_size];
9204 if (!useless_type_conversion_p (TREE_TYPE (vectype),
9205 TREE_TYPE (elt)))
9206 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9207 TREE_TYPE (vectype), elt);
9208 init_elts.quick_push (elt);
9210 /* The number of steps to add to the initial values. */
9211 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9212 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9213 ? build_real_from_wide (stept,
9214 mul_elt, UNSIGNED)
9215 : build_int_cstu (stept, mul_elt));
9217 vec_step = gimple_build_vector (&init_stmts, &step_elts);
9218 vec_steps.safe_push (vec_step);
9219 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9220 if (peel_mul)
9221 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9222 step_mul, peel_mul);
9223 if (!init_node)
9224 vec_init = gimple_build_vector (&init_stmts, &init_elts);
9226 /* Create the induction-phi that defines the induction-operand. */
9227 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9228 "vec_iv_");
9229 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9230 induc_def = PHI_RESULT (induction_phi);
9232 /* Create the iv update inside the loop */
9233 tree up = vec_step;
9234 if (lupdate_mul)
9235 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9236 vec_step, lupdate_mul);
9237 gimple_seq stmts = NULL;
9238 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9239 vec_def = gimple_build (&stmts,
9240 PLUS_EXPR, step_vectype, vec_def, up);
9241 vec_def = gimple_convert (&stmts, vectype, vec_def);
9242 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9243 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9244 UNKNOWN_LOCATION);
9246 if (init_node)
9247 vec_init = vect_get_slp_vect_def (init_node, ivn);
9248 if (!nested_in_vect_loop
9249 && !integer_zerop (step_mul))
9251 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9252 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9253 vec_step, step_mul);
9254 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9255 vec_def, up);
9256 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9259 /* Set the arguments of the phi node: */
9260 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9262 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
9264 if (!nested_in_vect_loop)
9266 /* Fill up to the number of vectors we need for the whole group. */
9267 nivs = least_common_multiple (group_size,
9268 const_nunits) / const_nunits;
9269 vec_steps.reserve (nivs-ivn);
9270 for (; ivn < nivs; ++ivn)
9272 SLP_TREE_VEC_STMTS (slp_node)
9273 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
9274 vec_steps.quick_push (vec_steps[0]);
9278 /* Re-use IVs when we can. We are generating further vector
9279 stmts by adding VF' * stride to the IVs generated above. */
9280 if (ivn < nvects)
9282 unsigned vfp
9283 = least_common_multiple (group_size, const_nunits) / group_size;
9284 tree lupdate_mul
9285 = build_vector_from_val (step_vectype,
9286 SCALAR_FLOAT_TYPE_P (stept)
9287 ? build_real_from_wide (stept,
9288 vfp, UNSIGNED)
9289 : build_int_cstu (stept, vfp));
9290 for (; ivn < nvects; ++ivn)
9292 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
9293 tree def = gimple_get_lhs (iv);
9294 if (ivn < 2*nivs)
9295 vec_steps[ivn - nivs]
9296 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9297 vec_steps[ivn - nivs], lupdate_mul);
9298 gimple_seq stmts = NULL;
9299 def = gimple_convert (&stmts, step_vectype, def);
9300 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9301 def, vec_steps[ivn % nivs]);
9302 def = gimple_convert (&stmts, vectype, def);
9303 if (gimple_code (iv) == GIMPLE_PHI)
9304 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9305 else
9307 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9308 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9310 SLP_TREE_VEC_STMTS (slp_node)
9311 .quick_push (SSA_NAME_DEF_STMT (def));
9315 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9316 gcc_assert (!new_bb);
9318 return true;
9321 init_expr = vect_phi_initial_value (phi);
9323 gimple_seq stmts = NULL;
9324 if (!nested_in_vect_loop)
9326 /* Convert the initial value to the IV update type. */
9327 tree new_type = TREE_TYPE (step_expr);
9328 init_expr = gimple_convert (&stmts, new_type, init_expr);
9330 /* If we are using the loop mask to "peel" for alignment then we need
9331 to adjust the start value here. */
9332 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9333 if (skip_niters != NULL_TREE)
9335 if (FLOAT_TYPE_P (vectype))
9336 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9337 skip_niters);
9338 else
9339 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9340 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9341 skip_niters, step_expr);
9342 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9343 init_expr, skip_step);
9347 if (stmts)
9349 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9350 gcc_assert (!new_bb);
9353 /* Create the vector that holds the initial_value of the induction. */
9354 if (nested_in_vect_loop)
9356 /* iv_loop is nested in the loop to be vectorized. init_expr had already
9357 been created during vectorization of previous stmts. We obtain it
9358 from the STMT_VINFO_VEC_STMT of the defining stmt. */
9359 auto_vec<tree> vec_inits;
9360 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9361 init_expr, &vec_inits);
9362 vec_init = vec_inits[0];
9363 /* If the initial value is not of proper type, convert it. */
9364 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9366 new_stmt
9367 = gimple_build_assign (vect_get_new_ssa_name (vectype,
9368 vect_simple_var,
9369 "vec_iv_"),
9370 VIEW_CONVERT_EXPR,
9371 build1 (VIEW_CONVERT_EXPR, vectype,
9372 vec_init));
9373 vec_init = gimple_assign_lhs (new_stmt);
9374 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
9375 new_stmt);
9376 gcc_assert (!new_bb);
9379 else
9381 /* iv_loop is the loop to be vectorized. Create:
9382 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
9383 stmts = NULL;
9384 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
9386 unsigned HOST_WIDE_INT const_nunits;
9387 if (nunits.is_constant (&const_nunits))
9389 tree_vector_builder elts (step_vectype, const_nunits, 1);
9390 elts.quick_push (new_name);
9391 for (i = 1; i < const_nunits; i++)
9393 /* Create: new_name_i = new_name + step_expr */
9394 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
9395 new_name, step_expr);
9396 elts.quick_push (new_name);
9398 /* Create a vector from [new_name_0, new_name_1, ...,
9399 new_name_nunits-1] */
9400 vec_init = gimple_build_vector (&stmts, &elts);
9402 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
9403 /* Build the initial value directly from a VEC_SERIES_EXPR. */
9404 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
9405 new_name, step_expr);
9406 else
9408 /* Build:
9409 [base, base, base, ...]
9410 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9411 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
9412 gcc_assert (flag_associative_math);
9413 tree index = build_index_vector (step_vectype, 0, 1);
9414 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9415 new_name);
9416 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9417 step_expr);
9418 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
9419 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
9420 vec_init, step_vec);
9421 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9422 vec_init, base_vec);
9424 vec_init = gimple_convert (&stmts, vectype, vec_init);
9426 if (stmts)
9428 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9429 gcc_assert (!new_bb);
9434 /* Create the vector that holds the step of the induction. */
9435 if (nested_in_vect_loop)
9436 /* iv_loop is nested in the loop to be vectorized. Generate:
9437 vec_step = [S, S, S, S] */
9438 new_name = step_expr;
9439 else
9441 /* iv_loop is the loop to be vectorized. Generate:
9442 vec_step = [VF*S, VF*S, VF*S, VF*S] */
9443 gimple_seq seq = NULL;
9444 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9446 expr = build_int_cst (integer_type_node, vf);
9447 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9449 else
9450 expr = build_int_cst (TREE_TYPE (step_expr), vf);
9451 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
9452 expr, step_expr);
9453 if (seq)
9455 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
9456 gcc_assert (!new_bb);
9460 t = unshare_expr (new_name);
9461 gcc_assert (CONSTANT_CLASS_P (new_name)
9462 || TREE_CODE (new_name) == SSA_NAME);
9463 new_vec = build_vector_from_val (step_vectype, t);
9464 vec_step = vect_init_vector (loop_vinfo, stmt_info,
9465 new_vec, step_vectype, NULL);
9468 /* Create the following def-use cycle:
9469 loop prolog:
9470 vec_init = ...
9471 vec_step = ...
9472 loop:
9473 vec_iv = PHI <vec_init, vec_loop>
9475 STMT
9477 vec_loop = vec_iv + vec_step; */
9479 /* Create the induction-phi that defines the induction-operand. */
9480 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9481 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9482 induc_def = PHI_RESULT (induction_phi);
9484 /* Create the iv update inside the loop */
9485 stmts = NULL;
9486 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9487 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
9488 vec_def = gimple_convert (&stmts, vectype, vec_def);
9489 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9490 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9492 /* Set the arguments of the phi node: */
9493 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9494 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9495 UNKNOWN_LOCATION);
9497 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9498 *vec_stmt = induction_phi;
9500 /* In case that vectorization factor (VF) is bigger than the number
9501 of elements that we can fit in a vectype (nunits), we have to generate
9502 more than one vector stmt - i.e - we need to "unroll" the
9503 vector stmt by a factor VF/nunits. For more details see documentation
9504 in vectorizable_operation. */
9506 if (ncopies > 1)
9508 gimple_seq seq = NULL;
9509 /* FORNOW. This restriction should be relaxed. */
9510 gcc_assert (!nested_in_vect_loop);
9512 /* Create the vector that holds the step of the induction. */
9513 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9515 expr = build_int_cst (integer_type_node, nunits);
9516 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9518 else
9519 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
9520 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
9521 expr, step_expr);
9522 if (seq)
9524 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
9525 gcc_assert (!new_bb);
9528 t = unshare_expr (new_name);
9529 gcc_assert (CONSTANT_CLASS_P (new_name)
9530 || TREE_CODE (new_name) == SSA_NAME);
9531 new_vec = build_vector_from_val (step_vectype, t);
9532 vec_step = vect_init_vector (loop_vinfo, stmt_info,
9533 new_vec, step_vectype, NULL);
9535 vec_def = induc_def;
9536 for (i = 1; i < ncopies; i++)
9538 /* vec_i = vec_prev + vec_step */
9539 gimple_seq stmts = NULL;
9540 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
9541 vec_def = gimple_build (&stmts,
9542 PLUS_EXPR, step_vectype, vec_def, vec_step);
9543 vec_def = gimple_convert (&stmts, vectype, vec_def);
9545 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9546 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9547 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9551 if (dump_enabled_p ())
9552 dump_printf_loc (MSG_NOTE, vect_location,
9553 "transform induction: created def-use cycle: %G%G",
9554 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9556 return true;
9559 /* Function vectorizable_live_operation.
9561 STMT_INFO computes a value that is used outside the loop. Check if
9562 it can be supported. */
9564 bool
9565 vectorizable_live_operation (vec_info *vinfo,
9566 stmt_vec_info stmt_info,
9567 gimple_stmt_iterator *gsi,
9568 slp_tree slp_node, slp_instance slp_node_instance,
9569 int slp_index, bool vec_stmt_p,
9570 stmt_vector_for_cost *cost_vec)
9572 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
9573 imm_use_iterator imm_iter;
9574 tree lhs, lhs_type, bitsize;
9575 tree vectype = (slp_node
9576 ? SLP_TREE_VECTYPE (slp_node)
9577 : STMT_VINFO_VECTYPE (stmt_info));
9578 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9579 int ncopies;
9580 gimple *use_stmt;
9581 auto_vec<tree> vec_oprnds;
9582 int vec_entry = 0;
9583 poly_uint64 vec_index = 0;
9585 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
9587 /* If a stmt of a reduction is live, vectorize it via
9588 vect_create_epilog_for_reduction. vectorizable_reduction assessed
9589 validity so just trigger the transform here. */
9590 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
9592 if (!vec_stmt_p)
9593 return true;
9594 if (slp_node)
9596 /* For reduction chains the meta-info is attached to
9597 the group leader. */
9598 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
9599 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
9600 /* For SLP reductions we vectorize the epilogue for
9601 all involved stmts together. */
9602 else if (slp_index != 0)
9603 return true;
9604 else
9605 /* For SLP reductions the meta-info is attached to
9606 the representative. */
9607 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
9609 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
9610 gcc_assert (reduc_info->is_reduc_info);
9611 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
9612 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
9613 return true;
9614 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
9615 slp_node_instance);
9616 return true;
9619 /* If STMT is not relevant and it is a simple assignment and its inputs are
9620 invariant then it can remain in place, unvectorized. The original last
9621 scalar value that it computes will be used. */
9622 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9624 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
9625 if (dump_enabled_p ())
9626 dump_printf_loc (MSG_NOTE, vect_location,
9627 "statement is simple and uses invariant. Leaving in "
9628 "place.\n");
9629 return true;
9632 if (slp_node)
9633 ncopies = 1;
9634 else
9635 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9637 if (slp_node)
9639 gcc_assert (slp_index >= 0);
9641 /* Get the last occurrence of the scalar index from the concatenation of
9642 all the slp vectors. Calculate which slp vector it is and the index
9643 within. */
9644 int num_scalar = SLP_TREE_LANES (slp_node);
9645 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9646 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
9648 /* Calculate which vector contains the result, and which lane of
9649 that vector we need. */
9650 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
9652 if (dump_enabled_p ())
9653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9654 "Cannot determine which vector holds the"
9655 " final result.\n");
9656 return false;
9660 if (!vec_stmt_p)
9662 /* No transformation required. */
9663 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
9665 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
9666 OPTIMIZE_FOR_SPEED))
9668 if (dump_enabled_p ())
9669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9670 "can't operate on partial vectors "
9671 "because the target doesn't support extract "
9672 "last reduction.\n");
9673 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9675 else if (slp_node)
9677 if (dump_enabled_p ())
9678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9679 "can't operate on partial vectors "
9680 "because an SLP statement is live after "
9681 "the loop.\n");
9682 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9684 else if (ncopies > 1)
9686 if (dump_enabled_p ())
9687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9688 "can't operate on partial vectors "
9689 "because ncopies is greater than 1.\n");
9690 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9692 else
9694 gcc_assert (ncopies == 1 && !slp_node);
9695 vect_record_loop_mask (loop_vinfo,
9696 &LOOP_VINFO_MASKS (loop_vinfo),
9697 1, vectype, NULL);
9700 /* ??? Enable for loop costing as well. */
9701 if (!loop_vinfo)
9702 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
9703 0, vect_epilogue);
9704 return true;
9707 /* Use the lhs of the original scalar statement. */
9708 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
9709 if (dump_enabled_p ())
9710 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
9711 "stmt %G", stmt);
9713 lhs = gimple_get_lhs (stmt);
9714 lhs_type = TREE_TYPE (lhs);
9716 bitsize = vector_element_bits_tree (vectype);
9718 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
9719 tree vec_lhs, bitstart;
9720 gimple *vec_stmt;
9721 if (slp_node)
9723 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
9725 /* Get the correct slp vectorized stmt. */
9726 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
9727 vec_lhs = gimple_get_lhs (vec_stmt);
9729 /* Get entry to use. */
9730 bitstart = bitsize_int (vec_index);
9731 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
9733 else
9735 /* For multiple copies, get the last copy. */
9736 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
9737 vec_lhs = gimple_get_lhs (vec_stmt);
9739 /* Get the last lane in the vector. */
9740 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
9743 if (loop_vinfo)
9745 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
9746 requirement, insert one phi node for it. It looks like:
9747 loop;
9749 # lhs' = PHI <lhs>
9751 loop;
9753 # vec_lhs' = PHI <vec_lhs>
9754 new_tree = lane_extract <vec_lhs', ...>;
9755 lhs' = new_tree; */
9757 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9758 basic_block exit_bb = single_exit (loop)->dest;
9759 gcc_assert (single_pred_p (exit_bb));
9761 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
9762 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
9763 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
9765 gimple_seq stmts = NULL;
9766 tree new_tree;
9767 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9769 /* Emit:
9771 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
9773 where VEC_LHS is the vectorized live-out result and MASK is
9774 the loop mask for the final iteration. */
9775 gcc_assert (ncopies == 1 && !slp_node);
9776 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
9777 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
9778 1, vectype, 0);
9779 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
9780 mask, vec_lhs_phi);
9782 /* Convert the extracted vector element to the scalar type. */
9783 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
9785 else
9787 tree bftype = TREE_TYPE (vectype);
9788 if (VECTOR_BOOLEAN_TYPE_P (vectype))
9789 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
9790 new_tree = build3 (BIT_FIELD_REF, bftype,
9791 vec_lhs_phi, bitsize, bitstart);
9792 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
9793 &stmts, true, NULL_TREE);
9796 if (stmts)
9798 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
9799 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
9801 /* Remove existing phi from lhs and create one copy from new_tree. */
9802 tree lhs_phi = NULL_TREE;
9803 gimple_stmt_iterator gsi;
9804 for (gsi = gsi_start_phis (exit_bb);
9805 !gsi_end_p (gsi); gsi_next (&gsi))
9807 gimple *phi = gsi_stmt (gsi);
9808 if ((gimple_phi_arg_def (phi, 0) == lhs))
9810 remove_phi_node (&gsi, false);
9811 lhs_phi = gimple_phi_result (phi);
9812 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
9813 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
9814 break;
9819 /* Replace use of lhs with newly computed result. If the use stmt is a
9820 single arg PHI, just replace all uses of PHI result. It's necessary
9821 because lcssa PHI defining lhs may be before newly inserted stmt. */
9822 use_operand_p use_p;
9823 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
9824 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
9825 && !is_gimple_debug (use_stmt))
9827 if (gimple_code (use_stmt) == GIMPLE_PHI
9828 && gimple_phi_num_args (use_stmt) == 1)
9830 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
9832 else
9834 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9835 SET_USE (use_p, new_tree);
9837 update_stmt (use_stmt);
9840 else
9842 /* For basic-block vectorization simply insert the lane-extraction. */
9843 tree bftype = TREE_TYPE (vectype);
9844 if (VECTOR_BOOLEAN_TYPE_P (vectype))
9845 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
9846 tree new_tree = build3 (BIT_FIELD_REF, bftype,
9847 vec_lhs, bitsize, bitstart);
9848 gimple_seq stmts = NULL;
9849 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
9850 &stmts, true, NULL_TREE);
9851 if (TREE_CODE (new_tree) == SSA_NAME
9852 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
9853 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
9854 if (is_a <gphi *> (vec_stmt))
9856 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
9857 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9859 else
9861 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
9862 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
9865 /* Replace use of lhs with newly computed result. If the use stmt is a
9866 single arg PHI, just replace all uses of PHI result. It's necessary
9867 because lcssa PHI defining lhs may be before newly inserted stmt. */
9868 use_operand_p use_p;
9869 stmt_vec_info use_stmt_info;
9870 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
9871 if (!is_gimple_debug (use_stmt)
9872 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
9873 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
9875 /* ??? This can happen when the live lane ends up being
9876 used in a vector construction code-generated by an
9877 external SLP node (and code-generation for that already
9878 happened). See gcc.dg/vect/bb-slp-47.c.
9879 Doing this is what would happen if that vector CTOR
9880 were not code-generated yet so it is not too bad.
9881 ??? In fact we'd likely want to avoid this situation
9882 in the first place. */
9883 if (TREE_CODE (new_tree) == SSA_NAME
9884 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9885 && gimple_code (use_stmt) != GIMPLE_PHI
9886 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
9887 use_stmt))
9889 enum tree_code code = gimple_assign_rhs_code (use_stmt);
9890 gcc_assert (code == CONSTRUCTOR
9891 || code == VIEW_CONVERT_EXPR
9892 || CONVERT_EXPR_CODE_P (code));
9893 if (dump_enabled_p ())
9894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9895 "Using original scalar computation for "
9896 "live lane because use preceeds vector "
9897 "def\n");
9898 continue;
9900 /* ??? It can also happen that we end up pulling a def into
9901 a loop where replacing out-of-loop uses would require
9902 a new LC SSA PHI node. Retain the original scalar in
9903 those cases as well. PR98064. */
9904 if (TREE_CODE (new_tree) == SSA_NAME
9905 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
9906 && (gimple_bb (use_stmt)->loop_father
9907 != gimple_bb (vec_stmt)->loop_father)
9908 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
9909 gimple_bb (use_stmt)->loop_father))
9911 if (dump_enabled_p ())
9912 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9913 "Using original scalar computation for "
9914 "live lane because there is an out-of-loop "
9915 "definition for it\n");
9916 continue;
9918 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
9919 SET_USE (use_p, new_tree);
9920 update_stmt (use_stmt);
9924 return true;
9927 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
9929 static void
9930 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
9932 ssa_op_iter op_iter;
9933 imm_use_iterator imm_iter;
9934 def_operand_p def_p;
9935 gimple *ustmt;
9937 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
9939 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
9941 basic_block bb;
9943 if (!is_gimple_debug (ustmt))
9944 continue;
9946 bb = gimple_bb (ustmt);
9948 if (!flow_bb_inside_loop_p (loop, bb))
9950 if (gimple_debug_bind_p (ustmt))
9952 if (dump_enabled_p ())
9953 dump_printf_loc (MSG_NOTE, vect_location,
9954 "killing debug use\n");
9956 gimple_debug_bind_reset_value (ustmt);
9957 update_stmt (ustmt);
9959 else
9960 gcc_unreachable ();
9966 /* Given loop represented by LOOP_VINFO, return true if computation of
9967 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
9968 otherwise. */
9970 static bool
9971 loop_niters_no_overflow (loop_vec_info loop_vinfo)
9973 /* Constant case. */
9974 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
9976 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
9977 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
9979 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
9980 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
9981 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
9982 return true;
9985 widest_int max;
9986 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9987 /* Check the upper bound of loop niters. */
9988 if (get_max_loop_iterations (loop, &max))
9990 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
9991 signop sgn = TYPE_SIGN (type);
9992 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
9993 if (max < type_max)
9994 return true;
9996 return false;
9999 /* Return a mask type with half the number of elements as OLD_TYPE,
10000 given that it should have mode NEW_MODE. */
10002 tree
10003 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10005 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10006 return build_truth_vector_type_for_mode (nunits, new_mode);
10009 /* Return a mask type with twice as many elements as OLD_TYPE,
10010 given that it should have mode NEW_MODE. */
10012 tree
10013 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10015 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10016 return build_truth_vector_type_for_mode (nunits, new_mode);
10019 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10020 contain a sequence of NVECTORS masks that each control a vector of type
10021 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10022 these vector masks with the vector version of SCALAR_MASK. */
10024 void
10025 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10026 unsigned int nvectors, tree vectype, tree scalar_mask)
10028 gcc_assert (nvectors != 0);
10029 if (masks->length () < nvectors)
10030 masks->safe_grow_cleared (nvectors, true);
10031 rgroup_controls *rgm = &(*masks)[nvectors - 1];
10032 /* The number of scalars per iteration and the number of vectors are
10033 both compile-time constants. */
10034 unsigned int nscalars_per_iter
10035 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10036 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10038 if (scalar_mask)
10040 scalar_cond_masked_key cond (scalar_mask, nvectors);
10041 loop_vinfo->scalar_cond_masked_set.add (cond);
10044 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
10046 rgm->max_nscalars_per_iter = nscalars_per_iter;
10047 rgm->type = truth_type_for (vectype);
10048 rgm->factor = 1;
10052 /* Given a complete set of masks MASKS, extract mask number INDEX
10053 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10054 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10056 See the comment above vec_loop_masks for more details about the mask
10057 arrangement. */
10059 tree
10060 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10061 unsigned int nvectors, tree vectype, unsigned int index)
10063 rgroup_controls *rgm = &(*masks)[nvectors - 1];
10064 tree mask_type = rgm->type;
10066 /* Populate the rgroup's mask array, if this is the first time we've
10067 used it. */
10068 if (rgm->controls.is_empty ())
10070 rgm->controls.safe_grow_cleared (nvectors, true);
10071 for (unsigned int i = 0; i < nvectors; ++i)
10073 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10074 /* Provide a dummy definition until the real one is available. */
10075 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10076 rgm->controls[i] = mask;
10080 tree mask = rgm->controls[index];
10081 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10082 TYPE_VECTOR_SUBPARTS (vectype)))
10084 /* A loop mask for data type X can be reused for data type Y
10085 if X has N times more elements than Y and if Y's elements
10086 are N times bigger than X's. In this case each sequence
10087 of N elements in the loop mask will be all-zero or all-one.
10088 We can then view-convert the mask so that each sequence of
10089 N elements is replaced by a single element. */
10090 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10091 TYPE_VECTOR_SUBPARTS (vectype)));
10092 gimple_seq seq = NULL;
10093 mask_type = truth_type_for (vectype);
10094 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10095 if (seq)
10096 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10098 return mask;
10101 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10102 lengths for controlling an operation on VECTYPE. The operation splits
10103 each element of VECTYPE into FACTOR separate subelements, measuring the
10104 length as a number of these subelements. */
10106 void
10107 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10108 unsigned int nvectors, tree vectype, unsigned int factor)
10110 gcc_assert (nvectors != 0);
10111 if (lens->length () < nvectors)
10112 lens->safe_grow_cleared (nvectors, true);
10113 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10115 /* The number of scalars per iteration, scalar occupied bytes and
10116 the number of vectors are both compile-time constants. */
10117 unsigned int nscalars_per_iter
10118 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10119 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10121 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10123 /* For now, we only support cases in which all loads and stores fall back
10124 to VnQI or none do. */
10125 gcc_assert (!rgl->max_nscalars_per_iter
10126 || (rgl->factor == 1 && factor == 1)
10127 || (rgl->max_nscalars_per_iter * rgl->factor
10128 == nscalars_per_iter * factor));
10129 rgl->max_nscalars_per_iter = nscalars_per_iter;
10130 rgl->type = vectype;
10131 rgl->factor = factor;
10135 /* Given a complete set of length LENS, extract length number INDEX for an
10136 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
10138 tree
10139 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10140 unsigned int nvectors, unsigned int index)
10142 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10143 bool use_bias_adjusted_len =
10144 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10146 /* Populate the rgroup's len array, if this is the first time we've
10147 used it. */
10148 if (rgl->controls.is_empty ())
10150 rgl->controls.safe_grow_cleared (nvectors, true);
10151 for (unsigned int i = 0; i < nvectors; ++i)
10153 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10154 gcc_assert (len_type != NULL_TREE);
10156 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10158 /* Provide a dummy definition until the real one is available. */
10159 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10160 rgl->controls[i] = len;
10162 if (use_bias_adjusted_len)
10164 gcc_assert (i == 0);
10165 tree adjusted_len =
10166 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10167 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10168 rgl->bias_adjusted_ctrl = adjusted_len;
10173 if (use_bias_adjusted_len)
10174 return rgl->bias_adjusted_ctrl;
10175 else
10176 return rgl->controls[index];
10179 /* Scale profiling counters by estimation for LOOP which is vectorized
10180 by factor VF. */
10182 static void
10183 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
10185 edge preheader = loop_preheader_edge (loop);
10186 /* Reduce loop iterations by the vectorization factor. */
10187 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
10188 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
10190 if (freq_h.nonzero_p ())
10192 profile_probability p;
10194 /* Avoid dropping loop body profile counter to 0 because of zero count
10195 in loop's preheader. */
10196 if (!(freq_e == profile_count::zero ()))
10197 freq_e = freq_e.force_nonzero ();
10198 p = (freq_e * (new_est_niter + 1)).probability_in (freq_h);
10199 scale_loop_frequencies (loop, p);
10202 edge exit_e = single_exit (loop);
10203 exit_e->probability = profile_probability::always () / (new_est_niter + 1);
10205 edge exit_l = single_pred_edge (loop->latch);
10206 profile_probability prob = exit_l->probability;
10207 exit_l->probability = exit_e->probability.invert ();
10208 if (prob.initialized_p () && exit_l->probability.initialized_p ())
10209 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
10212 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10213 latch edge values originally defined by it. */
10215 static void
10216 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
10217 stmt_vec_info def_stmt_info)
10219 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
10220 if (!def || TREE_CODE (def) != SSA_NAME)
10221 return;
10222 stmt_vec_info phi_info;
10223 imm_use_iterator iter;
10224 use_operand_p use_p;
10225 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
10226 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
10227 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
10228 && (phi_info = loop_vinfo->lookup_stmt (phi))
10229 && STMT_VINFO_RELEVANT_P (phi_info)
10230 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
10231 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
10232 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
10234 loop_p loop = gimple_bb (phi)->loop_father;
10235 edge e = loop_latch_edge (loop);
10236 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
10238 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10239 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10240 gcc_assert (phi_defs.length () == latch_defs.length ());
10241 for (unsigned i = 0; i < phi_defs.length (); ++i)
10242 add_phi_arg (as_a <gphi *> (phi_defs[i]),
10243 gimple_get_lhs (latch_defs[i]), e,
10244 gimple_phi_arg_location (phi, e->dest_idx));
10249 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
10250 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
10251 stmt_vec_info. */
10253 static bool
10254 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
10255 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
10257 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10258 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10260 if (dump_enabled_p ())
10261 dump_printf_loc (MSG_NOTE, vect_location,
10262 "------>vectorizing statement: %G", stmt_info->stmt);
10264 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10265 vect_loop_kill_debug_uses (loop, stmt_info);
10267 if (!STMT_VINFO_RELEVANT_P (stmt_info)
10268 && !STMT_VINFO_LIVE_P (stmt_info))
10269 return false;
10271 if (STMT_VINFO_VECTYPE (stmt_info))
10273 poly_uint64 nunits
10274 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
10275 if (!STMT_SLP_TYPE (stmt_info)
10276 && maybe_ne (nunits, vf)
10277 && dump_enabled_p ())
10278 /* For SLP VF is set according to unrolling factor, and not
10279 to vector size, hence for SLP this print is not valid. */
10280 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10283 /* Pure SLP statements have already been vectorized. We still need
10284 to apply loop vectorization to hybrid SLP statements. */
10285 if (PURE_SLP_STMT (stmt_info))
10286 return false;
10288 if (dump_enabled_p ())
10289 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
10291 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
10292 *seen_store = stmt_info;
10294 return true;
10297 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
10298 in the hash_map with its corresponding values. */
10300 static tree
10301 find_in_mapping (tree t, void *context)
10303 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
10305 tree *value = mapping->get (t);
10306 return value ? *value : t;
10309 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10310 original loop that has now been vectorized.
10312 The inits of the data_references need to be advanced with the number of
10313 iterations of the main loop. This has been computed in vect_do_peeling and
10314 is stored in parameter ADVANCE. We first restore the data_references
10315 initial offset with the values recored in ORIG_DRS_INIT.
10317 Since the loop_vec_info of this EPILOGUE was constructed for the original
10318 loop, its stmt_vec_infos all point to the original statements. These need
10319 to be updated to point to their corresponding copies as well as the SSA_NAMES
10320 in their PATTERN_DEF_SEQs and RELATED_STMTs.
10322 The data_reference's connections also need to be updated. Their
10323 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10324 stmt_vec_infos, their statements need to point to their corresponding copy,
10325 if they are gather loads or scatter stores then their reference needs to be
10326 updated to point to its corresponding copy and finally we set
10327 'base_misaligned' to false as we have already peeled for alignment in the
10328 prologue of the main loop. */
10330 static void
10331 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10333 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10334 auto_vec<gimple *> stmt_worklist;
10335 hash_map<tree,tree> mapping;
10336 gimple *orig_stmt, *new_stmt;
10337 gimple_stmt_iterator epilogue_gsi;
10338 gphi_iterator epilogue_phi_gsi;
10339 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10340 basic_block *epilogue_bbs = get_loop_body (epilogue);
10341 unsigned i;
10343 free (LOOP_VINFO_BBS (epilogue_vinfo));
10344 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10346 /* Advance data_reference's with the number of iterations of the previous
10347 loop and its prologue. */
10348 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10351 /* The EPILOGUE loop is a copy of the original loop so they share the same
10352 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10353 point to the copied statements. We also create a mapping of all LHS' in
10354 the original loop and all the LHS' in the EPILOGUE and create worklists to
10355 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
10356 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10358 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10359 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10361 new_stmt = epilogue_phi_gsi.phi ();
10363 gcc_assert (gimple_uid (new_stmt) > 0);
10364 stmt_vinfo
10365 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10367 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
10368 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10370 mapping.put (gimple_phi_result (orig_stmt),
10371 gimple_phi_result (new_stmt));
10372 /* PHI nodes can not have patterns or related statements. */
10373 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
10374 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
10377 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10378 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10380 new_stmt = gsi_stmt (epilogue_gsi);
10381 if (is_gimple_debug (new_stmt))
10382 continue;
10384 gcc_assert (gimple_uid (new_stmt) > 0);
10385 stmt_vinfo
10386 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10388 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
10389 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10391 if (tree old_lhs = gimple_get_lhs (orig_stmt))
10392 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
10394 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
10396 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
10397 for (gimple_stmt_iterator gsi = gsi_start (seq);
10398 !gsi_end_p (gsi); gsi_next (&gsi))
10399 stmt_worklist.safe_push (gsi_stmt (gsi));
10402 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10403 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10405 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10406 stmt_worklist.safe_push (stmt);
10407 /* Set BB such that the assert in
10408 'get_initial_def_for_reduction' is able to determine that
10409 the BB of the related stmt is inside this loop. */
10410 gimple_set_bb (stmt,
10411 gimple_bb (new_stmt));
10412 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10413 gcc_assert (related_vinfo == NULL
10414 || related_vinfo == stmt_vinfo);
10419 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
10420 using the original main loop and thus need to be updated to refer to the
10421 cloned variables used in the epilogue. */
10422 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
10424 gimple *stmt = stmt_worklist[i];
10425 tree *new_op;
10427 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
10429 tree op = gimple_op (stmt, j);
10430 if ((new_op = mapping.get(op)))
10431 gimple_set_op (stmt, j, *new_op);
10432 else
10434 /* PR92429: The last argument of simplify_replace_tree disables
10435 folding when replacing arguments. This is required as
10436 otherwise you might end up with different statements than the
10437 ones analyzed in vect_loop_analyze, leading to different
10438 vectorization. */
10439 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
10440 &find_in_mapping, &mapping, false);
10441 gimple_set_op (stmt, j, op);
10446 struct data_reference *dr;
10447 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10448 FOR_EACH_VEC_ELT (datarefs, i, dr)
10450 orig_stmt = DR_STMT (dr);
10451 gcc_assert (gimple_uid (orig_stmt) > 0);
10452 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10453 /* Data references for gather loads and scatter stores do not use the
10454 updated offset we set using ADVANCE. Instead we have to make sure the
10455 reference in the data references point to the corresponding copy of
10456 the original in the epilogue. */
10457 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
10458 == VMAT_GATHER_SCATTER)
10460 DR_REF (dr)
10461 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
10462 &find_in_mapping, &mapping);
10463 DR_BASE_ADDRESS (dr)
10464 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
10465 &find_in_mapping, &mapping);
10467 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10468 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
10469 /* The vector size of the epilogue is smaller than that of the main loop
10470 so the alignment is either the same or lower. This means the dr will
10471 thus by definition be aligned. */
10472 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
10475 epilogue_vinfo->shared->datarefs_copy.release ();
10476 epilogue_vinfo->shared->save_datarefs ();
10479 /* Function vect_transform_loop.
10481 The analysis phase has determined that the loop is vectorizable.
10482 Vectorize the loop - created vectorized stmts to replace the scalar
10483 stmts in the loop, and update the loop exit condition.
10484 Returns scalar epilogue loop if any. */
10486 class loop *
10487 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
10489 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10490 class loop *epilogue = NULL;
10491 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
10492 int nbbs = loop->num_nodes;
10493 int i;
10494 tree niters_vector = NULL_TREE;
10495 tree step_vector = NULL_TREE;
10496 tree niters_vector_mult_vf = NULL_TREE;
10497 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10498 unsigned int lowest_vf = constant_lower_bound (vf);
10499 gimple *stmt;
10500 bool check_profitability = false;
10501 unsigned int th;
10503 DUMP_VECT_SCOPE ("vec_transform_loop");
10505 loop_vinfo->shared->check_datarefs ();
10507 /* Use the more conservative vectorization threshold. If the number
10508 of iterations is constant assume the cost check has been performed
10509 by our caller. If the threshold makes all loops profitable that
10510 run at least the (estimated) vectorization factor number of times
10511 checking is pointless, too. */
10512 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
10513 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
10515 if (dump_enabled_p ())
10516 dump_printf_loc (MSG_NOTE, vect_location,
10517 "Profitability threshold is %d loop iterations.\n",
10518 th);
10519 check_profitability = true;
10522 /* Make sure there exists a single-predecessor exit bb. Do this before
10523 versioning. */
10524 edge e = single_exit (loop);
10525 if (! single_pred_p (e->dest))
10527 split_loop_exit_edge (e, true);
10528 if (dump_enabled_p ())
10529 dump_printf (MSG_NOTE, "split exit edge\n");
10532 /* Version the loop first, if required, so the profitability check
10533 comes first. */
10535 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
10537 class loop *sloop
10538 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
10539 sloop->force_vectorize = false;
10540 check_profitability = false;
10543 /* Make sure there exists a single-predecessor exit bb also on the
10544 scalar loop copy. Do this after versioning but before peeling
10545 so CFG structure is fine for both scalar and if-converted loop
10546 to make slpeel_duplicate_current_defs_from_edges face matched
10547 loop closed PHI nodes on the exit. */
10548 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
10550 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
10551 if (! single_pred_p (e->dest))
10553 split_loop_exit_edge (e, true);
10554 if (dump_enabled_p ())
10555 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
10559 tree niters = vect_build_loop_niters (loop_vinfo);
10560 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
10561 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
10562 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
10563 tree advance;
10564 drs_init_vec orig_drs_init;
10566 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
10567 &step_vector, &niters_vector_mult_vf, th,
10568 check_profitability, niters_no_overflow,
10569 &advance);
10571 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
10572 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
10573 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
10574 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
10576 if (niters_vector == NULL_TREE)
10578 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
10579 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
10580 && known_eq (lowest_vf, vf))
10582 niters_vector
10583 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
10584 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
10585 step_vector = build_one_cst (TREE_TYPE (niters));
10587 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10588 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
10589 &step_vector, niters_no_overflow);
10590 else
10591 /* vect_do_peeling subtracted the number of peeled prologue
10592 iterations from LOOP_VINFO_NITERS. */
10593 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
10594 &niters_vector, &step_vector,
10595 niters_no_overflow);
10598 /* 1) Make sure the loop header has exactly two entries
10599 2) Make sure we have a preheader basic block. */
10601 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
10603 split_edge (loop_preheader_edge (loop));
10605 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10606 /* This will deal with any possible peeling. */
10607 vect_prepare_for_masked_peels (loop_vinfo);
10609 /* Schedule the SLP instances first, then handle loop vectorization
10610 below. */
10611 if (!loop_vinfo->slp_instances.is_empty ())
10613 DUMP_VECT_SCOPE ("scheduling SLP instances");
10614 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
10617 /* FORNOW: the vectorizer supports only loops which body consist
10618 of one basic block (header + empty latch). When the vectorizer will
10619 support more involved loop forms, the order by which the BBs are
10620 traversed need to be reconsidered. */
10622 for (i = 0; i < nbbs; i++)
10624 basic_block bb = bbs[i];
10625 stmt_vec_info stmt_info;
10627 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
10628 gsi_next (&si))
10630 gphi *phi = si.phi ();
10631 if (dump_enabled_p ())
10632 dump_printf_loc (MSG_NOTE, vect_location,
10633 "------>vectorizing phi: %G", (gimple *) phi);
10634 stmt_info = loop_vinfo->lookup_stmt (phi);
10635 if (!stmt_info)
10636 continue;
10638 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10639 vect_loop_kill_debug_uses (loop, stmt_info);
10641 if (!STMT_VINFO_RELEVANT_P (stmt_info)
10642 && !STMT_VINFO_LIVE_P (stmt_info))
10643 continue;
10645 if (STMT_VINFO_VECTYPE (stmt_info)
10646 && (maybe_ne
10647 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
10648 && dump_enabled_p ())
10649 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10651 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
10652 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
10653 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
10654 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
10655 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
10656 && ! PURE_SLP_STMT (stmt_info))
10658 if (dump_enabled_p ())
10659 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
10660 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
10664 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
10665 gsi_next (&si))
10667 gphi *phi = si.phi ();
10668 stmt_info = loop_vinfo->lookup_stmt (phi);
10669 if (!stmt_info)
10670 continue;
10672 if (!STMT_VINFO_RELEVANT_P (stmt_info)
10673 && !STMT_VINFO_LIVE_P (stmt_info))
10674 continue;
10676 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
10677 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
10678 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
10679 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
10680 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
10681 && ! PURE_SLP_STMT (stmt_info))
10682 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
10685 for (gimple_stmt_iterator si = gsi_start_bb (bb);
10686 !gsi_end_p (si);)
10688 stmt = gsi_stmt (si);
10689 /* During vectorization remove existing clobber stmts. */
10690 if (gimple_clobber_p (stmt))
10692 unlink_stmt_vdef (stmt);
10693 gsi_remove (&si, true);
10694 release_defs (stmt);
10696 else
10698 /* Ignore vector stmts created in the outer loop. */
10699 stmt_info = loop_vinfo->lookup_stmt (stmt);
10701 /* vector stmts created in the outer-loop during vectorization of
10702 stmts in an inner-loop may not have a stmt_info, and do not
10703 need to be vectorized. */
10704 stmt_vec_info seen_store = NULL;
10705 if (stmt_info)
10707 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
10709 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
10710 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
10711 !gsi_end_p (subsi); gsi_next (&subsi))
10713 stmt_vec_info pat_stmt_info
10714 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
10715 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
10716 &si, &seen_store);
10718 stmt_vec_info pat_stmt_info
10719 = STMT_VINFO_RELATED_STMT (stmt_info);
10720 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
10721 &si, &seen_store))
10722 maybe_set_vectorized_backedge_value (loop_vinfo,
10723 pat_stmt_info);
10725 else
10727 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
10728 &seen_store))
10729 maybe_set_vectorized_backedge_value (loop_vinfo,
10730 stmt_info);
10733 gsi_next (&si);
10734 if (seen_store)
10736 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
10737 /* Interleaving. If IS_STORE is TRUE, the
10738 vectorization of the interleaving chain was
10739 completed - free all the stores in the chain. */
10740 vect_remove_stores (loop_vinfo,
10741 DR_GROUP_FIRST_ELEMENT (seen_store));
10742 else
10743 /* Free the attached stmt_vec_info and remove the stmt. */
10744 loop_vinfo->remove_stmt (stmt_info);
10749 /* Stub out scalar statements that must not survive vectorization.
10750 Doing this here helps with grouped statements, or statements that
10751 are involved in patterns. */
10752 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
10753 !gsi_end_p (gsi); gsi_next (&gsi))
10755 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
10756 if (!call || !gimple_call_internal_p (call))
10757 continue;
10758 internal_fn ifn = gimple_call_internal_fn (call);
10759 if (ifn == IFN_MASK_LOAD)
10761 tree lhs = gimple_get_lhs (call);
10762 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10764 tree zero = build_zero_cst (TREE_TYPE (lhs));
10765 gimple *new_stmt = gimple_build_assign (lhs, zero);
10766 gsi_replace (&gsi, new_stmt, true);
10769 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
10771 tree lhs = gimple_get_lhs (call);
10772 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10774 tree else_arg
10775 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
10776 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
10777 gsi_replace (&gsi, new_stmt, true);
10781 } /* BBs in loop */
10783 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
10784 a zero NITERS becomes a nonzero NITERS_VECTOR. */
10785 if (integer_onep (step_vector))
10786 niters_no_overflow = true;
10787 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
10788 niters_vector_mult_vf, !niters_no_overflow);
10790 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
10791 scale_profile_for_vect_loop (loop, assumed_vf);
10793 /* True if the final iteration might not handle a full vector's
10794 worth of scalar iterations. */
10795 bool final_iter_may_be_partial
10796 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
10797 /* The minimum number of iterations performed by the epilogue. This
10798 is 1 when peeling for gaps because we always need a final scalar
10799 iteration. */
10800 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
10801 /* +1 to convert latch counts to loop iteration counts,
10802 -min_epilogue_iters to remove iterations that cannot be performed
10803 by the vector code. */
10804 int bias_for_lowest = 1 - min_epilogue_iters;
10805 int bias_for_assumed = bias_for_lowest;
10806 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
10807 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
10809 /* When the amount of peeling is known at compile time, the first
10810 iteration will have exactly alignment_npeels active elements.
10811 In the worst case it will have at least one. */
10812 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
10813 bias_for_lowest += lowest_vf - min_first_active;
10814 bias_for_assumed += assumed_vf - min_first_active;
10816 /* In these calculations the "- 1" converts loop iteration counts
10817 back to latch counts. */
10818 if (loop->any_upper_bound)
10820 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
10821 loop->nb_iterations_upper_bound
10822 = (final_iter_may_be_partial
10823 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
10824 lowest_vf) - 1
10825 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
10826 lowest_vf) - 1);
10827 if (main_vinfo
10828 /* Both peeling for alignment and peeling for gaps can end up
10829 with the scalar epilogue running for more than VF-1 iterations. */
10830 && !main_vinfo->peeling_for_alignment
10831 && !main_vinfo->peeling_for_gaps)
10833 unsigned int bound;
10834 poly_uint64 main_iters
10835 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
10836 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
10837 main_iters
10838 = upper_bound (main_iters,
10839 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
10840 if (can_div_away_from_zero_p (main_iters,
10841 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
10842 &bound))
10843 loop->nb_iterations_upper_bound
10844 = wi::umin ((widest_int) (bound - 1),
10845 loop->nb_iterations_upper_bound);
10848 if (loop->any_likely_upper_bound)
10849 loop->nb_iterations_likely_upper_bound
10850 = (final_iter_may_be_partial
10851 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
10852 + bias_for_lowest, lowest_vf) - 1
10853 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
10854 + bias_for_lowest, lowest_vf) - 1);
10855 if (loop->any_estimate)
10856 loop->nb_iterations_estimate
10857 = (final_iter_may_be_partial
10858 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
10859 assumed_vf) - 1
10860 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
10861 assumed_vf) - 1);
10863 if (dump_enabled_p ())
10865 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
10867 dump_printf_loc (MSG_NOTE, vect_location,
10868 "LOOP VECTORIZED\n");
10869 if (loop->inner)
10870 dump_printf_loc (MSG_NOTE, vect_location,
10871 "OUTER LOOP VECTORIZED\n");
10872 dump_printf (MSG_NOTE, "\n");
10874 else
10875 dump_printf_loc (MSG_NOTE, vect_location,
10876 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
10877 GET_MODE_NAME (loop_vinfo->vector_mode));
10880 /* Loops vectorized with a variable factor won't benefit from
10881 unrolling/peeling. */
10882 if (!vf.is_constant ())
10884 loop->unroll = 1;
10885 if (dump_enabled_p ())
10886 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
10887 " variable-length vectorization factor\n");
10889 /* Free SLP instances here because otherwise stmt reference counting
10890 won't work. */
10891 slp_instance instance;
10892 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
10893 vect_free_slp_instance (instance);
10894 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
10895 /* Clear-up safelen field since its value is invalid after vectorization
10896 since vectorized loop can have loop-carried dependencies. */
10897 loop->safelen = 0;
10899 if (epilogue)
10901 update_epilogue_loop_vinfo (epilogue, advance);
10903 epilogue->simduid = loop->simduid;
10904 epilogue->force_vectorize = loop->force_vectorize;
10905 epilogue->dont_vectorize = false;
10908 return epilogue;
10911 /* The code below is trying to perform simple optimization - revert
10912 if-conversion for masked stores, i.e. if the mask of a store is zero
10913 do not perform it and all stored value producers also if possible.
10914 For example,
10915 for (i=0; i<n; i++)
10916 if (c[i])
10918 p1[i] += 1;
10919 p2[i] = p3[i] +2;
10921 this transformation will produce the following semi-hammock:
10923 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
10925 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
10926 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
10927 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
10928 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
10929 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
10930 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
10934 void
10935 optimize_mask_stores (class loop *loop)
10937 basic_block *bbs = get_loop_body (loop);
10938 unsigned nbbs = loop->num_nodes;
10939 unsigned i;
10940 basic_block bb;
10941 class loop *bb_loop;
10942 gimple_stmt_iterator gsi;
10943 gimple *stmt;
10944 auto_vec<gimple *> worklist;
10945 auto_purge_vect_location sentinel;
10947 vect_location = find_loop_location (loop);
10948 /* Pick up all masked stores in loop if any. */
10949 for (i = 0; i < nbbs; i++)
10951 bb = bbs[i];
10952 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
10953 gsi_next (&gsi))
10955 stmt = gsi_stmt (gsi);
10956 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
10957 worklist.safe_push (stmt);
10961 free (bbs);
10962 if (worklist.is_empty ())
10963 return;
10965 /* Loop has masked stores. */
10966 while (!worklist.is_empty ())
10968 gimple *last, *last_store;
10969 edge e, efalse;
10970 tree mask;
10971 basic_block store_bb, join_bb;
10972 gimple_stmt_iterator gsi_to;
10973 tree vdef, new_vdef;
10974 gphi *phi;
10975 tree vectype;
10976 tree zero;
10978 last = worklist.pop ();
10979 mask = gimple_call_arg (last, 2);
10980 bb = gimple_bb (last);
10981 /* Create then_bb and if-then structure in CFG, then_bb belongs to
10982 the same loop as if_bb. It could be different to LOOP when two
10983 level loop-nest is vectorized and mask_store belongs to the inner
10984 one. */
10985 e = split_block (bb, last);
10986 bb_loop = bb->loop_father;
10987 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
10988 join_bb = e->dest;
10989 store_bb = create_empty_bb (bb);
10990 add_bb_to_loop (store_bb, bb_loop);
10991 e->flags = EDGE_TRUE_VALUE;
10992 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
10993 /* Put STORE_BB to likely part. */
10994 efalse->probability = profile_probability::unlikely ();
10995 store_bb->count = efalse->count ();
10996 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
10997 if (dom_info_available_p (CDI_DOMINATORS))
10998 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
10999 if (dump_enabled_p ())
11000 dump_printf_loc (MSG_NOTE, vect_location,
11001 "Create new block %d to sink mask stores.",
11002 store_bb->index);
11003 /* Create vector comparison with boolean result. */
11004 vectype = TREE_TYPE (mask);
11005 zero = build_zero_cst (vectype);
11006 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11007 gsi = gsi_last_bb (bb);
11008 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11009 /* Create new PHI node for vdef of the last masked store:
11010 .MEM_2 = VDEF <.MEM_1>
11011 will be converted to
11012 .MEM.3 = VDEF <.MEM_1>
11013 and new PHI node will be created in join bb
11014 .MEM_2 = PHI <.MEM_1, .MEM_3>
11016 vdef = gimple_vdef (last);
11017 new_vdef = make_ssa_name (gimple_vop (cfun), last);
11018 gimple_set_vdef (last, new_vdef);
11019 phi = create_phi_node (vdef, join_bb);
11020 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11022 /* Put all masked stores with the same mask to STORE_BB if possible. */
11023 while (true)
11025 gimple_stmt_iterator gsi_from;
11026 gimple *stmt1 = NULL;
11028 /* Move masked store to STORE_BB. */
11029 last_store = last;
11030 gsi = gsi_for_stmt (last);
11031 gsi_from = gsi;
11032 /* Shift GSI to the previous stmt for further traversal. */
11033 gsi_prev (&gsi);
11034 gsi_to = gsi_start_bb (store_bb);
11035 gsi_move_before (&gsi_from, &gsi_to);
11036 /* Setup GSI_TO to the non-empty block start. */
11037 gsi_to = gsi_start_bb (store_bb);
11038 if (dump_enabled_p ())
11039 dump_printf_loc (MSG_NOTE, vect_location,
11040 "Move stmt to created bb\n%G", last);
11041 /* Move all stored value producers if possible. */
11042 while (!gsi_end_p (gsi))
11044 tree lhs;
11045 imm_use_iterator imm_iter;
11046 use_operand_p use_p;
11047 bool res;
11049 /* Skip debug statements. */
11050 if (is_gimple_debug (gsi_stmt (gsi)))
11052 gsi_prev (&gsi);
11053 continue;
11055 stmt1 = gsi_stmt (gsi);
11056 /* Do not consider statements writing to memory or having
11057 volatile operand. */
11058 if (gimple_vdef (stmt1)
11059 || gimple_has_volatile_ops (stmt1))
11060 break;
11061 gsi_from = gsi;
11062 gsi_prev (&gsi);
11063 lhs = gimple_get_lhs (stmt1);
11064 if (!lhs)
11065 break;
11067 /* LHS of vectorized stmt must be SSA_NAME. */
11068 if (TREE_CODE (lhs) != SSA_NAME)
11069 break;
11071 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11073 /* Remove dead scalar statement. */
11074 if (has_zero_uses (lhs))
11076 gsi_remove (&gsi_from, true);
11077 continue;
11081 /* Check that LHS does not have uses outside of STORE_BB. */
11082 res = true;
11083 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11085 gimple *use_stmt;
11086 use_stmt = USE_STMT (use_p);
11087 if (is_gimple_debug (use_stmt))
11088 continue;
11089 if (gimple_bb (use_stmt) != store_bb)
11091 res = false;
11092 break;
11095 if (!res)
11096 break;
11098 if (gimple_vuse (stmt1)
11099 && gimple_vuse (stmt1) != gimple_vuse (last_store))
11100 break;
11102 /* Can move STMT1 to STORE_BB. */
11103 if (dump_enabled_p ())
11104 dump_printf_loc (MSG_NOTE, vect_location,
11105 "Move stmt to created bb\n%G", stmt1);
11106 gsi_move_before (&gsi_from, &gsi_to);
11107 /* Shift GSI_TO for further insertion. */
11108 gsi_prev (&gsi_to);
11110 /* Put other masked stores with the same mask to STORE_BB. */
11111 if (worklist.is_empty ()
11112 || gimple_call_arg (worklist.last (), 2) != mask
11113 || worklist.last () != stmt1)
11114 break;
11115 last = worklist.pop ();
11117 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11121 /* Decide whether it is possible to use a zero-based induction variable
11122 when vectorizing LOOP_VINFO with partial vectors. If it is, return
11123 the value that the induction variable must be able to hold in order
11124 to ensure that the rgroups eventually have no active vector elements.
11125 Return -1 otherwise. */
11127 widest_int
11128 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11130 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11131 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11132 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11134 /* Calculate the value that the induction variable must be able
11135 to hit in order to ensure that we end the loop with an all-false mask.
11136 This involves adding the maximum number of inactive trailing scalar
11137 iterations. */
11138 widest_int iv_limit = -1;
11139 if (max_loop_iterations (loop, &iv_limit))
11141 if (niters_skip)
11143 /* Add the maximum number of skipped iterations to the
11144 maximum iteration count. */
11145 if (TREE_CODE (niters_skip) == INTEGER_CST)
11146 iv_limit += wi::to_widest (niters_skip);
11147 else
11148 iv_limit += max_vf - 1;
11150 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11151 /* Make a conservatively-correct assumption. */
11152 iv_limit += max_vf - 1;
11154 /* IV_LIMIT is the maximum number of latch iterations, which is also
11155 the maximum in-range IV value. Round this value down to the previous
11156 vector alignment boundary and then add an extra full iteration. */
11157 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11158 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11160 return iv_limit;
11163 /* For the given rgroup_controls RGC, check whether an induction variable
11164 would ever hit a value that produces a set of all-false masks or zero
11165 lengths before wrapping around. Return true if it's possible to wrap
11166 around before hitting the desirable value, otherwise return false. */
11168 bool
11169 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11171 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11173 if (iv_limit == -1)
11174 return true;
11176 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11177 unsigned int compare_precision = TYPE_PRECISION (compare_type);
11178 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11180 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11181 return true;
11183 return false;