d: Merge upstream dmd, druntime 26f049fb26, phobos 330d6a4fd.
[official-gcc.git] / gcc / tree-vect-loop.cc
blob1fcd8d07ea1448920b9fb203d96b5f5cb0c2282a
1 /* Loop Vectorization
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 #include "case-cfn-macros.h"
58 #include "langhooks.h"
60 /* Loop Vectorization Pass.
62 This pass tries to vectorize loops.
64 For example, the vectorizer transforms the following simple loop:
66 short a[N]; short b[N]; short c[N]; int i;
68 for (i=0; i<N; i++){
69 a[i] = b[i] + c[i];
72 as if it was manually vectorized by rewriting the source code into:
74 typedef int __attribute__((mode(V8HI))) v8hi;
75 short a[N]; short b[N]; short c[N]; int i;
76 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
77 v8hi va, vb, vc;
79 for (i=0; i<N/8; i++){
80 vb = pb[i];
81 vc = pc[i];
82 va = vb + vc;
83 pa[i] = va;
86 The main entry to this pass is vectorize_loops(), in which
87 the vectorizer applies a set of analyses on a given set of loops,
88 followed by the actual vectorization transformation for the loops that
89 had successfully passed the analysis phase.
90 Throughout this pass we make a distinction between two types of
91 data: scalars (which are represented by SSA_NAMES), and memory references
92 ("data-refs"). These two types of data require different handling both
93 during analysis and transformation. The types of data-refs that the
94 vectorizer currently supports are ARRAY_REFS which base is an array DECL
95 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
96 accesses are required to have a simple (consecutive) access pattern.
98 Analysis phase:
99 ===============
100 The driver for the analysis phase is vect_analyze_loop().
101 It applies a set of analyses, some of which rely on the scalar evolution
102 analyzer (scev) developed by Sebastian Pop.
104 During the analysis phase the vectorizer records some information
105 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
106 loop, as well as general information about the loop as a whole, which is
107 recorded in a "loop_vec_info" struct attached to each loop.
109 Transformation phase:
110 =====================
111 The loop transformation phase scans all the stmts in the loop, and
112 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
113 the loop that needs to be vectorized. It inserts the vector code sequence
114 just before the scalar stmt S, and records a pointer to the vector code
115 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
116 attached to S). This pointer will be used for the vectorization of following
117 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
118 otherwise, we rely on dead code elimination for removing it.
120 For example, say stmt S1 was vectorized into stmt VS1:
122 VS1: vb = px[i];
123 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124 S2: a = b;
126 To vectorize stmt S2, the vectorizer first finds the stmt that defines
127 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
128 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
129 resulting sequence would be:
131 VS1: vb = px[i];
132 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
133 VS2: va = vb;
134 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
136 Operands that are not SSA_NAMEs, are data-refs that appear in
137 load/store operations (like 'x[i]' in S1), and are handled differently.
139 Target modeling:
140 =================
141 Currently the only target specific information that is used is the
142 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
143 Targets that can support different sizes of vectors, for now will need
144 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
145 flexibility will be added in the future.
147 Since we only vectorize operations which vector form can be
148 expressed using existing tree codes, to verify that an operation is
149 supported, the vectorizer checks the relevant optab at the relevant
150 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
151 the value found is CODE_FOR_nothing, then there's no target support, and
152 we can't vectorize the stmt.
154 For additional information on this project see:
155 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
158 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
159 unsigned *);
160 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
161 bool *, bool *, bool);
163 /* Subroutine of vect_determine_vf_for_stmt that handles only one
164 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
165 may already be set for general statements (not just data refs). */
167 static opt_result
168 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
169 bool vectype_maybe_set_p,
170 poly_uint64 *vf)
172 gimple *stmt = stmt_info->stmt;
174 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
175 && !STMT_VINFO_LIVE_P (stmt_info))
176 || gimple_clobber_p (stmt))
178 if (dump_enabled_p ())
179 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
180 return opt_result::success ();
183 tree stmt_vectype, nunits_vectype;
184 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
185 &stmt_vectype,
186 &nunits_vectype);
187 if (!res)
188 return res;
190 if (stmt_vectype)
192 if (STMT_VINFO_VECTYPE (stmt_info))
193 /* The only case when a vectype had been already set is for stmts
194 that contain a data ref, or for "pattern-stmts" (stmts generated
195 by the vectorizer to represent/replace a certain idiom). */
196 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
197 || vectype_maybe_set_p)
198 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
199 else
200 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
203 if (nunits_vectype)
204 vect_update_max_nunits (vf, nunits_vectype);
206 return opt_result::success ();
209 /* Subroutine of vect_determine_vectorization_factor. Set the vector
210 types of STMT_INFO and all attached pattern statements and update
211 the vectorization factor VF accordingly. Return true on success
212 or false if something prevented vectorization. */
214 static opt_result
215 vect_determine_vf_for_stmt (vec_info *vinfo,
216 stmt_vec_info stmt_info, poly_uint64 *vf)
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
220 stmt_info->stmt);
221 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
222 if (!res)
223 return res;
225 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
226 && STMT_VINFO_RELATED_STMT (stmt_info))
228 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
229 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
231 /* If a pattern statement has def stmts, analyze them too. */
232 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
233 !gsi_end_p (si); gsi_next (&si))
235 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
236 if (dump_enabled_p ())
237 dump_printf_loc (MSG_NOTE, vect_location,
238 "==> examining pattern def stmt: %G",
239 def_stmt_info->stmt);
240 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
241 if (!res)
242 return res;
245 if (dump_enabled_p ())
246 dump_printf_loc (MSG_NOTE, vect_location,
247 "==> examining pattern statement: %G",
248 stmt_info->stmt);
249 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
250 if (!res)
251 return res;
254 return opt_result::success ();
257 /* Function vect_determine_vectorization_factor
259 Determine the vectorization factor (VF). VF is the number of data elements
260 that are operated upon in parallel in a single iteration of the vectorized
261 loop. For example, when vectorizing a loop that operates on 4byte elements,
262 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
263 elements can fit in a single vector register.
265 We currently support vectorization of loops in which all types operated upon
266 are of the same size. Therefore this function currently sets VF according to
267 the size of the types operated upon, and fails if there are multiple sizes
268 in the loop.
270 VF is also the factor by which the loop iterations are strip-mined, e.g.:
271 original loop:
272 for (i=0; i<N; i++){
273 a[i] = b[i] + c[i];
276 vectorized loop:
277 for (i=0; i<N; i+=VF){
278 a[i:VF] = b[i:VF] + c[i:VF];
282 static opt_result
283 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
285 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
286 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
287 unsigned nbbs = loop->num_nodes;
288 poly_uint64 vectorization_factor = 1;
289 tree scalar_type = NULL_TREE;
290 gphi *phi;
291 tree vectype;
292 stmt_vec_info stmt_info;
293 unsigned i;
295 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
297 for (i = 0; i < nbbs; i++)
299 basic_block bb = bbs[i];
301 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
302 gsi_next (&si))
304 phi = si.phi ();
305 stmt_info = loop_vinfo->lookup_stmt (phi);
306 if (dump_enabled_p ())
307 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
308 (gimple *) phi);
310 gcc_assert (stmt_info);
312 if (STMT_VINFO_RELEVANT_P (stmt_info)
313 || STMT_VINFO_LIVE_P (stmt_info))
315 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
316 scalar_type = TREE_TYPE (PHI_RESULT (phi));
318 if (dump_enabled_p ())
319 dump_printf_loc (MSG_NOTE, vect_location,
320 "get vectype for scalar type: %T\n",
321 scalar_type);
323 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
324 if (!vectype)
325 return opt_result::failure_at (phi,
326 "not vectorized: unsupported "
327 "data-type %T\n",
328 scalar_type);
329 STMT_VINFO_VECTYPE (stmt_info) = vectype;
331 if (dump_enabled_p ())
332 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
333 vectype);
335 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
338 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
339 dump_printf (MSG_NOTE, "\n");
342 vect_update_max_nunits (&vectorization_factor, vectype);
346 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
347 gsi_next (&si))
349 if (is_gimple_debug (gsi_stmt (si)))
350 continue;
351 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
352 opt_result res
353 = vect_determine_vf_for_stmt (loop_vinfo,
354 stmt_info, &vectorization_factor);
355 if (!res)
356 return res;
360 /* TODO: Analyze cost. Decide if worth while to vectorize. */
361 if (dump_enabled_p ())
363 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
364 dump_dec (MSG_NOTE, vectorization_factor);
365 dump_printf (MSG_NOTE, "\n");
368 if (known_le (vectorization_factor, 1U))
369 return opt_result::failure_at (vect_location,
370 "not vectorized: unsupported data-type\n");
371 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
372 return opt_result::success ();
376 /* Function vect_is_simple_iv_evolution.
378 FORNOW: A simple evolution of an induction variables in the loop is
379 considered a polynomial evolution. */
381 static bool
382 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
383 tree * step)
385 tree init_expr;
386 tree step_expr;
387 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
388 basic_block bb;
390 /* When there is no evolution in this loop, the evolution function
391 is not "simple". */
392 if (evolution_part == NULL_TREE)
393 return false;
395 /* When the evolution is a polynomial of degree >= 2
396 the evolution function is not "simple". */
397 if (tree_is_chrec (evolution_part))
398 return false;
400 step_expr = evolution_part;
401 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
403 if (dump_enabled_p ())
404 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
405 step_expr, init_expr);
407 *init = init_expr;
408 *step = step_expr;
410 if (TREE_CODE (step_expr) != INTEGER_CST
411 && (TREE_CODE (step_expr) != SSA_NAME
412 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
413 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
414 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
415 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
416 || !flag_associative_math)))
417 && (TREE_CODE (step_expr) != REAL_CST
418 || !flag_associative_math))
420 if (dump_enabled_p ())
421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
422 "step unknown.\n");
423 return false;
426 return true;
429 /* Function vect_is_nonlinear_iv_evolution
431 Only support nonlinear induction for integer type
432 1. neg
433 2. mul by constant
434 3. lshift/rshift by constant.
436 For neg induction, return a fake step as integer -1. */
437 static bool
438 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
439 gphi* loop_phi_node, tree *init, tree *step)
441 tree init_expr, ev_expr, result, op1, op2;
442 gimple* def;
444 if (gimple_phi_num_args (loop_phi_node) != 2)
445 return false;
447 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
448 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
450 /* Support nonlinear induction only for integer type. */
451 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
452 return false;
454 *init = init_expr;
455 result = PHI_RESULT (loop_phi_node);
457 if (TREE_CODE (ev_expr) != SSA_NAME
458 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
459 || !is_gimple_assign (def))
460 return false;
462 enum tree_code t_code = gimple_assign_rhs_code (def);
463 switch (t_code)
465 case NEGATE_EXPR:
466 if (gimple_assign_rhs1 (def) != result)
467 return false;
468 *step = build_int_cst (TREE_TYPE (init_expr), -1);
469 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
470 break;
472 case RSHIFT_EXPR:
473 case LSHIFT_EXPR:
474 case MULT_EXPR:
475 op1 = gimple_assign_rhs1 (def);
476 op2 = gimple_assign_rhs2 (def);
477 if (TREE_CODE (op2) != INTEGER_CST
478 || op1 != result)
479 return false;
480 *step = op2;
481 if (t_code == LSHIFT_EXPR)
482 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
483 else if (t_code == RSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
485 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
486 else
487 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
488 break;
490 default:
491 return false;
494 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
495 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
497 return true;
500 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
501 what we are assuming is a double reduction. For example, given
502 a structure like this:
504 outer1:
505 x_1 = PHI <x_4(outer2), ...>;
508 inner:
509 x_2 = PHI <x_1(outer1), ...>;
511 x_3 = ...;
514 outer2:
515 x_4 = PHI <x_3(inner)>;
518 outer loop analysis would treat x_1 as a double reduction phi and
519 this function would then return true for x_2. */
521 static bool
522 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
524 use_operand_p use_p;
525 ssa_op_iter op_iter;
526 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
527 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
528 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
529 return true;
530 return false;
533 /* Returns true if Phi is a first-order recurrence. A first-order
534 recurrence is a non-reduction recurrence relation in which the value of
535 the recurrence in the current loop iteration equals a value defined in
536 the previous iteration. */
538 static bool
539 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
540 gphi *phi)
542 /* A nested cycle isn't vectorizable as first order recurrence. */
543 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
544 return false;
546 /* Ensure the loop latch definition is from within the loop. */
547 edge latch = loop_latch_edge (loop);
548 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
549 if (TREE_CODE (ldef) != SSA_NAME
550 || SSA_NAME_IS_DEFAULT_DEF (ldef)
551 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
552 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
553 return false;
555 tree def = gimple_phi_result (phi);
557 /* Ensure every use_stmt of the phi node is dominated by the latch
558 definition. */
559 imm_use_iterator imm_iter;
560 use_operand_p use_p;
561 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
562 if (!is_gimple_debug (USE_STMT (use_p))
563 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
564 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
565 USE_STMT (use_p))))
566 return false;
568 /* First-order recurrence autovectorization needs shuffle vector. */
569 tree scalar_type = TREE_TYPE (def);
570 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
571 if (!vectype)
572 return false;
574 return true;
577 /* Function vect_analyze_scalar_cycles_1.
579 Examine the cross iteration def-use cycles of scalar variables
580 in LOOP. LOOP_VINFO represents the loop that is now being
581 considered for vectorization (can be LOOP, or an outer-loop
582 enclosing LOOP). SLP indicates there will be some subsequent
583 slp analyses or not. */
585 static void
586 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
587 bool slp)
589 basic_block bb = loop->header;
590 tree init, step;
591 auto_vec<stmt_vec_info, 64> worklist;
592 gphi_iterator gsi;
593 bool double_reduc, reduc_chain;
595 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
597 /* First - identify all inductions. Reduction detection assumes that all the
598 inductions have been identified, therefore, this order must not be
599 changed. */
600 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
602 gphi *phi = gsi.phi ();
603 tree access_fn = NULL;
604 tree def = PHI_RESULT (phi);
605 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
607 if (dump_enabled_p ())
608 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
609 (gimple *) phi);
611 /* Skip virtual phi's. The data dependences that are associated with
612 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
613 if (virtual_operand_p (def))
614 continue;
616 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
618 /* Analyze the evolution function. */
619 access_fn = analyze_scalar_evolution (loop, def);
620 if (access_fn)
622 STRIP_NOPS (access_fn);
623 if (dump_enabled_p ())
624 dump_printf_loc (MSG_NOTE, vect_location,
625 "Access function of PHI: %T\n", access_fn);
626 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
627 = initial_condition_in_loop_num (access_fn, loop->num);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
629 = evolution_part_in_loop_num (access_fn, loop->num);
632 if ((!access_fn
633 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
634 || !vect_is_simple_iv_evolution (loop->num, access_fn,
635 &init, &step)
636 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
637 && TREE_CODE (step) != INTEGER_CST))
638 /* Only handle nonlinear iv for same loop. */
639 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
640 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
641 phi, &init, &step)))
643 worklist.safe_push (stmt_vinfo);
644 continue;
647 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
648 != NULL_TREE);
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
651 if (dump_enabled_p ())
652 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
653 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
657 /* Second - identify all reductions and nested cycles. */
658 while (worklist.length () > 0)
660 stmt_vec_info stmt_vinfo = worklist.pop ();
661 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
662 tree def = PHI_RESULT (phi);
664 if (dump_enabled_p ())
665 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
666 (gimple *) phi);
668 gcc_assert (!virtual_operand_p (def)
669 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
671 stmt_vec_info reduc_stmt_info
672 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
673 &reduc_chain, slp);
674 if (reduc_stmt_info)
676 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
677 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
678 if (double_reduc)
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_NOTE, vect_location,
682 "Detected double reduction.\n");
684 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
685 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
687 else
689 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
691 if (dump_enabled_p ())
692 dump_printf_loc (MSG_NOTE, vect_location,
693 "Detected vectorizable nested cycle.\n");
695 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
697 else
699 if (dump_enabled_p ())
700 dump_printf_loc (MSG_NOTE, vect_location,
701 "Detected reduction.\n");
703 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
704 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
705 /* Store the reduction cycles for possible vectorization in
706 loop-aware SLP if it was not detected as reduction
707 chain. */
708 if (! reduc_chain)
709 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
710 (reduc_stmt_info);
714 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
715 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
716 else
717 if (dump_enabled_p ())
718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
719 "Unknown def-use cycle pattern.\n");
724 /* Function vect_analyze_scalar_cycles.
726 Examine the cross iteration def-use cycles of scalar variables, by
727 analyzing the loop-header PHIs of scalar variables. Classify each
728 cycle as one of the following: invariant, induction, reduction, unknown.
729 We do that for the loop represented by LOOP_VINFO, and also to its
730 inner-loop, if exists.
731 Examples for scalar cycles:
733 Example1: reduction:
735 loop1:
736 for (i=0; i<N; i++)
737 sum += a[i];
739 Example2: induction:
741 loop2:
742 for (i=0; i<N; i++)
743 a[i] = i; */
745 static void
746 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
748 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
750 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
752 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
753 Reductions in such inner-loop therefore have different properties than
754 the reductions in the nest that gets vectorized:
755 1. When vectorized, they are executed in the same order as in the original
756 scalar loop, so we can't change the order of computation when
757 vectorizing them.
758 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
759 current checks are too strict. */
761 if (loop->inner)
762 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
765 /* Transfer group and reduction information from STMT_INFO to its
766 pattern stmt. */
768 static void
769 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
771 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
772 stmt_vec_info stmtp;
773 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
774 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
775 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
778 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
779 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
780 == STMT_VINFO_DEF_TYPE (stmt_info));
781 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
782 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
783 if (stmt_info)
784 REDUC_GROUP_NEXT_ELEMENT (stmtp)
785 = STMT_VINFO_RELATED_STMT (stmt_info);
787 while (stmt_info);
790 /* Fixup scalar cycles that now have their stmts detected as patterns. */
792 static void
793 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
795 stmt_vec_info first;
796 unsigned i;
798 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
800 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
801 while (next)
803 if ((STMT_VINFO_IN_PATTERN_P (next)
804 != STMT_VINFO_IN_PATTERN_P (first))
805 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
806 break;
807 next = REDUC_GROUP_NEXT_ELEMENT (next);
809 /* If all reduction chain members are well-formed patterns adjust
810 the group to group the pattern stmts instead. */
811 if (! next
812 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
814 if (STMT_VINFO_IN_PATTERN_P (first))
816 vect_fixup_reduc_chain (first);
817 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
818 = STMT_VINFO_RELATED_STMT (first);
821 /* If not all stmt in the chain are patterns or if we failed
822 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
823 it as regular reduction instead. */
824 else
826 stmt_vec_info vinfo = first;
827 stmt_vec_info last = NULL;
828 while (vinfo)
830 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
831 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
832 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
833 last = vinfo;
834 vinfo = next;
836 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
837 = vect_internal_def;
838 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
839 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
840 --i;
845 /* Function vect_get_loop_niters.
847 Determine how many iterations the loop is executed and place it
848 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
849 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
850 niter information holds in ASSUMPTIONS.
852 Return the loop exit condition. */
855 static gcond *
856 vect_get_loop_niters (class loop *loop, tree *assumptions,
857 tree *number_of_iterations, tree *number_of_iterationsm1)
859 edge exit = single_exit (loop);
860 class tree_niter_desc niter_desc;
861 tree niter_assumptions, niter, may_be_zero;
862 gcond *cond = get_loop_exit_condition (loop);
864 *assumptions = boolean_true_node;
865 *number_of_iterationsm1 = chrec_dont_know;
866 *number_of_iterations = chrec_dont_know;
867 DUMP_VECT_SCOPE ("get_loop_niters");
869 if (!exit)
870 return cond;
872 may_be_zero = NULL_TREE;
873 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
874 || chrec_contains_undetermined (niter_desc.niter))
875 return cond;
877 niter_assumptions = niter_desc.assumptions;
878 may_be_zero = niter_desc.may_be_zero;
879 niter = niter_desc.niter;
881 if (may_be_zero && integer_zerop (may_be_zero))
882 may_be_zero = NULL_TREE;
884 if (may_be_zero)
886 if (COMPARISON_CLASS_P (may_be_zero))
888 /* Try to combine may_be_zero with assumptions, this can simplify
889 computation of niter expression. */
890 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
891 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
892 niter_assumptions,
893 fold_build1 (TRUTH_NOT_EXPR,
894 boolean_type_node,
895 may_be_zero));
896 else
897 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
898 build_int_cst (TREE_TYPE (niter), 0),
899 rewrite_to_non_trapping_overflow (niter));
901 may_be_zero = NULL_TREE;
903 else if (integer_nonzerop (may_be_zero))
905 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
906 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
907 return cond;
909 else
910 return cond;
913 *assumptions = niter_assumptions;
914 *number_of_iterationsm1 = niter;
916 /* We want the number of loop header executions which is the number
917 of latch executions plus one.
918 ??? For UINT_MAX latch executions this number overflows to zero
919 for loops like do { n++; } while (n != 0); */
920 if (niter && !chrec_contains_undetermined (niter))
921 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
922 build_int_cst (TREE_TYPE (niter), 1));
923 *number_of_iterations = niter;
925 return cond;
928 /* Function bb_in_loop_p
930 Used as predicate for dfs order traversal of the loop bbs. */
932 static bool
933 bb_in_loop_p (const_basic_block bb, const void *data)
935 const class loop *const loop = (const class loop *)data;
936 if (flow_bb_inside_loop_p (loop, bb))
937 return true;
938 return false;
942 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
943 stmt_vec_info structs for all the stmts in LOOP_IN. */
945 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
946 : vec_info (vec_info::loop, shared),
947 loop (loop_in),
948 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
949 num_itersm1 (NULL_TREE),
950 num_iters (NULL_TREE),
951 num_iters_unchanged (NULL_TREE),
952 num_iters_assumptions (NULL_TREE),
953 vector_costs (nullptr),
954 scalar_costs (nullptr),
955 th (0),
956 versioning_threshold (0),
957 vectorization_factor (0),
958 main_loop_edge (nullptr),
959 skip_main_loop_edge (nullptr),
960 skip_this_loop_edge (nullptr),
961 reusable_accumulators (),
962 suggested_unroll_factor (1),
963 max_vectorization_factor (0),
964 mask_skip_niters (NULL_TREE),
965 rgroup_compare_type (NULL_TREE),
966 simd_if_cond (NULL_TREE),
967 partial_vector_style (vect_partial_vectors_none),
968 unaligned_dr (NULL),
969 peeling_for_alignment (0),
970 ptr_mask (0),
971 ivexpr_map (NULL),
972 scan_map (NULL),
973 slp_unrolling_factor (1),
974 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
975 vectorizable (false),
976 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
977 using_partial_vectors_p (false),
978 using_decrementing_iv_p (false),
979 using_select_vl_p (false),
980 epil_using_partial_vectors_p (false),
981 partial_load_store_bias (0),
982 peeling_for_gaps (false),
983 peeling_for_niter (false),
984 no_data_dependencies (false),
985 has_mask_store (false),
986 scalar_loop_scaling (profile_probability::uninitialized ()),
987 scalar_loop (NULL),
988 orig_loop_info (NULL)
990 /* CHECKME: We want to visit all BBs before their successors (except for
991 latch blocks, for which this assertion wouldn't hold). In the simple
992 case of the loop forms we allow, a dfs order of the BBs would the same
993 as reversed postorder traversal, so we are safe. */
995 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
996 bbs, loop->num_nodes, loop);
997 gcc_assert (nbbs == loop->num_nodes);
999 for (unsigned int i = 0; i < nbbs; i++)
1001 basic_block bb = bbs[i];
1002 gimple_stmt_iterator si;
1004 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1006 gimple *phi = gsi_stmt (si);
1007 gimple_set_uid (phi, 0);
1008 add_stmt (phi);
1011 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1013 gimple *stmt = gsi_stmt (si);
1014 gimple_set_uid (stmt, 0);
1015 if (is_gimple_debug (stmt))
1016 continue;
1017 add_stmt (stmt);
1018 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1019 third argument is the #pragma omp simd if (x) condition, when 0,
1020 loop shouldn't be vectorized, when non-zero constant, it should
1021 be vectorized normally, otherwise versioned with vectorized loop
1022 done if the condition is non-zero at runtime. */
1023 if (loop_in->simduid
1024 && is_gimple_call (stmt)
1025 && gimple_call_internal_p (stmt)
1026 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1027 && gimple_call_num_args (stmt) >= 3
1028 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1029 && (loop_in->simduid
1030 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1032 tree arg = gimple_call_arg (stmt, 2);
1033 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1034 simd_if_cond = arg;
1035 else
1036 gcc_assert (integer_nonzerop (arg));
1041 epilogue_vinfos.create (6);
1044 /* Free all levels of rgroup CONTROLS. */
1046 void
1047 release_vec_loop_controls (vec<rgroup_controls> *controls)
1049 rgroup_controls *rgc;
1050 unsigned int i;
1051 FOR_EACH_VEC_ELT (*controls, i, rgc)
1052 rgc->controls.release ();
1053 controls->release ();
1056 /* Free all memory used by the _loop_vec_info, as well as all the
1057 stmt_vec_info structs of all the stmts in the loop. */
1059 _loop_vec_info::~_loop_vec_info ()
1061 free (bbs);
1063 release_vec_loop_controls (&masks.rgc_vec);
1064 release_vec_loop_controls (&lens);
1065 delete ivexpr_map;
1066 delete scan_map;
1067 epilogue_vinfos.release ();
1068 delete scalar_costs;
1069 delete vector_costs;
1071 /* When we release an epiloge vinfo that we do not intend to use
1072 avoid clearing AUX of the main loop which should continue to
1073 point to the main loop vinfo since otherwise we'll leak that. */
1074 if (loop->aux == this)
1075 loop->aux = NULL;
1078 /* Return an invariant or register for EXPR and emit necessary
1079 computations in the LOOP_VINFO loop preheader. */
1081 tree
1082 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1084 if (is_gimple_reg (expr)
1085 || is_gimple_min_invariant (expr))
1086 return expr;
1088 if (! loop_vinfo->ivexpr_map)
1089 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1090 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1091 if (! cached)
1093 gimple_seq stmts = NULL;
1094 cached = force_gimple_operand (unshare_expr (expr),
1095 &stmts, true, NULL_TREE);
1096 if (stmts)
1098 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1099 gsi_insert_seq_on_edge_immediate (e, stmts);
1102 return cached;
1105 /* Return true if we can use CMP_TYPE as the comparison type to produce
1106 all masks required to mask LOOP_VINFO. */
1108 static bool
1109 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1111 rgroup_controls *rgm;
1112 unsigned int i;
1113 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1114 if (rgm->type != NULL_TREE
1115 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1116 cmp_type, rgm->type,
1117 OPTIMIZE_FOR_SPEED))
1118 return false;
1119 return true;
1122 /* Calculate the maximum number of scalars per iteration for every
1123 rgroup in LOOP_VINFO. */
1125 static unsigned int
1126 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1128 unsigned int res = 1;
1129 unsigned int i;
1130 rgroup_controls *rgm;
1131 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1132 res = MAX (res, rgm->max_nscalars_per_iter);
1133 return res;
1136 /* Calculate the minimum precision necessary to represent:
1138 MAX_NITERS * FACTOR
1140 as an unsigned integer, where MAX_NITERS is the maximum number of
1141 loop header iterations for the original scalar form of LOOP_VINFO. */
1143 static unsigned
1144 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1146 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1148 /* Get the maximum number of iterations that is representable
1149 in the counter type. */
1150 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1151 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1153 /* Get a more refined estimate for the number of iterations. */
1154 widest_int max_back_edges;
1155 if (max_loop_iterations (loop, &max_back_edges))
1156 max_ni = wi::smin (max_ni, max_back_edges + 1);
1158 /* Work out how many bits we need to represent the limit. */
1159 return wi::min_precision (max_ni * factor, UNSIGNED);
1162 /* True if the loop needs peeling or partial vectors when vectorized. */
1164 static bool
1165 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1167 unsigned HOST_WIDE_INT const_vf;
1168 HOST_WIDE_INT max_niter
1169 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1171 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1172 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1173 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1174 (loop_vinfo));
1176 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1177 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1179 /* Work out the (constant) number of iterations that need to be
1180 peeled for reasons other than niters. */
1181 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1182 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1183 peel_niter += 1;
1184 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1185 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1186 return true;
1188 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1189 /* ??? When peeling for gaps but not alignment, we could
1190 try to check whether the (variable) niters is known to be
1191 VF * N + 1. That's something of a niche case though. */
1192 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1193 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1194 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1195 < (unsigned) exact_log2 (const_vf))
1196 /* In case of versioning, check if the maximum number of
1197 iterations is greater than th. If they are identical,
1198 the epilogue is unnecessary. */
1199 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1200 || ((unsigned HOST_WIDE_INT) max_niter
1201 > (th / const_vf) * const_vf))))
1202 return true;
1204 return false;
1207 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1208 whether we can actually generate the masks required. Return true if so,
1209 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1211 static bool
1212 vect_verify_full_masking (loop_vec_info loop_vinfo)
1214 unsigned int min_ni_width;
1216 /* Use a normal loop if there are no statements that need masking.
1217 This only happens in rare degenerate cases: it means that the loop
1218 has no loads, no stores, and no live-out values. */
1219 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1220 return false;
1222 /* Produce the rgroup controls. */
1223 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1225 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1226 tree vectype = mask.first;
1227 unsigned nvectors = mask.second;
1229 if (masks->rgc_vec.length () < nvectors)
1230 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1231 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1232 /* The number of scalars per iteration and the number of vectors are
1233 both compile-time constants. */
1234 unsigned int nscalars_per_iter
1235 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1236 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1238 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1240 rgm->max_nscalars_per_iter = nscalars_per_iter;
1241 rgm->type = truth_type_for (vectype);
1242 rgm->factor = 1;
1246 unsigned int max_nscalars_per_iter
1247 = vect_get_max_nscalars_per_iter (loop_vinfo);
1249 /* Work out how many bits we need to represent the limit. */
1250 min_ni_width
1251 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1253 /* Find a scalar mode for which WHILE_ULT is supported. */
1254 opt_scalar_int_mode cmp_mode_iter;
1255 tree cmp_type = NULL_TREE;
1256 tree iv_type = NULL_TREE;
1257 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1258 unsigned int iv_precision = UINT_MAX;
1260 if (iv_limit != -1)
1261 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1262 UNSIGNED);
1264 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1266 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1267 if (cmp_bits >= min_ni_width
1268 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1270 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1271 if (this_type
1272 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1274 /* Although we could stop as soon as we find a valid mode,
1275 there are at least two reasons why that's not always the
1276 best choice:
1278 - An IV that's Pmode or wider is more likely to be reusable
1279 in address calculations than an IV that's narrower than
1280 Pmode.
1282 - Doing the comparison in IV_PRECISION or wider allows
1283 a natural 0-based IV, whereas using a narrower comparison
1284 type requires mitigations against wrap-around.
1286 Conversely, if the IV limit is variable, doing the comparison
1287 in a wider type than the original type can introduce
1288 unnecessary extensions, so picking the widest valid mode
1289 is not always a good choice either.
1291 Here we prefer the first IV type that's Pmode or wider,
1292 and the first comparison type that's IV_PRECISION or wider.
1293 (The comparison type must be no wider than the IV type,
1294 to avoid extensions in the vector loop.)
1296 ??? We might want to try continuing beyond Pmode for ILP32
1297 targets if CMP_BITS < IV_PRECISION. */
1298 iv_type = this_type;
1299 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1300 cmp_type = this_type;
1301 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1302 break;
1307 if (!cmp_type)
1309 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1310 return false;
1313 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1314 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1315 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1316 return true;
1319 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1320 whether we can actually generate AVX512 style masks. Return true if so,
1321 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1323 static bool
1324 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1326 /* Produce differently organized rgc_vec and differently check
1327 we can produce masks. */
1329 /* Use a normal loop if there are no statements that need masking.
1330 This only happens in rare degenerate cases: it means that the loop
1331 has no loads, no stores, and no live-out values. */
1332 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1333 return false;
1335 /* For the decrementing IV we need to represent all values in
1336 [0, niter + niter_skip] where niter_skip is the elements we
1337 skip in the first iteration for prologue peeling. */
1338 tree iv_type = NULL_TREE;
1339 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1340 unsigned int iv_precision = UINT_MAX;
1341 if (iv_limit != -1)
1342 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1344 /* First compute the type for the IV we use to track the remaining
1345 scalar iterations. */
1346 opt_scalar_int_mode cmp_mode_iter;
1347 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1349 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1350 if (cmp_bits >= iv_precision
1351 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1353 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1354 if (iv_type)
1355 break;
1358 if (!iv_type)
1359 return false;
1361 /* Produce the rgroup controls. */
1362 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1364 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1365 tree vectype = mask.first;
1366 unsigned nvectors = mask.second;
1368 /* The number of scalars per iteration and the number of vectors are
1369 both compile-time constants. */
1370 unsigned int nscalars_per_iter
1371 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1372 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1374 /* We index the rgroup_controls vector with nscalars_per_iter
1375 which we keep constant and instead have a varying nvectors,
1376 remembering the vector mask with the fewest nV. */
1377 if (masks->rgc_vec.length () < nscalars_per_iter)
1378 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1379 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1381 if (!rgm->type || rgm->factor > nvectors)
1383 rgm->type = truth_type_for (vectype);
1384 rgm->compare_type = NULL_TREE;
1385 rgm->max_nscalars_per_iter = nscalars_per_iter;
1386 rgm->factor = nvectors;
1387 rgm->bias_adjusted_ctrl = NULL_TREE;
1391 /* There is no fixed compare type we are going to use but we have to
1392 be able to get at one for each mask group. */
1393 unsigned int min_ni_width
1394 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1396 bool ok = true;
1397 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1399 tree mask_type = rgc.type;
1400 if (!mask_type)
1401 continue;
1403 if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1405 ok = false;
1406 break;
1409 /* If iv_type is usable as compare type use that - we can elide the
1410 saturation in that case. */
1411 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1413 tree cmp_vectype
1414 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1415 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1416 rgc.compare_type = cmp_vectype;
1418 if (!rgc.compare_type)
1419 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1421 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1422 if (cmp_bits >= min_ni_width
1423 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1425 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1426 if (!cmp_type)
1427 continue;
1429 /* Check whether we can produce the mask with cmp_type. */
1430 tree cmp_vectype
1431 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1432 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1434 rgc.compare_type = cmp_vectype;
1435 break;
1439 if (!rgc.compare_type)
1441 ok = false;
1442 break;
1445 if (!ok)
1447 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1448 return false;
1451 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1452 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1453 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1454 return true;
1457 /* Check whether we can use vector access with length based on precison
1458 comparison. So far, to keep it simple, we only allow the case that the
1459 precision of the target supported length is larger than the precision
1460 required by loop niters. */
1462 static bool
1463 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1465 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1466 return false;
1468 machine_mode len_load_mode = get_len_load_store_mode
1469 (loop_vinfo->vector_mode, true).require ();
1470 machine_mode len_store_mode = get_len_load_store_mode
1471 (loop_vinfo->vector_mode, false).require ();
1473 signed char partial_load_bias = internal_len_load_store_bias
1474 (IFN_LEN_LOAD, len_load_mode);
1476 signed char partial_store_bias = internal_len_load_store_bias
1477 (IFN_LEN_STORE, len_store_mode);
1479 gcc_assert (partial_load_bias == partial_store_bias);
1481 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1482 return false;
1484 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1485 len_loads with a length of zero. In order to avoid that we prohibit
1486 more than one loop length here. */
1487 if (partial_load_bias == -1
1488 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1489 return false;
1491 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1493 unsigned int max_nitems_per_iter = 1;
1494 unsigned int i;
1495 rgroup_controls *rgl;
1496 /* Find the maximum number of items per iteration for every rgroup. */
1497 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1499 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1500 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1503 /* Work out how many bits we need to represent the length limit. */
1504 unsigned int min_ni_prec
1505 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1507 /* Now use the maximum of below precisions for one suitable IV type:
1508 - the IV's natural precision
1509 - the precision needed to hold: the maximum number of scalar
1510 iterations multiplied by the scale factor (min_ni_prec above)
1511 - the Pmode precision
1513 If min_ni_prec is less than the precision of the current niters,
1514 we perfer to still use the niters type. Prefer to use Pmode and
1515 wider IV to avoid narrow conversions. */
1517 unsigned int ni_prec
1518 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1519 min_ni_prec = MAX (min_ni_prec, ni_prec);
1520 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1522 tree iv_type = NULL_TREE;
1523 opt_scalar_int_mode tmode_iter;
1524 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1526 scalar_mode tmode = tmode_iter.require ();
1527 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1529 /* ??? Do we really want to construct one IV whose precision exceeds
1530 BITS_PER_WORD? */
1531 if (tbits > BITS_PER_WORD)
1532 break;
1534 /* Find the first available standard integral type. */
1535 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1537 iv_type = build_nonstandard_integer_type (tbits, true);
1538 break;
1542 if (!iv_type)
1544 if (dump_enabled_p ())
1545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1546 "can't vectorize with length-based partial vectors"
1547 " because there is no suitable iv type.\n");
1548 return false;
1551 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1552 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1553 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1555 return true;
1558 /* Calculate the cost of one scalar iteration of the loop. */
1559 static void
1560 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1562 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1563 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1564 int nbbs = loop->num_nodes, factor;
1565 int innerloop_iters, i;
1567 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1569 /* Gather costs for statements in the scalar loop. */
1571 /* FORNOW. */
1572 innerloop_iters = 1;
1573 if (loop->inner)
1574 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1576 for (i = 0; i < nbbs; i++)
1578 gimple_stmt_iterator si;
1579 basic_block bb = bbs[i];
1581 if (bb->loop_father == loop->inner)
1582 factor = innerloop_iters;
1583 else
1584 factor = 1;
1586 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1588 gimple *stmt = gsi_stmt (si);
1589 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1591 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1592 continue;
1594 /* Skip stmts that are not vectorized inside the loop. */
1595 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1596 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1597 && (!STMT_VINFO_LIVE_P (vstmt_info)
1598 || !VECTORIZABLE_CYCLE_DEF
1599 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1600 continue;
1602 vect_cost_for_stmt kind;
1603 if (STMT_VINFO_DATA_REF (stmt_info))
1605 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1606 kind = scalar_load;
1607 else
1608 kind = scalar_store;
1610 else if (vect_nop_conversion_p (stmt_info))
1611 continue;
1612 else
1613 kind = scalar_stmt;
1615 /* We are using vect_prologue here to avoid scaling twice
1616 by the inner loop factor. */
1617 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1618 factor, kind, stmt_info, 0, vect_prologue);
1622 /* Now accumulate cost. */
1623 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1624 add_stmt_costs (loop_vinfo->scalar_costs,
1625 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1626 loop_vinfo->scalar_costs->finish_cost (nullptr);
1630 /* Function vect_analyze_loop_form.
1632 Verify that certain CFG restrictions hold, including:
1633 - the loop has a pre-header
1634 - the loop has a single entry and exit
1635 - the loop exit condition is simple enough
1636 - the number of iterations can be analyzed, i.e, a countable loop. The
1637 niter could be analyzed under some assumptions. */
1639 opt_result
1640 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1642 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1644 /* Different restrictions apply when we are considering an inner-most loop,
1645 vs. an outer (nested) loop.
1646 (FORNOW. May want to relax some of these restrictions in the future). */
1648 info->inner_loop_cond = NULL;
1649 if (!loop->inner)
1651 /* Inner-most loop. We currently require that the number of BBs is
1652 exactly 2 (the header and latch). Vectorizable inner-most loops
1653 look like this:
1655 (pre-header)
1657 header <--------+
1658 | | |
1659 | +--> latch --+
1661 (exit-bb) */
1663 if (loop->num_nodes != 2)
1664 return opt_result::failure_at (vect_location,
1665 "not vectorized:"
1666 " control flow in loop.\n");
1668 if (empty_block_p (loop->header))
1669 return opt_result::failure_at (vect_location,
1670 "not vectorized: empty loop.\n");
1672 else
1674 class loop *innerloop = loop->inner;
1675 edge entryedge;
1677 /* Nested loop. We currently require that the loop is doubly-nested,
1678 contains a single inner loop, and the number of BBs is exactly 5.
1679 Vectorizable outer-loops look like this:
1681 (pre-header)
1683 header <---+
1685 inner-loop |
1687 tail ------+
1689 (exit-bb)
1691 The inner-loop has the properties expected of inner-most loops
1692 as described above. */
1694 if ((loop->inner)->inner || (loop->inner)->next)
1695 return opt_result::failure_at (vect_location,
1696 "not vectorized:"
1697 " multiple nested loops.\n");
1699 if (loop->num_nodes != 5)
1700 return opt_result::failure_at (vect_location,
1701 "not vectorized:"
1702 " control flow in loop.\n");
1704 entryedge = loop_preheader_edge (innerloop);
1705 if (entryedge->src != loop->header
1706 || !single_exit (innerloop)
1707 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1708 return opt_result::failure_at (vect_location,
1709 "not vectorized:"
1710 " unsupported outerloop form.\n");
1712 /* Analyze the inner-loop. */
1713 vect_loop_form_info inner;
1714 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1715 if (!res)
1717 if (dump_enabled_p ())
1718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719 "not vectorized: Bad inner loop.\n");
1720 return res;
1723 /* Don't support analyzing niter under assumptions for inner
1724 loop. */
1725 if (!integer_onep (inner.assumptions))
1726 return opt_result::failure_at (vect_location,
1727 "not vectorized: Bad inner loop.\n");
1729 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1730 return opt_result::failure_at (vect_location,
1731 "not vectorized: inner-loop count not"
1732 " invariant.\n");
1734 if (dump_enabled_p ())
1735 dump_printf_loc (MSG_NOTE, vect_location,
1736 "Considering outer-loop vectorization.\n");
1737 info->inner_loop_cond = inner.loop_cond;
1740 if (!single_exit (loop))
1741 return opt_result::failure_at (vect_location,
1742 "not vectorized: multiple exits.\n");
1743 if (EDGE_COUNT (loop->header->preds) != 2)
1744 return opt_result::failure_at (vect_location,
1745 "not vectorized:"
1746 " too many incoming edges.\n");
1748 /* We assume that the loop exit condition is at the end of the loop. i.e,
1749 that the loop is represented as a do-while (with a proper if-guard
1750 before the loop if needed), where the loop header contains all the
1751 executable statements, and the latch is empty. */
1752 if (!empty_block_p (loop->latch)
1753 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1754 return opt_result::failure_at (vect_location,
1755 "not vectorized: latch block not empty.\n");
1757 /* Make sure the exit is not abnormal. */
1758 edge e = single_exit (loop);
1759 if (e->flags & EDGE_ABNORMAL)
1760 return opt_result::failure_at (vect_location,
1761 "not vectorized:"
1762 " abnormal loop exit edge.\n");
1764 info->loop_cond
1765 = vect_get_loop_niters (loop, &info->assumptions,
1766 &info->number_of_iterations,
1767 &info->number_of_iterationsm1);
1768 if (!info->loop_cond)
1769 return opt_result::failure_at
1770 (vect_location,
1771 "not vectorized: complicated exit condition.\n");
1773 if (integer_zerop (info->assumptions)
1774 || !info->number_of_iterations
1775 || chrec_contains_undetermined (info->number_of_iterations))
1776 return opt_result::failure_at
1777 (info->loop_cond,
1778 "not vectorized: number of iterations cannot be computed.\n");
1780 if (integer_zerop (info->number_of_iterations))
1781 return opt_result::failure_at
1782 (info->loop_cond,
1783 "not vectorized: number of iterations = 0.\n");
1785 if (!(tree_fits_shwi_p (info->number_of_iterations)
1786 && tree_to_shwi (info->number_of_iterations) > 0))
1788 if (dump_enabled_p ())
1790 dump_printf_loc (MSG_NOTE, vect_location,
1791 "Symbolic number of iterations is ");
1792 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1793 dump_printf (MSG_NOTE, "\n");
1797 return opt_result::success ();
1800 /* Create a loop_vec_info for LOOP with SHARED and the
1801 vect_analyze_loop_form result. */
1803 loop_vec_info
1804 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1805 const vect_loop_form_info *info,
1806 loop_vec_info main_loop_info)
1808 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1809 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1810 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1811 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1812 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1813 /* Also record the assumptions for versioning. */
1814 if (!integer_onep (info->assumptions) && !main_loop_info)
1815 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1817 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1818 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1819 if (info->inner_loop_cond)
1821 stmt_vec_info inner_loop_cond_info
1822 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1823 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1824 /* If we have an estimate on the number of iterations of the inner
1825 loop use that to limit the scale for costing, otherwise use
1826 --param vect-inner-loop-cost-factor literally. */
1827 widest_int nit;
1828 if (estimated_stmt_executions (loop->inner, &nit))
1829 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1830 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1833 return loop_vinfo;
1838 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1839 statements update the vectorization factor. */
1841 static void
1842 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1844 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1845 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1846 int nbbs = loop->num_nodes;
1847 poly_uint64 vectorization_factor;
1848 int i;
1850 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1852 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1853 gcc_assert (known_ne (vectorization_factor, 0U));
1855 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1856 vectorization factor of the loop is the unrolling factor required by
1857 the SLP instances. If that unrolling factor is 1, we say, that we
1858 perform pure SLP on loop - cross iteration parallelism is not
1859 exploited. */
1860 bool only_slp_in_loop = true;
1861 for (i = 0; i < nbbs; i++)
1863 basic_block bb = bbs[i];
1864 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1865 gsi_next (&si))
1867 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1868 if (!stmt_info)
1869 continue;
1870 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1871 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1872 && !PURE_SLP_STMT (stmt_info))
1873 /* STMT needs both SLP and loop-based vectorization. */
1874 only_slp_in_loop = false;
1876 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1877 gsi_next (&si))
1879 if (is_gimple_debug (gsi_stmt (si)))
1880 continue;
1881 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1882 stmt_info = vect_stmt_to_vectorize (stmt_info);
1883 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1884 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1885 && !PURE_SLP_STMT (stmt_info))
1886 /* STMT needs both SLP and loop-based vectorization. */
1887 only_slp_in_loop = false;
1891 if (only_slp_in_loop)
1893 if (dump_enabled_p ())
1894 dump_printf_loc (MSG_NOTE, vect_location,
1895 "Loop contains only SLP stmts\n");
1896 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1898 else
1900 if (dump_enabled_p ())
1901 dump_printf_loc (MSG_NOTE, vect_location,
1902 "Loop contains SLP and non-SLP stmts\n");
1903 /* Both the vectorization factor and unroll factor have the form
1904 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1905 so they must have a common multiple. */
1906 vectorization_factor
1907 = force_common_multiple (vectorization_factor,
1908 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1911 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1912 if (dump_enabled_p ())
1914 dump_printf_loc (MSG_NOTE, vect_location,
1915 "Updating vectorization factor to ");
1916 dump_dec (MSG_NOTE, vectorization_factor);
1917 dump_printf (MSG_NOTE, ".\n");
1921 /* Return true if STMT_INFO describes a double reduction phi and if
1922 the other phi in the reduction is also relevant for vectorization.
1923 This rejects cases such as:
1925 outer1:
1926 x_1 = PHI <x_3(outer2), ...>;
1929 inner:
1930 x_2 = ...;
1933 outer2:
1934 x_3 = PHI <x_2(inner)>;
1936 if nothing in x_2 or elsewhere makes x_1 relevant. */
1938 static bool
1939 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1941 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1942 return false;
1944 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1947 /* Function vect_analyze_loop_operations.
1949 Scan the loop stmts and make sure they are all vectorizable. */
1951 static opt_result
1952 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1954 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1955 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1956 int nbbs = loop->num_nodes;
1957 int i;
1958 stmt_vec_info stmt_info;
1959 bool need_to_vectorize = false;
1960 bool ok;
1962 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1964 auto_vec<stmt_info_for_cost> cost_vec;
1966 for (i = 0; i < nbbs; i++)
1968 basic_block bb = bbs[i];
1970 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1971 gsi_next (&si))
1973 gphi *phi = si.phi ();
1974 ok = true;
1976 stmt_info = loop_vinfo->lookup_stmt (phi);
1977 if (dump_enabled_p ())
1978 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1979 (gimple *) phi);
1980 if (virtual_operand_p (gimple_phi_result (phi)))
1981 continue;
1983 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1984 (i.e., a phi in the tail of the outer-loop). */
1985 if (! is_loop_header_bb_p (bb))
1987 /* FORNOW: we currently don't support the case that these phis
1988 are not used in the outerloop (unless it is double reduction,
1989 i.e., this phi is vect_reduction_def), cause this case
1990 requires to actually do something here. */
1991 if (STMT_VINFO_LIVE_P (stmt_info)
1992 && !vect_active_double_reduction_p (stmt_info))
1993 return opt_result::failure_at (phi,
1994 "Unsupported loop-closed phi"
1995 " in outer-loop.\n");
1997 /* If PHI is used in the outer loop, we check that its operand
1998 is defined in the inner loop. */
1999 if (STMT_VINFO_RELEVANT_P (stmt_info))
2001 tree phi_op;
2003 if (gimple_phi_num_args (phi) != 1)
2004 return opt_result::failure_at (phi, "unsupported phi");
2006 phi_op = PHI_ARG_DEF (phi, 0);
2007 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2008 if (!op_def_info)
2009 return opt_result::failure_at (phi, "unsupported phi\n");
2011 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2012 && (STMT_VINFO_RELEVANT (op_def_info)
2013 != vect_used_in_outer_by_reduction))
2014 return opt_result::failure_at (phi, "unsupported phi\n");
2016 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2017 || (STMT_VINFO_DEF_TYPE (stmt_info)
2018 == vect_double_reduction_def))
2019 && !vectorizable_lc_phi (loop_vinfo,
2020 stmt_info, NULL, NULL))
2021 return opt_result::failure_at (phi, "unsupported phi\n");
2024 continue;
2027 gcc_assert (stmt_info);
2029 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2030 || STMT_VINFO_LIVE_P (stmt_info))
2031 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2032 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2033 /* A scalar-dependence cycle that we don't support. */
2034 return opt_result::failure_at (phi,
2035 "not vectorized:"
2036 " scalar dependence cycle.\n");
2038 if (STMT_VINFO_RELEVANT_P (stmt_info))
2040 need_to_vectorize = true;
2041 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2042 && ! PURE_SLP_STMT (stmt_info))
2043 ok = vectorizable_induction (loop_vinfo,
2044 stmt_info, NULL, NULL,
2045 &cost_vec);
2046 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2047 || (STMT_VINFO_DEF_TYPE (stmt_info)
2048 == vect_double_reduction_def)
2049 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2050 && ! PURE_SLP_STMT (stmt_info))
2051 ok = vectorizable_reduction (loop_vinfo,
2052 stmt_info, NULL, NULL, &cost_vec);
2053 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2054 == vect_first_order_recurrence)
2055 && ! PURE_SLP_STMT (stmt_info))
2056 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2057 &cost_vec);
2060 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2061 if (ok
2062 && STMT_VINFO_LIVE_P (stmt_info)
2063 && !PURE_SLP_STMT (stmt_info))
2064 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2065 -1, false, &cost_vec);
2067 if (!ok)
2068 return opt_result::failure_at (phi,
2069 "not vectorized: relevant phi not "
2070 "supported: %G",
2071 static_cast <gimple *> (phi));
2074 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2075 gsi_next (&si))
2077 gimple *stmt = gsi_stmt (si);
2078 if (!gimple_clobber_p (stmt)
2079 && !is_gimple_debug (stmt))
2081 opt_result res
2082 = vect_analyze_stmt (loop_vinfo,
2083 loop_vinfo->lookup_stmt (stmt),
2084 &need_to_vectorize,
2085 NULL, NULL, &cost_vec);
2086 if (!res)
2087 return res;
2090 } /* bbs */
2092 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2094 /* All operations in the loop are either irrelevant (deal with loop
2095 control, or dead), or only used outside the loop and can be moved
2096 out of the loop (e.g. invariants, inductions). The loop can be
2097 optimized away by scalar optimizations. We're better off not
2098 touching this loop. */
2099 if (!need_to_vectorize)
2101 if (dump_enabled_p ())
2102 dump_printf_loc (MSG_NOTE, vect_location,
2103 "All the computation can be taken out of the loop.\n");
2104 return opt_result::failure_at
2105 (vect_location,
2106 "not vectorized: redundant loop. no profit to vectorize.\n");
2109 return opt_result::success ();
2112 /* Return true if we know that the iteration count is smaller than the
2113 vectorization factor. Return false if it isn't, or if we can't be sure
2114 either way. */
2116 static bool
2117 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2119 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2121 HOST_WIDE_INT max_niter;
2122 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2123 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2124 else
2125 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2127 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2128 return true;
2130 return false;
2133 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2134 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2135 definitely no, or -1 if it's worth retrying. */
2137 static int
2138 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2139 unsigned *suggested_unroll_factor)
2141 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2142 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2144 /* Only loops that can handle partially-populated vectors can have iteration
2145 counts less than the vectorization factor. */
2146 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2147 && vect_known_niters_smaller_than_vf (loop_vinfo))
2149 if (dump_enabled_p ())
2150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2151 "not vectorized: iteration count smaller than "
2152 "vectorization factor.\n");
2153 return 0;
2156 /* If we know the number of iterations we can do better, for the
2157 epilogue we can also decide whether the main loop leaves us
2158 with enough iterations, prefering a smaller vector epilog then
2159 also possibly used for the case we skip the vector loop. */
2160 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2162 widest_int scalar_niters
2163 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2164 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2166 loop_vec_info orig_loop_vinfo
2167 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2168 unsigned lowest_vf
2169 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2170 int prolog_peeling = 0;
2171 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2172 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2173 if (prolog_peeling >= 0
2174 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2175 lowest_vf))
2177 unsigned gap
2178 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2179 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2180 % lowest_vf + gap);
2183 /* Reject vectorizing for a single scalar iteration, even if
2184 we could in principle implement that using partial vectors. */
2185 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2186 if (scalar_niters <= peeling_gap + 1)
2188 if (dump_enabled_p ())
2189 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2190 "not vectorized: loop only has a single "
2191 "scalar iteration.\n");
2192 return 0;
2195 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2197 /* Check that the loop processes at least one full vector. */
2198 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2199 if (known_lt (scalar_niters, vf))
2201 if (dump_enabled_p ())
2202 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2203 "loop does not have enough iterations "
2204 "to support vectorization.\n");
2205 return 0;
2208 /* If we need to peel an extra epilogue iteration to handle data
2209 accesses with gaps, check that there are enough scalar iterations
2210 available.
2212 The check above is redundant with this one when peeling for gaps,
2213 but the distinction is useful for diagnostics. */
2214 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2215 && known_le (scalar_niters, vf))
2217 if (dump_enabled_p ())
2218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2219 "loop does not have enough iterations "
2220 "to support peeling for gaps.\n");
2221 return 0;
2226 /* If using the "very cheap" model. reject cases in which we'd keep
2227 a copy of the scalar code (even if we might be able to vectorize it). */
2228 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2229 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2230 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2231 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2233 if (dump_enabled_p ())
2234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2235 "some scalar iterations would need to be peeled\n");
2236 return 0;
2239 int min_profitable_iters, min_profitable_estimate;
2240 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2241 &min_profitable_estimate,
2242 suggested_unroll_factor);
2244 if (min_profitable_iters < 0)
2246 if (dump_enabled_p ())
2247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2248 "not vectorized: vectorization not profitable.\n");
2249 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2251 "not vectorized: vector version will never be "
2252 "profitable.\n");
2253 return -1;
2256 int min_scalar_loop_bound = (param_min_vect_loop_bound
2257 * assumed_vf);
2259 /* Use the cost model only if it is more conservative than user specified
2260 threshold. */
2261 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2262 min_profitable_iters);
2264 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2266 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2267 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2269 if (dump_enabled_p ())
2270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2271 "not vectorized: vectorization not profitable.\n");
2272 if (dump_enabled_p ())
2273 dump_printf_loc (MSG_NOTE, vect_location,
2274 "not vectorized: iteration count smaller than user "
2275 "specified loop bound parameter or minimum profitable "
2276 "iterations (whichever is more conservative).\n");
2277 return 0;
2280 /* The static profitablity threshold min_profitable_estimate includes
2281 the cost of having to check at runtime whether the scalar loop
2282 should be used instead. If it turns out that we don't need or want
2283 such a check, the threshold we should use for the static estimate
2284 is simply the point at which the vector loop becomes more profitable
2285 than the scalar loop. */
2286 if (min_profitable_estimate > min_profitable_iters
2287 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2288 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2289 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2290 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2292 if (dump_enabled_p ())
2293 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2294 " choice between the scalar and vector loops\n");
2295 min_profitable_estimate = min_profitable_iters;
2298 /* If the vector loop needs multiple iterations to be beneficial then
2299 things are probably too close to call, and the conservative thing
2300 would be to stick with the scalar code. */
2301 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2302 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2304 if (dump_enabled_p ())
2305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2306 "one iteration of the vector loop would be"
2307 " more expensive than the equivalent number of"
2308 " iterations of the scalar loop\n");
2309 return 0;
2312 HOST_WIDE_INT estimated_niter;
2314 /* If we are vectorizing an epilogue then we know the maximum number of
2315 scalar iterations it will cover is at least one lower than the
2316 vectorization factor of the main loop. */
2317 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2318 estimated_niter
2319 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2320 else
2322 estimated_niter = estimated_stmt_executions_int (loop);
2323 if (estimated_niter == -1)
2324 estimated_niter = likely_max_stmt_executions_int (loop);
2326 if (estimated_niter != -1
2327 && ((unsigned HOST_WIDE_INT) estimated_niter
2328 < MAX (th, (unsigned) min_profitable_estimate)))
2330 if (dump_enabled_p ())
2331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2332 "not vectorized: estimated iteration count too "
2333 "small.\n");
2334 if (dump_enabled_p ())
2335 dump_printf_loc (MSG_NOTE, vect_location,
2336 "not vectorized: estimated iteration count smaller "
2337 "than specified loop bound parameter or minimum "
2338 "profitable iterations (whichever is more "
2339 "conservative).\n");
2340 return -1;
2343 return 1;
2346 static opt_result
2347 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2348 vec<data_reference_p> *datarefs,
2349 unsigned int *n_stmts)
2351 *n_stmts = 0;
2352 for (unsigned i = 0; i < loop->num_nodes; i++)
2353 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2354 !gsi_end_p (gsi); gsi_next (&gsi))
2356 gimple *stmt = gsi_stmt (gsi);
2357 if (is_gimple_debug (stmt))
2358 continue;
2359 ++(*n_stmts);
2360 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2361 NULL, 0);
2362 if (!res)
2364 if (is_gimple_call (stmt) && loop->safelen)
2366 tree fndecl = gimple_call_fndecl (stmt), op;
2367 if (fndecl == NULL_TREE
2368 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2370 fndecl = gimple_call_arg (stmt, 0);
2371 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2372 fndecl = TREE_OPERAND (fndecl, 0);
2373 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2375 if (fndecl != NULL_TREE)
2377 cgraph_node *node = cgraph_node::get (fndecl);
2378 if (node != NULL && node->simd_clones != NULL)
2380 unsigned int j, n = gimple_call_num_args (stmt);
2381 for (j = 0; j < n; j++)
2383 op = gimple_call_arg (stmt, j);
2384 if (DECL_P (op)
2385 || (REFERENCE_CLASS_P (op)
2386 && get_base_address (op)))
2387 break;
2389 op = gimple_call_lhs (stmt);
2390 /* Ignore #pragma omp declare simd functions
2391 if they don't have data references in the
2392 call stmt itself. */
2393 if (j == n
2394 && !(op
2395 && (DECL_P (op)
2396 || (REFERENCE_CLASS_P (op)
2397 && get_base_address (op)))))
2398 continue;
2402 return res;
2404 /* If dependence analysis will give up due to the limit on the
2405 number of datarefs stop here and fail fatally. */
2406 if (datarefs->length ()
2407 > (unsigned)param_loop_max_datarefs_for_datadeps)
2408 return opt_result::failure_at (stmt, "exceeded param "
2409 "loop-max-datarefs-for-datadeps\n");
2411 return opt_result::success ();
2414 /* Look for SLP-only access groups and turn each individual access into its own
2415 group. */
2416 static void
2417 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2419 unsigned int i;
2420 struct data_reference *dr;
2422 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2424 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2425 FOR_EACH_VEC_ELT (datarefs, i, dr)
2427 gcc_assert (DR_REF (dr));
2428 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2430 /* Check if the load is a part of an interleaving chain. */
2431 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2433 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2434 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2435 unsigned int group_size = DR_GROUP_SIZE (first_element);
2437 /* Check if SLP-only groups. */
2438 if (!STMT_SLP_TYPE (stmt_info)
2439 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2441 /* Dissolve the group. */
2442 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2444 stmt_vec_info vinfo = first_element;
2445 while (vinfo)
2447 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2448 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2449 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2450 DR_GROUP_SIZE (vinfo) = 1;
2451 if (STMT_VINFO_STRIDED_P (first_element))
2452 DR_GROUP_GAP (vinfo) = 0;
2453 else
2454 DR_GROUP_GAP (vinfo) = group_size - 1;
2455 /* Duplicate and adjust alignment info, it needs to
2456 be present on each group leader, see dr_misalignment. */
2457 if (vinfo != first_element)
2459 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2460 dr_info2->target_alignment = dr_info->target_alignment;
2461 int misalignment = dr_info->misalignment;
2462 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2464 HOST_WIDE_INT diff
2465 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2466 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2467 unsigned HOST_WIDE_INT align_c
2468 = dr_info->target_alignment.to_constant ();
2469 misalignment = (misalignment + diff) % align_c;
2471 dr_info2->misalignment = misalignment;
2473 vinfo = next;
2480 /* Determine if operating on full vectors for LOOP_VINFO might leave
2481 some scalar iterations still to do. If so, decide how we should
2482 handle those scalar iterations. The possibilities are:
2484 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2485 In this case:
2487 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2488 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2489 LOOP_VINFO_PEELING_FOR_NITER == false
2491 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2492 to handle the remaining scalar iterations. In this case:
2494 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2495 LOOP_VINFO_PEELING_FOR_NITER == true
2497 There are two choices:
2499 (2a) Consider vectorizing the epilogue loop at the same VF as the
2500 main loop, but using partial vectors instead of full vectors.
2501 In this case:
2503 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2505 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2506 In this case:
2508 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2511 opt_result
2512 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2514 /* Determine whether there would be any scalar iterations left over. */
2515 bool need_peeling_or_partial_vectors_p
2516 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2518 /* Decide whether to vectorize the loop with partial vectors. */
2519 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2520 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2521 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2522 && need_peeling_or_partial_vectors_p)
2524 /* For partial-vector-usage=1, try to push the handling of partial
2525 vectors to the epilogue, with the main loop continuing to operate
2526 on full vectors.
2528 If we are unrolling we also do not want to use partial vectors. This
2529 is to avoid the overhead of generating multiple masks and also to
2530 avoid having to execute entire iterations of FALSE masked instructions
2531 when dealing with one or less full iterations.
2533 ??? We could then end up failing to use partial vectors if we
2534 decide to peel iterations into a prologue, and if the main loop
2535 then ends up processing fewer than VF iterations. */
2536 if ((param_vect_partial_vector_usage == 1
2537 || loop_vinfo->suggested_unroll_factor > 1)
2538 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2539 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2540 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2541 else
2542 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2545 if (dump_enabled_p ())
2546 dump_printf_loc (MSG_NOTE, vect_location,
2547 "operating on %s vectors%s.\n",
2548 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2549 ? "partial" : "full",
2550 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2551 ? " for epilogue loop" : "");
2553 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2554 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2555 && need_peeling_or_partial_vectors_p);
2557 return opt_result::success ();
2560 /* Function vect_analyze_loop_2.
2562 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2563 analyses will record information in some members of LOOP_VINFO. FATAL
2564 indicates if some analysis meets fatal error. If one non-NULL pointer
2565 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2566 worked out suggested unroll factor, while one NULL pointer shows it's
2567 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2568 is to hold the slp decision when the suggested unroll factor is worked
2569 out. */
2570 static opt_result
2571 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2572 unsigned *suggested_unroll_factor,
2573 bool& slp_done_for_suggested_uf)
2575 opt_result ok = opt_result::success ();
2576 int res;
2577 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2578 poly_uint64 min_vf = 2;
2579 loop_vec_info orig_loop_vinfo = NULL;
2581 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2582 loop_vec_info of the first vectorized loop. */
2583 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2584 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2585 else
2586 orig_loop_vinfo = loop_vinfo;
2587 gcc_assert (orig_loop_vinfo);
2589 /* The first group of checks is independent of the vector size. */
2590 fatal = true;
2592 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2593 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2594 return opt_result::failure_at (vect_location,
2595 "not vectorized: simd if(0)\n");
2597 /* Find all data references in the loop (which correspond to vdefs/vuses)
2598 and analyze their evolution in the loop. */
2600 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2602 /* Gather the data references and count stmts in the loop. */
2603 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2605 opt_result res
2606 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2607 &LOOP_VINFO_DATAREFS (loop_vinfo),
2608 &LOOP_VINFO_N_STMTS (loop_vinfo));
2609 if (!res)
2611 if (dump_enabled_p ())
2612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2613 "not vectorized: loop contains function "
2614 "calls or data references that cannot "
2615 "be analyzed\n");
2616 return res;
2618 loop_vinfo->shared->save_datarefs ();
2620 else
2621 loop_vinfo->shared->check_datarefs ();
2623 /* Analyze the data references and also adjust the minimal
2624 vectorization factor according to the loads and stores. */
2626 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2627 if (!ok)
2629 if (dump_enabled_p ())
2630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2631 "bad data references.\n");
2632 return ok;
2635 /* Check if we are applying unroll factor now. */
2636 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2637 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2639 /* If the slp decision is false when suggested unroll factor is worked
2640 out, and we are applying suggested unroll factor, we can simply skip
2641 all slp related analyses this time. */
2642 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2644 /* Classify all cross-iteration scalar data-flow cycles.
2645 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2646 vect_analyze_scalar_cycles (loop_vinfo, slp);
2648 vect_pattern_recog (loop_vinfo);
2650 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2652 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2653 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2655 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2656 if (!ok)
2658 if (dump_enabled_p ())
2659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2660 "bad data access.\n");
2661 return ok;
2664 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2666 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2667 if (!ok)
2669 if (dump_enabled_p ())
2670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2671 "unexpected pattern.\n");
2672 return ok;
2675 /* While the rest of the analysis below depends on it in some way. */
2676 fatal = false;
2678 /* Analyze data dependences between the data-refs in the loop
2679 and adjust the maximum vectorization factor according to
2680 the dependences.
2681 FORNOW: fail at the first data dependence that we encounter. */
2683 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2684 if (!ok)
2686 if (dump_enabled_p ())
2687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2688 "bad data dependence.\n");
2689 return ok;
2691 if (max_vf != MAX_VECTORIZATION_FACTOR
2692 && maybe_lt (max_vf, min_vf))
2693 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2694 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2696 ok = vect_determine_vectorization_factor (loop_vinfo);
2697 if (!ok)
2699 if (dump_enabled_p ())
2700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2701 "can't determine vectorization factor.\n");
2702 return ok;
2704 if (max_vf != MAX_VECTORIZATION_FACTOR
2705 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2706 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2708 /* Compute the scalar iteration cost. */
2709 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2711 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2713 if (slp)
2715 /* Check the SLP opportunities in the loop, analyze and build
2716 SLP trees. */
2717 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2718 if (!ok)
2719 return ok;
2721 /* If there are any SLP instances mark them as pure_slp. */
2722 slp = vect_make_slp_decision (loop_vinfo);
2723 if (slp)
2725 /* Find stmts that need to be both vectorized and SLPed. */
2726 vect_detect_hybrid_slp (loop_vinfo);
2728 /* Update the vectorization factor based on the SLP decision. */
2729 vect_update_vf_for_slp (loop_vinfo);
2731 /* Optimize the SLP graph with the vectorization factor fixed. */
2732 vect_optimize_slp (loop_vinfo);
2734 /* Gather the loads reachable from the SLP graph entries. */
2735 vect_gather_slp_loads (loop_vinfo);
2739 bool saved_can_use_partial_vectors_p
2740 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2742 /* We don't expect to have to roll back to anything other than an empty
2743 set of rgroups. */
2744 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2746 /* This is the point where we can re-start analysis with SLP forced off. */
2747 start_over:
2749 /* Apply the suggested unrolling factor, this was determined by the backend
2750 during finish_cost the first time we ran the analyzis for this
2751 vector mode. */
2752 if (applying_suggested_uf)
2753 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2755 /* Now the vectorization factor is final. */
2756 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2757 gcc_assert (known_ne (vectorization_factor, 0U));
2759 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2761 dump_printf_loc (MSG_NOTE, vect_location,
2762 "vectorization_factor = ");
2763 dump_dec (MSG_NOTE, vectorization_factor);
2764 dump_printf (MSG_NOTE, ", niters = %wd\n",
2765 LOOP_VINFO_INT_NITERS (loop_vinfo));
2768 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2770 /* Analyze the alignment of the data-refs in the loop.
2771 Fail if a data reference is found that cannot be vectorized. */
2773 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2774 if (!ok)
2776 if (dump_enabled_p ())
2777 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2778 "bad data alignment.\n");
2779 return ok;
2782 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2783 It is important to call pruning after vect_analyze_data_ref_accesses,
2784 since we use grouping information gathered by interleaving analysis. */
2785 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2786 if (!ok)
2787 return ok;
2789 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2790 vectorization, since we do not want to add extra peeling or
2791 add versioning for alignment. */
2792 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2793 /* This pass will decide on using loop versioning and/or loop peeling in
2794 order to enhance the alignment of data references in the loop. */
2795 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2796 if (!ok)
2797 return ok;
2799 if (slp)
2801 /* Analyze operations in the SLP instances. Note this may
2802 remove unsupported SLP instances which makes the above
2803 SLP kind detection invalid. */
2804 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2805 vect_slp_analyze_operations (loop_vinfo);
2806 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2808 ok = opt_result::failure_at (vect_location,
2809 "unsupported SLP instances\n");
2810 goto again;
2813 /* Check whether any load in ALL SLP instances is possibly permuted. */
2814 slp_tree load_node, slp_root;
2815 unsigned i, x;
2816 slp_instance instance;
2817 bool can_use_lanes = true;
2818 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2820 slp_root = SLP_INSTANCE_TREE (instance);
2821 int group_size = SLP_TREE_LANES (slp_root);
2822 tree vectype = SLP_TREE_VECTYPE (slp_root);
2823 bool loads_permuted = false;
2824 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2826 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2827 continue;
2828 unsigned j;
2829 stmt_vec_info load_info;
2830 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2831 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2833 loads_permuted = true;
2834 break;
2838 /* If the loads and stores can be handled with load/store-lane
2839 instructions record it and move on to the next instance. */
2840 if (loads_permuted
2841 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2842 && vect_store_lanes_supported (vectype, group_size, false)
2843 != IFN_LAST)
2845 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2847 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2848 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2849 /* Use SLP for strided accesses (or if we can't
2850 load-lanes). */
2851 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2852 || vect_load_lanes_supported
2853 (STMT_VINFO_VECTYPE (stmt_vinfo),
2854 DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2855 break;
2858 can_use_lanes
2859 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2861 if (can_use_lanes && dump_enabled_p ())
2862 dump_printf_loc (MSG_NOTE, vect_location,
2863 "SLP instance %p can use load/store-lanes\n",
2864 (void *) instance);
2866 else
2868 can_use_lanes = false;
2869 break;
2873 /* If all SLP instances can use load/store-lanes abort SLP and try again
2874 with SLP disabled. */
2875 if (can_use_lanes)
2877 ok = opt_result::failure_at (vect_location,
2878 "Built SLP cancelled: can use "
2879 "load/store-lanes\n");
2880 if (dump_enabled_p ())
2881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2882 "Built SLP cancelled: all SLP instances support "
2883 "load/store-lanes\n");
2884 goto again;
2888 /* Dissolve SLP-only groups. */
2889 vect_dissolve_slp_only_groups (loop_vinfo);
2891 /* Scan all the remaining operations in the loop that are not subject
2892 to SLP and make sure they are vectorizable. */
2893 ok = vect_analyze_loop_operations (loop_vinfo);
2894 if (!ok)
2896 if (dump_enabled_p ())
2897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2898 "bad operation or unsupported loop bound.\n");
2899 return ok;
2902 /* For now, we don't expect to mix both masking and length approaches for one
2903 loop, disable it if both are recorded. */
2904 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2905 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2906 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2908 if (dump_enabled_p ())
2909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2910 "can't vectorize a loop with partial vectors"
2911 " because we don't expect to mix different"
2912 " approaches with partial vectors for the"
2913 " same loop.\n");
2914 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2917 /* If we still have the option of using partial vectors,
2918 check whether we can generate the necessary loop controls. */
2919 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2921 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2923 if (!vect_verify_full_masking (loop_vinfo)
2924 && !vect_verify_full_masking_avx512 (loop_vinfo))
2925 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2927 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2928 if (!vect_verify_loop_lens (loop_vinfo))
2929 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2932 /* If we're vectorizing a loop that uses length "controls" and
2933 can iterate more than once, we apply decrementing IV approach
2934 in loop control. */
2935 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2936 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2937 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2938 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2939 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2940 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2941 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2943 /* If a loop uses length controls and has a decrementing loop control IV,
2944 we will normally pass that IV through a MIN_EXPR to calcaluate the
2945 basis for the length controls. E.g. in a loop that processes one
2946 element per scalar iteration, the number of elements would be
2947 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2949 This MIN_EXPR approach allows us to use pointer IVs with an invariant
2950 step, since only the final iteration of the vector loop can have
2951 inactive lanes.
2953 However, some targets have a dedicated instruction for calculating the
2954 preferred length, given the total number of elements that still need to
2955 be processed. This is encapsulated in the SELECT_VL internal function.
2957 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2958 to determine the basis for the length controls. However, unlike the
2959 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2960 lanes inactive in any iteration of the vector loop, not just the last
2961 iteration. This SELECT_VL approach therefore requires us to use pointer
2962 IVs with variable steps.
2964 Once we've decided how many elements should be processed by one
2965 iteration of the vector loop, we need to populate the rgroup controls.
2966 If a loop has multiple rgroups, we need to make sure that those rgroups
2967 "line up" (that is, they must be consistent about which elements are
2968 active and which aren't). This is done by vect_adjust_loop_lens_control.
2970 In principle, it would be possible to use vect_adjust_loop_lens_control
2971 on either the result of a MIN_EXPR or the result of a SELECT_VL.
2972 However:
2974 (1) In practice, it only makes sense to use SELECT_VL when a vector
2975 operation will be controlled directly by the result. It is not
2976 worth using SELECT_VL if it would only be the input to other
2977 calculations.
2979 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2980 pointer IV will need N updates by a variable amount (N-1 updates
2981 within the iteration and 1 update to move to the next iteration).
2983 Because of this, we prefer to use the MIN_EXPR approach whenever there
2984 is more than one length control.
2986 In addition, SELECT_VL always operates to a granularity of 1 unit.
2987 If we wanted to use it to control an SLP operation on N consecutive
2988 elements, we would need to make the SELECT_VL inputs measure scalar
2989 iterations (rather than elements) and then multiply the SELECT_VL
2990 result by N. But using SELECT_VL this way is inefficient because
2991 of (1) above.
2993 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2994 satisfied:
2996 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2997 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2999 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3000 we will fail to gain benefits of following unroll optimizations. We prefer
3001 using the MIN_EXPR approach in this situation. */
3002 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3004 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3005 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3006 OPTIMIZE_FOR_SPEED)
3007 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3008 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3009 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3010 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3011 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3014 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3015 assuming that the loop will be used as a main loop. We will redo
3016 this analysis later if we instead decide to use the loop as an
3017 epilogue loop. */
3018 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3019 if (!ok)
3020 return ok;
3022 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3023 to be able to handle fewer than VF scalars, or needs to have a lower VF
3024 than the main loop. */
3025 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3026 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3028 poly_uint64 unscaled_vf
3029 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3030 orig_loop_vinfo->suggested_unroll_factor);
3031 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3032 return opt_result::failure_at (vect_location,
3033 "Vectorization factor too high for"
3034 " epilogue loop.\n");
3037 /* Check the costings of the loop make vectorizing worthwhile. */
3038 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3039 if (res < 0)
3041 ok = opt_result::failure_at (vect_location,
3042 "Loop costings may not be worthwhile.\n");
3043 goto again;
3045 if (!res)
3046 return opt_result::failure_at (vect_location,
3047 "Loop costings not worthwhile.\n");
3049 /* If an epilogue loop is required make sure we can create one. */
3050 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3051 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3053 if (dump_enabled_p ())
3054 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3055 if (!vect_can_advance_ivs_p (loop_vinfo)
3056 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
3057 single_exit (LOOP_VINFO_LOOP
3058 (loop_vinfo))))
3060 ok = opt_result::failure_at (vect_location,
3061 "not vectorized: can't create required "
3062 "epilog loop\n");
3063 goto again;
3067 /* During peeling, we need to check if number of loop iterations is
3068 enough for both peeled prolog loop and vector loop. This check
3069 can be merged along with threshold check of loop versioning, so
3070 increase threshold for this case if necessary.
3072 If we are analyzing an epilogue we still want to check what its
3073 versioning threshold would be. If we decide to vectorize the epilogues we
3074 will want to use the lowest versioning threshold of all epilogues and main
3075 loop. This will enable us to enter a vectorized epilogue even when
3076 versioning the loop. We can't simply check whether the epilogue requires
3077 versioning though since we may have skipped some versioning checks when
3078 analyzing the epilogue. For instance, checks for alias versioning will be
3079 skipped when dealing with epilogues as we assume we already checked them
3080 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3081 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3083 poly_uint64 niters_th = 0;
3084 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3086 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3088 /* Niters for peeled prolog loop. */
3089 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3091 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3092 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3093 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3095 else
3096 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3099 /* Niters for at least one iteration of vectorized loop. */
3100 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3101 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3102 /* One additional iteration because of peeling for gap. */
3103 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3104 niters_th += 1;
3106 /* Use the same condition as vect_transform_loop to decide when to use
3107 the cost to determine a versioning threshold. */
3108 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3109 && ordered_p (th, niters_th))
3110 niters_th = ordered_max (poly_uint64 (th), niters_th);
3112 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3115 gcc_assert (known_eq (vectorization_factor,
3116 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3118 slp_done_for_suggested_uf = slp;
3120 /* Ok to vectorize! */
3121 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3122 return opt_result::success ();
3124 again:
3125 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3126 gcc_assert (!ok);
3128 /* Try again with SLP forced off but if we didn't do any SLP there is
3129 no point in re-trying. */
3130 if (!slp)
3131 return ok;
3133 /* If the slp decision is true when suggested unroll factor is worked
3134 out, and we are applying suggested unroll factor, we don't need to
3135 re-try any more. */
3136 if (applying_suggested_uf && slp_done_for_suggested_uf)
3137 return ok;
3139 /* If there are reduction chains re-trying will fail anyway. */
3140 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3141 return ok;
3143 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3144 via interleaving or lane instructions. */
3145 slp_instance instance;
3146 slp_tree node;
3147 unsigned i, j;
3148 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3150 stmt_vec_info vinfo;
3151 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3152 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3153 continue;
3154 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3155 unsigned int size = DR_GROUP_SIZE (vinfo);
3156 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3157 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3158 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3159 && ! vect_grouped_store_supported (vectype, size))
3160 return opt_result::failure_at (vinfo->stmt,
3161 "unsupported grouped store\n");
3162 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3164 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
3165 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3166 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3167 size = DR_GROUP_SIZE (vinfo);
3168 vectype = STMT_VINFO_VECTYPE (vinfo);
3169 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3170 && ! vect_grouped_load_supported (vectype, single_element_p,
3171 size))
3172 return opt_result::failure_at (vinfo->stmt,
3173 "unsupported grouped load\n");
3177 if (dump_enabled_p ())
3178 dump_printf_loc (MSG_NOTE, vect_location,
3179 "re-trying with SLP disabled\n");
3181 /* Roll back state appropriately. No SLP this time. */
3182 slp = false;
3183 /* Restore vectorization factor as it were without SLP. */
3184 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3185 /* Free the SLP instances. */
3186 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3187 vect_free_slp_instance (instance);
3188 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3189 /* Reset SLP type to loop_vect on all stmts. */
3190 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3192 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3193 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3194 !gsi_end_p (si); gsi_next (&si))
3196 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3197 STMT_SLP_TYPE (stmt_info) = loop_vect;
3198 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3199 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3201 /* vectorizable_reduction adjusts reduction stmt def-types,
3202 restore them to that of the PHI. */
3203 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3204 = STMT_VINFO_DEF_TYPE (stmt_info);
3205 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3206 (STMT_VINFO_REDUC_DEF (stmt_info)))
3207 = STMT_VINFO_DEF_TYPE (stmt_info);
3210 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3211 !gsi_end_p (si); gsi_next (&si))
3213 if (is_gimple_debug (gsi_stmt (si)))
3214 continue;
3215 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3216 STMT_SLP_TYPE (stmt_info) = loop_vect;
3217 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3219 stmt_vec_info pattern_stmt_info
3220 = STMT_VINFO_RELATED_STMT (stmt_info);
3221 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3222 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3225 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3226 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3227 !gsi_end_p (pi); gsi_next (&pi))
3228 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3229 = loop_vect;
3233 /* Free optimized alias test DDRS. */
3234 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3235 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3236 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3237 /* Reset target cost data. */
3238 delete loop_vinfo->vector_costs;
3239 loop_vinfo->vector_costs = nullptr;
3240 /* Reset accumulated rgroup information. */
3241 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3242 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3243 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3244 /* Reset assorted flags. */
3245 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3246 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3247 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3248 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3249 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3250 = saved_can_use_partial_vectors_p;
3252 goto start_over;
3255 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3256 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3257 OLD_LOOP_VINFO is better unless something specifically indicates
3258 otherwise.
3260 Note that this deliberately isn't a partial order. */
3262 static bool
3263 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3264 loop_vec_info old_loop_vinfo)
3266 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3267 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3269 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3270 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3272 /* Always prefer a VF of loop->simdlen over any other VF. */
3273 if (loop->simdlen)
3275 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3276 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3277 if (new_simdlen_p != old_simdlen_p)
3278 return new_simdlen_p;
3281 const auto *old_costs = old_loop_vinfo->vector_costs;
3282 const auto *new_costs = new_loop_vinfo->vector_costs;
3283 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3284 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3286 return new_costs->better_main_loop_than_p (old_costs);
3289 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3290 true if we should. */
3292 static bool
3293 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3294 loop_vec_info old_loop_vinfo)
3296 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3297 return false;
3299 if (dump_enabled_p ())
3300 dump_printf_loc (MSG_NOTE, vect_location,
3301 "***** Preferring vector mode %s to vector mode %s\n",
3302 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3303 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3304 return true;
3307 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3308 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3309 MODE_I to the next mode useful to analyze.
3310 Return the loop_vinfo on success and wrapped null on failure. */
3312 static opt_loop_vec_info
3313 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3314 const vect_loop_form_info *loop_form_info,
3315 loop_vec_info main_loop_vinfo,
3316 const vector_modes &vector_modes, unsigned &mode_i,
3317 machine_mode &autodetected_vector_mode,
3318 bool &fatal)
3320 loop_vec_info loop_vinfo
3321 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3323 machine_mode vector_mode = vector_modes[mode_i];
3324 loop_vinfo->vector_mode = vector_mode;
3325 unsigned int suggested_unroll_factor = 1;
3326 bool slp_done_for_suggested_uf = false;
3328 /* Run the main analysis. */
3329 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3330 &suggested_unroll_factor,
3331 slp_done_for_suggested_uf);
3332 if (dump_enabled_p ())
3333 dump_printf_loc (MSG_NOTE, vect_location,
3334 "***** Analysis %s with vector mode %s\n",
3335 res ? "succeeded" : " failed",
3336 GET_MODE_NAME (loop_vinfo->vector_mode));
3338 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3340 if (dump_enabled_p ())
3341 dump_printf_loc (MSG_NOTE, vect_location,
3342 "***** Re-trying analysis for unrolling"
3343 " with unroll factor %d and slp %s.\n",
3344 suggested_unroll_factor,
3345 slp_done_for_suggested_uf ? "on" : "off");
3346 loop_vec_info unroll_vinfo
3347 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3348 unroll_vinfo->vector_mode = vector_mode;
3349 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3350 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3351 slp_done_for_suggested_uf);
3352 if (new_res)
3354 delete loop_vinfo;
3355 loop_vinfo = unroll_vinfo;
3357 else
3358 delete unroll_vinfo;
3361 /* Remember the autodetected vector mode. */
3362 if (vector_mode == VOIDmode)
3363 autodetected_vector_mode = loop_vinfo->vector_mode;
3365 /* Advance mode_i, first skipping modes that would result in the
3366 same analysis result. */
3367 while (mode_i + 1 < vector_modes.length ()
3368 && vect_chooses_same_modes_p (loop_vinfo,
3369 vector_modes[mode_i + 1]))
3371 if (dump_enabled_p ())
3372 dump_printf_loc (MSG_NOTE, vect_location,
3373 "***** The result for vector mode %s would"
3374 " be the same\n",
3375 GET_MODE_NAME (vector_modes[mode_i + 1]));
3376 mode_i += 1;
3378 if (mode_i + 1 < vector_modes.length ()
3379 && VECTOR_MODE_P (autodetected_vector_mode)
3380 && (related_vector_mode (vector_modes[mode_i + 1],
3381 GET_MODE_INNER (autodetected_vector_mode))
3382 == autodetected_vector_mode)
3383 && (related_vector_mode (autodetected_vector_mode,
3384 GET_MODE_INNER (vector_modes[mode_i + 1]))
3385 == vector_modes[mode_i + 1]))
3387 if (dump_enabled_p ())
3388 dump_printf_loc (MSG_NOTE, vect_location,
3389 "***** Skipping vector mode %s, which would"
3390 " repeat the analysis for %s\n",
3391 GET_MODE_NAME (vector_modes[mode_i + 1]),
3392 GET_MODE_NAME (autodetected_vector_mode));
3393 mode_i += 1;
3395 mode_i++;
3397 if (!res)
3399 delete loop_vinfo;
3400 if (fatal)
3401 gcc_checking_assert (main_loop_vinfo == NULL);
3402 return opt_loop_vec_info::propagate_failure (res);
3405 return opt_loop_vec_info::success (loop_vinfo);
3408 /* Function vect_analyze_loop.
3410 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3411 for it. The different analyses will record information in the
3412 loop_vec_info struct. */
3413 opt_loop_vec_info
3414 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3416 DUMP_VECT_SCOPE ("analyze_loop_nest");
3418 if (loop_outer (loop)
3419 && loop_vec_info_for_loop (loop_outer (loop))
3420 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3421 return opt_loop_vec_info::failure_at (vect_location,
3422 "outer-loop already vectorized.\n");
3424 if (!find_loop_nest (loop, &shared->loop_nest))
3425 return opt_loop_vec_info::failure_at
3426 (vect_location,
3427 "not vectorized: loop nest containing two or more consecutive inner"
3428 " loops cannot be vectorized\n");
3430 /* Analyze the loop form. */
3431 vect_loop_form_info loop_form_info;
3432 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3433 if (!res)
3435 if (dump_enabled_p ())
3436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3437 "bad loop form.\n");
3438 return opt_loop_vec_info::propagate_failure (res);
3440 if (!integer_onep (loop_form_info.assumptions))
3442 /* We consider to vectorize this loop by versioning it under
3443 some assumptions. In order to do this, we need to clear
3444 existing information computed by scev and niter analyzer. */
3445 scev_reset_htab ();
3446 free_numbers_of_iterations_estimates (loop);
3447 /* Also set flag for this loop so that following scev and niter
3448 analysis are done under the assumptions. */
3449 loop_constraint_set (loop, LOOP_C_FINITE);
3452 auto_vector_modes vector_modes;
3453 /* Autodetect first vector size we try. */
3454 vector_modes.safe_push (VOIDmode);
3455 unsigned int autovec_flags
3456 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3457 loop->simdlen != 0);
3458 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3459 && !unlimited_cost_model (loop));
3460 machine_mode autodetected_vector_mode = VOIDmode;
3461 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3462 unsigned int mode_i = 0;
3463 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3465 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3466 a mode has not been analyzed. */
3467 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3468 for (unsigned i = 0; i < vector_modes.length (); ++i)
3469 cached_vf_per_mode.safe_push (0);
3471 /* First determine the main loop vectorization mode, either the first
3472 one that works, starting with auto-detecting the vector mode and then
3473 following the targets order of preference, or the one with the
3474 lowest cost if pick_lowest_cost_p. */
3475 while (1)
3477 bool fatal;
3478 unsigned int last_mode_i = mode_i;
3479 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3480 failed. */
3481 cached_vf_per_mode[last_mode_i] = -1;
3482 opt_loop_vec_info loop_vinfo
3483 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3484 NULL, vector_modes, mode_i,
3485 autodetected_vector_mode, fatal);
3486 if (fatal)
3487 break;
3489 if (loop_vinfo)
3491 /* Analyzis has been successful so update the VF value. The
3492 VF should always be a multiple of unroll_factor and we want to
3493 capture the original VF here. */
3494 cached_vf_per_mode[last_mode_i]
3495 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3496 loop_vinfo->suggested_unroll_factor);
3497 /* Once we hit the desired simdlen for the first time,
3498 discard any previous attempts. */
3499 if (simdlen
3500 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3502 delete first_loop_vinfo;
3503 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3504 simdlen = 0;
3506 else if (pick_lowest_cost_p
3507 && first_loop_vinfo
3508 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3510 /* Pick loop_vinfo over first_loop_vinfo. */
3511 delete first_loop_vinfo;
3512 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3514 if (first_loop_vinfo == NULL)
3515 first_loop_vinfo = loop_vinfo;
3516 else
3518 delete loop_vinfo;
3519 loop_vinfo = opt_loop_vec_info::success (NULL);
3522 /* Commit to first_loop_vinfo if we have no reason to try
3523 alternatives. */
3524 if (!simdlen && !pick_lowest_cost_p)
3525 break;
3527 if (mode_i == vector_modes.length ()
3528 || autodetected_vector_mode == VOIDmode)
3529 break;
3531 /* Try the next biggest vector size. */
3532 if (dump_enabled_p ())
3533 dump_printf_loc (MSG_NOTE, vect_location,
3534 "***** Re-trying analysis with vector mode %s\n",
3535 GET_MODE_NAME (vector_modes[mode_i]));
3537 if (!first_loop_vinfo)
3538 return opt_loop_vec_info::propagate_failure (res);
3540 if (dump_enabled_p ())
3541 dump_printf_loc (MSG_NOTE, vect_location,
3542 "***** Choosing vector mode %s\n",
3543 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3545 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3546 enabled, SIMDUID is not set, it is the innermost loop and we have
3547 either already found the loop's SIMDLEN or there was no SIMDLEN to
3548 begin with.
3549 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3550 bool vect_epilogues = (!simdlen
3551 && loop->inner == NULL
3552 && param_vect_epilogues_nomask
3553 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3554 && !loop->simduid);
3555 if (!vect_epilogues)
3556 return first_loop_vinfo;
3558 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3559 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3561 /* For epilogues start the analysis from the first mode. The motivation
3562 behind starting from the beginning comes from cases where the VECTOR_MODES
3563 array may contain length-agnostic and length-specific modes. Their
3564 ordering is not guaranteed, so we could end up picking a mode for the main
3565 loop that is after the epilogue's optimal mode. */
3566 vector_modes[0] = autodetected_vector_mode;
3567 mode_i = 0;
3569 bool supports_partial_vectors =
3570 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3571 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3573 while (1)
3575 /* If the target does not support partial vectors we can shorten the
3576 number of modes to analyze for the epilogue as we know we can't pick a
3577 mode that would lead to a VF at least as big as the
3578 FIRST_VINFO_VF. */
3579 if (!supports_partial_vectors
3580 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3582 mode_i++;
3583 if (mode_i == vector_modes.length ())
3584 break;
3585 continue;
3588 if (dump_enabled_p ())
3589 dump_printf_loc (MSG_NOTE, vect_location,
3590 "***** Re-trying epilogue analysis with vector "
3591 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3593 bool fatal;
3594 opt_loop_vec_info loop_vinfo
3595 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3596 first_loop_vinfo,
3597 vector_modes, mode_i,
3598 autodetected_vector_mode, fatal);
3599 if (fatal)
3600 break;
3602 if (loop_vinfo)
3604 if (pick_lowest_cost_p)
3606 /* Keep trying to roll back vectorization attempts while the
3607 loop_vec_infos they produced were worse than this one. */
3608 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3609 while (!vinfos.is_empty ()
3610 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3612 gcc_assert (vect_epilogues);
3613 delete vinfos.pop ();
3616 /* For now only allow one epilogue loop. */
3617 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3619 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3620 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3621 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3622 || maybe_ne (lowest_th, 0U));
3623 /* Keep track of the known smallest versioning
3624 threshold. */
3625 if (ordered_p (lowest_th, th))
3626 lowest_th = ordered_min (lowest_th, th);
3628 else
3630 delete loop_vinfo;
3631 loop_vinfo = opt_loop_vec_info::success (NULL);
3634 /* For now only allow one epilogue loop, but allow
3635 pick_lowest_cost_p to replace it, so commit to the
3636 first epilogue if we have no reason to try alternatives. */
3637 if (!pick_lowest_cost_p)
3638 break;
3641 if (mode_i == vector_modes.length ())
3642 break;
3646 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3648 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3649 if (dump_enabled_p ())
3650 dump_printf_loc (MSG_NOTE, vect_location,
3651 "***** Choosing epilogue vector mode %s\n",
3652 GET_MODE_NAME
3653 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3656 return first_loop_vinfo;
3659 /* Return true if there is an in-order reduction function for CODE, storing
3660 it in *REDUC_FN if so. */
3662 static bool
3663 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3665 if (code == PLUS_EXPR)
3667 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3668 return true;
3670 return false;
3673 /* Function reduction_fn_for_scalar_code
3675 Input:
3676 CODE - tree_code of a reduction operations.
3678 Output:
3679 REDUC_FN - the corresponding internal function to be used to reduce the
3680 vector of partial results into a single scalar result, or IFN_LAST
3681 if the operation is a supported reduction operation, but does not have
3682 such an internal function.
3684 Return FALSE if CODE currently cannot be vectorized as reduction. */
3686 bool
3687 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3689 if (code.is_tree_code ())
3690 switch (tree_code (code))
3692 case MAX_EXPR:
3693 *reduc_fn = IFN_REDUC_MAX;
3694 return true;
3696 case MIN_EXPR:
3697 *reduc_fn = IFN_REDUC_MIN;
3698 return true;
3700 case PLUS_EXPR:
3701 *reduc_fn = IFN_REDUC_PLUS;
3702 return true;
3704 case BIT_AND_EXPR:
3705 *reduc_fn = IFN_REDUC_AND;
3706 return true;
3708 case BIT_IOR_EXPR:
3709 *reduc_fn = IFN_REDUC_IOR;
3710 return true;
3712 case BIT_XOR_EXPR:
3713 *reduc_fn = IFN_REDUC_XOR;
3714 return true;
3716 case MULT_EXPR:
3717 case MINUS_EXPR:
3718 *reduc_fn = IFN_LAST;
3719 return true;
3721 default:
3722 return false;
3724 else
3725 switch (combined_fn (code))
3727 CASE_CFN_FMAX:
3728 *reduc_fn = IFN_REDUC_FMAX;
3729 return true;
3731 CASE_CFN_FMIN:
3732 *reduc_fn = IFN_REDUC_FMIN;
3733 return true;
3735 default:
3736 return false;
3740 /* If there is a neutral value X such that a reduction would not be affected
3741 by the introduction of additional X elements, return that X, otherwise
3742 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3743 of the scalar elements. If the reduction has just a single initial value
3744 then INITIAL_VALUE is that value, otherwise it is null. */
3746 tree
3747 neutral_op_for_reduction (tree scalar_type, code_helper code,
3748 tree initial_value)
3750 if (code.is_tree_code ())
3751 switch (tree_code (code))
3753 case WIDEN_SUM_EXPR:
3754 case DOT_PROD_EXPR:
3755 case SAD_EXPR:
3756 case PLUS_EXPR:
3757 case MINUS_EXPR:
3758 case BIT_IOR_EXPR:
3759 case BIT_XOR_EXPR:
3760 return build_zero_cst (scalar_type);
3762 case MULT_EXPR:
3763 return build_one_cst (scalar_type);
3765 case BIT_AND_EXPR:
3766 return build_all_ones_cst (scalar_type);
3768 case MAX_EXPR:
3769 case MIN_EXPR:
3770 return initial_value;
3772 default:
3773 return NULL_TREE;
3775 else
3776 switch (combined_fn (code))
3778 CASE_CFN_FMIN:
3779 CASE_CFN_FMAX:
3780 return initial_value;
3782 default:
3783 return NULL_TREE;
3787 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3788 STMT is printed with a message MSG. */
3790 static void
3791 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3793 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3796 /* Return true if we need an in-order reduction for operation CODE
3797 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3798 overflow must wrap. */
3800 bool
3801 needs_fold_left_reduction_p (tree type, code_helper code)
3803 /* CHECKME: check for !flag_finite_math_only too? */
3804 if (SCALAR_FLOAT_TYPE_P (type))
3806 if (code.is_tree_code ())
3807 switch (tree_code (code))
3809 case MIN_EXPR:
3810 case MAX_EXPR:
3811 return false;
3813 default:
3814 return !flag_associative_math;
3816 else
3817 switch (combined_fn (code))
3819 CASE_CFN_FMIN:
3820 CASE_CFN_FMAX:
3821 return false;
3823 default:
3824 return !flag_associative_math;
3828 if (INTEGRAL_TYPE_P (type))
3829 return (!code.is_tree_code ()
3830 || !operation_no_trapping_overflow (type, tree_code (code)));
3832 if (SAT_FIXED_POINT_TYPE_P (type))
3833 return true;
3835 return false;
3838 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3839 has a handled computation expression. Store the main reduction
3840 operation in *CODE. */
3842 static bool
3843 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3844 tree loop_arg, code_helper *code,
3845 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3847 auto_bitmap visited;
3848 tree lookfor = PHI_RESULT (phi);
3849 ssa_op_iter curri;
3850 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3851 while (USE_FROM_PTR (curr) != loop_arg)
3852 curr = op_iter_next_use (&curri);
3853 curri.i = curri.numops;
3856 path.safe_push (std::make_pair (curri, curr));
3857 tree use = USE_FROM_PTR (curr);
3858 if (use == lookfor)
3859 break;
3860 gimple *def = SSA_NAME_DEF_STMT (use);
3861 if (gimple_nop_p (def)
3862 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3864 pop:
3867 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3868 curri = x.first;
3869 curr = x.second;
3871 curr = op_iter_next_use (&curri);
3872 /* Skip already visited or non-SSA operands (from iterating
3873 over PHI args). */
3874 while (curr != NULL_USE_OPERAND_P
3875 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3876 || ! bitmap_set_bit (visited,
3877 SSA_NAME_VERSION
3878 (USE_FROM_PTR (curr)))));
3880 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3881 if (curr == NULL_USE_OPERAND_P)
3882 break;
3884 else
3886 if (gimple_code (def) == GIMPLE_PHI)
3887 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3888 else
3889 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3890 while (curr != NULL_USE_OPERAND_P
3891 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3892 || ! bitmap_set_bit (visited,
3893 SSA_NAME_VERSION
3894 (USE_FROM_PTR (curr)))))
3895 curr = op_iter_next_use (&curri);
3896 if (curr == NULL_USE_OPERAND_P)
3897 goto pop;
3900 while (1);
3901 if (dump_file && (dump_flags & TDF_DETAILS))
3903 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3904 unsigned i;
3905 std::pair<ssa_op_iter, use_operand_p> *x;
3906 FOR_EACH_VEC_ELT (path, i, x)
3907 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3908 dump_printf (MSG_NOTE, "\n");
3911 /* Check whether the reduction path detected is valid. */
3912 bool fail = path.length () == 0;
3913 bool neg = false;
3914 int sign = -1;
3915 *code = ERROR_MARK;
3916 for (unsigned i = 1; i < path.length (); ++i)
3918 gimple *use_stmt = USE_STMT (path[i].second);
3919 gimple_match_op op;
3920 if (!gimple_extract_op (use_stmt, &op))
3922 fail = true;
3923 break;
3925 unsigned int opi = op.num_ops;
3926 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3928 /* The following make sure we can compute the operand index
3929 easily plus it mostly disallows chaining via COND_EXPR condition
3930 operands. */
3931 for (opi = 0; opi < op.num_ops; ++opi)
3932 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3933 break;
3935 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3937 for (opi = 0; opi < op.num_ops; ++opi)
3938 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3939 break;
3941 if (opi == op.num_ops)
3943 fail = true;
3944 break;
3946 op.code = canonicalize_code (op.code, op.type);
3947 if (op.code == MINUS_EXPR)
3949 op.code = PLUS_EXPR;
3950 /* Track whether we negate the reduction value each iteration. */
3951 if (op.ops[1] == op.ops[opi])
3952 neg = ! neg;
3954 if (CONVERT_EXPR_CODE_P (op.code)
3955 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3957 else if (*code == ERROR_MARK)
3959 *code = op.code;
3960 sign = TYPE_SIGN (op.type);
3962 else if (op.code != *code)
3964 fail = true;
3965 break;
3967 else if ((op.code == MIN_EXPR
3968 || op.code == MAX_EXPR)
3969 && sign != TYPE_SIGN (op.type))
3971 fail = true;
3972 break;
3974 /* Check there's only a single stmt the op is used on. For the
3975 not value-changing tail and the last stmt allow out-of-loop uses.
3976 ??? We could relax this and handle arbitrary live stmts by
3977 forcing a scalar epilogue for example. */
3978 imm_use_iterator imm_iter;
3979 gimple *op_use_stmt;
3980 unsigned cnt = 0;
3981 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3982 if (!is_gimple_debug (op_use_stmt)
3983 && (*code != ERROR_MARK
3984 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3986 /* We want to allow x + x but not x < 1 ? x : 2. */
3987 if (is_gimple_assign (op_use_stmt)
3988 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3990 use_operand_p use_p;
3991 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3992 cnt++;
3994 else
3995 cnt++;
3997 if (cnt != 1)
3999 fail = true;
4000 break;
4003 return ! fail && ! neg && *code != ERROR_MARK;
4006 bool
4007 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4008 tree loop_arg, enum tree_code code)
4010 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4011 code_helper code_;
4012 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4013 && code_ == code);
4018 /* Function vect_is_simple_reduction
4020 (1) Detect a cross-iteration def-use cycle that represents a simple
4021 reduction computation. We look for the following pattern:
4023 loop_header:
4024 a1 = phi < a0, a2 >
4025 a3 = ...
4026 a2 = operation (a3, a1)
4030 a3 = ...
4031 loop_header:
4032 a1 = phi < a0, a2 >
4033 a2 = operation (a3, a1)
4035 such that:
4036 1. operation is commutative and associative and it is safe to
4037 change the order of the computation
4038 2. no uses for a2 in the loop (a2 is used out of the loop)
4039 3. no uses of a1 in the loop besides the reduction operation
4040 4. no uses of a1 outside the loop.
4042 Conditions 1,4 are tested here.
4043 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4045 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4046 nested cycles.
4048 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4049 reductions:
4051 a1 = phi < a0, a2 >
4052 inner loop (def of a3)
4053 a2 = phi < a3 >
4055 (4) Detect condition expressions, ie:
4056 for (int i = 0; i < N; i++)
4057 if (a[i] < val)
4058 ret_val = a[i];
4062 static stmt_vec_info
4063 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4064 bool *double_reduc, bool *reduc_chain_p, bool slp)
4066 gphi *phi = as_a <gphi *> (phi_info->stmt);
4067 gimple *phi_use_stmt = NULL;
4068 imm_use_iterator imm_iter;
4069 use_operand_p use_p;
4071 *double_reduc = false;
4072 *reduc_chain_p = false;
4073 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4075 tree phi_name = PHI_RESULT (phi);
4076 /* ??? If there are no uses of the PHI result the inner loop reduction
4077 won't be detected as possibly double-reduction by vectorizable_reduction
4078 because that tries to walk the PHI arg from the preheader edge which
4079 can be constant. See PR60382. */
4080 if (has_zero_uses (phi_name))
4081 return NULL;
4082 class loop *loop = (gimple_bb (phi))->loop_father;
4083 unsigned nphi_def_loop_uses = 0;
4084 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4086 gimple *use_stmt = USE_STMT (use_p);
4087 if (is_gimple_debug (use_stmt))
4088 continue;
4090 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4092 if (dump_enabled_p ())
4093 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4094 "intermediate value used outside loop.\n");
4096 return NULL;
4099 nphi_def_loop_uses++;
4100 phi_use_stmt = use_stmt;
4103 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4104 if (TREE_CODE (latch_def) != SSA_NAME)
4106 if (dump_enabled_p ())
4107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4108 "reduction: not ssa_name: %T\n", latch_def);
4109 return NULL;
4112 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4113 if (!def_stmt_info
4114 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4115 return NULL;
4117 bool nested_in_vect_loop
4118 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4119 unsigned nlatch_def_loop_uses = 0;
4120 auto_vec<gphi *, 3> lcphis;
4121 bool inner_loop_of_double_reduc = false;
4122 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4124 gimple *use_stmt = USE_STMT (use_p);
4125 if (is_gimple_debug (use_stmt))
4126 continue;
4127 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4128 nlatch_def_loop_uses++;
4129 else
4131 /* We can have more than one loop-closed PHI. */
4132 lcphis.safe_push (as_a <gphi *> (use_stmt));
4133 if (nested_in_vect_loop
4134 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4135 == vect_double_reduction_def))
4136 inner_loop_of_double_reduc = true;
4140 /* If we are vectorizing an inner reduction we are executing that
4141 in the original order only in case we are not dealing with a
4142 double reduction. */
4143 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4145 if (dump_enabled_p ())
4146 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4147 "detected nested cycle: ");
4148 return def_stmt_info;
4151 /* When the inner loop of a double reduction ends up with more than
4152 one loop-closed PHI we have failed to classify alternate such
4153 PHIs as double reduction, leading to wrong code. See PR103237. */
4154 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4156 if (dump_enabled_p ())
4157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4158 "unhandle double reduction\n");
4159 return NULL;
4162 /* If this isn't a nested cycle or if the nested cycle reduction value
4163 is used ouside of the inner loop we cannot handle uses of the reduction
4164 value. */
4165 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4167 if (dump_enabled_p ())
4168 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4169 "reduction used in loop.\n");
4170 return NULL;
4173 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4174 defined in the inner loop. */
4175 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4177 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4178 if (gimple_phi_num_args (def_stmt) != 1
4179 || TREE_CODE (op1) != SSA_NAME)
4181 if (dump_enabled_p ())
4182 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4183 "unsupported phi node definition.\n");
4185 return NULL;
4188 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4189 and the latch definition op1. */
4190 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4191 if (gimple_bb (def1)
4192 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4193 && loop->inner
4194 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4195 && (is_gimple_assign (def1) || is_gimple_call (def1))
4196 && is_a <gphi *> (phi_use_stmt)
4197 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4198 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4199 loop_latch_edge (loop->inner))))
4201 if (dump_enabled_p ())
4202 report_vect_op (MSG_NOTE, def_stmt,
4203 "detected double reduction: ");
4205 *double_reduc = true;
4206 return def_stmt_info;
4209 return NULL;
4212 /* Look for the expression computing latch_def from then loop PHI result. */
4213 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4214 code_helper code;
4215 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4216 path))
4218 STMT_VINFO_REDUC_CODE (phi_info) = code;
4219 if (code == COND_EXPR && !nested_in_vect_loop)
4220 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4222 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4223 reduction chain for which the additional restriction is that
4224 all operations in the chain are the same. */
4225 auto_vec<stmt_vec_info, 8> reduc_chain;
4226 unsigned i;
4227 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4228 for (i = path.length () - 1; i >= 1; --i)
4230 gimple *stmt = USE_STMT (path[i].second);
4231 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4232 gimple_match_op op;
4233 if (!gimple_extract_op (stmt, &op))
4234 gcc_unreachable ();
4235 if (gassign *assign = dyn_cast<gassign *> (stmt))
4236 STMT_VINFO_REDUC_IDX (stmt_info)
4237 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4238 else
4240 gcall *call = as_a<gcall *> (stmt);
4241 STMT_VINFO_REDUC_IDX (stmt_info)
4242 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4244 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4245 && (i == 1 || i == path.length () - 1));
4246 if ((op.code != code && !leading_conversion)
4247 /* We can only handle the final value in epilogue
4248 generation for reduction chains. */
4249 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4250 is_slp_reduc = false;
4251 /* For reduction chains we support a trailing/leading
4252 conversions. We do not store those in the actual chain. */
4253 if (leading_conversion)
4254 continue;
4255 reduc_chain.safe_push (stmt_info);
4257 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4259 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4261 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4262 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4264 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4265 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4267 /* Save the chain for further analysis in SLP detection. */
4268 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4269 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4271 *reduc_chain_p = true;
4272 if (dump_enabled_p ())
4273 dump_printf_loc (MSG_NOTE, vect_location,
4274 "reduction: detected reduction chain\n");
4276 else if (dump_enabled_p ())
4277 dump_printf_loc (MSG_NOTE, vect_location,
4278 "reduction: detected reduction\n");
4280 return def_stmt_info;
4283 if (dump_enabled_p ())
4284 dump_printf_loc (MSG_NOTE, vect_location,
4285 "reduction: unknown pattern\n");
4287 return NULL;
4290 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4291 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4292 or -1 if not known. */
4294 static int
4295 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4297 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4298 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4300 if (dump_enabled_p ())
4301 dump_printf_loc (MSG_NOTE, vect_location,
4302 "cost model: epilogue peel iters set to vf/2 "
4303 "because loop iterations are unknown .\n");
4304 return assumed_vf / 2;
4306 else
4308 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4309 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4310 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4311 /* If we need to peel for gaps, but no peeling is required, we have to
4312 peel VF iterations. */
4313 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4314 peel_iters_epilogue = assumed_vf;
4315 return peel_iters_epilogue;
4319 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4321 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4322 int *peel_iters_epilogue,
4323 stmt_vector_for_cost *scalar_cost_vec,
4324 stmt_vector_for_cost *prologue_cost_vec,
4325 stmt_vector_for_cost *epilogue_cost_vec)
4327 int retval = 0;
4329 *peel_iters_epilogue
4330 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4332 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4334 /* If peeled iterations are known but number of scalar loop
4335 iterations are unknown, count a taken branch per peeled loop. */
4336 if (peel_iters_prologue > 0)
4337 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4338 vect_prologue);
4339 if (*peel_iters_epilogue > 0)
4340 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4341 vect_epilogue);
4344 stmt_info_for_cost *si;
4345 int j;
4346 if (peel_iters_prologue)
4347 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4348 retval += record_stmt_cost (prologue_cost_vec,
4349 si->count * peel_iters_prologue,
4350 si->kind, si->stmt_info, si->misalign,
4351 vect_prologue);
4352 if (*peel_iters_epilogue)
4353 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4354 retval += record_stmt_cost (epilogue_cost_vec,
4355 si->count * *peel_iters_epilogue,
4356 si->kind, si->stmt_info, si->misalign,
4357 vect_epilogue);
4359 return retval;
4362 /* Function vect_estimate_min_profitable_iters
4364 Return the number of iterations required for the vector version of the
4365 loop to be profitable relative to the cost of the scalar version of the
4366 loop.
4368 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4369 of iterations for vectorization. -1 value means loop vectorization
4370 is not profitable. This returned value may be used for dynamic
4371 profitability check.
4373 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4374 for static check against estimated number of iterations. */
4376 static void
4377 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4378 int *ret_min_profitable_niters,
4379 int *ret_min_profitable_estimate,
4380 unsigned *suggested_unroll_factor)
4382 int min_profitable_iters;
4383 int min_profitable_estimate;
4384 int peel_iters_prologue;
4385 int peel_iters_epilogue;
4386 unsigned vec_inside_cost = 0;
4387 int vec_outside_cost = 0;
4388 unsigned vec_prologue_cost = 0;
4389 unsigned vec_epilogue_cost = 0;
4390 int scalar_single_iter_cost = 0;
4391 int scalar_outside_cost = 0;
4392 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4393 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4394 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4396 /* Cost model disabled. */
4397 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4399 if (dump_enabled_p ())
4400 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4401 *ret_min_profitable_niters = 0;
4402 *ret_min_profitable_estimate = 0;
4403 return;
4406 /* Requires loop versioning tests to handle misalignment. */
4407 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4409 /* FIXME: Make cost depend on complexity of individual check. */
4410 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4411 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4412 if (dump_enabled_p ())
4413 dump_printf (MSG_NOTE,
4414 "cost model: Adding cost of checks for loop "
4415 "versioning to treat misalignment.\n");
4418 /* Requires loop versioning with alias checks. */
4419 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4421 /* FIXME: Make cost depend on complexity of individual check. */
4422 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4423 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4424 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4425 if (len)
4426 /* Count LEN - 1 ANDs and LEN comparisons. */
4427 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4428 scalar_stmt, vect_prologue);
4429 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4430 if (len)
4432 /* Count LEN - 1 ANDs and LEN comparisons. */
4433 unsigned int nstmts = len * 2 - 1;
4434 /* +1 for each bias that needs adding. */
4435 for (unsigned int i = 0; i < len; ++i)
4436 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4437 nstmts += 1;
4438 (void) add_stmt_cost (target_cost_data, nstmts,
4439 scalar_stmt, vect_prologue);
4441 if (dump_enabled_p ())
4442 dump_printf (MSG_NOTE,
4443 "cost model: Adding cost of checks for loop "
4444 "versioning aliasing.\n");
4447 /* Requires loop versioning with niter checks. */
4448 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4450 /* FIXME: Make cost depend on complexity of individual check. */
4451 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4452 NULL, NULL, NULL_TREE, 0, vect_prologue);
4453 if (dump_enabled_p ())
4454 dump_printf (MSG_NOTE,
4455 "cost model: Adding cost of checks for loop "
4456 "versioning niters.\n");
4459 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4460 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4461 vect_prologue);
4463 /* Count statements in scalar loop. Using this as scalar cost for a single
4464 iteration for now.
4466 TODO: Add outer loop support.
4468 TODO: Consider assigning different costs to different scalar
4469 statements. */
4471 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4473 /* Add additional cost for the peeled instructions in prologue and epilogue
4474 loop. (For fully-masked loops there will be no peeling.)
4476 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4477 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4479 TODO: Build an expression that represents peel_iters for prologue and
4480 epilogue to be used in a run-time test. */
4482 bool prologue_need_br_taken_cost = false;
4483 bool prologue_need_br_not_taken_cost = false;
4485 /* Calculate peel_iters_prologue. */
4486 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4487 peel_iters_prologue = 0;
4488 else if (npeel < 0)
4490 peel_iters_prologue = assumed_vf / 2;
4491 if (dump_enabled_p ())
4492 dump_printf (MSG_NOTE, "cost model: "
4493 "prologue peel iters set to vf/2.\n");
4495 /* If peeled iterations are unknown, count a taken branch and a not taken
4496 branch per peeled loop. Even if scalar loop iterations are known,
4497 vector iterations are not known since peeled prologue iterations are
4498 not known. Hence guards remain the same. */
4499 prologue_need_br_taken_cost = true;
4500 prologue_need_br_not_taken_cost = true;
4502 else
4504 peel_iters_prologue = npeel;
4505 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4506 /* If peeled iterations are known but number of scalar loop
4507 iterations are unknown, count a taken branch per peeled loop. */
4508 prologue_need_br_taken_cost = true;
4511 bool epilogue_need_br_taken_cost = false;
4512 bool epilogue_need_br_not_taken_cost = false;
4514 /* Calculate peel_iters_epilogue. */
4515 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4516 /* We need to peel exactly one iteration for gaps. */
4517 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4518 else if (npeel < 0)
4520 /* If peeling for alignment is unknown, loop bound of main loop
4521 becomes unknown. */
4522 peel_iters_epilogue = assumed_vf / 2;
4523 if (dump_enabled_p ())
4524 dump_printf (MSG_NOTE, "cost model: "
4525 "epilogue peel iters set to vf/2 because "
4526 "peeling for alignment is unknown.\n");
4528 /* See the same reason above in peel_iters_prologue calculation. */
4529 epilogue_need_br_taken_cost = true;
4530 epilogue_need_br_not_taken_cost = true;
4532 else
4534 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4535 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4536 /* If peeled iterations are known but number of scalar loop
4537 iterations are unknown, count a taken branch per peeled loop. */
4538 epilogue_need_br_taken_cost = true;
4541 stmt_info_for_cost *si;
4542 int j;
4543 /* Add costs associated with peel_iters_prologue. */
4544 if (peel_iters_prologue)
4545 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4547 (void) add_stmt_cost (target_cost_data,
4548 si->count * peel_iters_prologue, si->kind,
4549 si->stmt_info, si->node, si->vectype,
4550 si->misalign, vect_prologue);
4553 /* Add costs associated with peel_iters_epilogue. */
4554 if (peel_iters_epilogue)
4555 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4557 (void) add_stmt_cost (target_cost_data,
4558 si->count * peel_iters_epilogue, si->kind,
4559 si->stmt_info, si->node, si->vectype,
4560 si->misalign, vect_epilogue);
4563 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4565 if (prologue_need_br_taken_cost)
4566 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4567 vect_prologue);
4569 if (prologue_need_br_not_taken_cost)
4570 (void) add_stmt_cost (target_cost_data, 1,
4571 cond_branch_not_taken, vect_prologue);
4573 if (epilogue_need_br_taken_cost)
4574 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4575 vect_epilogue);
4577 if (epilogue_need_br_not_taken_cost)
4578 (void) add_stmt_cost (target_cost_data, 1,
4579 cond_branch_not_taken, vect_epilogue);
4581 /* Take care of special costs for rgroup controls of partial vectors. */
4582 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4583 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4584 == vect_partial_vectors_avx512))
4586 /* Calculate how many masks we need to generate. */
4587 unsigned int num_masks = 0;
4588 bool need_saturation = false;
4589 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4590 if (rgm.type)
4592 unsigned nvectors = rgm.factor;
4593 num_masks += nvectors;
4594 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4595 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4596 need_saturation = true;
4599 /* ??? The target isn't able to identify the costs below as
4600 producing masks so it cannot penaltize cases where we'd run
4601 out of mask registers for example. */
4603 /* ??? We are also failing to account for smaller vector masks
4604 we generate by splitting larger masks in vect_get_loop_mask. */
4606 /* In the worst case, we need to generate each mask in the prologue
4607 and in the loop body. We need one splat per group and one
4608 compare per mask.
4610 Sometimes the prologue mask will fold to a constant,
4611 so the actual prologue cost might be smaller. However, it's
4612 simpler and safer to use the worst-case cost; if this ends up
4613 being the tie-breaker between vectorizing or not, then it's
4614 probably better not to vectorize. */
4615 (void) add_stmt_cost (target_cost_data,
4616 num_masks
4617 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4618 vector_stmt, NULL, NULL, NULL_TREE, 0,
4619 vect_prologue);
4620 (void) add_stmt_cost (target_cost_data,
4621 num_masks
4622 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4623 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4625 /* When we need saturation we need it both in the prologue and
4626 the epilogue. */
4627 if (need_saturation)
4629 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4630 NULL, NULL, NULL_TREE, 0, vect_prologue);
4631 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4632 NULL, NULL, NULL_TREE, 0, vect_body);
4635 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4636 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4637 == vect_partial_vectors_while_ult))
4639 /* Calculate how many masks we need to generate. */
4640 unsigned int num_masks = 0;
4641 rgroup_controls *rgm;
4642 unsigned int num_vectors_m1;
4643 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4644 num_vectors_m1, rgm)
4645 if (rgm->type)
4646 num_masks += num_vectors_m1 + 1;
4647 gcc_assert (num_masks > 0);
4649 /* In the worst case, we need to generate each mask in the prologue
4650 and in the loop body. One of the loop body mask instructions
4651 replaces the comparison in the scalar loop, and since we don't
4652 count the scalar comparison against the scalar body, we shouldn't
4653 count that vector instruction against the vector body either.
4655 Sometimes we can use unpacks instead of generating prologue
4656 masks and sometimes the prologue mask will fold to a constant,
4657 so the actual prologue cost might be smaller. However, it's
4658 simpler and safer to use the worst-case cost; if this ends up
4659 being the tie-breaker between vectorizing or not, then it's
4660 probably better not to vectorize. */
4661 (void) add_stmt_cost (target_cost_data, num_masks,
4662 vector_stmt, NULL, NULL, NULL_TREE, 0,
4663 vect_prologue);
4664 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4665 vector_stmt, NULL, NULL, NULL_TREE, 0,
4666 vect_body);
4668 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4670 /* Referring to the functions vect_set_loop_condition_partial_vectors
4671 and vect_set_loop_controls_directly, we need to generate each
4672 length in the prologue and in the loop body if required. Although
4673 there are some possible optimizations, we consider the worst case
4674 here. */
4676 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4677 signed char partial_load_store_bias
4678 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4679 bool need_iterate_p
4680 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4681 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4683 /* Calculate how many statements to be added. */
4684 unsigned int prologue_stmts = 0;
4685 unsigned int body_stmts = 0;
4687 rgroup_controls *rgc;
4688 unsigned int num_vectors_m1;
4689 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4690 if (rgc->type)
4692 /* May need one SHIFT for nitems_total computation. */
4693 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4694 if (nitems != 1 && !niters_known_p)
4695 prologue_stmts += 1;
4697 /* May need one MAX and one MINUS for wrap around. */
4698 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4699 prologue_stmts += 2;
4701 /* Need one MAX and one MINUS for each batch limit excepting for
4702 the 1st one. */
4703 prologue_stmts += num_vectors_m1 * 2;
4705 unsigned int num_vectors = num_vectors_m1 + 1;
4707 /* Need to set up lengths in prologue, only one MIN required
4708 for each since start index is zero. */
4709 prologue_stmts += num_vectors;
4711 /* If we have a non-zero partial load bias, we need one PLUS
4712 to adjust the load length. */
4713 if (partial_load_store_bias != 0)
4714 body_stmts += 1;
4716 /* Each may need two MINs and one MINUS to update lengths in body
4717 for next iteration. */
4718 if (need_iterate_p)
4719 body_stmts += 3 * num_vectors;
4722 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4723 scalar_stmt, vect_prologue);
4724 (void) add_stmt_cost (target_cost_data, body_stmts,
4725 scalar_stmt, vect_body);
4728 /* FORNOW: The scalar outside cost is incremented in one of the
4729 following ways:
4731 1. The vectorizer checks for alignment and aliasing and generates
4732 a condition that allows dynamic vectorization. A cost model
4733 check is ANDED with the versioning condition. Hence scalar code
4734 path now has the added cost of the versioning check.
4736 if (cost > th & versioning_check)
4737 jmp to vector code
4739 Hence run-time scalar is incremented by not-taken branch cost.
4741 2. The vectorizer then checks if a prologue is required. If the
4742 cost model check was not done before during versioning, it has to
4743 be done before the prologue check.
4745 if (cost <= th)
4746 prologue = scalar_iters
4747 if (prologue == 0)
4748 jmp to vector code
4749 else
4750 execute prologue
4751 if (prologue == num_iters)
4752 go to exit
4754 Hence the run-time scalar cost is incremented by a taken branch,
4755 plus a not-taken branch, plus a taken branch cost.
4757 3. The vectorizer then checks if an epilogue is required. If the
4758 cost model check was not done before during prologue check, it
4759 has to be done with the epilogue check.
4761 if (prologue == 0)
4762 jmp to vector code
4763 else
4764 execute prologue
4765 if (prologue == num_iters)
4766 go to exit
4767 vector code:
4768 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4769 jmp to epilogue
4771 Hence the run-time scalar cost should be incremented by 2 taken
4772 branches.
4774 TODO: The back end may reorder the BBS's differently and reverse
4775 conditions/branch directions. Change the estimates below to
4776 something more reasonable. */
4778 /* If the number of iterations is known and we do not do versioning, we can
4779 decide whether to vectorize at compile time. Hence the scalar version
4780 do not carry cost model guard costs. */
4781 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4782 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4784 /* Cost model check occurs at versioning. */
4785 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4786 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4787 else
4789 /* Cost model check occurs at prologue generation. */
4790 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4791 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4792 + vect_get_stmt_cost (cond_branch_not_taken);
4793 /* Cost model check occurs at epilogue generation. */
4794 else
4795 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4799 /* Complete the target-specific cost calculations. */
4800 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4801 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4802 suggested_unroll_factor);
4804 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4805 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4806 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4807 *suggested_unroll_factor,
4808 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4810 if (dump_enabled_p ())
4811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4812 "can't unroll as unrolled vectorization factor larger"
4813 " than maximum vectorization factor: "
4814 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4815 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4816 *suggested_unroll_factor = 1;
4819 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4821 if (dump_enabled_p ())
4823 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4824 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4825 vec_inside_cost);
4826 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4827 vec_prologue_cost);
4828 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4829 vec_epilogue_cost);
4830 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4831 scalar_single_iter_cost);
4832 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4833 scalar_outside_cost);
4834 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4835 vec_outside_cost);
4836 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4837 peel_iters_prologue);
4838 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4839 peel_iters_epilogue);
4842 /* Calculate number of iterations required to make the vector version
4843 profitable, relative to the loop bodies only. The following condition
4844 must hold true:
4845 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4846 where
4847 SIC = scalar iteration cost, VIC = vector iteration cost,
4848 VOC = vector outside cost, VF = vectorization factor,
4849 NPEEL = prologue iterations + epilogue iterations,
4850 SOC = scalar outside cost for run time cost model check. */
4852 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4853 - vec_inside_cost);
4854 if (saving_per_viter <= 0)
4856 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4857 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4858 "vectorization did not happen for a simd loop");
4860 if (dump_enabled_p ())
4861 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4862 "cost model: the vector iteration cost = %d "
4863 "divided by the scalar iteration cost = %d "
4864 "is greater or equal to the vectorization factor = %d"
4865 ".\n",
4866 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4867 *ret_min_profitable_niters = -1;
4868 *ret_min_profitable_estimate = -1;
4869 return;
4872 /* ??? The "if" arm is written to handle all cases; see below for what
4873 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4874 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4876 /* Rewriting the condition above in terms of the number of
4877 vector iterations (vniters) rather than the number of
4878 scalar iterations (niters) gives:
4880 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4882 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4884 For integer N, X and Y when X > 0:
4886 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4887 int outside_overhead = (vec_outside_cost
4888 - scalar_single_iter_cost * peel_iters_prologue
4889 - scalar_single_iter_cost * peel_iters_epilogue
4890 - scalar_outside_cost);
4891 /* We're only interested in cases that require at least one
4892 vector iteration. */
4893 int min_vec_niters = 1;
4894 if (outside_overhead > 0)
4895 min_vec_niters = outside_overhead / saving_per_viter + 1;
4897 if (dump_enabled_p ())
4898 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4899 min_vec_niters);
4901 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4903 /* Now that we know the minimum number of vector iterations,
4904 find the minimum niters for which the scalar cost is larger:
4906 SIC * niters > VIC * vniters + VOC - SOC
4908 We know that the minimum niters is no more than
4909 vniters * VF + NPEEL, but it might be (and often is) less
4910 than that if a partial vector iteration is cheaper than the
4911 equivalent scalar code. */
4912 int threshold = (vec_inside_cost * min_vec_niters
4913 + vec_outside_cost
4914 - scalar_outside_cost);
4915 if (threshold <= 0)
4916 min_profitable_iters = 1;
4917 else
4918 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4920 else
4921 /* Convert the number of vector iterations into a number of
4922 scalar iterations. */
4923 min_profitable_iters = (min_vec_niters * assumed_vf
4924 + peel_iters_prologue
4925 + peel_iters_epilogue);
4927 else
4929 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4930 * assumed_vf
4931 - vec_inside_cost * peel_iters_prologue
4932 - vec_inside_cost * peel_iters_epilogue);
4933 if (min_profitable_iters <= 0)
4934 min_profitable_iters = 0;
4935 else
4937 min_profitable_iters /= saving_per_viter;
4939 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4940 <= (((int) vec_inside_cost * min_profitable_iters)
4941 + (((int) vec_outside_cost - scalar_outside_cost)
4942 * assumed_vf)))
4943 min_profitable_iters++;
4947 if (dump_enabled_p ())
4948 dump_printf (MSG_NOTE,
4949 " Calculated minimum iters for profitability: %d\n",
4950 min_profitable_iters);
4952 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4953 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4954 /* We want the vectorized loop to execute at least once. */
4955 min_profitable_iters = assumed_vf + peel_iters_prologue;
4956 else if (min_profitable_iters < peel_iters_prologue)
4957 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4958 vectorized loop executes at least once. */
4959 min_profitable_iters = peel_iters_prologue;
4961 if (dump_enabled_p ())
4962 dump_printf_loc (MSG_NOTE, vect_location,
4963 " Runtime profitability threshold = %d\n",
4964 min_profitable_iters);
4966 *ret_min_profitable_niters = min_profitable_iters;
4968 /* Calculate number of iterations required to make the vector version
4969 profitable, relative to the loop bodies only.
4971 Non-vectorized variant is SIC * niters and it must win over vector
4972 variant on the expected loop trip count. The following condition must hold true:
4973 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4975 if (vec_outside_cost <= 0)
4976 min_profitable_estimate = 0;
4977 /* ??? This "else if" arm is written to handle all cases; see below for
4978 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4979 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4981 /* This is a repeat of the code above, but with + SOC rather
4982 than - SOC. */
4983 int outside_overhead = (vec_outside_cost
4984 - scalar_single_iter_cost * peel_iters_prologue
4985 - scalar_single_iter_cost * peel_iters_epilogue
4986 + scalar_outside_cost);
4987 int min_vec_niters = 1;
4988 if (outside_overhead > 0)
4989 min_vec_niters = outside_overhead / saving_per_viter + 1;
4991 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4993 int threshold = (vec_inside_cost * min_vec_niters
4994 + vec_outside_cost
4995 + scalar_outside_cost);
4996 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4998 else
4999 min_profitable_estimate = (min_vec_niters * assumed_vf
5000 + peel_iters_prologue
5001 + peel_iters_epilogue);
5003 else
5005 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5006 * assumed_vf
5007 - vec_inside_cost * peel_iters_prologue
5008 - vec_inside_cost * peel_iters_epilogue)
5009 / ((scalar_single_iter_cost * assumed_vf)
5010 - vec_inside_cost);
5012 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5013 if (dump_enabled_p ())
5014 dump_printf_loc (MSG_NOTE, vect_location,
5015 " Static estimate profitability threshold = %d\n",
5016 min_profitable_estimate);
5018 *ret_min_profitable_estimate = min_profitable_estimate;
5021 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5022 vector elements (not bits) for a vector with NELT elements. */
5023 static void
5024 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5025 vec_perm_builder *sel)
5027 /* The encoding is a single stepped pattern. Any wrap-around is handled
5028 by vec_perm_indices. */
5029 sel->new_vector (nelt, 1, 3);
5030 for (unsigned int i = 0; i < 3; i++)
5031 sel->quick_push (i + offset);
5034 /* Checks whether the target supports whole-vector shifts for vectors of mode
5035 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5036 it supports vec_perm_const with masks for all necessary shift amounts. */
5037 static bool
5038 have_whole_vector_shift (machine_mode mode)
5040 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5041 return true;
5043 /* Variable-length vectors should be handled via the optab. */
5044 unsigned int nelt;
5045 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5046 return false;
5048 vec_perm_builder sel;
5049 vec_perm_indices indices;
5050 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5052 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5053 indices.new_vector (sel, 2, nelt);
5054 if (!can_vec_perm_const_p (mode, mode, indices, false))
5055 return false;
5057 return true;
5060 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5061 multiplication operands have differing signs and (b) we intend
5062 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5063 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5065 static bool
5066 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5067 stmt_vec_info stmt_info)
5069 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5070 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5071 return false;
5073 tree rhs1 = gimple_assign_rhs1 (assign);
5074 tree rhs2 = gimple_assign_rhs2 (assign);
5075 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5076 return false;
5078 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5079 gcc_assert (reduc_info->is_reduc_info);
5080 return !directly_supported_p (DOT_PROD_EXPR,
5081 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5082 optab_vector_mixed_sign);
5085 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5086 functions. Design better to avoid maintenance issues. */
5088 /* Function vect_model_reduction_cost.
5090 Models cost for a reduction operation, including the vector ops
5091 generated within the strip-mine loop in some cases, the initial
5092 definition before the loop, and the epilogue code that must be generated. */
5094 static void
5095 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5096 stmt_vec_info stmt_info, internal_fn reduc_fn,
5097 vect_reduction_type reduction_type,
5098 int ncopies, stmt_vector_for_cost *cost_vec)
5100 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5101 tree vectype;
5102 machine_mode mode;
5103 class loop *loop = NULL;
5105 if (loop_vinfo)
5106 loop = LOOP_VINFO_LOOP (loop_vinfo);
5108 /* Condition reductions generate two reductions in the loop. */
5109 if (reduction_type == COND_REDUCTION)
5110 ncopies *= 2;
5112 vectype = STMT_VINFO_VECTYPE (stmt_info);
5113 mode = TYPE_MODE (vectype);
5114 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5116 gimple_match_op op;
5117 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5118 gcc_unreachable ();
5120 bool emulated_mixed_dot_prod
5121 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5122 if (reduction_type == EXTRACT_LAST_REDUCTION)
5123 /* No extra instructions are needed in the prologue. The loop body
5124 operations are costed in vectorizable_condition. */
5125 inside_cost = 0;
5126 else if (reduction_type == FOLD_LEFT_REDUCTION)
5128 /* No extra instructions needed in the prologue. */
5129 prologue_cost = 0;
5131 if (reduc_fn != IFN_LAST)
5132 /* Count one reduction-like operation per vector. */
5133 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5134 stmt_info, 0, vect_body);
5135 else
5137 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5138 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5139 inside_cost = record_stmt_cost (cost_vec, nelements,
5140 vec_to_scalar, stmt_info, 0,
5141 vect_body);
5142 inside_cost += record_stmt_cost (cost_vec, nelements,
5143 scalar_stmt, stmt_info, 0,
5144 vect_body);
5147 else
5149 /* Add in the cost of the initial definitions. */
5150 int prologue_stmts;
5151 if (reduction_type == COND_REDUCTION)
5152 /* For cond reductions we have four vectors: initial index, step,
5153 initial result of the data reduction, initial value of the index
5154 reduction. */
5155 prologue_stmts = 4;
5156 else if (emulated_mixed_dot_prod)
5157 /* We need the initial reduction value and two invariants:
5158 one that contains the minimum signed value and one that
5159 contains half of its negative. */
5160 prologue_stmts = 3;
5161 else
5162 prologue_stmts = 1;
5163 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5164 scalar_to_vec, stmt_info, 0,
5165 vect_prologue);
5168 /* Determine cost of epilogue code.
5170 We have a reduction operator that will reduce the vector in one statement.
5171 Also requires scalar extract. */
5173 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5175 if (reduc_fn != IFN_LAST)
5177 if (reduction_type == COND_REDUCTION)
5179 /* An EQ stmt and an COND_EXPR stmt. */
5180 epilogue_cost += record_stmt_cost (cost_vec, 2,
5181 vector_stmt, stmt_info, 0,
5182 vect_epilogue);
5183 /* Reduction of the max index and a reduction of the found
5184 values. */
5185 epilogue_cost += record_stmt_cost (cost_vec, 2,
5186 vec_to_scalar, stmt_info, 0,
5187 vect_epilogue);
5188 /* A broadcast of the max value. */
5189 epilogue_cost += record_stmt_cost (cost_vec, 1,
5190 scalar_to_vec, stmt_info, 0,
5191 vect_epilogue);
5193 else
5195 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5196 stmt_info, 0, vect_epilogue);
5197 epilogue_cost += record_stmt_cost (cost_vec, 1,
5198 vec_to_scalar, stmt_info, 0,
5199 vect_epilogue);
5202 else if (reduction_type == COND_REDUCTION)
5204 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5205 /* Extraction of scalar elements. */
5206 epilogue_cost += record_stmt_cost (cost_vec,
5207 2 * estimated_nunits,
5208 vec_to_scalar, stmt_info, 0,
5209 vect_epilogue);
5210 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5211 epilogue_cost += record_stmt_cost (cost_vec,
5212 2 * estimated_nunits - 3,
5213 scalar_stmt, stmt_info, 0,
5214 vect_epilogue);
5216 else if (reduction_type == EXTRACT_LAST_REDUCTION
5217 || reduction_type == FOLD_LEFT_REDUCTION)
5218 /* No extra instructions need in the epilogue. */
5220 else
5222 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5223 tree bitsize = TYPE_SIZE (op.type);
5224 int element_bitsize = tree_to_uhwi (bitsize);
5225 int nelements = vec_size_in_bits / element_bitsize;
5227 if (op.code == COND_EXPR)
5228 op.code = MAX_EXPR;
5230 /* We have a whole vector shift available. */
5231 if (VECTOR_MODE_P (mode)
5232 && directly_supported_p (op.code, vectype)
5233 && have_whole_vector_shift (mode))
5235 /* Final reduction via vector shifts and the reduction operator.
5236 Also requires scalar extract. */
5237 epilogue_cost += record_stmt_cost (cost_vec,
5238 exact_log2 (nelements) * 2,
5239 vector_stmt, stmt_info, 0,
5240 vect_epilogue);
5241 epilogue_cost += record_stmt_cost (cost_vec, 1,
5242 vec_to_scalar, stmt_info, 0,
5243 vect_epilogue);
5245 else
5246 /* Use extracts and reduction op for final reduction. For N
5247 elements, we have N extracts and N-1 reduction ops. */
5248 epilogue_cost += record_stmt_cost (cost_vec,
5249 nelements + nelements - 1,
5250 vector_stmt, stmt_info, 0,
5251 vect_epilogue);
5255 if (dump_enabled_p ())
5256 dump_printf (MSG_NOTE,
5257 "vect_model_reduction_cost: inside_cost = %d, "
5258 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5259 prologue_cost, epilogue_cost);
5262 /* SEQ is a sequence of instructions that initialize the reduction
5263 described by REDUC_INFO. Emit them in the appropriate place. */
5265 static void
5266 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5267 stmt_vec_info reduc_info, gimple *seq)
5269 if (reduc_info->reused_accumulator)
5271 /* When reusing an accumulator from the main loop, we only need
5272 initialization instructions if the main loop can be skipped.
5273 In that case, emit the initialization instructions at the end
5274 of the guard block that does the skip. */
5275 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5276 gcc_assert (skip_edge);
5277 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5278 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5280 else
5282 /* The normal case: emit the initialization instructions on the
5283 preheader edge. */
5284 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5285 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5289 /* Function get_initial_def_for_reduction
5291 Input:
5292 REDUC_INFO - the info_for_reduction
5293 INIT_VAL - the initial value of the reduction variable
5294 NEUTRAL_OP - a value that has no effect on the reduction, as per
5295 neutral_op_for_reduction
5297 Output:
5298 Return a vector variable, initialized according to the operation that
5299 STMT_VINFO performs. This vector will be used as the initial value
5300 of the vector of partial results.
5302 The value we need is a vector in which element 0 has value INIT_VAL
5303 and every other element has value NEUTRAL_OP. */
5305 static tree
5306 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5307 stmt_vec_info reduc_info,
5308 tree init_val, tree neutral_op)
5310 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5311 tree scalar_type = TREE_TYPE (init_val);
5312 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5313 tree init_def;
5314 gimple_seq stmts = NULL;
5316 gcc_assert (vectype);
5318 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5319 || SCALAR_FLOAT_TYPE_P (scalar_type));
5321 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5322 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5324 if (operand_equal_p (init_val, neutral_op))
5326 /* If both elements are equal then the vector described above is
5327 just a splat. */
5328 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5329 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5331 else
5333 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5334 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5335 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5337 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5338 element 0. */
5339 init_def = gimple_build_vector_from_val (&stmts, vectype,
5340 neutral_op);
5341 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5342 vectype, init_def, init_val);
5344 else
5346 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5347 tree_vector_builder elts (vectype, 1, 2);
5348 elts.quick_push (init_val);
5349 elts.quick_push (neutral_op);
5350 init_def = gimple_build_vector (&stmts, &elts);
5354 if (stmts)
5355 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5356 return init_def;
5359 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5360 which performs a reduction involving GROUP_SIZE scalar statements.
5361 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5362 is nonnull, introducing extra elements of that value will not change the
5363 result. */
5365 static void
5366 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5367 stmt_vec_info reduc_info,
5368 vec<tree> *vec_oprnds,
5369 unsigned int number_of_vectors,
5370 unsigned int group_size, tree neutral_op)
5372 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5373 unsigned HOST_WIDE_INT nunits;
5374 unsigned j, number_of_places_left_in_vector;
5375 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5376 unsigned int i;
5378 gcc_assert (group_size == initial_values.length () || neutral_op);
5380 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5381 created vectors. It is greater than 1 if unrolling is performed.
5383 For example, we have two scalar operands, s1 and s2 (e.g., group of
5384 strided accesses of size two), while NUNITS is four (i.e., four scalars
5385 of this type can be packed in a vector). The output vector will contain
5386 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5387 will be 2).
5389 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5390 vectors containing the operands.
5392 For example, NUNITS is four as before, and the group size is 8
5393 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5394 {s5, s6, s7, s8}. */
5396 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5397 nunits = group_size;
5399 number_of_places_left_in_vector = nunits;
5400 bool constant_p = true;
5401 tree_vector_builder elts (vector_type, nunits, 1);
5402 elts.quick_grow (nunits);
5403 gimple_seq ctor_seq = NULL;
5404 for (j = 0; j < nunits * number_of_vectors; ++j)
5406 tree op;
5407 i = j % group_size;
5409 /* Get the def before the loop. In reduction chain we have only
5410 one initial value. Else we have as many as PHIs in the group. */
5411 if (i >= initial_values.length () || (j > i && neutral_op))
5412 op = neutral_op;
5413 else
5414 op = initial_values[i];
5416 /* Create 'vect_ = {op0,op1,...,opn}'. */
5417 number_of_places_left_in_vector--;
5418 elts[nunits - number_of_places_left_in_vector - 1] = op;
5419 if (!CONSTANT_CLASS_P (op))
5420 constant_p = false;
5422 if (number_of_places_left_in_vector == 0)
5424 tree init;
5425 if (constant_p && !neutral_op
5426 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5427 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5428 /* Build the vector directly from ELTS. */
5429 init = gimple_build_vector (&ctor_seq, &elts);
5430 else if (neutral_op)
5432 /* Build a vector of the neutral value and shift the
5433 other elements into place. */
5434 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5435 neutral_op);
5436 int k = nunits;
5437 while (k > 0 && elts[k - 1] == neutral_op)
5438 k -= 1;
5439 while (k > 0)
5441 k -= 1;
5442 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5443 vector_type, init, elts[k]);
5446 else
5448 /* First time round, duplicate ELTS to fill the
5449 required number of vectors. */
5450 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5451 elts, number_of_vectors, *vec_oprnds);
5452 break;
5454 vec_oprnds->quick_push (init);
5456 number_of_places_left_in_vector = nunits;
5457 elts.new_vector (vector_type, nunits, 1);
5458 elts.quick_grow (nunits);
5459 constant_p = true;
5462 if (ctor_seq != NULL)
5463 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5466 /* For a statement STMT_INFO taking part in a reduction operation return
5467 the stmt_vec_info the meta information is stored on. */
5469 stmt_vec_info
5470 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5472 stmt_info = vect_orig_stmt (stmt_info);
5473 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5474 if (!is_a <gphi *> (stmt_info->stmt)
5475 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5476 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5477 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5478 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5480 if (gimple_phi_num_args (phi) == 1)
5481 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5483 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5485 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5486 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5487 stmt_info = info;
5489 return stmt_info;
5492 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5493 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5494 return false. */
5496 static bool
5497 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5498 stmt_vec_info reduc_info)
5500 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5501 if (!main_loop_vinfo)
5502 return false;
5504 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5505 return false;
5507 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5508 auto_vec<tree, 16> main_loop_results (num_phis);
5509 auto_vec<tree, 16> initial_values (num_phis);
5510 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5512 /* The epilogue loop can be entered either from the main loop or
5513 from an earlier guard block. */
5514 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5515 for (tree incoming_value : reduc_info->reduc_initial_values)
5517 /* Look for:
5519 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5520 INITIAL_VALUE(guard block)>. */
5521 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5523 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5524 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5526 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5527 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5529 main_loop_results.quick_push (from_main_loop);
5530 initial_values.quick_push (from_skip);
5533 else
5534 /* The main loop dominates the epilogue loop. */
5535 main_loop_results.splice (reduc_info->reduc_initial_values);
5537 /* See if the main loop has the kind of accumulator we need. */
5538 vect_reusable_accumulator *accumulator
5539 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5540 if (!accumulator
5541 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5542 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5543 accumulator->reduc_info->reduc_scalar_results.begin ()))
5544 return false;
5546 /* Handle the case where we can reduce wider vectors to narrower ones. */
5547 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5548 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5549 unsigned HOST_WIDE_INT m;
5550 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5551 TYPE_VECTOR_SUBPARTS (vectype), &m))
5552 return false;
5553 /* Check the intermediate vector types and operations are available. */
5554 tree prev_vectype = old_vectype;
5555 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5556 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5558 intermediate_nunits = exact_div (intermediate_nunits, 2);
5559 tree intermediate_vectype = get_related_vectype_for_scalar_type
5560 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5561 if (!intermediate_vectype
5562 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5563 intermediate_vectype)
5564 || !can_vec_extract (TYPE_MODE (prev_vectype),
5565 TYPE_MODE (intermediate_vectype)))
5566 return false;
5567 prev_vectype = intermediate_vectype;
5570 /* Non-SLP reductions might apply an adjustment after the reduction
5571 operation, in order to simplify the initialization of the accumulator.
5572 If the epilogue loop carries on from where the main loop left off,
5573 it should apply the same adjustment to the final reduction result.
5575 If the epilogue loop can also be entered directly (rather than via
5576 the main loop), we need to be able to handle that case in the same way,
5577 with the same adjustment. (In principle we could add a PHI node
5578 to select the correct adjustment, but in practice that shouldn't be
5579 necessary.) */
5580 tree main_adjustment
5581 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5582 if (loop_vinfo->main_loop_edge && main_adjustment)
5584 gcc_assert (num_phis == 1);
5585 tree initial_value = initial_values[0];
5586 /* Check that we can use INITIAL_VALUE as the adjustment and
5587 initialize the accumulator with a neutral value instead. */
5588 if (!operand_equal_p (initial_value, main_adjustment))
5589 return false;
5590 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5591 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5592 code, initial_value);
5594 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5595 reduc_info->reduc_initial_values.truncate (0);
5596 reduc_info->reduc_initial_values.splice (initial_values);
5597 reduc_info->reused_accumulator = accumulator;
5598 return true;
5601 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5602 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5604 static tree
5605 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5606 gimple_seq *seq)
5608 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5609 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5610 tree stype = TREE_TYPE (vectype);
5611 tree new_temp = vec_def;
5612 while (nunits > nunits1)
5614 nunits /= 2;
5615 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5616 stype, nunits);
5617 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5619 /* The target has to make sure we support lowpart/highpart
5620 extraction, either via direct vector extract or through
5621 an integer mode punning. */
5622 tree dst1, dst2;
5623 gimple *epilog_stmt;
5624 if (convert_optab_handler (vec_extract_optab,
5625 TYPE_MODE (TREE_TYPE (new_temp)),
5626 TYPE_MODE (vectype1))
5627 != CODE_FOR_nothing)
5629 /* Extract sub-vectors directly once vec_extract becomes
5630 a conversion optab. */
5631 dst1 = make_ssa_name (vectype1);
5632 epilog_stmt
5633 = gimple_build_assign (dst1, BIT_FIELD_REF,
5634 build3 (BIT_FIELD_REF, vectype1,
5635 new_temp, TYPE_SIZE (vectype1),
5636 bitsize_int (0)));
5637 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5638 dst2 = make_ssa_name (vectype1);
5639 epilog_stmt
5640 = gimple_build_assign (dst2, BIT_FIELD_REF,
5641 build3 (BIT_FIELD_REF, vectype1,
5642 new_temp, TYPE_SIZE (vectype1),
5643 bitsize_int (bitsize)));
5644 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5646 else
5648 /* Extract via punning to appropriately sized integer mode
5649 vector. */
5650 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5651 tree etype = build_vector_type (eltype, 2);
5652 gcc_assert (convert_optab_handler (vec_extract_optab,
5653 TYPE_MODE (etype),
5654 TYPE_MODE (eltype))
5655 != CODE_FOR_nothing);
5656 tree tem = make_ssa_name (etype);
5657 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5658 build1 (VIEW_CONVERT_EXPR,
5659 etype, new_temp));
5660 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5661 new_temp = tem;
5662 tem = make_ssa_name (eltype);
5663 epilog_stmt
5664 = gimple_build_assign (tem, BIT_FIELD_REF,
5665 build3 (BIT_FIELD_REF, eltype,
5666 new_temp, TYPE_SIZE (eltype),
5667 bitsize_int (0)));
5668 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5669 dst1 = make_ssa_name (vectype1);
5670 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5671 build1 (VIEW_CONVERT_EXPR,
5672 vectype1, tem));
5673 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5674 tem = make_ssa_name (eltype);
5675 epilog_stmt
5676 = gimple_build_assign (tem, BIT_FIELD_REF,
5677 build3 (BIT_FIELD_REF, eltype,
5678 new_temp, TYPE_SIZE (eltype),
5679 bitsize_int (bitsize)));
5680 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5681 dst2 = make_ssa_name (vectype1);
5682 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5683 build1 (VIEW_CONVERT_EXPR,
5684 vectype1, tem));
5685 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5688 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5691 return new_temp;
5694 /* Function vect_create_epilog_for_reduction
5696 Create code at the loop-epilog to finalize the result of a reduction
5697 computation.
5699 STMT_INFO is the scalar reduction stmt that is being vectorized.
5700 SLP_NODE is an SLP node containing a group of reduction statements. The
5701 first one in this group is STMT_INFO.
5702 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5703 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5704 (counting from 0)
5706 This function:
5707 1. Completes the reduction def-use cycles.
5708 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5709 by calling the function specified by REDUC_FN if available, or by
5710 other means (whole-vector shifts or a scalar loop).
5711 The function also creates a new phi node at the loop exit to preserve
5712 loop-closed form, as illustrated below.
5714 The flow at the entry to this function:
5716 loop:
5717 vec_def = phi <vec_init, null> # REDUCTION_PHI
5718 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5719 s_loop = scalar_stmt # (scalar) STMT_INFO
5720 loop_exit:
5721 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5722 use <s_out0>
5723 use <s_out0>
5725 The above is transformed by this function into:
5727 loop:
5728 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5729 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5730 s_loop = scalar_stmt # (scalar) STMT_INFO
5731 loop_exit:
5732 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5733 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5734 v_out2 = reduce <v_out1>
5735 s_out3 = extract_field <v_out2, 0>
5736 s_out4 = adjust_result <s_out3>
5737 use <s_out4>
5738 use <s_out4>
5741 static void
5742 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5743 stmt_vec_info stmt_info,
5744 slp_tree slp_node,
5745 slp_instance slp_node_instance)
5747 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5748 gcc_assert (reduc_info->is_reduc_info);
5749 /* For double reductions we need to get at the inner loop reduction
5750 stmt which has the meta info attached. Our stmt_info is that of the
5751 loop-closed PHI of the inner loop which we remember as
5752 def for the reduction PHI generation. */
5753 bool double_reduc = false;
5754 stmt_vec_info rdef_info = stmt_info;
5755 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5757 gcc_assert (!slp_node);
5758 double_reduc = true;
5759 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5760 (stmt_info->stmt, 0));
5761 stmt_info = vect_stmt_to_vectorize (stmt_info);
5763 gphi *reduc_def_stmt
5764 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5765 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5766 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5767 tree vectype;
5768 machine_mode mode;
5769 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5770 basic_block exit_bb;
5771 tree scalar_dest;
5772 tree scalar_type;
5773 gimple *new_phi = NULL, *phi;
5774 gimple_stmt_iterator exit_gsi;
5775 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5776 gimple *epilog_stmt = NULL;
5777 gimple *exit_phi;
5778 tree bitsize;
5779 tree def;
5780 tree orig_name, scalar_result;
5781 imm_use_iterator imm_iter, phi_imm_iter;
5782 use_operand_p use_p, phi_use_p;
5783 gimple *use_stmt;
5784 auto_vec<tree> reduc_inputs;
5785 int j, i;
5786 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5787 unsigned int group_size = 1, k;
5788 auto_vec<gimple *> phis;
5789 /* SLP reduction without reduction chain, e.g.,
5790 # a1 = phi <a2, a0>
5791 # b1 = phi <b2, b0>
5792 a2 = operation (a1)
5793 b2 = operation (b1) */
5794 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5795 bool direct_slp_reduc;
5796 tree induction_index = NULL_TREE;
5798 if (slp_node)
5799 group_size = SLP_TREE_LANES (slp_node);
5801 if (nested_in_vect_loop_p (loop, stmt_info))
5803 outer_loop = loop;
5804 loop = loop->inner;
5805 gcc_assert (!slp_node && double_reduc);
5808 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5809 gcc_assert (vectype);
5810 mode = TYPE_MODE (vectype);
5812 tree induc_val = NULL_TREE;
5813 tree adjustment_def = NULL;
5814 if (slp_node)
5816 else
5818 /* Optimize: for induction condition reduction, if we can't use zero
5819 for induc_val, use initial_def. */
5820 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5821 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5822 else if (double_reduc)
5824 else
5825 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5828 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5829 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5830 if (slp_reduc)
5831 /* All statements produce live-out values. */
5832 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5833 else if (slp_node)
5835 /* The last statement in the reduction chain produces the live-out
5836 value. Note SLP optimization can shuffle scalar stmts to
5837 optimize permutations so we have to search for the last stmt. */
5838 for (k = 0; k < group_size; ++k)
5839 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5841 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5842 break;
5846 unsigned vec_num;
5847 int ncopies;
5848 if (slp_node)
5850 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5851 ncopies = 1;
5853 else
5855 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5856 vec_num = 1;
5857 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5860 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5861 which is updated with the current index of the loop for every match of
5862 the original loop's cond_expr (VEC_STMT). This results in a vector
5863 containing the last time the condition passed for that vector lane.
5864 The first match will be a 1 to allow 0 to be used for non-matching
5865 indexes. If there are no matches at all then the vector will be all
5866 zeroes.
5868 PR92772: This algorithm is broken for architectures that support
5869 masked vectors, but do not provide fold_extract_last. */
5870 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5872 auto_vec<std::pair<tree, bool>, 2> ccompares;
5873 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5874 cond_info = vect_stmt_to_vectorize (cond_info);
5875 while (cond_info != reduc_info)
5877 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5879 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5880 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5881 ccompares.safe_push
5882 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5883 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5885 cond_info
5886 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5887 1 + STMT_VINFO_REDUC_IDX
5888 (cond_info)));
5889 cond_info = vect_stmt_to_vectorize (cond_info);
5891 gcc_assert (ccompares.length () != 0);
5893 tree indx_before_incr, indx_after_incr;
5894 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5895 int scalar_precision
5896 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5897 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5898 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5899 (TYPE_MODE (vectype), cr_index_scalar_type,
5900 TYPE_VECTOR_SUBPARTS (vectype));
5902 /* First we create a simple vector induction variable which starts
5903 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5904 vector size (STEP). */
5906 /* Create a {1,2,3,...} vector. */
5907 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5909 /* Create a vector of the step value. */
5910 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5911 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5913 /* Create an induction variable. */
5914 gimple_stmt_iterator incr_gsi;
5915 bool insert_after;
5916 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5917 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5918 insert_after, &indx_before_incr, &indx_after_incr);
5920 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5921 filled with zeros (VEC_ZERO). */
5923 /* Create a vector of 0s. */
5924 tree zero = build_zero_cst (cr_index_scalar_type);
5925 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5927 /* Create a vector phi node. */
5928 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5929 new_phi = create_phi_node (new_phi_tree, loop->header);
5930 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5931 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5933 /* Now take the condition from the loops original cond_exprs
5934 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5935 every match uses values from the induction variable
5936 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5937 (NEW_PHI_TREE).
5938 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5939 the new cond_expr (INDEX_COND_EXPR). */
5940 gimple_seq stmts = NULL;
5941 for (int i = ccompares.length () - 1; i != -1; --i)
5943 tree ccompare = ccompares[i].first;
5944 if (ccompares[i].second)
5945 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5946 cr_index_vector_type,
5947 ccompare,
5948 indx_before_incr, new_phi_tree);
5949 else
5950 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5951 cr_index_vector_type,
5952 ccompare,
5953 new_phi_tree, indx_before_incr);
5955 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5957 /* Update the phi with the vec cond. */
5958 induction_index = new_phi_tree;
5959 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5960 loop_latch_edge (loop), UNKNOWN_LOCATION);
5963 /* 2. Create epilog code.
5964 The reduction epilog code operates across the elements of the vector
5965 of partial results computed by the vectorized loop.
5966 The reduction epilog code consists of:
5968 step 1: compute the scalar result in a vector (v_out2)
5969 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5970 step 3: adjust the scalar result (s_out3) if needed.
5972 Step 1 can be accomplished using one the following three schemes:
5973 (scheme 1) using reduc_fn, if available.
5974 (scheme 2) using whole-vector shifts, if available.
5975 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5976 combined.
5978 The overall epilog code looks like this:
5980 s_out0 = phi <s_loop> # original EXIT_PHI
5981 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5982 v_out2 = reduce <v_out1> # step 1
5983 s_out3 = extract_field <v_out2, 0> # step 2
5984 s_out4 = adjust_result <s_out3> # step 3
5986 (step 3 is optional, and steps 1 and 2 may be combined).
5987 Lastly, the uses of s_out0 are replaced by s_out4. */
5990 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5991 v_out1 = phi <VECT_DEF>
5992 Store them in NEW_PHIS. */
5993 if (double_reduc)
5994 loop = outer_loop;
5995 exit_bb = single_exit (loop)->dest;
5996 exit_gsi = gsi_after_labels (exit_bb);
5997 reduc_inputs.create (slp_node ? vec_num : ncopies);
5998 for (unsigned i = 0; i < vec_num; i++)
6000 gimple_seq stmts = NULL;
6001 if (slp_node)
6002 def = vect_get_slp_vect_def (slp_node, i);
6003 else
6004 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6005 for (j = 0; j < ncopies; j++)
6007 tree new_def = copy_ssa_name (def);
6008 phi = create_phi_node (new_def, exit_bb);
6009 if (j)
6010 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6011 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
6012 new_def = gimple_convert (&stmts, vectype, new_def);
6013 reduc_inputs.quick_push (new_def);
6015 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6018 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6019 (i.e. when reduc_fn is not available) and in the final adjustment
6020 code (if needed). Also get the original scalar reduction variable as
6021 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6022 represents a reduction pattern), the tree-code and scalar-def are
6023 taken from the original stmt that the pattern-stmt (STMT) replaces.
6024 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6025 are taken from STMT. */
6027 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6028 if (orig_stmt_info != stmt_info)
6030 /* Reduction pattern */
6031 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6032 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6035 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6036 scalar_type = TREE_TYPE (scalar_dest);
6037 scalar_results.truncate (0);
6038 scalar_results.reserve_exact (group_size);
6039 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6040 bitsize = TYPE_SIZE (scalar_type);
6042 /* True if we should implement SLP_REDUC using native reduction operations
6043 instead of scalar operations. */
6044 direct_slp_reduc = (reduc_fn != IFN_LAST
6045 && slp_reduc
6046 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6048 /* In case of reduction chain, e.g.,
6049 # a1 = phi <a3, a0>
6050 a2 = operation (a1)
6051 a3 = operation (a2),
6053 we may end up with more than one vector result. Here we reduce them
6054 to one vector.
6056 The same is true for a SLP reduction, e.g.,
6057 # a1 = phi <a2, a0>
6058 # b1 = phi <b2, b0>
6059 a2 = operation (a1)
6060 b2 = operation (a2),
6062 where we can end up with more than one vector as well. We can
6063 easily accumulate vectors when the number of vector elements is
6064 a multiple of the SLP group size.
6066 The same is true if we couldn't use a single defuse cycle. */
6067 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6068 || direct_slp_reduc
6069 || (slp_reduc
6070 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6071 || ncopies > 1)
6073 gimple_seq stmts = NULL;
6074 tree single_input = reduc_inputs[0];
6075 for (k = 1; k < reduc_inputs.length (); k++)
6076 single_input = gimple_build (&stmts, code, vectype,
6077 single_input, reduc_inputs[k]);
6078 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6080 reduc_inputs.truncate (0);
6081 reduc_inputs.safe_push (single_input);
6084 tree orig_reduc_input = reduc_inputs[0];
6086 /* If this loop is an epilogue loop that can be skipped after the
6087 main loop, we can only share a reduction operation between the
6088 main loop and the epilogue if we put it at the target of the
6089 skip edge.
6091 We can still reuse accumulators if this check fails. Doing so has
6092 the minor(?) benefit of making the epilogue loop's scalar result
6093 independent of the main loop's scalar result. */
6094 bool unify_with_main_loop_p = false;
6095 if (reduc_info->reused_accumulator
6096 && loop_vinfo->skip_this_loop_edge
6097 && single_succ_p (exit_bb)
6098 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6100 unify_with_main_loop_p = true;
6102 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6103 reduc_inputs[0] = make_ssa_name (vectype);
6104 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6105 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6106 UNKNOWN_LOCATION);
6107 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6108 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6109 exit_gsi = gsi_after_labels (reduc_block);
6112 /* Shouldn't be used beyond this point. */
6113 exit_bb = nullptr;
6115 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6116 && reduc_fn != IFN_LAST)
6118 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6119 various data values where the condition matched and another vector
6120 (INDUCTION_INDEX) containing all the indexes of those matches. We
6121 need to extract the last matching index (which will be the index with
6122 highest value) and use this to index into the data vector.
6123 For the case where there were no matches, the data vector will contain
6124 all default values and the index vector will be all zeros. */
6126 /* Get various versions of the type of the vector of indexes. */
6127 tree index_vec_type = TREE_TYPE (induction_index);
6128 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6129 tree index_scalar_type = TREE_TYPE (index_vec_type);
6130 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6132 /* Get an unsigned integer version of the type of the data vector. */
6133 int scalar_precision
6134 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6135 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6136 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6137 vectype);
6139 /* First we need to create a vector (ZERO_VEC) of zeros and another
6140 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6141 can create using a MAX reduction and then expanding.
6142 In the case where the loop never made any matches, the max index will
6143 be zero. */
6145 /* Vector of {0, 0, 0,...}. */
6146 tree zero_vec = build_zero_cst (vectype);
6148 /* Find maximum value from the vector of found indexes. */
6149 tree max_index = make_ssa_name (index_scalar_type);
6150 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6151 1, induction_index);
6152 gimple_call_set_lhs (max_index_stmt, max_index);
6153 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6155 /* Vector of {max_index, max_index, max_index,...}. */
6156 tree max_index_vec = make_ssa_name (index_vec_type);
6157 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6158 max_index);
6159 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6160 max_index_vec_rhs);
6161 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6163 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6164 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6165 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6166 otherwise. Only one value should match, resulting in a vector
6167 (VEC_COND) with one data value and the rest zeros.
6168 In the case where the loop never made any matches, every index will
6169 match, resulting in a vector with all data values (which will all be
6170 the default value). */
6172 /* Compare the max index vector to the vector of found indexes to find
6173 the position of the max value. */
6174 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6175 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6176 induction_index,
6177 max_index_vec);
6178 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6180 /* Use the compare to choose either values from the data vector or
6181 zero. */
6182 tree vec_cond = make_ssa_name (vectype);
6183 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6184 vec_compare,
6185 reduc_inputs[0],
6186 zero_vec);
6187 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6189 /* Finally we need to extract the data value from the vector (VEC_COND)
6190 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6191 reduction, but because this doesn't exist, we can use a MAX reduction
6192 instead. The data value might be signed or a float so we need to cast
6193 it first.
6194 In the case where the loop never made any matches, the data values are
6195 all identical, and so will reduce down correctly. */
6197 /* Make the matched data values unsigned. */
6198 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6199 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6200 vec_cond);
6201 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6202 VIEW_CONVERT_EXPR,
6203 vec_cond_cast_rhs);
6204 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6206 /* Reduce down to a scalar value. */
6207 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6208 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6209 1, vec_cond_cast);
6210 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6211 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6213 /* Convert the reduced value back to the result type and set as the
6214 result. */
6215 gimple_seq stmts = NULL;
6216 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6217 data_reduc);
6218 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6219 scalar_results.safe_push (new_temp);
6221 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6222 && reduc_fn == IFN_LAST)
6224 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6225 idx = 0;
6226 idx_val = induction_index[0];
6227 val = data_reduc[0];
6228 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6229 if (induction_index[i] > idx_val)
6230 val = data_reduc[i], idx_val = induction_index[i];
6231 return val; */
6233 tree data_eltype = TREE_TYPE (vectype);
6234 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6235 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6236 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6237 /* Enforced by vectorizable_reduction, which ensures we have target
6238 support before allowing a conditional reduction on variable-length
6239 vectors. */
6240 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6241 tree idx_val = NULL_TREE, val = NULL_TREE;
6242 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6244 tree old_idx_val = idx_val;
6245 tree old_val = val;
6246 idx_val = make_ssa_name (idx_eltype);
6247 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6248 build3 (BIT_FIELD_REF, idx_eltype,
6249 induction_index,
6250 bitsize_int (el_size),
6251 bitsize_int (off)));
6252 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6253 val = make_ssa_name (data_eltype);
6254 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6255 build3 (BIT_FIELD_REF,
6256 data_eltype,
6257 reduc_inputs[0],
6258 bitsize_int (el_size),
6259 bitsize_int (off)));
6260 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6261 if (off != 0)
6263 tree new_idx_val = idx_val;
6264 if (off != v_size - el_size)
6266 new_idx_val = make_ssa_name (idx_eltype);
6267 epilog_stmt = gimple_build_assign (new_idx_val,
6268 MAX_EXPR, idx_val,
6269 old_idx_val);
6270 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6272 tree cond = make_ssa_name (boolean_type_node);
6273 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6274 idx_val, old_idx_val);
6275 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6276 tree new_val = make_ssa_name (data_eltype);
6277 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6278 cond, val, old_val);
6279 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6280 idx_val = new_idx_val;
6281 val = new_val;
6284 /* Convert the reduced value back to the result type and set as the
6285 result. */
6286 gimple_seq stmts = NULL;
6287 val = gimple_convert (&stmts, scalar_type, val);
6288 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6289 scalar_results.safe_push (val);
6292 /* 2.3 Create the reduction code, using one of the three schemes described
6293 above. In SLP we simply need to extract all the elements from the
6294 vector (without reducing them), so we use scalar shifts. */
6295 else if (reduc_fn != IFN_LAST && !slp_reduc)
6297 tree tmp;
6298 tree vec_elem_type;
6300 /* Case 1: Create:
6301 v_out2 = reduc_expr <v_out1> */
6303 if (dump_enabled_p ())
6304 dump_printf_loc (MSG_NOTE, vect_location,
6305 "Reduce using direct vector reduction.\n");
6307 gimple_seq stmts = NULL;
6308 vec_elem_type = TREE_TYPE (vectype);
6309 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6310 vec_elem_type, reduc_inputs[0]);
6311 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6312 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6314 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6315 && induc_val)
6317 /* Earlier we set the initial value to be a vector if induc_val
6318 values. Check the result and if it is induc_val then replace
6319 with the original initial value, unless induc_val is
6320 the same as initial_def already. */
6321 tree zcompare = make_ssa_name (boolean_type_node);
6322 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6323 new_temp, induc_val);
6324 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6325 tree initial_def = reduc_info->reduc_initial_values[0];
6326 tmp = make_ssa_name (new_scalar_dest);
6327 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6328 initial_def, new_temp);
6329 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6330 new_temp = tmp;
6333 scalar_results.safe_push (new_temp);
6335 else if (direct_slp_reduc)
6337 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6338 with the elements for other SLP statements replaced with the
6339 neutral value. We can then do a normal reduction on each vector. */
6341 /* Enforced by vectorizable_reduction. */
6342 gcc_assert (reduc_inputs.length () == 1);
6343 gcc_assert (pow2p_hwi (group_size));
6345 gimple_seq seq = NULL;
6347 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6348 and the same element size as VECTYPE. */
6349 tree index = build_index_vector (vectype, 0, 1);
6350 tree index_type = TREE_TYPE (index);
6351 tree index_elt_type = TREE_TYPE (index_type);
6352 tree mask_type = truth_type_for (index_type);
6354 /* Create a vector that, for each element, identifies which of
6355 the REDUC_GROUP_SIZE results should use it. */
6356 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6357 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6358 build_vector_from_val (index_type, index_mask));
6360 /* Get a neutral vector value. This is simply a splat of the neutral
6361 scalar value if we have one, otherwise the initial scalar value
6362 is itself a neutral value. */
6363 tree vector_identity = NULL_TREE;
6364 tree neutral_op = NULL_TREE;
6365 if (slp_node)
6367 tree initial_value = NULL_TREE;
6368 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6369 initial_value = reduc_info->reduc_initial_values[0];
6370 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6371 initial_value);
6373 if (neutral_op)
6374 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6375 neutral_op);
6376 for (unsigned int i = 0; i < group_size; ++i)
6378 /* If there's no univeral neutral value, we can use the
6379 initial scalar value from the original PHI. This is used
6380 for MIN and MAX reduction, for example. */
6381 if (!neutral_op)
6383 tree scalar_value = reduc_info->reduc_initial_values[i];
6384 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6385 scalar_value);
6386 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6387 scalar_value);
6390 /* Calculate the equivalent of:
6392 sel[j] = (index[j] == i);
6394 which selects the elements of REDUC_INPUTS[0] that should
6395 be included in the result. */
6396 tree compare_val = build_int_cst (index_elt_type, i);
6397 compare_val = build_vector_from_val (index_type, compare_val);
6398 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6399 index, compare_val);
6401 /* Calculate the equivalent of:
6403 vec = seq ? reduc_inputs[0] : vector_identity;
6405 VEC is now suitable for a full vector reduction. */
6406 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6407 sel, reduc_inputs[0], vector_identity);
6409 /* Do the reduction and convert it to the appropriate type. */
6410 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6411 TREE_TYPE (vectype), vec);
6412 scalar = gimple_convert (&seq, scalar_type, scalar);
6413 scalar_results.safe_push (scalar);
6415 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6417 else
6419 bool reduce_with_shift;
6420 tree vec_temp;
6422 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6424 /* See if the target wants to do the final (shift) reduction
6425 in a vector mode of smaller size and first reduce upper/lower
6426 halves against each other. */
6427 enum machine_mode mode1 = mode;
6428 tree stype = TREE_TYPE (vectype);
6429 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6430 unsigned nunits1 = nunits;
6431 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6432 && reduc_inputs.length () == 1)
6434 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6435 /* For SLP reductions we have to make sure lanes match up, but
6436 since we're doing individual element final reduction reducing
6437 vector width here is even more important.
6438 ??? We can also separate lanes with permutes, for the common
6439 case of power-of-two group-size odd/even extracts would work. */
6440 if (slp_reduc && nunits != nunits1)
6442 nunits1 = least_common_multiple (nunits1, group_size);
6443 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6446 if (!slp_reduc
6447 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6448 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6450 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6451 stype, nunits1);
6452 reduce_with_shift = have_whole_vector_shift (mode1);
6453 if (!VECTOR_MODE_P (mode1)
6454 || !directly_supported_p (code, vectype1))
6455 reduce_with_shift = false;
6457 /* First reduce the vector to the desired vector size we should
6458 do shift reduction on by combining upper and lower halves. */
6459 gimple_seq stmts = NULL;
6460 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6461 code, &stmts);
6462 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6463 reduc_inputs[0] = new_temp;
6465 if (reduce_with_shift && !slp_reduc)
6467 int element_bitsize = tree_to_uhwi (bitsize);
6468 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6469 for variable-length vectors and also requires direct target support
6470 for loop reductions. */
6471 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6472 int nelements = vec_size_in_bits / element_bitsize;
6473 vec_perm_builder sel;
6474 vec_perm_indices indices;
6476 int elt_offset;
6478 tree zero_vec = build_zero_cst (vectype1);
6479 /* Case 2: Create:
6480 for (offset = nelements/2; offset >= 1; offset/=2)
6482 Create: va' = vec_shift <va, offset>
6483 Create: va = vop <va, va'>
6484 } */
6486 tree rhs;
6488 if (dump_enabled_p ())
6489 dump_printf_loc (MSG_NOTE, vect_location,
6490 "Reduce using vector shifts\n");
6492 gimple_seq stmts = NULL;
6493 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6494 for (elt_offset = nelements / 2;
6495 elt_offset >= 1;
6496 elt_offset /= 2)
6498 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6499 indices.new_vector (sel, 2, nelements);
6500 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6501 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6502 new_temp, zero_vec, mask);
6503 new_temp = gimple_build (&stmts, code,
6504 vectype1, new_name, new_temp);
6506 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6508 /* 2.4 Extract the final scalar result. Create:
6509 s_out3 = extract_field <v_out2, bitpos> */
6511 if (dump_enabled_p ())
6512 dump_printf_loc (MSG_NOTE, vect_location,
6513 "extract scalar result\n");
6515 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6516 bitsize, bitsize_zero_node);
6517 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6518 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6519 gimple_assign_set_lhs (epilog_stmt, new_temp);
6520 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6521 scalar_results.safe_push (new_temp);
6523 else
6525 /* Case 3: Create:
6526 s = extract_field <v_out2, 0>
6527 for (offset = element_size;
6528 offset < vector_size;
6529 offset += element_size;)
6531 Create: s' = extract_field <v_out2, offset>
6532 Create: s = op <s, s'> // For non SLP cases
6533 } */
6535 if (dump_enabled_p ())
6536 dump_printf_loc (MSG_NOTE, vect_location,
6537 "Reduce using scalar code.\n");
6539 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6540 int element_bitsize = tree_to_uhwi (bitsize);
6541 tree compute_type = TREE_TYPE (vectype);
6542 gimple_seq stmts = NULL;
6543 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6545 int bit_offset;
6546 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6547 vec_temp, bitsize, bitsize_zero_node);
6549 /* In SLP we don't need to apply reduction operation, so we just
6550 collect s' values in SCALAR_RESULTS. */
6551 if (slp_reduc)
6552 scalar_results.safe_push (new_temp);
6554 for (bit_offset = element_bitsize;
6555 bit_offset < vec_size_in_bits;
6556 bit_offset += element_bitsize)
6558 tree bitpos = bitsize_int (bit_offset);
6559 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6560 compute_type, vec_temp,
6561 bitsize, bitpos);
6562 if (slp_reduc)
6564 /* In SLP we don't need to apply reduction operation, so
6565 we just collect s' values in SCALAR_RESULTS. */
6566 new_temp = new_name;
6567 scalar_results.safe_push (new_name);
6569 else
6570 new_temp = gimple_build (&stmts, code, compute_type,
6571 new_name, new_temp);
6575 /* The only case where we need to reduce scalar results in SLP, is
6576 unrolling. If the size of SCALAR_RESULTS is greater than
6577 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6578 REDUC_GROUP_SIZE. */
6579 if (slp_reduc)
6581 tree res, first_res, new_res;
6583 /* Reduce multiple scalar results in case of SLP unrolling. */
6584 for (j = group_size; scalar_results.iterate (j, &res);
6585 j++)
6587 first_res = scalar_results[j % group_size];
6588 new_res = gimple_build (&stmts, code, compute_type,
6589 first_res, res);
6590 scalar_results[j % group_size] = new_res;
6592 scalar_results.truncate (group_size);
6593 for (k = 0; k < group_size; k++)
6594 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6595 scalar_results[k]);
6597 else
6599 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6600 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6601 scalar_results.safe_push (new_temp);
6604 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6607 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6608 && induc_val)
6610 /* Earlier we set the initial value to be a vector if induc_val
6611 values. Check the result and if it is induc_val then replace
6612 with the original initial value, unless induc_val is
6613 the same as initial_def already. */
6614 tree zcompare = make_ssa_name (boolean_type_node);
6615 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6616 induc_val);
6617 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6618 tree initial_def = reduc_info->reduc_initial_values[0];
6619 tree tmp = make_ssa_name (new_scalar_dest);
6620 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6621 initial_def, new_temp);
6622 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6623 scalar_results[0] = tmp;
6627 /* 2.5 Adjust the final result by the initial value of the reduction
6628 variable. (When such adjustment is not needed, then
6629 'adjustment_def' is zero). For example, if code is PLUS we create:
6630 new_temp = loop_exit_def + adjustment_def */
6632 if (adjustment_def)
6634 gcc_assert (!slp_reduc);
6635 gimple_seq stmts = NULL;
6636 if (double_reduc)
6638 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6639 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6640 new_temp = gimple_build (&stmts, code, vectype,
6641 reduc_inputs[0], adjustment_def);
6643 else
6645 new_temp = scalar_results[0];
6646 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6647 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6648 adjustment_def);
6649 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6650 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6651 new_temp, adjustment_def);
6652 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6655 epilog_stmt = gimple_seq_last_stmt (stmts);
6656 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6657 scalar_results[0] = new_temp;
6660 /* Record this operation if it could be reused by the epilogue loop. */
6661 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6662 && reduc_inputs.length () == 1)
6663 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6664 { orig_reduc_input, reduc_info });
6666 if (double_reduc)
6667 loop = outer_loop;
6669 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6670 phis with new adjusted scalar results, i.e., replace use <s_out0>
6671 with use <s_out4>.
6673 Transform:
6674 loop_exit:
6675 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6676 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6677 v_out2 = reduce <v_out1>
6678 s_out3 = extract_field <v_out2, 0>
6679 s_out4 = adjust_result <s_out3>
6680 use <s_out0>
6681 use <s_out0>
6683 into:
6685 loop_exit:
6686 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6687 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6688 v_out2 = reduce <v_out1>
6689 s_out3 = extract_field <v_out2, 0>
6690 s_out4 = adjust_result <s_out3>
6691 use <s_out4>
6692 use <s_out4> */
6694 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6695 for (k = 0; k < live_out_stmts.size (); k++)
6697 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6698 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6700 phis.create (3);
6701 /* Find the loop-closed-use at the loop exit of the original scalar
6702 result. (The reduction result is expected to have two immediate uses,
6703 one at the latch block, and one at the loop exit). For double
6704 reductions we are looking for exit phis of the outer loop. */
6705 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6707 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6709 if (!is_gimple_debug (USE_STMT (use_p)))
6710 phis.safe_push (USE_STMT (use_p));
6712 else
6714 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6716 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6718 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6720 if (!flow_bb_inside_loop_p (loop,
6721 gimple_bb (USE_STMT (phi_use_p)))
6722 && !is_gimple_debug (USE_STMT (phi_use_p)))
6723 phis.safe_push (USE_STMT (phi_use_p));
6729 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6731 /* Replace the uses: */
6732 orig_name = PHI_RESULT (exit_phi);
6734 /* Look for a single use at the target of the skip edge. */
6735 if (unify_with_main_loop_p)
6737 use_operand_p use_p;
6738 gimple *user;
6739 if (!single_imm_use (orig_name, &use_p, &user))
6740 gcc_unreachable ();
6741 orig_name = gimple_get_lhs (user);
6744 scalar_result = scalar_results[k];
6745 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6747 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6748 SET_USE (use_p, scalar_result);
6749 update_stmt (use_stmt);
6753 phis.release ();
6757 /* Return a vector of type VECTYPE that is equal to the vector select
6758 operation "MASK ? VEC : IDENTITY". Insert the select statements
6759 before GSI. */
6761 static tree
6762 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6763 tree vec, tree identity)
6765 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6766 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6767 mask, vec, identity);
6768 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6769 return cond;
6772 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6773 order, starting with LHS. Insert the extraction statements before GSI and
6774 associate the new scalar SSA names with variable SCALAR_DEST.
6775 Return the SSA name for the result. */
6777 static tree
6778 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6779 tree_code code, tree lhs, tree vector_rhs)
6781 tree vectype = TREE_TYPE (vector_rhs);
6782 tree scalar_type = TREE_TYPE (vectype);
6783 tree bitsize = TYPE_SIZE (scalar_type);
6784 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6785 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6787 for (unsigned HOST_WIDE_INT bit_offset = 0;
6788 bit_offset < vec_size_in_bits;
6789 bit_offset += element_bitsize)
6791 tree bitpos = bitsize_int (bit_offset);
6792 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6793 bitsize, bitpos);
6795 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6796 rhs = make_ssa_name (scalar_dest, stmt);
6797 gimple_assign_set_lhs (stmt, rhs);
6798 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6800 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6801 tree new_name = make_ssa_name (scalar_dest, stmt);
6802 gimple_assign_set_lhs (stmt, new_name);
6803 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6804 lhs = new_name;
6806 return lhs;
6809 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6810 type of the vector input. */
6812 static internal_fn
6813 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6815 internal_fn mask_reduc_fn;
6816 internal_fn mask_len_reduc_fn;
6818 switch (reduc_fn)
6820 case IFN_FOLD_LEFT_PLUS:
6821 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6822 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6823 break;
6825 default:
6826 return IFN_LAST;
6829 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6830 OPTIMIZE_FOR_SPEED))
6831 return mask_reduc_fn;
6832 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6833 OPTIMIZE_FOR_SPEED))
6834 return mask_len_reduc_fn;
6835 return IFN_LAST;
6838 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6839 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6840 statement. CODE is the operation performed by STMT_INFO and OPS are
6841 its scalar operands. REDUC_INDEX is the index of the operand in
6842 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6843 implements in-order reduction, or IFN_LAST if we should open-code it.
6844 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6845 that should be used to control the operation in a fully-masked loop. */
6847 static bool
6848 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6849 stmt_vec_info stmt_info,
6850 gimple_stmt_iterator *gsi,
6851 gimple **vec_stmt, slp_tree slp_node,
6852 gimple *reduc_def_stmt,
6853 tree_code code, internal_fn reduc_fn,
6854 tree ops[3], tree vectype_in,
6855 int reduc_index, vec_loop_masks *masks,
6856 vec_loop_lens *lens)
6858 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6859 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6860 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6862 int ncopies;
6863 if (slp_node)
6864 ncopies = 1;
6865 else
6866 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6868 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6869 gcc_assert (ncopies == 1);
6870 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6872 if (slp_node)
6873 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6874 TYPE_VECTOR_SUBPARTS (vectype_in)));
6876 tree op0 = ops[1 - reduc_index];
6878 int group_size = 1;
6879 stmt_vec_info scalar_dest_def_info;
6880 auto_vec<tree> vec_oprnds0;
6881 if (slp_node)
6883 auto_vec<vec<tree> > vec_defs (2);
6884 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6885 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6886 vec_defs[0].release ();
6887 vec_defs[1].release ();
6888 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6889 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6891 else
6893 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6894 op0, &vec_oprnds0);
6895 scalar_dest_def_info = stmt_info;
6898 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6899 tree scalar_type = TREE_TYPE (scalar_dest);
6900 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6902 int vec_num = vec_oprnds0.length ();
6903 gcc_assert (vec_num == 1 || slp_node);
6904 tree vec_elem_type = TREE_TYPE (vectype_out);
6905 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6907 tree vector_identity = NULL_TREE;
6908 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6910 vector_identity = build_zero_cst (vectype_out);
6911 if (!HONOR_SIGNED_ZEROS (vectype_out))
6913 else
6915 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
6916 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
6917 vector_identity);
6921 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6922 int i;
6923 tree def0;
6924 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6926 gimple *new_stmt;
6927 tree mask = NULL_TREE;
6928 tree len = NULL_TREE;
6929 tree bias = NULL_TREE;
6930 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6931 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
6932 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6934 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6935 i, 1);
6936 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6937 bias = build_int_cst (intQI_type_node, biasval);
6938 mask = build_minus_one_cst (truth_type_for (vectype_in));
6941 /* Handle MINUS by adding the negative. */
6942 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6944 tree negated = make_ssa_name (vectype_out);
6945 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6946 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6947 def0 = negated;
6950 if (mask && mask_reduc_fn == IFN_LAST)
6951 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6952 vector_identity);
6954 /* On the first iteration the input is simply the scalar phi
6955 result, and for subsequent iterations it is the output of
6956 the preceding operation. */
6957 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6959 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6960 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6961 def0, mask, len, bias);
6962 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6963 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6964 def0, mask);
6965 else
6966 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6967 def0);
6968 /* For chained SLP reductions the output of the previous reduction
6969 operation serves as the input of the next. For the final statement
6970 the output cannot be a temporary - we reuse the original
6971 scalar destination of the last statement. */
6972 if (i != vec_num - 1)
6974 gimple_set_lhs (new_stmt, scalar_dest_var);
6975 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6976 gimple_set_lhs (new_stmt, reduc_var);
6979 else
6981 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6982 reduc_var, def0);
6983 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6984 /* Remove the statement, so that we can use the same code paths
6985 as for statements that we've just created. */
6986 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6987 gsi_remove (&tmp_gsi, true);
6990 if (i == vec_num - 1)
6992 gimple_set_lhs (new_stmt, scalar_dest);
6993 vect_finish_replace_stmt (loop_vinfo,
6994 scalar_dest_def_info,
6995 new_stmt);
6997 else
6998 vect_finish_stmt_generation (loop_vinfo,
6999 scalar_dest_def_info,
7000 new_stmt, gsi);
7002 if (slp_node)
7003 slp_node->push_vec_def (new_stmt);
7004 else
7006 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7007 *vec_stmt = new_stmt;
7011 return true;
7014 /* Function is_nonwrapping_integer_induction.
7016 Check if STMT_VINO (which is part of loop LOOP) both increments and
7017 does not cause overflow. */
7019 static bool
7020 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7022 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7023 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7024 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7025 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7026 widest_int ni, max_loop_value, lhs_max;
7027 wi::overflow_type overflow = wi::OVF_NONE;
7029 /* Make sure the loop is integer based. */
7030 if (TREE_CODE (base) != INTEGER_CST
7031 || TREE_CODE (step) != INTEGER_CST)
7032 return false;
7034 /* Check that the max size of the loop will not wrap. */
7036 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7037 return true;
7039 if (! max_stmt_executions (loop, &ni))
7040 return false;
7042 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7043 &overflow);
7044 if (overflow)
7045 return false;
7047 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7048 TYPE_SIGN (lhs_type), &overflow);
7049 if (overflow)
7050 return false;
7052 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7053 <= TYPE_PRECISION (lhs_type));
7056 /* Check if masking can be supported by inserting a conditional expression.
7057 CODE is the code for the operation. COND_FN is the conditional internal
7058 function, if it exists. VECTYPE_IN is the type of the vector input. */
7059 static bool
7060 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7061 tree vectype_in)
7063 if (cond_fn != IFN_LAST
7064 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7065 OPTIMIZE_FOR_SPEED))
7066 return false;
7068 if (code.is_tree_code ())
7069 switch (tree_code (code))
7071 case DOT_PROD_EXPR:
7072 case SAD_EXPR:
7073 return true;
7075 default:
7076 break;
7078 return false;
7081 /* Insert a conditional expression to enable masked vectorization. CODE is the
7082 code for the operation. VOP is the array of operands. MASK is the loop
7083 mask. GSI is a statement iterator used to place the new conditional
7084 expression. */
7085 static void
7086 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7087 gimple_stmt_iterator *gsi)
7089 switch (tree_code (code))
7091 case DOT_PROD_EXPR:
7093 tree vectype = TREE_TYPE (vop[1]);
7094 tree zero = build_zero_cst (vectype);
7095 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7096 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7097 mask, vop[1], zero);
7098 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7099 vop[1] = masked_op1;
7100 break;
7103 case SAD_EXPR:
7105 tree vectype = TREE_TYPE (vop[1]);
7106 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7107 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7108 mask, vop[1], vop[0]);
7109 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7110 vop[1] = masked_op1;
7111 break;
7114 default:
7115 gcc_unreachable ();
7119 /* Function vectorizable_reduction.
7121 Check if STMT_INFO performs a reduction operation that can be vectorized.
7122 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7123 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7124 Return true if STMT_INFO is vectorizable in this way.
7126 This function also handles reduction idioms (patterns) that have been
7127 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7128 may be of this form:
7129 X = pattern_expr (arg0, arg1, ..., X)
7130 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7131 sequence that had been detected and replaced by the pattern-stmt
7132 (STMT_INFO).
7134 This function also handles reduction of condition expressions, for example:
7135 for (int i = 0; i < N; i++)
7136 if (a[i] < value)
7137 last = a[i];
7138 This is handled by vectorising the loop and creating an additional vector
7139 containing the loop indexes for which "a[i] < value" was true. In the
7140 function epilogue this is reduced to a single max value and then used to
7141 index into the vector of results.
7143 In some cases of reduction patterns, the type of the reduction variable X is
7144 different than the type of the other arguments of STMT_INFO.
7145 In such cases, the vectype that is used when transforming STMT_INFO into
7146 a vector stmt is different than the vectype that is used to determine the
7147 vectorization factor, because it consists of a different number of elements
7148 than the actual number of elements that are being operated upon in parallel.
7150 For example, consider an accumulation of shorts into an int accumulator.
7151 On some targets it's possible to vectorize this pattern operating on 8
7152 shorts at a time (hence, the vectype for purposes of determining the
7153 vectorization factor should be V8HI); on the other hand, the vectype that
7154 is used to create the vector form is actually V4SI (the type of the result).
7156 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7157 indicates what is the actual level of parallelism (V8HI in the example), so
7158 that the right vectorization factor would be derived. This vectype
7159 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7160 be used to create the vectorized stmt. The right vectype for the vectorized
7161 stmt is obtained from the type of the result X:
7162 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7164 This means that, contrary to "regular" reductions (or "regular" stmts in
7165 general), the following equation:
7166 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7167 does *NOT* necessarily hold for reduction patterns. */
7169 bool
7170 vectorizable_reduction (loop_vec_info loop_vinfo,
7171 stmt_vec_info stmt_info, slp_tree slp_node,
7172 slp_instance slp_node_instance,
7173 stmt_vector_for_cost *cost_vec)
7175 tree vectype_in = NULL_TREE;
7176 tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7177 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7178 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7179 stmt_vec_info cond_stmt_vinfo = NULL;
7180 int i;
7181 int ncopies;
7182 bool single_defuse_cycle = false;
7183 bool nested_cycle = false;
7184 bool double_reduc = false;
7185 int vec_num;
7186 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7187 tree cond_reduc_val = NULL_TREE;
7189 /* Make sure it was already recognized as a reduction computation. */
7190 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7191 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7192 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7193 return false;
7195 /* The stmt we store reduction analysis meta on. */
7196 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7197 reduc_info->is_reduc_info = true;
7199 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7201 if (is_a <gphi *> (stmt_info->stmt))
7203 if (slp_node)
7205 /* We eventually need to set a vector type on invariant
7206 arguments. */
7207 unsigned j;
7208 slp_tree child;
7209 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7210 if (!vect_maybe_update_slp_op_vectype
7211 (child, SLP_TREE_VECTYPE (slp_node)))
7213 if (dump_enabled_p ())
7214 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7215 "incompatible vector types for "
7216 "invariants\n");
7217 return false;
7220 /* Analysis for double-reduction is done on the outer
7221 loop PHI, nested cycles have no further restrictions. */
7222 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7224 else
7225 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7226 return true;
7229 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7230 stmt_vec_info phi_info = stmt_info;
7231 if (!is_a <gphi *> (stmt_info->stmt))
7233 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7234 return true;
7236 if (slp_node)
7238 slp_node_instance->reduc_phis = slp_node;
7239 /* ??? We're leaving slp_node to point to the PHIs, we only
7240 need it to get at the number of vector stmts which wasn't
7241 yet initialized for the instance root. */
7243 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7245 use_operand_p use_p;
7246 gimple *use_stmt;
7247 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7248 &use_p, &use_stmt);
7249 gcc_assert (res);
7250 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7253 /* PHIs should not participate in patterns. */
7254 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7255 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7257 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7258 and compute the reduction chain length. Discover the real
7259 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7260 tree reduc_def
7261 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7262 loop_latch_edge
7263 (gimple_bb (reduc_def_phi)->loop_father));
7264 unsigned reduc_chain_length = 0;
7265 bool only_slp_reduc_chain = true;
7266 stmt_info = NULL;
7267 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7268 while (reduc_def != PHI_RESULT (reduc_def_phi))
7270 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7271 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7272 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7274 if (dump_enabled_p ())
7275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7276 "reduction chain broken by patterns.\n");
7277 return false;
7279 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7280 only_slp_reduc_chain = false;
7281 /* For epilogue generation live members of the chain need
7282 to point back to the PHI via their original stmt for
7283 info_for_reduction to work. For SLP we need to look at
7284 all lanes here - even though we only will vectorize from
7285 the SLP node with live lane zero the other live lanes also
7286 need to be identified as part of a reduction to be able
7287 to skip code generation for them. */
7288 if (slp_for_stmt_info)
7290 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7291 if (STMT_VINFO_LIVE_P (s))
7292 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7294 else if (STMT_VINFO_LIVE_P (vdef))
7295 STMT_VINFO_REDUC_DEF (def) = phi_info;
7296 gimple_match_op op;
7297 if (!gimple_extract_op (vdef->stmt, &op))
7299 if (dump_enabled_p ())
7300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7301 "reduction chain includes unsupported"
7302 " statement type.\n");
7303 return false;
7305 if (CONVERT_EXPR_CODE_P (op.code))
7307 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7309 if (dump_enabled_p ())
7310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7311 "conversion in the reduction chain.\n");
7312 return false;
7315 else if (!stmt_info)
7316 /* First non-conversion stmt. */
7317 stmt_info = vdef;
7318 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7319 reduc_chain_length++;
7320 if (!stmt_info && slp_node)
7321 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7323 /* PHIs should not participate in patterns. */
7324 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7326 if (nested_in_vect_loop_p (loop, stmt_info))
7328 loop = loop->inner;
7329 nested_cycle = true;
7332 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7333 element. */
7334 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7336 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7337 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7339 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7340 gcc_assert (slp_node
7341 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7343 /* 1. Is vectorizable reduction? */
7344 /* Not supportable if the reduction variable is used in the loop, unless
7345 it's a reduction chain. */
7346 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7347 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7348 return false;
7350 /* Reductions that are not used even in an enclosing outer-loop,
7351 are expected to be "live" (used out of the loop). */
7352 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7353 && !STMT_VINFO_LIVE_P (stmt_info))
7354 return false;
7356 /* 2. Has this been recognized as a reduction pattern?
7358 Check if STMT represents a pattern that has been recognized
7359 in earlier analysis stages. For stmts that represent a pattern,
7360 the STMT_VINFO_RELATED_STMT field records the last stmt in
7361 the original sequence that constitutes the pattern. */
7363 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7364 if (orig_stmt_info)
7366 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7367 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7370 /* 3. Check the operands of the operation. The first operands are defined
7371 inside the loop body. The last operand is the reduction variable,
7372 which is defined by the loop-header-phi. */
7374 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7375 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7376 gimple_match_op op;
7377 if (!gimple_extract_op (stmt_info->stmt, &op))
7378 gcc_unreachable ();
7379 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7380 || op.code == WIDEN_SUM_EXPR
7381 || op.code == SAD_EXPR);
7383 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7384 && !SCALAR_FLOAT_TYPE_P (op.type))
7385 return false;
7387 /* Do not try to vectorize bit-precision reductions. */
7388 if (!type_has_mode_precision_p (op.type))
7389 return false;
7391 /* For lane-reducing ops we're reducing the number of reduction PHIs
7392 which means the only use of that may be in the lane-reducing operation. */
7393 if (lane_reduc_code_p
7394 && reduc_chain_length != 1
7395 && !only_slp_reduc_chain)
7397 if (dump_enabled_p ())
7398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7399 "lane-reducing reduction with extra stmts.\n");
7400 return false;
7403 /* All uses but the last are expected to be defined in the loop.
7404 The last use is the reduction variable. In case of nested cycle this
7405 assumption is not true: we use reduc_index to record the index of the
7406 reduction variable. */
7407 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7408 /* We need to skip an extra operand for COND_EXPRs with embedded
7409 comparison. */
7410 unsigned opno_adjust = 0;
7411 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7412 opno_adjust = 1;
7413 for (i = 0; i < (int) op.num_ops; i++)
7415 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7416 if (i == 0 && op.code == COND_EXPR)
7417 continue;
7419 stmt_vec_info def_stmt_info;
7420 enum vect_def_type dt;
7421 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7422 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7423 &vectype_op[i], &def_stmt_info))
7425 if (dump_enabled_p ())
7426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7427 "use not simple.\n");
7428 return false;
7430 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7431 continue;
7433 /* There should be only one cycle def in the stmt, the one
7434 leading to reduc_def. */
7435 if (VECTORIZABLE_CYCLE_DEF (dt))
7436 return false;
7438 if (!vectype_op[i])
7439 vectype_op[i]
7440 = get_vectype_for_scalar_type (loop_vinfo,
7441 TREE_TYPE (op.ops[i]), slp_op[i]);
7443 /* To properly compute ncopies we are interested in the widest
7444 non-reduction input type in case we're looking at a widening
7445 accumulation that we later handle in vect_transform_reduction. */
7446 if (lane_reduc_code_p
7447 && vectype_op[i]
7448 && (!vectype_in
7449 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7450 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7451 vectype_in = vectype_op[i];
7453 if (op.code == COND_EXPR)
7455 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7456 if (dt == vect_constant_def)
7458 cond_reduc_dt = dt;
7459 cond_reduc_val = op.ops[i];
7461 if (dt == vect_induction_def
7462 && def_stmt_info
7463 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7465 cond_reduc_dt = dt;
7466 cond_stmt_vinfo = def_stmt_info;
7470 if (!vectype_in)
7471 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7472 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7474 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7475 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7476 /* If we have a condition reduction, see if we can simplify it further. */
7477 if (v_reduc_type == COND_REDUCTION)
7479 if (slp_node)
7480 return false;
7482 /* When the condition uses the reduction value in the condition, fail. */
7483 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7485 if (dump_enabled_p ())
7486 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7487 "condition depends on previous iteration\n");
7488 return false;
7491 if (reduc_chain_length == 1
7492 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
7493 vectype_in, OPTIMIZE_FOR_SPEED))
7495 if (dump_enabled_p ())
7496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7497 "optimizing condition reduction with"
7498 " FOLD_EXTRACT_LAST.\n");
7499 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7501 else if (cond_reduc_dt == vect_induction_def)
7503 tree base
7504 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7505 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7507 gcc_assert (TREE_CODE (base) == INTEGER_CST
7508 && TREE_CODE (step) == INTEGER_CST);
7509 cond_reduc_val = NULL_TREE;
7510 enum tree_code cond_reduc_op_code = ERROR_MARK;
7511 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7512 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7514 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7515 above base; punt if base is the minimum value of the type for
7516 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7517 else if (tree_int_cst_sgn (step) == -1)
7519 cond_reduc_op_code = MIN_EXPR;
7520 if (tree_int_cst_sgn (base) == -1)
7521 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7522 else if (tree_int_cst_lt (base,
7523 TYPE_MAX_VALUE (TREE_TYPE (base))))
7524 cond_reduc_val
7525 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7527 else
7529 cond_reduc_op_code = MAX_EXPR;
7530 if (tree_int_cst_sgn (base) == 1)
7531 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7532 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7533 base))
7534 cond_reduc_val
7535 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7537 if (cond_reduc_val)
7539 if (dump_enabled_p ())
7540 dump_printf_loc (MSG_NOTE, vect_location,
7541 "condition expression based on "
7542 "integer induction.\n");
7543 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7544 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7545 = cond_reduc_val;
7546 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7549 else if (cond_reduc_dt == vect_constant_def)
7551 enum vect_def_type cond_initial_dt;
7552 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7553 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7554 if (cond_initial_dt == vect_constant_def
7555 && types_compatible_p (TREE_TYPE (cond_initial_val),
7556 TREE_TYPE (cond_reduc_val)))
7558 tree e = fold_binary (LE_EXPR, boolean_type_node,
7559 cond_initial_val, cond_reduc_val);
7560 if (e && (integer_onep (e) || integer_zerop (e)))
7562 if (dump_enabled_p ())
7563 dump_printf_loc (MSG_NOTE, vect_location,
7564 "condition expression based on "
7565 "compile time constant.\n");
7566 /* Record reduction code at analysis stage. */
7567 STMT_VINFO_REDUC_CODE (reduc_info)
7568 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7569 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7575 if (STMT_VINFO_LIVE_P (phi_info))
7576 return false;
7578 if (slp_node)
7579 ncopies = 1;
7580 else
7581 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7583 gcc_assert (ncopies >= 1);
7585 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7587 if (nested_cycle)
7589 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7590 == vect_double_reduction_def);
7591 double_reduc = true;
7594 /* 4.2. Check support for the epilog operation.
7596 If STMT represents a reduction pattern, then the type of the
7597 reduction variable may be different than the type of the rest
7598 of the arguments. For example, consider the case of accumulation
7599 of shorts into an int accumulator; The original code:
7600 S1: int_a = (int) short_a;
7601 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7603 was replaced with:
7604 STMT: int_acc = widen_sum <short_a, int_acc>
7606 This means that:
7607 1. The tree-code that is used to create the vector operation in the
7608 epilog code (that reduces the partial results) is not the
7609 tree-code of STMT, but is rather the tree-code of the original
7610 stmt from the pattern that STMT is replacing. I.e, in the example
7611 above we want to use 'widen_sum' in the loop, but 'plus' in the
7612 epilog.
7613 2. The type (mode) we use to check available target support
7614 for the vector operation to be created in the *epilog*, is
7615 determined by the type of the reduction variable (in the example
7616 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7617 However the type (mode) we use to check available target support
7618 for the vector operation to be created *inside the loop*, is
7619 determined by the type of the other arguments to STMT (in the
7620 example we'd check this: optab_handler (widen_sum_optab,
7621 vect_short_mode)).
7623 This is contrary to "regular" reductions, in which the types of all
7624 the arguments are the same as the type of the reduction variable.
7625 For "regular" reductions we can therefore use the same vector type
7626 (and also the same tree-code) when generating the epilog code and
7627 when generating the code inside the loop. */
7629 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7630 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7632 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7633 if (reduction_type == TREE_CODE_REDUCTION)
7635 /* Check whether it's ok to change the order of the computation.
7636 Generally, when vectorizing a reduction we change the order of the
7637 computation. This may change the behavior of the program in some
7638 cases, so we need to check that this is ok. One exception is when
7639 vectorizing an outer-loop: the inner-loop is executed sequentially,
7640 and therefore vectorizing reductions in the inner-loop during
7641 outer-loop vectorization is safe. Likewise when we are vectorizing
7642 a series of reductions using SLP and the VF is one the reductions
7643 are performed in scalar order. */
7644 if (slp_node
7645 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7646 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7648 else if (needs_fold_left_reduction_p (op.type, orig_code))
7650 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7651 is not directy used in stmt. */
7652 if (!only_slp_reduc_chain
7653 && reduc_chain_length != 1)
7655 if (dump_enabled_p ())
7656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7657 "in-order reduction chain without SLP.\n");
7658 return false;
7660 STMT_VINFO_REDUC_TYPE (reduc_info)
7661 = reduction_type = FOLD_LEFT_REDUCTION;
7663 else if (!commutative_binary_op_p (orig_code, op.type)
7664 || !associative_binary_op_p (orig_code, op.type))
7666 if (dump_enabled_p ())
7667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7668 "reduction: not commutative/associative");
7669 return false;
7673 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7674 && ncopies > 1)
7676 if (dump_enabled_p ())
7677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7678 "multiple types in double reduction or condition "
7679 "reduction or fold-left reduction.\n");
7680 return false;
7683 internal_fn reduc_fn = IFN_LAST;
7684 if (reduction_type == TREE_CODE_REDUCTION
7685 || reduction_type == FOLD_LEFT_REDUCTION
7686 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7687 || reduction_type == CONST_COND_REDUCTION)
7689 if (reduction_type == FOLD_LEFT_REDUCTION
7690 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7691 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7693 if (reduc_fn != IFN_LAST
7694 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7695 OPTIMIZE_FOR_SPEED))
7697 if (dump_enabled_p ())
7698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7699 "reduc op not supported by target.\n");
7701 reduc_fn = IFN_LAST;
7704 else
7706 if (!nested_cycle || double_reduc)
7708 if (dump_enabled_p ())
7709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7710 "no reduc code for scalar code.\n");
7712 return false;
7716 else if (reduction_type == COND_REDUCTION)
7718 int scalar_precision
7719 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7720 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7721 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7722 vectype_out);
7724 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7725 OPTIMIZE_FOR_SPEED))
7726 reduc_fn = IFN_REDUC_MAX;
7728 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7730 if (reduction_type != EXTRACT_LAST_REDUCTION
7731 && (!nested_cycle || double_reduc)
7732 && reduc_fn == IFN_LAST
7733 && !nunits_out.is_constant ())
7735 if (dump_enabled_p ())
7736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7737 "missing target support for reduction on"
7738 " variable-length vectors.\n");
7739 return false;
7742 /* For SLP reductions, see if there is a neutral value we can use. */
7743 tree neutral_op = NULL_TREE;
7744 if (slp_node)
7746 tree initial_value = NULL_TREE;
7747 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7748 initial_value = vect_phi_initial_value (reduc_def_phi);
7749 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7750 orig_code, initial_value);
7753 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7755 /* We can't support in-order reductions of code such as this:
7757 for (int i = 0; i < n1; ++i)
7758 for (int j = 0; j < n2; ++j)
7759 l += a[j];
7761 since GCC effectively transforms the loop when vectorizing:
7763 for (int i = 0; i < n1 / VF; ++i)
7764 for (int j = 0; j < n2; ++j)
7765 for (int k = 0; k < VF; ++k)
7766 l += a[j];
7768 which is a reassociation of the original operation. */
7769 if (dump_enabled_p ())
7770 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7771 "in-order double reduction not supported.\n");
7773 return false;
7776 if (reduction_type == FOLD_LEFT_REDUCTION
7777 && slp_node
7778 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7780 /* We cannot use in-order reductions in this case because there is
7781 an implicit reassociation of the operations involved. */
7782 if (dump_enabled_p ())
7783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7784 "in-order unchained SLP reductions not supported.\n");
7785 return false;
7788 /* For double reductions, and for SLP reductions with a neutral value,
7789 we construct a variable-length initial vector by loading a vector
7790 full of the neutral value and then shift-and-inserting the start
7791 values into the low-numbered elements. */
7792 if ((double_reduc || neutral_op)
7793 && !nunits_out.is_constant ()
7794 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7795 vectype_out, OPTIMIZE_FOR_SPEED))
7797 if (dump_enabled_p ())
7798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7799 "reduction on variable-length vectors requires"
7800 " target support for a vector-shift-and-insert"
7801 " operation.\n");
7802 return false;
7805 /* Check extra constraints for variable-length unchained SLP reductions. */
7806 if (slp_node
7807 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7808 && !nunits_out.is_constant ())
7810 /* We checked above that we could build the initial vector when
7811 there's a neutral element value. Check here for the case in
7812 which each SLP statement has its own initial value and in which
7813 that value needs to be repeated for every instance of the
7814 statement within the initial vector. */
7815 unsigned int group_size = SLP_TREE_LANES (slp_node);
7816 if (!neutral_op
7817 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7818 TREE_TYPE (vectype_out)))
7820 if (dump_enabled_p ())
7821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7822 "unsupported form of SLP reduction for"
7823 " variable-length vectors: cannot build"
7824 " initial vector.\n");
7825 return false;
7827 /* The epilogue code relies on the number of elements being a multiple
7828 of the group size. The duplicate-and-interleave approach to setting
7829 up the initial vector does too. */
7830 if (!multiple_p (nunits_out, group_size))
7832 if (dump_enabled_p ())
7833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7834 "unsupported form of SLP reduction for"
7835 " variable-length vectors: the vector size"
7836 " is not a multiple of the number of results.\n");
7837 return false;
7841 if (reduction_type == COND_REDUCTION)
7843 widest_int ni;
7845 if (! max_loop_iterations (loop, &ni))
7847 if (dump_enabled_p ())
7848 dump_printf_loc (MSG_NOTE, vect_location,
7849 "loop count not known, cannot create cond "
7850 "reduction.\n");
7851 return false;
7853 /* Convert backedges to iterations. */
7854 ni += 1;
7856 /* The additional index will be the same type as the condition. Check
7857 that the loop can fit into this less one (because we'll use up the
7858 zero slot for when there are no matches). */
7859 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7860 if (wi::geu_p (ni, wi::to_widest (max_index)))
7862 if (dump_enabled_p ())
7863 dump_printf_loc (MSG_NOTE, vect_location,
7864 "loop size is greater than data size.\n");
7865 return false;
7869 /* In case the vectorization factor (VF) is bigger than the number
7870 of elements that we can fit in a vectype (nunits), we have to generate
7871 more than one vector stmt - i.e - we need to "unroll" the
7872 vector stmt by a factor VF/nunits. For more details see documentation
7873 in vectorizable_operation. */
7875 /* If the reduction is used in an outer loop we need to generate
7876 VF intermediate results, like so (e.g. for ncopies=2):
7877 r0 = phi (init, r0)
7878 r1 = phi (init, r1)
7879 r0 = x0 + r0;
7880 r1 = x1 + r1;
7881 (i.e. we generate VF results in 2 registers).
7882 In this case we have a separate def-use cycle for each copy, and therefore
7883 for each copy we get the vector def for the reduction variable from the
7884 respective phi node created for this copy.
7886 Otherwise (the reduction is unused in the loop nest), we can combine
7887 together intermediate results, like so (e.g. for ncopies=2):
7888 r = phi (init, r)
7889 r = x0 + r;
7890 r = x1 + r;
7891 (i.e. we generate VF/2 results in a single register).
7892 In this case for each copy we get the vector def for the reduction variable
7893 from the vectorized reduction operation generated in the previous iteration.
7895 This only works when we see both the reduction PHI and its only consumer
7896 in vectorizable_reduction and there are no intermediate stmts
7897 participating. When unrolling we want each unrolled iteration to have its
7898 own reduction accumulator since one of the main goals of unrolling a
7899 reduction is to reduce the aggregate loop-carried latency. */
7900 if (ncopies > 1
7901 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7902 && reduc_chain_length == 1
7903 && loop_vinfo->suggested_unroll_factor == 1)
7904 single_defuse_cycle = true;
7906 if (single_defuse_cycle || lane_reduc_code_p)
7908 gcc_assert (op.code != COND_EXPR);
7910 /* 4. Supportable by target? */
7911 bool ok = true;
7913 /* 4.1. check support for the operation in the loop
7915 This isn't necessary for the lane reduction codes, since they
7916 can only be produced by pattern matching, and it's up to the
7917 pattern matcher to test for support. The main reason for
7918 specifically skipping this step is to avoid rechecking whether
7919 mixed-sign dot-products can be implemented using signed
7920 dot-products. */
7921 machine_mode vec_mode = TYPE_MODE (vectype_in);
7922 if (!lane_reduc_code_p
7923 && !directly_supported_p (op.code, vectype_in, optab_vector))
7925 if (dump_enabled_p ())
7926 dump_printf (MSG_NOTE, "op not supported by target.\n");
7927 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7928 || !vect_can_vectorize_without_simd_p (op.code))
7929 ok = false;
7930 else
7931 if (dump_enabled_p ())
7932 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7935 if (vect_emulated_vector_p (vectype_in)
7936 && !vect_can_vectorize_without_simd_p (op.code))
7938 if (dump_enabled_p ())
7939 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7940 return false;
7943 /* lane-reducing operations have to go through vect_transform_reduction.
7944 For the other cases try without the single cycle optimization. */
7945 if (!ok)
7947 if (lane_reduc_code_p)
7948 return false;
7949 else
7950 single_defuse_cycle = false;
7953 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7955 /* If the reduction stmt is one of the patterns that have lane
7956 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7957 if ((ncopies > 1 && ! single_defuse_cycle)
7958 && lane_reduc_code_p)
7960 if (dump_enabled_p ())
7961 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7962 "multi def-use cycle not possible for lane-reducing "
7963 "reduction operation\n");
7964 return false;
7967 if (slp_node
7968 && !(!single_defuse_cycle
7969 && !lane_reduc_code_p
7970 && reduction_type != FOLD_LEFT_REDUCTION))
7971 for (i = 0; i < (int) op.num_ops; i++)
7972 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7974 if (dump_enabled_p ())
7975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7976 "incompatible vector types for invariants\n");
7977 return false;
7980 if (slp_node)
7981 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7982 else
7983 vec_num = 1;
7985 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7986 reduction_type, ncopies, cost_vec);
7987 /* Cost the reduction op inside the loop if transformed via
7988 vect_transform_reduction. Otherwise this is costed by the
7989 separate vectorizable_* routines. */
7990 if (single_defuse_cycle || lane_reduc_code_p)
7992 int factor = 1;
7993 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7994 /* Three dot-products and a subtraction. */
7995 factor = 4;
7996 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
7997 stmt_info, 0, vect_body);
8000 if (dump_enabled_p ()
8001 && reduction_type == FOLD_LEFT_REDUCTION)
8002 dump_printf_loc (MSG_NOTE, vect_location,
8003 "using an in-order (fold-left) reduction.\n");
8004 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8005 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8006 reductions go through their own vectorizable_* routines. */
8007 if (!single_defuse_cycle
8008 && !lane_reduc_code_p
8009 && reduction_type != FOLD_LEFT_REDUCTION)
8011 stmt_vec_info tem
8012 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8013 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8015 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8016 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8018 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8019 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8021 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8023 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8024 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8025 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8027 if (reduction_type != FOLD_LEFT_REDUCTION
8028 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8029 && (cond_fn == IFN_LAST
8030 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8031 OPTIMIZE_FOR_SPEED)))
8033 if (dump_enabled_p ())
8034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8035 "can't operate on partial vectors because"
8036 " no conditional operation is available.\n");
8037 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8039 else if (reduction_type == FOLD_LEFT_REDUCTION
8040 && reduc_fn == IFN_LAST
8041 && !expand_vec_cond_expr_p (vectype_in,
8042 truth_type_for (vectype_in),
8043 SSA_NAME))
8045 if (dump_enabled_p ())
8046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8047 "can't operate on partial vectors because"
8048 " no conditional operation is available.\n");
8049 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8051 else if (reduction_type == FOLD_LEFT_REDUCTION
8052 && reduc_fn == IFN_LAST
8053 && FLOAT_TYPE_P (vectype_in)
8054 && HONOR_SIGNED_ZEROS (vectype_in)
8055 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8057 if (dump_enabled_p ())
8058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8059 "can't operate on partial vectors because"
8060 " signed zeros cannot be preserved.\n");
8061 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8063 else
8065 internal_fn mask_reduc_fn
8066 = get_masked_reduction_fn (reduc_fn, vectype_in);
8068 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8069 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8070 vectype_in, 1);
8071 else
8072 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8073 vectype_in, NULL);
8076 return true;
8079 /* STMT_INFO is a dot-product reduction whose multiplication operands
8080 have different signs. Emit a sequence to emulate the operation
8081 using a series of signed DOT_PROD_EXPRs and return the last
8082 statement generated. VEC_DEST is the result of the vector operation
8083 and VOP lists its inputs. */
8085 static gassign *
8086 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8087 gimple_stmt_iterator *gsi, tree vec_dest,
8088 tree vop[3])
8090 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8091 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8092 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8093 gimple *new_stmt;
8095 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8096 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8097 std::swap (vop[0], vop[1]);
8099 /* Convert all inputs to signed types. */
8100 for (int i = 0; i < 3; ++i)
8101 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8103 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8104 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8105 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8106 vop[i] = tmp;
8109 /* In the comments below we assume 8-bit inputs for simplicity,
8110 but the approach works for any full integer type. */
8112 /* Create a vector of -128. */
8113 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8114 tree min_narrow = build_vector_from_val (narrow_vectype,
8115 min_narrow_elttype);
8117 /* Create a vector of 64. */
8118 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8119 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8120 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8122 /* Emit: SUB_RES = VOP[0] - 128. */
8123 tree sub_res = make_ssa_name (narrow_vectype);
8124 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8125 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8127 /* Emit:
8129 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8130 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8131 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8133 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8134 Doing the two 64 * y steps first allows more time to compute x. */
8135 tree stage1 = make_ssa_name (wide_vectype);
8136 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8137 vop[1], half_narrow, vop[2]);
8138 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8140 tree stage2 = make_ssa_name (wide_vectype);
8141 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8142 vop[1], half_narrow, stage1);
8143 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8145 tree stage3 = make_ssa_name (wide_vectype);
8146 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8147 sub_res, vop[1], stage2);
8148 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8150 /* Convert STAGE3 to the reduction type. */
8151 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8154 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8155 value. */
8157 bool
8158 vect_transform_reduction (loop_vec_info loop_vinfo,
8159 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8160 gimple **vec_stmt, slp_tree slp_node)
8162 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8163 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8164 int i;
8165 int ncopies;
8166 int vec_num;
8168 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8169 gcc_assert (reduc_info->is_reduc_info);
8171 if (nested_in_vect_loop_p (loop, stmt_info))
8173 loop = loop->inner;
8174 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8177 gimple_match_op op;
8178 if (!gimple_extract_op (stmt_info->stmt, &op))
8179 gcc_unreachable ();
8181 /* All uses but the last are expected to be defined in the loop.
8182 The last use is the reduction variable. In case of nested cycle this
8183 assumption is not true: we use reduc_index to record the index of the
8184 reduction variable. */
8185 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8186 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8187 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8188 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8190 if (slp_node)
8192 ncopies = 1;
8193 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8195 else
8197 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8198 vec_num = 1;
8201 code_helper code = canonicalize_code (op.code, op.type);
8202 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8203 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8204 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8205 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8207 /* Transform. */
8208 tree new_temp = NULL_TREE;
8209 auto_vec<tree> vec_oprnds0;
8210 auto_vec<tree> vec_oprnds1;
8211 auto_vec<tree> vec_oprnds2;
8212 tree def0;
8214 if (dump_enabled_p ())
8215 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8217 /* FORNOW: Multiple types are not supported for condition. */
8218 if (code == COND_EXPR)
8219 gcc_assert (ncopies == 1);
8221 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8223 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8224 if (reduction_type == FOLD_LEFT_REDUCTION)
8226 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8227 gcc_assert (code.is_tree_code ());
8228 return vectorize_fold_left_reduction
8229 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8230 tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
8231 lens);
8234 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8235 gcc_assert (single_defuse_cycle
8236 || code == DOT_PROD_EXPR
8237 || code == WIDEN_SUM_EXPR
8238 || code == SAD_EXPR);
8240 /* Create the destination vector */
8241 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8242 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8244 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8245 single_defuse_cycle && reduc_index == 0
8246 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8247 single_defuse_cycle && reduc_index == 1
8248 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8249 op.num_ops == 3
8250 && !(single_defuse_cycle && reduc_index == 2)
8251 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8252 if (single_defuse_cycle)
8254 gcc_assert (!slp_node);
8255 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8256 op.ops[reduc_index],
8257 reduc_index == 0 ? &vec_oprnds0
8258 : (reduc_index == 1 ? &vec_oprnds1
8259 : &vec_oprnds2));
8262 bool emulated_mixed_dot_prod
8263 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8264 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8266 gimple *new_stmt;
8267 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8268 if (masked_loop_p && !mask_by_cond_expr)
8270 /* No conditional ifns have been defined for dot-product yet. */
8271 gcc_assert (code != DOT_PROD_EXPR);
8273 /* Make sure that the reduction accumulator is vop[0]. */
8274 if (reduc_index == 1)
8276 gcc_assert (commutative_binary_op_p (code, op.type));
8277 std::swap (vop[0], vop[1]);
8279 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8280 vec_num * ncopies, vectype_in, i);
8281 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8282 vop[0], vop[1], vop[0]);
8283 new_temp = make_ssa_name (vec_dest, call);
8284 gimple_call_set_lhs (call, new_temp);
8285 gimple_call_set_nothrow (call, true);
8286 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8287 new_stmt = call;
8289 else
8291 if (op.num_ops == 3)
8292 vop[2] = vec_oprnds2[i];
8294 if (masked_loop_p && mask_by_cond_expr)
8296 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8297 vec_num * ncopies, vectype_in, i);
8298 build_vect_cond_expr (code, vop, mask, gsi);
8301 if (emulated_mixed_dot_prod)
8302 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8303 vec_dest, vop);
8304 else if (code.is_internal_fn ())
8305 new_stmt = gimple_build_call_internal (internal_fn (code),
8306 op.num_ops,
8307 vop[0], vop[1], vop[2]);
8308 else
8309 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8310 vop[0], vop[1], vop[2]);
8311 new_temp = make_ssa_name (vec_dest, new_stmt);
8312 gimple_set_lhs (new_stmt, new_temp);
8313 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8316 if (slp_node)
8317 slp_node->push_vec_def (new_stmt);
8318 else if (single_defuse_cycle
8319 && i < ncopies - 1)
8321 if (reduc_index == 0)
8322 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8323 else if (reduc_index == 1)
8324 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8325 else if (reduc_index == 2)
8326 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8328 else
8329 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8332 if (!slp_node)
8333 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8335 return true;
8338 /* Transform phase of a cycle PHI. */
8340 bool
8341 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8342 stmt_vec_info stmt_info, gimple **vec_stmt,
8343 slp_tree slp_node, slp_instance slp_node_instance)
8345 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8346 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8347 int i;
8348 int ncopies;
8349 int j;
8350 bool nested_cycle = false;
8351 int vec_num;
8353 if (nested_in_vect_loop_p (loop, stmt_info))
8355 loop = loop->inner;
8356 nested_cycle = true;
8359 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8360 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8361 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8362 gcc_assert (reduc_info->is_reduc_info);
8364 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8365 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8366 /* Leave the scalar phi in place. */
8367 return true;
8369 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8370 /* For a nested cycle we do not fill the above. */
8371 if (!vectype_in)
8372 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8373 gcc_assert (vectype_in);
8375 if (slp_node)
8377 /* The size vect_schedule_slp_instance computes is off for us. */
8378 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8379 * SLP_TREE_LANES (slp_node), vectype_in);
8380 ncopies = 1;
8382 else
8384 vec_num = 1;
8385 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8388 /* Check whether we should use a single PHI node and accumulate
8389 vectors to one before the backedge. */
8390 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8391 ncopies = 1;
8393 /* Create the destination vector */
8394 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8395 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8396 vectype_out);
8398 /* Get the loop-entry arguments. */
8399 tree vec_initial_def = NULL_TREE;
8400 auto_vec<tree> vec_initial_defs;
8401 if (slp_node)
8403 vec_initial_defs.reserve (vec_num);
8404 if (nested_cycle)
8406 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8407 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8408 &vec_initial_defs);
8410 else
8412 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8413 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8414 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8416 unsigned int num_phis = stmts.length ();
8417 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8418 num_phis = 1;
8419 initial_values.reserve (num_phis);
8420 for (unsigned int i = 0; i < num_phis; ++i)
8422 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8423 initial_values.quick_push (vect_phi_initial_value (this_phi));
8425 if (vec_num == 1)
8426 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8427 if (!initial_values.is_empty ())
8429 tree initial_value
8430 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8431 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8432 tree neutral_op
8433 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8434 code, initial_value);
8435 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8436 &vec_initial_defs, vec_num,
8437 stmts.length (), neutral_op);
8441 else
8443 /* Get at the scalar def before the loop, that defines the initial
8444 value of the reduction variable. */
8445 tree initial_def = vect_phi_initial_value (phi);
8446 reduc_info->reduc_initial_values.safe_push (initial_def);
8447 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8448 and we can't use zero for induc_val, use initial_def. Similarly
8449 for REDUC_MIN and initial_def larger than the base. */
8450 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8452 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8453 if (TREE_CODE (initial_def) == INTEGER_CST
8454 && !integer_zerop (induc_val)
8455 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8456 && tree_int_cst_lt (initial_def, induc_val))
8457 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8458 && tree_int_cst_lt (induc_val, initial_def))))
8460 induc_val = initial_def;
8461 /* Communicate we used the initial_def to epilouge
8462 generation. */
8463 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8465 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8467 else if (nested_cycle)
8469 /* Do not use an adjustment def as that case is not supported
8470 correctly if ncopies is not one. */
8471 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8472 ncopies, initial_def,
8473 &vec_initial_defs);
8475 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8476 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8477 /* Fill the initial vector with the initial scalar value. */
8478 vec_initial_def
8479 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8480 initial_def, initial_def);
8481 else
8483 if (ncopies == 1)
8484 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8485 if (!reduc_info->reduc_initial_values.is_empty ())
8487 initial_def = reduc_info->reduc_initial_values[0];
8488 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8489 tree neutral_op
8490 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8491 code, initial_def);
8492 gcc_assert (neutral_op);
8493 /* Try to simplify the vector initialization by applying an
8494 adjustment after the reduction has been performed. */
8495 if (!reduc_info->reused_accumulator
8496 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8497 && !operand_equal_p (neutral_op, initial_def))
8499 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8500 = initial_def;
8501 initial_def = neutral_op;
8503 vec_initial_def
8504 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8505 initial_def, neutral_op);
8510 if (vec_initial_def)
8512 vec_initial_defs.create (ncopies);
8513 for (i = 0; i < ncopies; ++i)
8514 vec_initial_defs.quick_push (vec_initial_def);
8517 if (auto *accumulator = reduc_info->reused_accumulator)
8519 tree def = accumulator->reduc_input;
8520 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8522 unsigned int nreduc;
8523 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8524 (TREE_TYPE (def)),
8525 TYPE_VECTOR_SUBPARTS (vectype_out),
8526 &nreduc);
8527 gcc_assert (res);
8528 gimple_seq stmts = NULL;
8529 /* Reduce the single vector to a smaller one. */
8530 if (nreduc != 1)
8532 /* Perform the reduction in the appropriate type. */
8533 tree rvectype = vectype_out;
8534 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8535 TREE_TYPE (TREE_TYPE (def))))
8536 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8537 TYPE_VECTOR_SUBPARTS
8538 (vectype_out));
8539 def = vect_create_partial_epilog (def, rvectype,
8540 STMT_VINFO_REDUC_CODE
8541 (reduc_info),
8542 &stmts);
8544 /* The epilogue loop might use a different vector mode, like
8545 VNx2DI vs. V2DI. */
8546 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8548 tree reduc_type = build_vector_type_for_mode
8549 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8550 def = gimple_convert (&stmts, reduc_type, def);
8552 /* Adjust the input so we pick up the partially reduced value
8553 for the skip edge in vect_create_epilog_for_reduction. */
8554 accumulator->reduc_input = def;
8555 /* And the reduction could be carried out using a different sign. */
8556 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8557 def = gimple_convert (&stmts, vectype_out, def);
8558 if (loop_vinfo->main_loop_edge)
8560 /* While we'd like to insert on the edge this will split
8561 blocks and disturb bookkeeping, we also will eventually
8562 need this on the skip edge. Rely on sinking to
8563 fixup optimal placement and insert in the pred. */
8564 gimple_stmt_iterator gsi
8565 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8566 /* Insert before a cond that eventually skips the
8567 epilogue. */
8568 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8569 gsi_prev (&gsi);
8570 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8572 else
8573 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8574 stmts);
8576 if (loop_vinfo->main_loop_edge)
8577 vec_initial_defs[0]
8578 = vect_get_main_loop_result (loop_vinfo, def,
8579 vec_initial_defs[0]);
8580 else
8581 vec_initial_defs.safe_push (def);
8584 /* Generate the reduction PHIs upfront. */
8585 for (i = 0; i < vec_num; i++)
8587 tree vec_init_def = vec_initial_defs[i];
8588 for (j = 0; j < ncopies; j++)
8590 /* Create the reduction-phi that defines the reduction
8591 operand. */
8592 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8594 /* Set the loop-entry arg of the reduction-phi. */
8595 if (j != 0 && nested_cycle)
8596 vec_init_def = vec_initial_defs[j];
8597 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8598 UNKNOWN_LOCATION);
8600 /* The loop-latch arg is set in epilogue processing. */
8602 if (slp_node)
8603 slp_node->push_vec_def (new_phi);
8604 else
8606 if (j == 0)
8607 *vec_stmt = new_phi;
8608 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8613 return true;
8616 /* Vectorizes LC PHIs. */
8618 bool
8619 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8620 stmt_vec_info stmt_info, gimple **vec_stmt,
8621 slp_tree slp_node)
8623 if (!loop_vinfo
8624 || !is_a <gphi *> (stmt_info->stmt)
8625 || gimple_phi_num_args (stmt_info->stmt) != 1)
8626 return false;
8628 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8629 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8630 return false;
8632 if (!vec_stmt) /* transformation not required. */
8634 /* Deal with copies from externs or constants that disguise as
8635 loop-closed PHI nodes (PR97886). */
8636 if (slp_node
8637 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8638 SLP_TREE_VECTYPE (slp_node)))
8640 if (dump_enabled_p ())
8641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8642 "incompatible vector types for invariants\n");
8643 return false;
8645 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8646 return true;
8649 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8650 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8651 basic_block bb = gimple_bb (stmt_info->stmt);
8652 edge e = single_pred_edge (bb);
8653 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8654 auto_vec<tree> vec_oprnds;
8655 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8656 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8657 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8658 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8660 /* Create the vectorized LC PHI node. */
8661 gphi *new_phi = create_phi_node (vec_dest, bb);
8662 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8663 if (slp_node)
8664 slp_node->push_vec_def (new_phi);
8665 else
8666 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8668 if (!slp_node)
8669 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8671 return true;
8674 /* Vectorizes PHIs. */
8676 bool
8677 vectorizable_phi (vec_info *,
8678 stmt_vec_info stmt_info, gimple **vec_stmt,
8679 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8681 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8682 return false;
8684 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8685 return false;
8687 tree vectype = SLP_TREE_VECTYPE (slp_node);
8689 if (!vec_stmt) /* transformation not required. */
8691 slp_tree child;
8692 unsigned i;
8693 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8694 if (!child)
8696 if (dump_enabled_p ())
8697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8698 "PHI node with unvectorized backedge def\n");
8699 return false;
8701 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8703 if (dump_enabled_p ())
8704 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8705 "incompatible vector types for invariants\n");
8706 return false;
8708 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8709 && !useless_type_conversion_p (vectype,
8710 SLP_TREE_VECTYPE (child)))
8712 /* With bools we can have mask and non-mask precision vectors
8713 or different non-mask precisions. while pattern recog is
8714 supposed to guarantee consistency here bugs in it can cause
8715 mismatches (PR103489 and PR103800 for example).
8716 Deal with them here instead of ICEing later. */
8717 if (dump_enabled_p ())
8718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8719 "incompatible vector type setup from "
8720 "bool pattern detection\n");
8721 return false;
8724 /* For single-argument PHIs assume coalescing which means zero cost
8725 for the scalar and the vector PHIs. This avoids artificially
8726 favoring the vector path (but may pessimize it in some cases). */
8727 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8728 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8729 vector_stmt, stmt_info, vectype, 0, vect_body);
8730 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8731 return true;
8734 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8735 basic_block bb = gimple_bb (stmt_info->stmt);
8736 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8737 auto_vec<gphi *> new_phis;
8738 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8740 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8742 /* Skip not yet vectorized defs. */
8743 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8744 && SLP_TREE_VEC_DEFS (child).is_empty ())
8745 continue;
8747 auto_vec<tree> vec_oprnds;
8748 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8749 if (!new_phis.exists ())
8751 new_phis.create (vec_oprnds.length ());
8752 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8754 /* Create the vectorized LC PHI node. */
8755 new_phis.quick_push (create_phi_node (vec_dest, bb));
8756 slp_node->push_vec_def (new_phis[j]);
8759 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8760 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8761 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8763 /* We should have at least one already vectorized child. */
8764 gcc_assert (new_phis.exists ());
8766 return true;
8769 /* Vectorizes first order recurrences. An overview of the transformation
8770 is described below. Suppose we have the following loop.
8772 int t = 0;
8773 for (int i = 0; i < n; ++i)
8775 b[i] = a[i] - t;
8776 t = a[i];
8779 There is a first-order recurrence on 'a'. For this loop, the scalar IR
8780 looks (simplified) like:
8782 scalar.preheader:
8783 init = 0;
8785 scalar.body:
8786 i = PHI <0(scalar.preheader), i+1(scalar.body)>
8787 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8788 _1 = a[i]
8789 b[i] = _1 - _2
8790 if (i < n) goto scalar.body
8792 In this example, _2 is a recurrence because it's value depends on the
8793 previous iteration. We vectorize this as (VF = 4)
8795 vector.preheader:
8796 vect_init = vect_cst(..., ..., ..., 0)
8798 vector.body
8799 i = PHI <0(vector.preheader), i+4(vector.body)>
8800 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8801 vect_2 = a[i, i+1, i+2, i+3];
8802 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8803 b[i, i+1, i+2, i+3] = vect_2 - vect_3
8804 if (..) goto vector.body
8806 In this function, vectorizable_recurr, we code generate both the
8807 vector PHI node and the permute since those together compute the
8808 vectorized value of the scalar PHI. We do not yet have the
8809 backedge value to fill in there nor into the vec_perm. Those
8810 are filled in maybe_set_vectorized_backedge_value and
8811 vect_schedule_scc.
8813 TODO: Since the scalar loop does not have a use of the recurrence
8814 outside of the loop the natural way to implement peeling via
8815 vectorizing the live value doesn't work. For now peeling of loops
8816 with a recurrence is not implemented. For SLP the supported cases
8817 are restricted to those requiring a single vector recurrence PHI. */
8819 bool
8820 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8821 gimple **vec_stmt, slp_tree slp_node,
8822 stmt_vector_for_cost *cost_vec)
8824 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8825 return false;
8827 gphi *phi = as_a<gphi *> (stmt_info->stmt);
8829 /* So far we only support first-order recurrence auto-vectorization. */
8830 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8831 return false;
8833 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8834 unsigned ncopies;
8835 if (slp_node)
8836 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8837 else
8838 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8839 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8840 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8841 /* We need to be able to make progress with a single vector. */
8842 if (maybe_gt (dist * 2, nunits))
8844 if (dump_enabled_p ())
8845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8846 "first order recurrence exceeds half of "
8847 "a vector\n");
8848 return false;
8851 /* First-order recurrence autovectorization needs to handle permutation
8852 with indices = [nunits-1, nunits, nunits+1, ...]. */
8853 vec_perm_builder sel (nunits, 1, 3);
8854 for (int i = 0; i < 3; ++i)
8855 sel.quick_push (nunits - dist + i);
8856 vec_perm_indices indices (sel, 2, nunits);
8858 if (!vec_stmt) /* transformation not required. */
8860 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8861 indices))
8862 return false;
8864 if (slp_node)
8866 /* We eventually need to set a vector type on invariant
8867 arguments. */
8868 unsigned j;
8869 slp_tree child;
8870 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8871 if (!vect_maybe_update_slp_op_vectype
8872 (child, SLP_TREE_VECTYPE (slp_node)))
8874 if (dump_enabled_p ())
8875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8876 "incompatible vector types for "
8877 "invariants\n");
8878 return false;
8881 /* The recurrence costs the initialization vector and one permute
8882 for each copy. */
8883 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8884 stmt_info, 0, vect_prologue);
8885 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8886 stmt_info, 0, vect_body);
8887 if (dump_enabled_p ())
8888 dump_printf_loc (MSG_NOTE, vect_location,
8889 "vectorizable_recurr: inside_cost = %d, "
8890 "prologue_cost = %d .\n", inside_cost,
8891 prologue_cost);
8893 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8894 return true;
8897 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8898 basic_block bb = gimple_bb (phi);
8899 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8900 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8902 gimple_seq stmts = NULL;
8903 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8904 gsi_insert_seq_on_edge_immediate (pe, stmts);
8906 tree vec_init = build_vector_from_val (vectype, preheader);
8907 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8909 /* Create the vectorized first-order PHI node. */
8910 tree vec_dest = vect_get_new_vect_var (vectype,
8911 vect_simple_var, "vec_recur_");
8912 gphi *new_phi = create_phi_node (vec_dest, bb);
8913 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8915 /* Insert shuffles the first-order recurrence autovectorization.
8916 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8917 tree perm = vect_gen_perm_mask_checked (vectype, indices);
8919 /* Insert the required permute after the latch definition. The
8920 second and later operands are tentative and will be updated when we have
8921 vectorized the latch definition. */
8922 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8923 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8924 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8925 gsi_next (&gsi2);
8927 for (unsigned i = 0; i < ncopies; ++i)
8929 vec_dest = make_ssa_name (vectype);
8930 gassign *vperm
8931 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8932 i == 0 ? gimple_phi_result (new_phi) : NULL,
8933 NULL, perm);
8934 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8936 if (slp_node)
8937 slp_node->push_vec_def (vperm);
8938 else
8939 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8942 if (!slp_node)
8943 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8944 return true;
8947 /* Return true if VECTYPE represents a vector that requires lowering
8948 by the vector lowering pass. */
8950 bool
8951 vect_emulated_vector_p (tree vectype)
8953 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8954 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8955 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8958 /* Return true if we can emulate CODE on an integer mode representation
8959 of a vector. */
8961 bool
8962 vect_can_vectorize_without_simd_p (tree_code code)
8964 switch (code)
8966 case PLUS_EXPR:
8967 case MINUS_EXPR:
8968 case NEGATE_EXPR:
8969 case BIT_AND_EXPR:
8970 case BIT_IOR_EXPR:
8971 case BIT_XOR_EXPR:
8972 case BIT_NOT_EXPR:
8973 return true;
8975 default:
8976 return false;
8980 /* Likewise, but taking a code_helper. */
8982 bool
8983 vect_can_vectorize_without_simd_p (code_helper code)
8985 return (code.is_tree_code ()
8986 && vect_can_vectorize_without_simd_p (tree_code (code)));
8989 /* Create vector init for vectorized iv. */
8990 static tree
8991 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8992 tree step_expr, poly_uint64 nunits,
8993 tree vectype,
8994 enum vect_induction_op_type induction_type)
8996 unsigned HOST_WIDE_INT const_nunits;
8997 tree vec_shift, vec_init, new_name;
8998 unsigned i;
8999 tree itype = TREE_TYPE (vectype);
9001 /* iv_loop is the loop to be vectorized. Create:
9002 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9003 new_name = gimple_convert (stmts, itype, init_expr);
9004 switch (induction_type)
9006 case vect_step_op_shr:
9007 case vect_step_op_shl:
9008 /* Build the Initial value from shift_expr. */
9009 vec_init = gimple_build_vector_from_val (stmts,
9010 vectype,
9011 new_name);
9012 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9013 build_zero_cst (itype), step_expr);
9014 vec_init = gimple_build (stmts,
9015 (induction_type == vect_step_op_shr
9016 ? RSHIFT_EXPR : LSHIFT_EXPR),
9017 vectype, vec_init, vec_shift);
9018 break;
9020 case vect_step_op_neg:
9022 vec_init = gimple_build_vector_from_val (stmts,
9023 vectype,
9024 new_name);
9025 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9026 vectype, vec_init);
9027 /* The encoding has 2 interleaved stepped patterns. */
9028 vec_perm_builder sel (nunits, 2, 3);
9029 sel.quick_grow (6);
9030 for (i = 0; i < 3; i++)
9032 sel[2 * i] = i;
9033 sel[2 * i + 1] = i + nunits;
9035 vec_perm_indices indices (sel, 2, nunits);
9036 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9037 fail when vec_init is const vector. In that situation vec_perm is not
9038 really needed. */
9039 tree perm_mask_even
9040 = vect_gen_perm_mask_any (vectype, indices);
9041 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9042 vectype,
9043 vec_init, vec_neg,
9044 perm_mask_even);
9046 break;
9048 case vect_step_op_mul:
9050 /* Use unsigned mult to avoid UD integer overflow. */
9051 gcc_assert (nunits.is_constant (&const_nunits));
9052 tree utype = unsigned_type_for (itype);
9053 tree uvectype = build_vector_type (utype,
9054 TYPE_VECTOR_SUBPARTS (vectype));
9055 new_name = gimple_convert (stmts, utype, new_name);
9056 vec_init = gimple_build_vector_from_val (stmts,
9057 uvectype,
9058 new_name);
9059 tree_vector_builder elts (uvectype, const_nunits, 1);
9060 tree elt_step = build_one_cst (utype);
9062 elts.quick_push (elt_step);
9063 for (i = 1; i < const_nunits; i++)
9065 /* Create: new_name_i = new_name + step_expr. */
9066 elt_step = gimple_build (stmts, MULT_EXPR,
9067 utype, elt_step, step_expr);
9068 elts.quick_push (elt_step);
9070 /* Create a vector from [new_name_0, new_name_1, ...,
9071 new_name_nunits-1]. */
9072 tree vec_mul = gimple_build_vector (stmts, &elts);
9073 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9074 vec_init, vec_mul);
9075 vec_init = gimple_convert (stmts, vectype, vec_init);
9077 break;
9079 default:
9080 gcc_unreachable ();
9083 return vec_init;
9086 /* Peel init_expr by skip_niter for induction_type. */
9087 tree
9088 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9089 tree skip_niters, tree step_expr,
9090 enum vect_induction_op_type induction_type)
9092 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9093 tree type = TREE_TYPE (init_expr);
9094 unsigned prec = TYPE_PRECISION (type);
9095 switch (induction_type)
9097 case vect_step_op_neg:
9098 if (TREE_INT_CST_LOW (skip_niters) % 2)
9099 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9100 /* else no change. */
9101 break;
9103 case vect_step_op_shr:
9104 case vect_step_op_shl:
9105 skip_niters = gimple_convert (stmts, type, skip_niters);
9106 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9107 /* When shift mount >= precision, need to avoid UD.
9108 In the original loop, there's no UD, and according to semantic,
9109 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9110 if (!tree_fits_uhwi_p (step_expr)
9111 || tree_to_uhwi (step_expr) >= prec)
9113 if (induction_type == vect_step_op_shl
9114 || TYPE_UNSIGNED (type))
9115 init_expr = build_zero_cst (type);
9116 else
9117 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9118 init_expr,
9119 wide_int_to_tree (type, prec - 1));
9121 else
9122 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9123 ? RSHIFT_EXPR : LSHIFT_EXPR),
9124 type, init_expr, step_expr);
9125 break;
9127 case vect_step_op_mul:
9129 tree utype = unsigned_type_for (type);
9130 init_expr = gimple_convert (stmts, utype, init_expr);
9131 unsigned skipn = TREE_INT_CST_LOW (skip_niters);
9132 wide_int begin = wi::to_wide (step_expr);
9133 for (unsigned i = 0; i != skipn - 1; i++)
9134 begin = wi::mul (begin, wi::to_wide (step_expr));
9135 tree mult_expr = wide_int_to_tree (utype, begin);
9136 init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
9137 init_expr = gimple_convert (stmts, type, init_expr);
9139 break;
9141 default:
9142 gcc_unreachable ();
9145 return init_expr;
9148 /* Create vector step for vectorized iv. */
9149 static tree
9150 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9151 poly_uint64 vf,
9152 enum vect_induction_op_type induction_type)
9154 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9155 tree new_name = NULL;
9156 /* Step should be pow (step, vf) for mult induction. */
9157 if (induction_type == vect_step_op_mul)
9159 gcc_assert (vf.is_constant ());
9160 wide_int begin = wi::to_wide (step_expr);
9162 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9163 begin = wi::mul (begin, wi::to_wide (step_expr));
9165 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9167 else if (induction_type == vect_step_op_neg)
9168 /* Do nothing. */
9170 else
9171 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9172 expr, step_expr);
9173 return new_name;
9176 static tree
9177 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9178 stmt_vec_info stmt_info,
9179 tree new_name, tree vectype,
9180 enum vect_induction_op_type induction_type)
9182 /* No step is needed for neg induction. */
9183 if (induction_type == vect_step_op_neg)
9184 return NULL;
9186 tree t = unshare_expr (new_name);
9187 gcc_assert (CONSTANT_CLASS_P (new_name)
9188 || TREE_CODE (new_name) == SSA_NAME);
9189 tree new_vec = build_vector_from_val (vectype, t);
9190 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9191 new_vec, vectype, NULL);
9192 return vec_step;
9195 /* Update vectorized iv with vect_step, induc_def is init. */
9196 static tree
9197 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9198 tree induc_def, tree vec_step,
9199 enum vect_induction_op_type induction_type)
9201 tree vec_def = induc_def;
9202 switch (induction_type)
9204 case vect_step_op_mul:
9206 /* Use unsigned mult to avoid UD integer overflow. */
9207 tree uvectype
9208 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9209 TYPE_VECTOR_SUBPARTS (vectype));
9210 vec_def = gimple_convert (stmts, uvectype, vec_def);
9211 vec_step = gimple_convert (stmts, uvectype, vec_step);
9212 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9213 vec_def, vec_step);
9214 vec_def = gimple_convert (stmts, vectype, vec_def);
9216 break;
9218 case vect_step_op_shr:
9219 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9220 vec_def, vec_step);
9221 break;
9223 case vect_step_op_shl:
9224 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9225 vec_def, vec_step);
9226 break;
9227 case vect_step_op_neg:
9228 vec_def = induc_def;
9229 /* Do nothing. */
9230 break;
9231 default:
9232 gcc_unreachable ();
9235 return vec_def;
9239 /* Function vectorizable_induction
9241 Check if STMT_INFO performs an nonlinear induction computation that can be
9242 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9243 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9244 basic block.
9245 Return true if STMT_INFO is vectorizable in this way. */
9247 static bool
9248 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9249 stmt_vec_info stmt_info,
9250 gimple **vec_stmt, slp_tree slp_node,
9251 stmt_vector_for_cost *cost_vec)
9253 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9254 unsigned ncopies;
9255 bool nested_in_vect_loop = false;
9256 class loop *iv_loop;
9257 tree vec_def;
9258 edge pe = loop_preheader_edge (loop);
9259 basic_block new_bb;
9260 tree vec_init, vec_step;
9261 tree new_name;
9262 gimple *new_stmt;
9263 gphi *induction_phi;
9264 tree induc_def, vec_dest;
9265 tree init_expr, step_expr;
9266 tree niters_skip;
9267 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9268 unsigned i;
9269 gimple_stmt_iterator si;
9271 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9273 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9274 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9275 enum vect_induction_op_type induction_type
9276 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9278 gcc_assert (induction_type > vect_step_op_add);
9280 if (slp_node)
9281 ncopies = 1;
9282 else
9283 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9284 gcc_assert (ncopies >= 1);
9286 /* FORNOW. Only handle nonlinear induction in the same loop. */
9287 if (nested_in_vect_loop_p (loop, stmt_info))
9289 if (dump_enabled_p ())
9290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9291 "nonlinear induction in nested loop.\n");
9292 return false;
9295 iv_loop = loop;
9296 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9298 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9299 update for each iv and a permutation to generate wanted vector iv. */
9300 if (slp_node)
9302 if (dump_enabled_p ())
9303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9304 "SLP induction not supported for nonlinear"
9305 " induction.\n");
9306 return false;
9309 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9311 if (dump_enabled_p ())
9312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9313 "floating point nonlinear induction vectorization"
9314 " not supported.\n");
9315 return false;
9318 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9319 init_expr = vect_phi_initial_value (phi);
9320 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9321 && TREE_CODE (step_expr) == INTEGER_CST);
9322 /* step_expr should be aligned with init_expr,
9323 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9324 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9326 if (TREE_CODE (init_expr) == INTEGER_CST)
9327 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9328 else
9329 gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9330 TREE_TYPE (init_expr)));
9332 switch (induction_type)
9334 case vect_step_op_neg:
9335 if (TREE_CODE (init_expr) != INTEGER_CST
9336 && TREE_CODE (init_expr) != REAL_CST)
9338 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9339 if (!directly_supported_p (NEGATE_EXPR, vectype))
9340 return false;
9342 /* The encoding has 2 interleaved stepped patterns. */
9343 vec_perm_builder sel (nunits, 2, 3);
9344 machine_mode mode = TYPE_MODE (vectype);
9345 sel.quick_grow (6);
9346 for (i = 0; i < 3; i++)
9348 sel[i * 2] = i;
9349 sel[i * 2 + 1] = i + nunits;
9351 vec_perm_indices indices (sel, 2, nunits);
9352 if (!can_vec_perm_const_p (mode, mode, indices))
9353 return false;
9355 break;
9357 case vect_step_op_mul:
9359 /* Check for backend support of MULT_EXPR. */
9360 if (!directly_supported_p (MULT_EXPR, vectype))
9361 return false;
9363 /* ?? How to construct vector step for variable number vector.
9364 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9365 if (!vf.is_constant ())
9366 return false;
9368 break;
9370 case vect_step_op_shr:
9371 /* Check for backend support of RSHIFT_EXPR. */
9372 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9373 return false;
9375 /* Don't shift more than type precision to avoid UD. */
9376 if (!tree_fits_uhwi_p (step_expr)
9377 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9378 TYPE_PRECISION (TREE_TYPE (init_expr))))
9379 return false;
9380 break;
9382 case vect_step_op_shl:
9383 /* Check for backend support of RSHIFT_EXPR. */
9384 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9385 return false;
9387 /* Don't shift more than type precision to avoid UD. */
9388 if (!tree_fits_uhwi_p (step_expr)
9389 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9390 TYPE_PRECISION (TREE_TYPE (init_expr))))
9391 return false;
9393 break;
9395 default:
9396 gcc_unreachable ();
9399 if (!vec_stmt) /* transformation not required. */
9401 unsigned inside_cost = 0, prologue_cost = 0;
9402 /* loop cost for vec_loop. Neg induction doesn't have any
9403 inside_cost. */
9404 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9405 stmt_info, 0, vect_body);
9407 /* loop cost for vec_loop. Neg induction doesn't have any
9408 inside_cost. */
9409 if (induction_type == vect_step_op_neg)
9410 inside_cost = 0;
9412 /* prologue cost for vec_init and vec_step. */
9413 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9414 stmt_info, 0, vect_prologue);
9416 if (dump_enabled_p ())
9417 dump_printf_loc (MSG_NOTE, vect_location,
9418 "vect_model_induction_cost: inside_cost = %d, "
9419 "prologue_cost = %d. \n", inside_cost,
9420 prologue_cost);
9422 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9423 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9424 return true;
9427 /* Transform. */
9429 /* Compute a vector variable, initialized with the first VF values of
9430 the induction variable. E.g., for an iv with IV_PHI='X' and
9431 evolution S, for a vector of 4 units, we want to compute:
9432 [X, X + S, X + 2*S, X + 3*S]. */
9434 if (dump_enabled_p ())
9435 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9437 pe = loop_preheader_edge (iv_loop);
9438 /* Find the first insertion point in the BB. */
9439 basic_block bb = gimple_bb (phi);
9440 si = gsi_after_labels (bb);
9442 gimple_seq stmts = NULL;
9444 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9445 /* If we are using the loop mask to "peel" for alignment then we need
9446 to adjust the start value here. */
9447 if (niters_skip != NULL_TREE)
9448 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9449 step_expr, induction_type);
9451 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9452 step_expr, nunits, vectype,
9453 induction_type);
9454 if (stmts)
9456 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9457 gcc_assert (!new_bb);
9460 stmts = NULL;
9461 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9462 vf, induction_type);
9463 if (stmts)
9465 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9466 gcc_assert (!new_bb);
9469 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9470 new_name, vectype,
9471 induction_type);
9472 /* Create the following def-use cycle:
9473 loop prolog:
9474 vec_init = ...
9475 vec_step = ...
9476 loop:
9477 vec_iv = PHI <vec_init, vec_loop>
9479 STMT
9481 vec_loop = vec_iv + vec_step; */
9483 /* Create the induction-phi that defines the induction-operand. */
9484 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9485 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9486 induc_def = PHI_RESULT (induction_phi);
9488 /* Create the iv update inside the loop. */
9489 stmts = NULL;
9490 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9491 induc_def, vec_step,
9492 induction_type);
9494 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9495 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9497 /* Set the arguments of the phi node: */
9498 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9499 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9500 UNKNOWN_LOCATION);
9502 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9503 *vec_stmt = induction_phi;
9505 /* In case that vectorization factor (VF) is bigger than the number
9506 of elements that we can fit in a vectype (nunits), we have to generate
9507 more than one vector stmt - i.e - we need to "unroll" the
9508 vector stmt by a factor VF/nunits. For more details see documentation
9509 in vectorizable_operation. */
9511 if (ncopies > 1)
9513 stmts = NULL;
9514 /* FORNOW. This restriction should be relaxed. */
9515 gcc_assert (!nested_in_vect_loop);
9517 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9518 nunits, induction_type);
9520 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9521 new_name, vectype,
9522 induction_type);
9523 vec_def = induc_def;
9524 for (i = 1; i < ncopies; i++)
9526 /* vec_i = vec_prev + vec_step. */
9527 stmts = NULL;
9528 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9529 vec_def, vec_step,
9530 induction_type);
9531 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9532 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9533 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9537 if (dump_enabled_p ())
9538 dump_printf_loc (MSG_NOTE, vect_location,
9539 "transform induction: created def-use cycle: %G%G",
9540 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9542 return true;
9545 /* Function vectorizable_induction
9547 Check if STMT_INFO performs an induction computation that can be vectorized.
9548 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9549 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9550 Return true if STMT_INFO is vectorizable in this way. */
9552 bool
9553 vectorizable_induction (loop_vec_info loop_vinfo,
9554 stmt_vec_info stmt_info,
9555 gimple **vec_stmt, slp_tree slp_node,
9556 stmt_vector_for_cost *cost_vec)
9558 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9559 unsigned ncopies;
9560 bool nested_in_vect_loop = false;
9561 class loop *iv_loop;
9562 tree vec_def;
9563 edge pe = loop_preheader_edge (loop);
9564 basic_block new_bb;
9565 tree new_vec, vec_init, vec_step, t;
9566 tree new_name;
9567 gimple *new_stmt;
9568 gphi *induction_phi;
9569 tree induc_def, vec_dest;
9570 tree init_expr, step_expr;
9571 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9572 unsigned i;
9573 tree expr;
9574 gimple_stmt_iterator si;
9575 enum vect_induction_op_type induction_type
9576 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9578 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9579 if (!phi)
9580 return false;
9582 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9583 return false;
9585 /* Make sure it was recognized as induction computation. */
9586 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9587 return false;
9589 /* Handle nonlinear induction in a separate place. */
9590 if (induction_type != vect_step_op_add)
9591 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9592 vec_stmt, slp_node, cost_vec);
9594 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9595 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9597 if (slp_node)
9598 ncopies = 1;
9599 else
9600 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9601 gcc_assert (ncopies >= 1);
9603 /* FORNOW. These restrictions should be relaxed. */
9604 if (nested_in_vect_loop_p (loop, stmt_info))
9606 imm_use_iterator imm_iter;
9607 use_operand_p use_p;
9608 gimple *exit_phi;
9609 edge latch_e;
9610 tree loop_arg;
9612 if (ncopies > 1)
9614 if (dump_enabled_p ())
9615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9616 "multiple types in nested loop.\n");
9617 return false;
9620 exit_phi = NULL;
9621 latch_e = loop_latch_edge (loop->inner);
9622 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9623 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9625 gimple *use_stmt = USE_STMT (use_p);
9626 if (is_gimple_debug (use_stmt))
9627 continue;
9629 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9631 exit_phi = use_stmt;
9632 break;
9635 if (exit_phi)
9637 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9638 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9639 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9641 if (dump_enabled_p ())
9642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9643 "inner-loop induction only used outside "
9644 "of the outer vectorized loop.\n");
9645 return false;
9649 nested_in_vect_loop = true;
9650 iv_loop = loop->inner;
9652 else
9653 iv_loop = loop;
9654 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9656 if (slp_node && !nunits.is_constant ())
9658 /* The current SLP code creates the step value element-by-element. */
9659 if (dump_enabled_p ())
9660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9661 "SLP induction not supported for variable-length"
9662 " vectors.\n");
9663 return false;
9666 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9668 if (dump_enabled_p ())
9669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9670 "floating point induction vectorization disabled\n");
9671 return false;
9674 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9675 gcc_assert (step_expr != NULL_TREE);
9676 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9678 /* Check for backend support of PLUS/MINUS_EXPR. */
9679 if (!directly_supported_p (PLUS_EXPR, step_vectype)
9680 || !directly_supported_p (MINUS_EXPR, step_vectype))
9681 return false;
9683 if (!vec_stmt) /* transformation not required. */
9685 unsigned inside_cost = 0, prologue_cost = 0;
9686 if (slp_node)
9688 /* We eventually need to set a vector type on invariant
9689 arguments. */
9690 unsigned j;
9691 slp_tree child;
9692 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9693 if (!vect_maybe_update_slp_op_vectype
9694 (child, SLP_TREE_VECTYPE (slp_node)))
9696 if (dump_enabled_p ())
9697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9698 "incompatible vector types for "
9699 "invariants\n");
9700 return false;
9702 /* loop cost for vec_loop. */
9703 inside_cost
9704 = record_stmt_cost (cost_vec,
9705 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9706 vector_stmt, stmt_info, 0, vect_body);
9707 /* prologue cost for vec_init (if not nested) and step. */
9708 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9709 scalar_to_vec,
9710 stmt_info, 0, vect_prologue);
9712 else /* if (!slp_node) */
9714 /* loop cost for vec_loop. */
9715 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9716 stmt_info, 0, vect_body);
9717 /* prologue cost for vec_init and vec_step. */
9718 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9719 stmt_info, 0, vect_prologue);
9721 if (dump_enabled_p ())
9722 dump_printf_loc (MSG_NOTE, vect_location,
9723 "vect_model_induction_cost: inside_cost = %d, "
9724 "prologue_cost = %d .\n", inside_cost,
9725 prologue_cost);
9727 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9728 DUMP_VECT_SCOPE ("vectorizable_induction");
9729 return true;
9732 /* Transform. */
9734 /* Compute a vector variable, initialized with the first VF values of
9735 the induction variable. E.g., for an iv with IV_PHI='X' and
9736 evolution S, for a vector of 4 units, we want to compute:
9737 [X, X + S, X + 2*S, X + 3*S]. */
9739 if (dump_enabled_p ())
9740 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9742 pe = loop_preheader_edge (iv_loop);
9743 /* Find the first insertion point in the BB. */
9744 basic_block bb = gimple_bb (phi);
9745 si = gsi_after_labels (bb);
9747 /* For SLP induction we have to generate several IVs as for example
9748 with group size 3 we need
9749 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9750 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9751 if (slp_node)
9753 /* Enforced above. */
9754 unsigned int const_nunits = nunits.to_constant ();
9756 /* The initial values are vectorized, but any lanes > group_size
9757 need adjustment. */
9758 slp_tree init_node
9759 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9761 /* Gather steps. Since we do not vectorize inductions as
9762 cycles we have to reconstruct the step from SCEV data. */
9763 unsigned group_size = SLP_TREE_LANES (slp_node);
9764 tree *steps = XALLOCAVEC (tree, group_size);
9765 tree *inits = XALLOCAVEC (tree, group_size);
9766 stmt_vec_info phi_info;
9767 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9769 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9770 if (!init_node)
9771 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9772 pe->dest_idx);
9775 /* Now generate the IVs. */
9776 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9777 gcc_assert ((const_nunits * nvects) % group_size == 0);
9778 unsigned nivs;
9779 if (nested_in_vect_loop)
9780 nivs = nvects;
9781 else
9783 /* Compute the number of distinct IVs we need. First reduce
9784 group_size if it is a multiple of const_nunits so we get
9785 one IV for a group_size of 4 but const_nunits 2. */
9786 unsigned group_sizep = group_size;
9787 if (group_sizep % const_nunits == 0)
9788 group_sizep = group_sizep / const_nunits;
9789 nivs = least_common_multiple (group_sizep,
9790 const_nunits) / const_nunits;
9792 tree stept = TREE_TYPE (step_vectype);
9793 tree lupdate_mul = NULL_TREE;
9794 if (!nested_in_vect_loop)
9796 /* The number of iterations covered in one vector iteration. */
9797 unsigned lup_mul = (nvects * const_nunits) / group_size;
9798 lupdate_mul
9799 = build_vector_from_val (step_vectype,
9800 SCALAR_FLOAT_TYPE_P (stept)
9801 ? build_real_from_wide (stept, lup_mul,
9802 UNSIGNED)
9803 : build_int_cstu (stept, lup_mul));
9805 tree peel_mul = NULL_TREE;
9806 gimple_seq init_stmts = NULL;
9807 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9809 if (SCALAR_FLOAT_TYPE_P (stept))
9810 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9811 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9812 else
9813 peel_mul = gimple_convert (&init_stmts, stept,
9814 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9815 peel_mul = gimple_build_vector_from_val (&init_stmts,
9816 step_vectype, peel_mul);
9818 unsigned ivn;
9819 auto_vec<tree> vec_steps;
9820 for (ivn = 0; ivn < nivs; ++ivn)
9822 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9823 tree_vector_builder init_elts (vectype, const_nunits, 1);
9824 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9825 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9827 /* The scalar steps of the IVs. */
9828 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9829 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9830 step_elts.quick_push (elt);
9831 if (!init_node)
9833 /* The scalar inits of the IVs if not vectorized. */
9834 elt = inits[(ivn*const_nunits + eltn) % group_size];
9835 if (!useless_type_conversion_p (TREE_TYPE (vectype),
9836 TREE_TYPE (elt)))
9837 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9838 TREE_TYPE (vectype), elt);
9839 init_elts.quick_push (elt);
9841 /* The number of steps to add to the initial values. */
9842 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9843 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9844 ? build_real_from_wide (stept,
9845 mul_elt, UNSIGNED)
9846 : build_int_cstu (stept, mul_elt));
9848 vec_step = gimple_build_vector (&init_stmts, &step_elts);
9849 vec_steps.safe_push (vec_step);
9850 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9851 if (peel_mul)
9852 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9853 step_mul, peel_mul);
9854 if (!init_node)
9855 vec_init = gimple_build_vector (&init_stmts, &init_elts);
9857 /* Create the induction-phi that defines the induction-operand. */
9858 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9859 "vec_iv_");
9860 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9861 induc_def = PHI_RESULT (induction_phi);
9863 /* Create the iv update inside the loop */
9864 tree up = vec_step;
9865 if (lupdate_mul)
9866 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9867 vec_step, lupdate_mul);
9868 gimple_seq stmts = NULL;
9869 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9870 vec_def = gimple_build (&stmts,
9871 PLUS_EXPR, step_vectype, vec_def, up);
9872 vec_def = gimple_convert (&stmts, vectype, vec_def);
9873 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9874 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9875 UNKNOWN_LOCATION);
9877 if (init_node)
9878 vec_init = vect_get_slp_vect_def (init_node, ivn);
9879 if (!nested_in_vect_loop
9880 && !integer_zerop (step_mul))
9882 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9883 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9884 vec_step, step_mul);
9885 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9886 vec_def, up);
9887 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9890 /* Set the arguments of the phi node: */
9891 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9893 slp_node->push_vec_def (induction_phi);
9895 if (!nested_in_vect_loop)
9897 /* Fill up to the number of vectors we need for the whole group. */
9898 nivs = least_common_multiple (group_size,
9899 const_nunits) / const_nunits;
9900 vec_steps.reserve (nivs-ivn);
9901 for (; ivn < nivs; ++ivn)
9903 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9904 vec_steps.quick_push (vec_steps[0]);
9908 /* Re-use IVs when we can. We are generating further vector
9909 stmts by adding VF' * stride to the IVs generated above. */
9910 if (ivn < nvects)
9912 unsigned vfp
9913 = least_common_multiple (group_size, const_nunits) / group_size;
9914 tree lupdate_mul
9915 = build_vector_from_val (step_vectype,
9916 SCALAR_FLOAT_TYPE_P (stept)
9917 ? build_real_from_wide (stept,
9918 vfp, UNSIGNED)
9919 : build_int_cstu (stept, vfp));
9920 for (; ivn < nvects; ++ivn)
9922 gimple *iv
9923 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
9924 tree def = gimple_get_lhs (iv);
9925 if (ivn < 2*nivs)
9926 vec_steps[ivn - nivs]
9927 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9928 vec_steps[ivn - nivs], lupdate_mul);
9929 gimple_seq stmts = NULL;
9930 def = gimple_convert (&stmts, step_vectype, def);
9931 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9932 def, vec_steps[ivn % nivs]);
9933 def = gimple_convert (&stmts, vectype, def);
9934 if (gimple_code (iv) == GIMPLE_PHI)
9935 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9936 else
9938 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9939 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9941 slp_node->push_vec_def (def);
9945 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9946 gcc_assert (!new_bb);
9948 return true;
9951 init_expr = vect_phi_initial_value (phi);
9953 gimple_seq stmts = NULL;
9954 if (!nested_in_vect_loop)
9956 /* Convert the initial value to the IV update type. */
9957 tree new_type = TREE_TYPE (step_expr);
9958 init_expr = gimple_convert (&stmts, new_type, init_expr);
9960 /* If we are using the loop mask to "peel" for alignment then we need
9961 to adjust the start value here. */
9962 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9963 if (skip_niters != NULL_TREE)
9965 if (FLOAT_TYPE_P (vectype))
9966 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9967 skip_niters);
9968 else
9969 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9970 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9971 skip_niters, step_expr);
9972 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9973 init_expr, skip_step);
9977 if (stmts)
9979 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9980 gcc_assert (!new_bb);
9983 /* Create the vector that holds the initial_value of the induction. */
9984 if (nested_in_vect_loop)
9986 /* iv_loop is nested in the loop to be vectorized. init_expr had already
9987 been created during vectorization of previous stmts. We obtain it
9988 from the STMT_VINFO_VEC_STMT of the defining stmt. */
9989 auto_vec<tree> vec_inits;
9990 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9991 init_expr, &vec_inits);
9992 vec_init = vec_inits[0];
9993 /* If the initial value is not of proper type, convert it. */
9994 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9996 new_stmt
9997 = gimple_build_assign (vect_get_new_ssa_name (vectype,
9998 vect_simple_var,
9999 "vec_iv_"),
10000 VIEW_CONVERT_EXPR,
10001 build1 (VIEW_CONVERT_EXPR, vectype,
10002 vec_init));
10003 vec_init = gimple_assign_lhs (new_stmt);
10004 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10005 new_stmt);
10006 gcc_assert (!new_bb);
10009 else
10011 /* iv_loop is the loop to be vectorized. Create:
10012 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10013 stmts = NULL;
10014 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10016 unsigned HOST_WIDE_INT const_nunits;
10017 if (nunits.is_constant (&const_nunits))
10019 tree_vector_builder elts (step_vectype, const_nunits, 1);
10020 elts.quick_push (new_name);
10021 for (i = 1; i < const_nunits; i++)
10023 /* Create: new_name_i = new_name + step_expr */
10024 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10025 new_name, step_expr);
10026 elts.quick_push (new_name);
10028 /* Create a vector from [new_name_0, new_name_1, ...,
10029 new_name_nunits-1] */
10030 vec_init = gimple_build_vector (&stmts, &elts);
10032 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10033 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10034 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10035 new_name, step_expr);
10036 else
10038 /* Build:
10039 [base, base, base, ...]
10040 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10041 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10042 gcc_assert (flag_associative_math);
10043 tree index = build_index_vector (step_vectype, 0, 1);
10044 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10045 new_name);
10046 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10047 step_expr);
10048 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10049 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10050 vec_init, step_vec);
10051 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10052 vec_init, base_vec);
10054 vec_init = gimple_convert (&stmts, vectype, vec_init);
10056 if (stmts)
10058 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10059 gcc_assert (!new_bb);
10064 /* Create the vector that holds the step of the induction. */
10065 if (nested_in_vect_loop)
10066 /* iv_loop is nested in the loop to be vectorized. Generate:
10067 vec_step = [S, S, S, S] */
10068 new_name = step_expr;
10069 else
10071 /* iv_loop is the loop to be vectorized. Generate:
10072 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10073 gimple_seq seq = NULL;
10074 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10076 expr = build_int_cst (integer_type_node, vf);
10077 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10079 else
10080 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10081 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10082 expr, step_expr);
10083 if (seq)
10085 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10086 gcc_assert (!new_bb);
10090 t = unshare_expr (new_name);
10091 gcc_assert (CONSTANT_CLASS_P (new_name)
10092 || TREE_CODE (new_name) == SSA_NAME);
10093 new_vec = build_vector_from_val (step_vectype, t);
10094 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10095 new_vec, step_vectype, NULL);
10098 /* Create the following def-use cycle:
10099 loop prolog:
10100 vec_init = ...
10101 vec_step = ...
10102 loop:
10103 vec_iv = PHI <vec_init, vec_loop>
10105 STMT
10107 vec_loop = vec_iv + vec_step; */
10109 /* Create the induction-phi that defines the induction-operand. */
10110 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10111 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10112 induc_def = PHI_RESULT (induction_phi);
10114 /* Create the iv update inside the loop */
10115 stmts = NULL;
10116 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10117 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10118 vec_def = gimple_convert (&stmts, vectype, vec_def);
10119 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10120 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10122 /* Set the arguments of the phi node: */
10123 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10124 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10125 UNKNOWN_LOCATION);
10127 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10128 *vec_stmt = induction_phi;
10130 /* In case that vectorization factor (VF) is bigger than the number
10131 of elements that we can fit in a vectype (nunits), we have to generate
10132 more than one vector stmt - i.e - we need to "unroll" the
10133 vector stmt by a factor VF/nunits. For more details see documentation
10134 in vectorizable_operation. */
10136 if (ncopies > 1)
10138 gimple_seq seq = NULL;
10139 /* FORNOW. This restriction should be relaxed. */
10140 gcc_assert (!nested_in_vect_loop);
10142 /* Create the vector that holds the step of the induction. */
10143 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10145 expr = build_int_cst (integer_type_node, nunits);
10146 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10148 else
10149 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10150 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10151 expr, step_expr);
10152 if (seq)
10154 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10155 gcc_assert (!new_bb);
10158 t = unshare_expr (new_name);
10159 gcc_assert (CONSTANT_CLASS_P (new_name)
10160 || TREE_CODE (new_name) == SSA_NAME);
10161 new_vec = build_vector_from_val (step_vectype, t);
10162 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10163 new_vec, step_vectype, NULL);
10165 vec_def = induc_def;
10166 for (i = 1; i < ncopies + 1; i++)
10168 /* vec_i = vec_prev + vec_step */
10169 gimple_seq stmts = NULL;
10170 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10171 vec_def = gimple_build (&stmts,
10172 PLUS_EXPR, step_vectype, vec_def, vec_step);
10173 vec_def = gimple_convert (&stmts, vectype, vec_def);
10175 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10176 if (i < ncopies)
10178 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10179 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10181 else
10183 /* vec_1 = vec_iv + (VF/n * S)
10184 vec_2 = vec_1 + (VF/n * S)
10186 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10188 vec_n is used as vec_loop to save the large step register and
10189 related operations. */
10190 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10191 UNKNOWN_LOCATION);
10196 if (dump_enabled_p ())
10197 dump_printf_loc (MSG_NOTE, vect_location,
10198 "transform induction: created def-use cycle: %G%G",
10199 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10201 return true;
10204 /* Function vectorizable_live_operation.
10206 STMT_INFO computes a value that is used outside the loop. Check if
10207 it can be supported. */
10209 bool
10210 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10211 slp_tree slp_node, slp_instance slp_node_instance,
10212 int slp_index, bool vec_stmt_p,
10213 stmt_vector_for_cost *cost_vec)
10215 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10216 imm_use_iterator imm_iter;
10217 tree lhs, lhs_type, bitsize;
10218 tree vectype = (slp_node
10219 ? SLP_TREE_VECTYPE (slp_node)
10220 : STMT_VINFO_VECTYPE (stmt_info));
10221 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10222 int ncopies;
10223 gimple *use_stmt;
10224 auto_vec<tree> vec_oprnds;
10225 int vec_entry = 0;
10226 poly_uint64 vec_index = 0;
10228 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10230 /* If a stmt of a reduction is live, vectorize it via
10231 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10232 validity so just trigger the transform here. */
10233 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10235 if (!vec_stmt_p)
10236 return true;
10237 if (slp_node)
10239 /* For reduction chains the meta-info is attached to
10240 the group leader. */
10241 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10242 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10243 /* For SLP reductions we vectorize the epilogue for
10244 all involved stmts together. */
10245 else if (slp_index != 0)
10246 return true;
10248 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10249 gcc_assert (reduc_info->is_reduc_info);
10250 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10251 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10252 return true;
10253 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10254 slp_node_instance);
10255 return true;
10258 /* If STMT is not relevant and it is a simple assignment and its inputs are
10259 invariant then it can remain in place, unvectorized. The original last
10260 scalar value that it computes will be used. */
10261 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10263 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10264 if (dump_enabled_p ())
10265 dump_printf_loc (MSG_NOTE, vect_location,
10266 "statement is simple and uses invariant. Leaving in "
10267 "place.\n");
10268 return true;
10271 if (slp_node)
10272 ncopies = 1;
10273 else
10274 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10276 if (slp_node)
10278 gcc_assert (slp_index >= 0);
10280 /* Get the last occurrence of the scalar index from the concatenation of
10281 all the slp vectors. Calculate which slp vector it is and the index
10282 within. */
10283 int num_scalar = SLP_TREE_LANES (slp_node);
10284 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10285 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10287 /* Calculate which vector contains the result, and which lane of
10288 that vector we need. */
10289 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10291 if (dump_enabled_p ())
10292 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10293 "Cannot determine which vector holds the"
10294 " final result.\n");
10295 return false;
10299 if (!vec_stmt_p)
10301 /* No transformation required. */
10302 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10304 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10305 OPTIMIZE_FOR_SPEED))
10307 if (dump_enabled_p ())
10308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10309 "can't operate on partial vectors "
10310 "because the target doesn't support extract "
10311 "last reduction.\n");
10312 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10314 else if (slp_node)
10316 if (dump_enabled_p ())
10317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10318 "can't operate on partial vectors "
10319 "because an SLP statement is live after "
10320 "the loop.\n");
10321 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10323 else if (ncopies > 1)
10325 if (dump_enabled_p ())
10326 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10327 "can't operate on partial vectors "
10328 "because ncopies is greater than 1.\n");
10329 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10331 else
10333 gcc_assert (ncopies == 1 && !slp_node);
10334 vect_record_loop_mask (loop_vinfo,
10335 &LOOP_VINFO_MASKS (loop_vinfo),
10336 1, vectype, NULL);
10339 /* ??? Enable for loop costing as well. */
10340 if (!loop_vinfo)
10341 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10342 0, vect_epilogue);
10343 return true;
10346 /* Use the lhs of the original scalar statement. */
10347 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10348 if (dump_enabled_p ())
10349 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10350 "stmt %G", stmt);
10352 lhs = gimple_get_lhs (stmt);
10353 lhs_type = TREE_TYPE (lhs);
10355 bitsize = vector_element_bits_tree (vectype);
10357 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10358 tree vec_lhs, bitstart;
10359 gimple *vec_stmt;
10360 if (slp_node)
10362 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
10364 /* Get the correct slp vectorized stmt. */
10365 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10366 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10368 /* Get entry to use. */
10369 bitstart = bitsize_int (vec_index);
10370 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10372 else
10374 /* For multiple copies, get the last copy. */
10375 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10376 vec_lhs = gimple_get_lhs (vec_stmt);
10378 /* Get the last lane in the vector. */
10379 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10382 if (loop_vinfo)
10384 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10385 requirement, insert one phi node for it. It looks like:
10386 loop;
10388 # lhs' = PHI <lhs>
10390 loop;
10392 # vec_lhs' = PHI <vec_lhs>
10393 new_tree = lane_extract <vec_lhs', ...>;
10394 lhs' = new_tree; */
10396 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10397 basic_block exit_bb = single_exit (loop)->dest;
10398 gcc_assert (single_pred_p (exit_bb));
10400 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10401 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10402 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
10404 gimple_seq stmts = NULL;
10405 tree new_tree;
10406 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10408 /* Emit:
10410 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10412 where VEC_LHS is the vectorized live-out result and MASK is
10413 the loop mask for the final iteration. */
10414 gcc_assert (ncopies == 1 && !slp_node);
10415 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10416 gimple_seq tem = NULL;
10417 gimple_stmt_iterator gsi = gsi_last (tem);
10418 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10419 &LOOP_VINFO_MASKS (loop_vinfo),
10420 1, vectype, 0);
10421 gimple_seq_add_seq (&stmts, tem);
10422 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10423 mask, vec_lhs_phi);
10425 /* Convert the extracted vector element to the scalar type. */
10426 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10428 else
10430 tree bftype = TREE_TYPE (vectype);
10431 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10432 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10433 new_tree = build3 (BIT_FIELD_REF, bftype,
10434 vec_lhs_phi, bitsize, bitstart);
10435 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10436 &stmts, true, NULL_TREE);
10439 if (stmts)
10441 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10442 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10444 /* Remove existing phi from lhs and create one copy from new_tree. */
10445 tree lhs_phi = NULL_TREE;
10446 gimple_stmt_iterator gsi;
10447 for (gsi = gsi_start_phis (exit_bb);
10448 !gsi_end_p (gsi); gsi_next (&gsi))
10450 gimple *phi = gsi_stmt (gsi);
10451 if ((gimple_phi_arg_def (phi, 0) == lhs))
10453 remove_phi_node (&gsi, false);
10454 lhs_phi = gimple_phi_result (phi);
10455 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10456 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10457 break;
10462 /* Replace use of lhs with newly computed result. If the use stmt is a
10463 single arg PHI, just replace all uses of PHI result. It's necessary
10464 because lcssa PHI defining lhs may be before newly inserted stmt. */
10465 use_operand_p use_p;
10466 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10467 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10468 && !is_gimple_debug (use_stmt))
10470 if (gimple_code (use_stmt) == GIMPLE_PHI
10471 && gimple_phi_num_args (use_stmt) == 1)
10473 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10475 else
10477 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10478 SET_USE (use_p, new_tree);
10480 update_stmt (use_stmt);
10483 else
10485 /* For basic-block vectorization simply insert the lane-extraction. */
10486 tree bftype = TREE_TYPE (vectype);
10487 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10488 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10489 tree new_tree = build3 (BIT_FIELD_REF, bftype,
10490 vec_lhs, bitsize, bitstart);
10491 gimple_seq stmts = NULL;
10492 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10493 &stmts, true, NULL_TREE);
10494 if (TREE_CODE (new_tree) == SSA_NAME
10495 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10496 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10497 if (is_a <gphi *> (vec_stmt))
10499 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10500 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10502 else
10504 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10505 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10508 /* Replace use of lhs with newly computed result. If the use stmt is a
10509 single arg PHI, just replace all uses of PHI result. It's necessary
10510 because lcssa PHI defining lhs may be before newly inserted stmt. */
10511 use_operand_p use_p;
10512 stmt_vec_info use_stmt_info;
10513 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10514 if (!is_gimple_debug (use_stmt)
10515 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10516 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10518 /* ??? This can happen when the live lane ends up being
10519 used in a vector construction code-generated by an
10520 external SLP node (and code-generation for that already
10521 happened). See gcc.dg/vect/bb-slp-47.c.
10522 Doing this is what would happen if that vector CTOR
10523 were not code-generated yet so it is not too bad.
10524 ??? In fact we'd likely want to avoid this situation
10525 in the first place. */
10526 if (TREE_CODE (new_tree) == SSA_NAME
10527 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10528 && gimple_code (use_stmt) != GIMPLE_PHI
10529 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10530 use_stmt))
10532 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10533 gcc_checking_assert (code == SSA_NAME
10534 || code == CONSTRUCTOR
10535 || code == VIEW_CONVERT_EXPR
10536 || CONVERT_EXPR_CODE_P (code));
10537 if (dump_enabled_p ())
10538 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10539 "Using original scalar computation for "
10540 "live lane because use preceeds vector "
10541 "def\n");
10542 continue;
10544 /* ??? It can also happen that we end up pulling a def into
10545 a loop where replacing out-of-loop uses would require
10546 a new LC SSA PHI node. Retain the original scalar in
10547 those cases as well. PR98064. */
10548 if (TREE_CODE (new_tree) == SSA_NAME
10549 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10550 && (gimple_bb (use_stmt)->loop_father
10551 != gimple_bb (vec_stmt)->loop_father)
10552 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10553 gimple_bb (use_stmt)->loop_father))
10555 if (dump_enabled_p ())
10556 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10557 "Using original scalar computation for "
10558 "live lane because there is an out-of-loop "
10559 "definition for it\n");
10560 continue;
10562 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10563 SET_USE (use_p, new_tree);
10564 update_stmt (use_stmt);
10568 return true;
10571 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
10573 static void
10574 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10576 ssa_op_iter op_iter;
10577 imm_use_iterator imm_iter;
10578 def_operand_p def_p;
10579 gimple *ustmt;
10581 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10583 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10585 basic_block bb;
10587 if (!is_gimple_debug (ustmt))
10588 continue;
10590 bb = gimple_bb (ustmt);
10592 if (!flow_bb_inside_loop_p (loop, bb))
10594 if (gimple_debug_bind_p (ustmt))
10596 if (dump_enabled_p ())
10597 dump_printf_loc (MSG_NOTE, vect_location,
10598 "killing debug use\n");
10600 gimple_debug_bind_reset_value (ustmt);
10601 update_stmt (ustmt);
10603 else
10604 gcc_unreachable ();
10610 /* Given loop represented by LOOP_VINFO, return true if computation of
10611 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10612 otherwise. */
10614 static bool
10615 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10617 /* Constant case. */
10618 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10620 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10621 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10623 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10624 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10625 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10626 return true;
10629 widest_int max;
10630 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10631 /* Check the upper bound of loop niters. */
10632 if (get_max_loop_iterations (loop, &max))
10634 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10635 signop sgn = TYPE_SIGN (type);
10636 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10637 if (max < type_max)
10638 return true;
10640 return false;
10643 /* Return a mask type with half the number of elements as OLD_TYPE,
10644 given that it should have mode NEW_MODE. */
10646 tree
10647 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10649 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10650 return build_truth_vector_type_for_mode (nunits, new_mode);
10653 /* Return a mask type with twice as many elements as OLD_TYPE,
10654 given that it should have mode NEW_MODE. */
10656 tree
10657 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10659 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10660 return build_truth_vector_type_for_mode (nunits, new_mode);
10663 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10664 contain a sequence of NVECTORS masks that each control a vector of type
10665 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10666 these vector masks with the vector version of SCALAR_MASK. */
10668 void
10669 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10670 unsigned int nvectors, tree vectype, tree scalar_mask)
10672 gcc_assert (nvectors != 0);
10674 if (scalar_mask)
10676 scalar_cond_masked_key cond (scalar_mask, nvectors);
10677 loop_vinfo->scalar_cond_masked_set.add (cond);
10680 masks->mask_set.add (std::make_pair (vectype, nvectors));
10683 /* Given a complete set of masks MASKS, extract mask number INDEX
10684 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10685 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10687 See the comment above vec_loop_masks for more details about the mask
10688 arrangement. */
10690 tree
10691 vect_get_loop_mask (loop_vec_info loop_vinfo,
10692 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10693 unsigned int nvectors, tree vectype, unsigned int index)
10695 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10696 == vect_partial_vectors_while_ult)
10698 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10699 tree mask_type = rgm->type;
10701 /* Populate the rgroup's mask array, if this is the first time we've
10702 used it. */
10703 if (rgm->controls.is_empty ())
10705 rgm->controls.safe_grow_cleared (nvectors, true);
10706 for (unsigned int i = 0; i < nvectors; ++i)
10708 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10709 /* Provide a dummy definition until the real one is available. */
10710 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10711 rgm->controls[i] = mask;
10715 tree mask = rgm->controls[index];
10716 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10717 TYPE_VECTOR_SUBPARTS (vectype)))
10719 /* A loop mask for data type X can be reused for data type Y
10720 if X has N times more elements than Y and if Y's elements
10721 are N times bigger than X's. In this case each sequence
10722 of N elements in the loop mask will be all-zero or all-one.
10723 We can then view-convert the mask so that each sequence of
10724 N elements is replaced by a single element. */
10725 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10726 TYPE_VECTOR_SUBPARTS (vectype)));
10727 gimple_seq seq = NULL;
10728 mask_type = truth_type_for (vectype);
10729 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10730 if (seq)
10731 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10733 return mask;
10735 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10736 == vect_partial_vectors_avx512)
10738 /* The number of scalars per iteration and the number of vectors are
10739 both compile-time constants. */
10740 unsigned int nscalars_per_iter
10741 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10742 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10744 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10746 /* The stored nV is dependent on the mask type produced. */
10747 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10748 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10749 == rgm->factor);
10750 nvectors = rgm->factor;
10752 /* Populate the rgroup's mask array, if this is the first time we've
10753 used it. */
10754 if (rgm->controls.is_empty ())
10756 rgm->controls.safe_grow_cleared (nvectors, true);
10757 for (unsigned int i = 0; i < nvectors; ++i)
10759 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10760 /* Provide a dummy definition until the real one is available. */
10761 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10762 rgm->controls[i] = mask;
10765 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10766 TYPE_VECTOR_SUBPARTS (vectype)))
10767 return rgm->controls[index];
10769 /* Split the vector if needed. Since we are dealing with integer mode
10770 masks with AVX512 we can operate on the integer representation
10771 performing the whole vector shifting. */
10772 unsigned HOST_WIDE_INT factor;
10773 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10774 TYPE_VECTOR_SUBPARTS (vectype), &factor);
10775 gcc_assert (ok);
10776 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10777 tree mask_type = truth_type_for (vectype);
10778 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10779 unsigned vi = index / factor;
10780 unsigned vpart = index % factor;
10781 tree vec = rgm->controls[vi];
10782 gimple_seq seq = NULL;
10783 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10784 lang_hooks.types.type_for_mode
10785 (TYPE_MODE (rgm->type), 1), vec);
10786 /* For integer mode masks simply shift the right bits into position. */
10787 if (vpart != 0)
10788 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10789 build_int_cst (integer_type_node,
10790 (TYPE_VECTOR_SUBPARTS (vectype)
10791 * vpart)));
10792 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10793 (TYPE_MODE (mask_type), 1), vec);
10794 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10795 if (seq)
10796 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10797 return vec;
10799 else
10800 gcc_unreachable ();
10803 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10804 lengths for controlling an operation on VECTYPE. The operation splits
10805 each element of VECTYPE into FACTOR separate subelements, measuring the
10806 length as a number of these subelements. */
10808 void
10809 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10810 unsigned int nvectors, tree vectype, unsigned int factor)
10812 gcc_assert (nvectors != 0);
10813 if (lens->length () < nvectors)
10814 lens->safe_grow_cleared (nvectors, true);
10815 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10817 /* The number of scalars per iteration, scalar occupied bytes and
10818 the number of vectors are both compile-time constants. */
10819 unsigned int nscalars_per_iter
10820 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10821 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10823 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10825 /* For now, we only support cases in which all loads and stores fall back
10826 to VnQI or none do. */
10827 gcc_assert (!rgl->max_nscalars_per_iter
10828 || (rgl->factor == 1 && factor == 1)
10829 || (rgl->max_nscalars_per_iter * rgl->factor
10830 == nscalars_per_iter * factor));
10831 rgl->max_nscalars_per_iter = nscalars_per_iter;
10832 rgl->type = vectype;
10833 rgl->factor = factor;
10837 /* Given a complete set of lengths LENS, extract length number INDEX
10838 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10839 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10840 multipled by the number of elements that should be processed.
10841 Insert any set-up statements before GSI. */
10843 tree
10844 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10845 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10846 unsigned int index, unsigned int factor)
10848 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10849 bool use_bias_adjusted_len =
10850 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10852 /* Populate the rgroup's len array, if this is the first time we've
10853 used it. */
10854 if (rgl->controls.is_empty ())
10856 rgl->controls.safe_grow_cleared (nvectors, true);
10857 for (unsigned int i = 0; i < nvectors; ++i)
10859 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10860 gcc_assert (len_type != NULL_TREE);
10862 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10864 /* Provide a dummy definition until the real one is available. */
10865 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10866 rgl->controls[i] = len;
10868 if (use_bias_adjusted_len)
10870 gcc_assert (i == 0);
10871 tree adjusted_len =
10872 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10873 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10874 rgl->bias_adjusted_ctrl = adjusted_len;
10879 if (use_bias_adjusted_len)
10880 return rgl->bias_adjusted_ctrl;
10882 tree loop_len = rgl->controls[index];
10883 if (rgl->factor == 1 && factor == 1)
10885 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10886 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10887 if (maybe_ne (nunits1, nunits2))
10889 /* A loop len for data type X can be reused for data type Y
10890 if X has N times more elements than Y and if Y's elements
10891 are N times bigger than X's. */
10892 gcc_assert (multiple_p (nunits1, nunits2));
10893 factor = exact_div (nunits1, nunits2).to_constant ();
10894 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10895 gimple_seq seq = NULL;
10896 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
10897 build_int_cst (iv_type, factor));
10898 if (seq)
10899 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10902 return loop_len;
10905 /* Scale profiling counters by estimation for LOOP which is vectorized
10906 by factor VF.
10907 If FLAT is true, the loop we started with had unrealistically flat
10908 profile. */
10910 static void
10911 scale_profile_for_vect_loop (class loop *loop, unsigned vf, bool flat)
10913 /* For flat profiles do not scale down proportionally by VF and only
10914 cap by known iteration count bounds. */
10915 if (flat)
10917 if (dump_file && (dump_flags & TDF_DETAILS))
10918 fprintf (dump_file,
10919 "Vectorized loop profile seems flat; not scaling iteration "
10920 "count down by the vectorization factor %i\n", vf);
10921 scale_loop_profile (loop, profile_probability::always (),
10922 get_likely_max_loop_iterations_int (loop));
10923 return;
10925 /* Loop body executes VF fewer times and exit increases VF times. */
10926 edge exit_e = single_exit (loop);
10927 profile_count entry_count = loop_preheader_edge (loop)->count ();
10929 /* If we have unreliable loop profile avoid dropping entry
10930 count bellow header count. This can happen since loops
10931 has unrealistically low trip counts. */
10932 while (vf > 1
10933 && loop->header->count > entry_count
10934 && loop->header->count < entry_count * vf)
10936 if (dump_file && (dump_flags & TDF_DETAILS))
10937 fprintf (dump_file,
10938 "Vectorization factor %i seems too large for profile "
10939 "prevoiusly believed to be consistent; reducing.\n", vf);
10940 vf /= 2;
10943 if (entry_count.nonzero_p ())
10944 set_edge_probability_and_rescale_others
10945 (exit_e,
10946 entry_count.probability_in (loop->header->count / vf));
10947 /* Avoid producing very large exit probability when we do not have
10948 sensible profile. */
10949 else if (exit_e->probability < profile_probability::always () / (vf * 2))
10950 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10951 loop->latch->count = single_pred_edge (loop->latch)->count ();
10953 scale_loop_profile (loop, profile_probability::always () / vf,
10954 get_likely_max_loop_iterations_int (loop));
10957 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10958 latch edge values originally defined by it. */
10960 static void
10961 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
10962 stmt_vec_info def_stmt_info)
10964 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
10965 if (!def || TREE_CODE (def) != SSA_NAME)
10966 return;
10967 stmt_vec_info phi_info;
10968 imm_use_iterator iter;
10969 use_operand_p use_p;
10970 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
10972 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
10973 if (!phi)
10974 continue;
10975 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
10976 && (phi_info = loop_vinfo->lookup_stmt (phi))
10977 && STMT_VINFO_RELEVANT_P (phi_info)))
10978 continue;
10979 loop_p loop = gimple_bb (phi)->loop_father;
10980 edge e = loop_latch_edge (loop);
10981 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
10982 continue;
10984 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
10985 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
10986 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
10988 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10989 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10990 gcc_assert (phi_defs.length () == latch_defs.length ());
10991 for (unsigned i = 0; i < phi_defs.length (); ++i)
10992 add_phi_arg (as_a <gphi *> (phi_defs[i]),
10993 gimple_get_lhs (latch_defs[i]), e,
10994 gimple_phi_arg_location (phi, e->dest_idx));
10996 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
10998 /* For first order recurrences we have to update both uses of
10999 the latch definition, the one in the PHI node and the one
11000 in the generated VEC_PERM_EXPR. */
11001 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11002 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11003 gcc_assert (phi_defs.length () == latch_defs.length ());
11004 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11005 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11006 for (unsigned i = 0; i < phi_defs.length (); ++i)
11008 gassign *perm = as_a <gassign *> (phi_defs[i]);
11009 if (i > 0)
11010 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11011 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11012 update_stmt (perm);
11014 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11015 gimple_phi_arg_location (phi, e->dest_idx));
11020 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11021 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11022 stmt_vec_info. */
11024 static bool
11025 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11026 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11028 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11029 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11031 if (dump_enabled_p ())
11032 dump_printf_loc (MSG_NOTE, vect_location,
11033 "------>vectorizing statement: %G", stmt_info->stmt);
11035 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11036 vect_loop_kill_debug_uses (loop, stmt_info);
11038 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11039 && !STMT_VINFO_LIVE_P (stmt_info))
11040 return false;
11042 if (STMT_VINFO_VECTYPE (stmt_info))
11044 poly_uint64 nunits
11045 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11046 if (!STMT_SLP_TYPE (stmt_info)
11047 && maybe_ne (nunits, vf)
11048 && dump_enabled_p ())
11049 /* For SLP VF is set according to unrolling factor, and not
11050 to vector size, hence for SLP this print is not valid. */
11051 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11054 /* Pure SLP statements have already been vectorized. We still need
11055 to apply loop vectorization to hybrid SLP statements. */
11056 if (PURE_SLP_STMT (stmt_info))
11057 return false;
11059 if (dump_enabled_p ())
11060 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11062 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11063 *seen_store = stmt_info;
11065 return true;
11068 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11069 in the hash_map with its corresponding values. */
11071 static tree
11072 find_in_mapping (tree t, void *context)
11074 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11076 tree *value = mapping->get (t);
11077 return value ? *value : t;
11080 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11081 original loop that has now been vectorized.
11083 The inits of the data_references need to be advanced with the number of
11084 iterations of the main loop. This has been computed in vect_do_peeling and
11085 is stored in parameter ADVANCE. We first restore the data_references
11086 initial offset with the values recored in ORIG_DRS_INIT.
11088 Since the loop_vec_info of this EPILOGUE was constructed for the original
11089 loop, its stmt_vec_infos all point to the original statements. These need
11090 to be updated to point to their corresponding copies as well as the SSA_NAMES
11091 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11093 The data_reference's connections also need to be updated. Their
11094 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11095 stmt_vec_infos, their statements need to point to their corresponding copy,
11096 if they are gather loads or scatter stores then their reference needs to be
11097 updated to point to its corresponding copy and finally we set
11098 'base_misaligned' to false as we have already peeled for alignment in the
11099 prologue of the main loop. */
11101 static void
11102 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11104 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11105 auto_vec<gimple *> stmt_worklist;
11106 hash_map<tree,tree> mapping;
11107 gimple *orig_stmt, *new_stmt;
11108 gimple_stmt_iterator epilogue_gsi;
11109 gphi_iterator epilogue_phi_gsi;
11110 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11111 basic_block *epilogue_bbs = get_loop_body (epilogue);
11112 unsigned i;
11114 free (LOOP_VINFO_BBS (epilogue_vinfo));
11115 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11117 /* Advance data_reference's with the number of iterations of the previous
11118 loop and its prologue. */
11119 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11122 /* The EPILOGUE loop is a copy of the original loop so they share the same
11123 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11124 point to the copied statements. We also create a mapping of all LHS' in
11125 the original loop and all the LHS' in the EPILOGUE and create worklists to
11126 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11127 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11129 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11130 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11132 new_stmt = epilogue_phi_gsi.phi ();
11134 gcc_assert (gimple_uid (new_stmt) > 0);
11135 stmt_vinfo
11136 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11138 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11139 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11141 mapping.put (gimple_phi_result (orig_stmt),
11142 gimple_phi_result (new_stmt));
11143 /* PHI nodes can not have patterns or related statements. */
11144 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11145 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11148 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11149 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11151 new_stmt = gsi_stmt (epilogue_gsi);
11152 if (is_gimple_debug (new_stmt))
11153 continue;
11155 gcc_assert (gimple_uid (new_stmt) > 0);
11156 stmt_vinfo
11157 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11159 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11160 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11162 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11163 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11165 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11167 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11168 for (gimple_stmt_iterator gsi = gsi_start (seq);
11169 !gsi_end_p (gsi); gsi_next (&gsi))
11170 stmt_worklist.safe_push (gsi_stmt (gsi));
11173 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11174 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11176 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11177 stmt_worklist.safe_push (stmt);
11178 /* Set BB such that the assert in
11179 'get_initial_def_for_reduction' is able to determine that
11180 the BB of the related stmt is inside this loop. */
11181 gimple_set_bb (stmt,
11182 gimple_bb (new_stmt));
11183 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11184 gcc_assert (related_vinfo == NULL
11185 || related_vinfo == stmt_vinfo);
11190 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11191 using the original main loop and thus need to be updated to refer to the
11192 cloned variables used in the epilogue. */
11193 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11195 gimple *stmt = stmt_worklist[i];
11196 tree *new_op;
11198 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11200 tree op = gimple_op (stmt, j);
11201 if ((new_op = mapping.get(op)))
11202 gimple_set_op (stmt, j, *new_op);
11203 else
11205 /* PR92429: The last argument of simplify_replace_tree disables
11206 folding when replacing arguments. This is required as
11207 otherwise you might end up with different statements than the
11208 ones analyzed in vect_loop_analyze, leading to different
11209 vectorization. */
11210 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11211 &find_in_mapping, &mapping, false);
11212 gimple_set_op (stmt, j, op);
11217 struct data_reference *dr;
11218 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11219 FOR_EACH_VEC_ELT (datarefs, i, dr)
11221 orig_stmt = DR_STMT (dr);
11222 gcc_assert (gimple_uid (orig_stmt) > 0);
11223 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11224 /* Data references for gather loads and scatter stores do not use the
11225 updated offset we set using ADVANCE. Instead we have to make sure the
11226 reference in the data references point to the corresponding copy of
11227 the original in the epilogue. */
11228 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
11229 == VMAT_GATHER_SCATTER)
11231 DR_REF (dr)
11232 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11233 &find_in_mapping, &mapping);
11234 DR_BASE_ADDRESS (dr)
11235 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11236 &find_in_mapping, &mapping);
11238 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11239 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11240 /* The vector size of the epilogue is smaller than that of the main loop
11241 so the alignment is either the same or lower. This means the dr will
11242 thus by definition be aligned. */
11243 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11246 epilogue_vinfo->shared->datarefs_copy.release ();
11247 epilogue_vinfo->shared->save_datarefs ();
11250 /* Function vect_transform_loop.
11252 The analysis phase has determined that the loop is vectorizable.
11253 Vectorize the loop - created vectorized stmts to replace the scalar
11254 stmts in the loop, and update the loop exit condition.
11255 Returns scalar epilogue loop if any. */
11257 class loop *
11258 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11260 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11261 class loop *epilogue = NULL;
11262 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11263 int nbbs = loop->num_nodes;
11264 int i;
11265 tree niters_vector = NULL_TREE;
11266 tree step_vector = NULL_TREE;
11267 tree niters_vector_mult_vf = NULL_TREE;
11268 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11269 unsigned int lowest_vf = constant_lower_bound (vf);
11270 gimple *stmt;
11271 bool check_profitability = false;
11272 unsigned int th;
11273 bool flat = maybe_flat_loop_profile (loop);
11275 DUMP_VECT_SCOPE ("vec_transform_loop");
11277 loop_vinfo->shared->check_datarefs ();
11279 /* Use the more conservative vectorization threshold. If the number
11280 of iterations is constant assume the cost check has been performed
11281 by our caller. If the threshold makes all loops profitable that
11282 run at least the (estimated) vectorization factor number of times
11283 checking is pointless, too. */
11284 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11285 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11287 if (dump_enabled_p ())
11288 dump_printf_loc (MSG_NOTE, vect_location,
11289 "Profitability threshold is %d loop iterations.\n",
11290 th);
11291 check_profitability = true;
11294 /* Make sure there exists a single-predecessor exit bb. Do this before
11295 versioning. */
11296 edge e = single_exit (loop);
11297 if (! single_pred_p (e->dest))
11299 split_loop_exit_edge (e, true);
11300 if (dump_enabled_p ())
11301 dump_printf (MSG_NOTE, "split exit edge\n");
11304 /* Version the loop first, if required, so the profitability check
11305 comes first. */
11307 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11309 class loop *sloop
11310 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11311 sloop->force_vectorize = false;
11312 check_profitability = false;
11315 /* Make sure there exists a single-predecessor exit bb also on the
11316 scalar loop copy. Do this after versioning but before peeling
11317 so CFG structure is fine for both scalar and if-converted loop
11318 to make slpeel_duplicate_current_defs_from_edges face matched
11319 loop closed PHI nodes on the exit. */
11320 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11322 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
11323 if (! single_pred_p (e->dest))
11325 split_loop_exit_edge (e, true);
11326 if (dump_enabled_p ())
11327 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11331 tree niters = vect_build_loop_niters (loop_vinfo);
11332 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11333 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11334 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11335 tree advance;
11336 drs_init_vec orig_drs_init;
11338 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11339 &step_vector, &niters_vector_mult_vf, th,
11340 check_profitability, niters_no_overflow,
11341 &advance);
11342 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11343 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11345 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11346 block after loop exit. We need to scale all that. */
11347 basic_block preheader
11348 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11349 preheader->count
11350 = preheader->count.apply_probability
11351 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11352 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11353 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11354 single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11355 = preheader->count;
11358 if (niters_vector == NULL_TREE)
11360 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11361 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11362 && known_eq (lowest_vf, vf))
11364 niters_vector
11365 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11366 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11367 step_vector = build_one_cst (TREE_TYPE (niters));
11369 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11370 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11371 &step_vector, niters_no_overflow);
11372 else
11373 /* vect_do_peeling subtracted the number of peeled prologue
11374 iterations from LOOP_VINFO_NITERS. */
11375 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11376 &niters_vector, &step_vector,
11377 niters_no_overflow);
11380 /* 1) Make sure the loop header has exactly two entries
11381 2) Make sure we have a preheader basic block. */
11383 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11385 split_edge (loop_preheader_edge (loop));
11387 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11388 /* This will deal with any possible peeling. */
11389 vect_prepare_for_masked_peels (loop_vinfo);
11391 /* Schedule the SLP instances first, then handle loop vectorization
11392 below. */
11393 if (!loop_vinfo->slp_instances.is_empty ())
11395 DUMP_VECT_SCOPE ("scheduling SLP instances");
11396 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11399 /* FORNOW: the vectorizer supports only loops which body consist
11400 of one basic block (header + empty latch). When the vectorizer will
11401 support more involved loop forms, the order by which the BBs are
11402 traversed need to be reconsidered. */
11404 for (i = 0; i < nbbs; i++)
11406 basic_block bb = bbs[i];
11407 stmt_vec_info stmt_info;
11409 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11410 gsi_next (&si))
11412 gphi *phi = si.phi ();
11413 if (dump_enabled_p ())
11414 dump_printf_loc (MSG_NOTE, vect_location,
11415 "------>vectorizing phi: %G", (gimple *) phi);
11416 stmt_info = loop_vinfo->lookup_stmt (phi);
11417 if (!stmt_info)
11418 continue;
11420 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11421 vect_loop_kill_debug_uses (loop, stmt_info);
11423 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11424 && !STMT_VINFO_LIVE_P (stmt_info))
11425 continue;
11427 if (STMT_VINFO_VECTYPE (stmt_info)
11428 && (maybe_ne
11429 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11430 && dump_enabled_p ())
11431 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11433 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11434 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11435 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11436 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11437 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11438 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11439 && ! PURE_SLP_STMT (stmt_info))
11441 if (dump_enabled_p ())
11442 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11443 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11447 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11448 gsi_next (&si))
11450 gphi *phi = si.phi ();
11451 stmt_info = loop_vinfo->lookup_stmt (phi);
11452 if (!stmt_info)
11453 continue;
11455 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11456 && !STMT_VINFO_LIVE_P (stmt_info))
11457 continue;
11459 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11460 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11461 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11462 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11463 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11464 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11465 && ! PURE_SLP_STMT (stmt_info))
11466 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11469 for (gimple_stmt_iterator si = gsi_start_bb (bb);
11470 !gsi_end_p (si);)
11472 stmt = gsi_stmt (si);
11473 /* During vectorization remove existing clobber stmts. */
11474 if (gimple_clobber_p (stmt))
11476 unlink_stmt_vdef (stmt);
11477 gsi_remove (&si, true);
11478 release_defs (stmt);
11480 else
11482 /* Ignore vector stmts created in the outer loop. */
11483 stmt_info = loop_vinfo->lookup_stmt (stmt);
11485 /* vector stmts created in the outer-loop during vectorization of
11486 stmts in an inner-loop may not have a stmt_info, and do not
11487 need to be vectorized. */
11488 stmt_vec_info seen_store = NULL;
11489 if (stmt_info)
11491 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11493 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11494 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11495 !gsi_end_p (subsi); gsi_next (&subsi))
11497 stmt_vec_info pat_stmt_info
11498 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11499 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11500 &si, &seen_store);
11502 stmt_vec_info pat_stmt_info
11503 = STMT_VINFO_RELATED_STMT (stmt_info);
11504 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11505 &si, &seen_store))
11506 maybe_set_vectorized_backedge_value (loop_vinfo,
11507 pat_stmt_info);
11509 else
11511 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11512 &seen_store))
11513 maybe_set_vectorized_backedge_value (loop_vinfo,
11514 stmt_info);
11517 gsi_next (&si);
11518 if (seen_store)
11520 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11521 /* Interleaving. If IS_STORE is TRUE, the
11522 vectorization of the interleaving chain was
11523 completed - free all the stores in the chain. */
11524 vect_remove_stores (loop_vinfo,
11525 DR_GROUP_FIRST_ELEMENT (seen_store));
11526 else
11527 /* Free the attached stmt_vec_info and remove the stmt. */
11528 loop_vinfo->remove_stmt (stmt_info);
11533 /* Stub out scalar statements that must not survive vectorization.
11534 Doing this here helps with grouped statements, or statements that
11535 are involved in patterns. */
11536 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11537 !gsi_end_p (gsi); gsi_next (&gsi))
11539 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11540 if (!call || !gimple_call_internal_p (call))
11541 continue;
11542 internal_fn ifn = gimple_call_internal_fn (call);
11543 if (ifn == IFN_MASK_LOAD)
11545 tree lhs = gimple_get_lhs (call);
11546 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11548 tree zero = build_zero_cst (TREE_TYPE (lhs));
11549 gimple *new_stmt = gimple_build_assign (lhs, zero);
11550 gsi_replace (&gsi, new_stmt, true);
11553 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11555 tree lhs = gimple_get_lhs (call);
11556 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11558 tree else_arg
11559 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11560 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11561 gsi_replace (&gsi, new_stmt, true);
11565 } /* BBs in loop */
11567 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11568 a zero NITERS becomes a nonzero NITERS_VECTOR. */
11569 if (integer_onep (step_vector))
11570 niters_no_overflow = true;
11571 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11572 niters_vector_mult_vf, !niters_no_overflow);
11574 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11576 /* True if the final iteration might not handle a full vector's
11577 worth of scalar iterations. */
11578 bool final_iter_may_be_partial
11579 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11580 /* The minimum number of iterations performed by the epilogue. This
11581 is 1 when peeling for gaps because we always need a final scalar
11582 iteration. */
11583 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11584 /* +1 to convert latch counts to loop iteration counts,
11585 -min_epilogue_iters to remove iterations that cannot be performed
11586 by the vector code. */
11587 int bias_for_lowest = 1 - min_epilogue_iters;
11588 int bias_for_assumed = bias_for_lowest;
11589 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11590 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11592 /* When the amount of peeling is known at compile time, the first
11593 iteration will have exactly alignment_npeels active elements.
11594 In the worst case it will have at least one. */
11595 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11596 bias_for_lowest += lowest_vf - min_first_active;
11597 bias_for_assumed += assumed_vf - min_first_active;
11599 /* In these calculations the "- 1" converts loop iteration counts
11600 back to latch counts. */
11601 if (loop->any_upper_bound)
11603 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11604 loop->nb_iterations_upper_bound
11605 = (final_iter_may_be_partial
11606 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11607 lowest_vf) - 1
11608 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11609 lowest_vf) - 1);
11610 if (main_vinfo
11611 /* Both peeling for alignment and peeling for gaps can end up
11612 with the scalar epilogue running for more than VF-1 iterations. */
11613 && !main_vinfo->peeling_for_alignment
11614 && !main_vinfo->peeling_for_gaps)
11616 unsigned int bound;
11617 poly_uint64 main_iters
11618 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11619 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11620 main_iters
11621 = upper_bound (main_iters,
11622 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11623 if (can_div_away_from_zero_p (main_iters,
11624 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11625 &bound))
11626 loop->nb_iterations_upper_bound
11627 = wi::umin ((widest_int) (bound - 1),
11628 loop->nb_iterations_upper_bound);
11631 if (loop->any_likely_upper_bound)
11632 loop->nb_iterations_likely_upper_bound
11633 = (final_iter_may_be_partial
11634 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11635 + bias_for_lowest, lowest_vf) - 1
11636 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11637 + bias_for_lowest, lowest_vf) - 1);
11638 if (loop->any_estimate)
11639 loop->nb_iterations_estimate
11640 = (final_iter_may_be_partial
11641 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11642 assumed_vf) - 1
11643 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11644 assumed_vf) - 1);
11645 scale_profile_for_vect_loop (loop, assumed_vf, flat);
11647 if (dump_enabled_p ())
11649 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11651 dump_printf_loc (MSG_NOTE, vect_location,
11652 "LOOP VECTORIZED\n");
11653 if (loop->inner)
11654 dump_printf_loc (MSG_NOTE, vect_location,
11655 "OUTER LOOP VECTORIZED\n");
11656 dump_printf (MSG_NOTE, "\n");
11658 else
11659 dump_printf_loc (MSG_NOTE, vect_location,
11660 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11661 GET_MODE_NAME (loop_vinfo->vector_mode));
11664 /* Loops vectorized with a variable factor won't benefit from
11665 unrolling/peeling. */
11666 if (!vf.is_constant ())
11668 loop->unroll = 1;
11669 if (dump_enabled_p ())
11670 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11671 " variable-length vectorization factor\n");
11673 /* Free SLP instances here because otherwise stmt reference counting
11674 won't work. */
11675 slp_instance instance;
11676 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11677 vect_free_slp_instance (instance);
11678 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11679 /* Clear-up safelen field since its value is invalid after vectorization
11680 since vectorized loop can have loop-carried dependencies. */
11681 loop->safelen = 0;
11683 if (epilogue)
11685 update_epilogue_loop_vinfo (epilogue, advance);
11687 epilogue->simduid = loop->simduid;
11688 epilogue->force_vectorize = loop->force_vectorize;
11689 epilogue->dont_vectorize = false;
11692 return epilogue;
11695 /* The code below is trying to perform simple optimization - revert
11696 if-conversion for masked stores, i.e. if the mask of a store is zero
11697 do not perform it and all stored value producers also if possible.
11698 For example,
11699 for (i=0; i<n; i++)
11700 if (c[i])
11702 p1[i] += 1;
11703 p2[i] = p3[i] +2;
11705 this transformation will produce the following semi-hammock:
11707 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11709 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11710 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11711 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11712 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11713 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11714 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11718 void
11719 optimize_mask_stores (class loop *loop)
11721 basic_block *bbs = get_loop_body (loop);
11722 unsigned nbbs = loop->num_nodes;
11723 unsigned i;
11724 basic_block bb;
11725 class loop *bb_loop;
11726 gimple_stmt_iterator gsi;
11727 gimple *stmt;
11728 auto_vec<gimple *> worklist;
11729 auto_purge_vect_location sentinel;
11731 vect_location = find_loop_location (loop);
11732 /* Pick up all masked stores in loop if any. */
11733 for (i = 0; i < nbbs; i++)
11735 bb = bbs[i];
11736 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11737 gsi_next (&gsi))
11739 stmt = gsi_stmt (gsi);
11740 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11741 worklist.safe_push (stmt);
11745 free (bbs);
11746 if (worklist.is_empty ())
11747 return;
11749 /* Loop has masked stores. */
11750 while (!worklist.is_empty ())
11752 gimple *last, *last_store;
11753 edge e, efalse;
11754 tree mask;
11755 basic_block store_bb, join_bb;
11756 gimple_stmt_iterator gsi_to;
11757 tree vdef, new_vdef;
11758 gphi *phi;
11759 tree vectype;
11760 tree zero;
11762 last = worklist.pop ();
11763 mask = gimple_call_arg (last, 2);
11764 bb = gimple_bb (last);
11765 /* Create then_bb and if-then structure in CFG, then_bb belongs to
11766 the same loop as if_bb. It could be different to LOOP when two
11767 level loop-nest is vectorized and mask_store belongs to the inner
11768 one. */
11769 e = split_block (bb, last);
11770 bb_loop = bb->loop_father;
11771 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11772 join_bb = e->dest;
11773 store_bb = create_empty_bb (bb);
11774 add_bb_to_loop (store_bb, bb_loop);
11775 e->flags = EDGE_TRUE_VALUE;
11776 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11777 /* Put STORE_BB to likely part. */
11778 efalse->probability = profile_probability::likely ();
11779 e->probability = efalse->probability.invert ();
11780 store_bb->count = efalse->count ();
11781 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11782 if (dom_info_available_p (CDI_DOMINATORS))
11783 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11784 if (dump_enabled_p ())
11785 dump_printf_loc (MSG_NOTE, vect_location,
11786 "Create new block %d to sink mask stores.",
11787 store_bb->index);
11788 /* Create vector comparison with boolean result. */
11789 vectype = TREE_TYPE (mask);
11790 zero = build_zero_cst (vectype);
11791 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11792 gsi = gsi_last_bb (bb);
11793 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11794 /* Create new PHI node for vdef of the last masked store:
11795 .MEM_2 = VDEF <.MEM_1>
11796 will be converted to
11797 .MEM.3 = VDEF <.MEM_1>
11798 and new PHI node will be created in join bb
11799 .MEM_2 = PHI <.MEM_1, .MEM_3>
11801 vdef = gimple_vdef (last);
11802 new_vdef = make_ssa_name (gimple_vop (cfun), last);
11803 gimple_set_vdef (last, new_vdef);
11804 phi = create_phi_node (vdef, join_bb);
11805 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11807 /* Put all masked stores with the same mask to STORE_BB if possible. */
11808 while (true)
11810 gimple_stmt_iterator gsi_from;
11811 gimple *stmt1 = NULL;
11813 /* Move masked store to STORE_BB. */
11814 last_store = last;
11815 gsi = gsi_for_stmt (last);
11816 gsi_from = gsi;
11817 /* Shift GSI to the previous stmt for further traversal. */
11818 gsi_prev (&gsi);
11819 gsi_to = gsi_start_bb (store_bb);
11820 gsi_move_before (&gsi_from, &gsi_to);
11821 /* Setup GSI_TO to the non-empty block start. */
11822 gsi_to = gsi_start_bb (store_bb);
11823 if (dump_enabled_p ())
11824 dump_printf_loc (MSG_NOTE, vect_location,
11825 "Move stmt to created bb\n%G", last);
11826 /* Move all stored value producers if possible. */
11827 while (!gsi_end_p (gsi))
11829 tree lhs;
11830 imm_use_iterator imm_iter;
11831 use_operand_p use_p;
11832 bool res;
11834 /* Skip debug statements. */
11835 if (is_gimple_debug (gsi_stmt (gsi)))
11837 gsi_prev (&gsi);
11838 continue;
11840 stmt1 = gsi_stmt (gsi);
11841 /* Do not consider statements writing to memory or having
11842 volatile operand. */
11843 if (gimple_vdef (stmt1)
11844 || gimple_has_volatile_ops (stmt1))
11845 break;
11846 gsi_from = gsi;
11847 gsi_prev (&gsi);
11848 lhs = gimple_get_lhs (stmt1);
11849 if (!lhs)
11850 break;
11852 /* LHS of vectorized stmt must be SSA_NAME. */
11853 if (TREE_CODE (lhs) != SSA_NAME)
11854 break;
11856 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11858 /* Remove dead scalar statement. */
11859 if (has_zero_uses (lhs))
11861 gsi_remove (&gsi_from, true);
11862 continue;
11866 /* Check that LHS does not have uses outside of STORE_BB. */
11867 res = true;
11868 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11870 gimple *use_stmt;
11871 use_stmt = USE_STMT (use_p);
11872 if (is_gimple_debug (use_stmt))
11873 continue;
11874 if (gimple_bb (use_stmt) != store_bb)
11876 res = false;
11877 break;
11880 if (!res)
11881 break;
11883 if (gimple_vuse (stmt1)
11884 && gimple_vuse (stmt1) != gimple_vuse (last_store))
11885 break;
11887 /* Can move STMT1 to STORE_BB. */
11888 if (dump_enabled_p ())
11889 dump_printf_loc (MSG_NOTE, vect_location,
11890 "Move stmt to created bb\n%G", stmt1);
11891 gsi_move_before (&gsi_from, &gsi_to);
11892 /* Shift GSI_TO for further insertion. */
11893 gsi_prev (&gsi_to);
11895 /* Put other masked stores with the same mask to STORE_BB. */
11896 if (worklist.is_empty ()
11897 || gimple_call_arg (worklist.last (), 2) != mask
11898 || worklist.last () != stmt1)
11899 break;
11900 last = worklist.pop ();
11902 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11906 /* Decide whether it is possible to use a zero-based induction variable
11907 when vectorizing LOOP_VINFO with partial vectors. If it is, return
11908 the value that the induction variable must be able to hold in order
11909 to ensure that the rgroups eventually have no active vector elements.
11910 Return -1 otherwise. */
11912 widest_int
11913 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11915 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11916 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11917 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11919 /* Calculate the value that the induction variable must be able
11920 to hit in order to ensure that we end the loop with an all-false mask.
11921 This involves adding the maximum number of inactive trailing scalar
11922 iterations. */
11923 widest_int iv_limit = -1;
11924 if (max_loop_iterations (loop, &iv_limit))
11926 if (niters_skip)
11928 /* Add the maximum number of skipped iterations to the
11929 maximum iteration count. */
11930 if (TREE_CODE (niters_skip) == INTEGER_CST)
11931 iv_limit += wi::to_widest (niters_skip);
11932 else
11933 iv_limit += max_vf - 1;
11935 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11936 /* Make a conservatively-correct assumption. */
11937 iv_limit += max_vf - 1;
11939 /* IV_LIMIT is the maximum number of latch iterations, which is also
11940 the maximum in-range IV value. Round this value down to the previous
11941 vector alignment boundary and then add an extra full iteration. */
11942 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11943 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11945 return iv_limit;
11948 /* For the given rgroup_controls RGC, check whether an induction variable
11949 would ever hit a value that produces a set of all-false masks or zero
11950 lengths before wrapping around. Return true if it's possible to wrap
11951 around before hitting the desirable value, otherwise return false. */
11953 bool
11954 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11956 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11958 if (iv_limit == -1)
11959 return true;
11961 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11962 unsigned int compare_precision = TYPE_PRECISION (compare_type);
11963 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11965 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11966 return true;
11968 return false;