tree-optimization/113026 - avoid vector epilog in more cases
[official-gcc.git] / gcc / tree-vect-loop.cc
blob9dd573ef12541823400cf21ccb3446ff699f285f
1 /* Loop Vectorization
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "memmodel.h"
36 #include "optabs.h"
37 #include "diagnostic-core.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "cfganal.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "gimple-fold.h"
52 #include "cgraph.h"
53 #include "tree-cfg.h"
54 #include "tree-if-conv.h"
55 #include "internal-fn.h"
56 #include "tree-vector-builder.h"
57 #include "vec-perm-indices.h"
58 #include "tree-eh.h"
59 #include "case-cfn-macros.h"
60 #include "langhooks.h"
62 /* Loop Vectorization Pass.
64 This pass tries to vectorize loops.
66 For example, the vectorizer transforms the following simple loop:
68 short a[N]; short b[N]; short c[N]; int i;
70 for (i=0; i<N; i++){
71 a[i] = b[i] + c[i];
74 as if it was manually vectorized by rewriting the source code into:
76 typedef int __attribute__((mode(V8HI))) v8hi;
77 short a[N]; short b[N]; short c[N]; int i;
78 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 v8hi va, vb, vc;
81 for (i=0; i<N/8; i++){
82 vb = pb[i];
83 vc = pc[i];
84 va = vb + vc;
85 pa[i] = va;
88 The main entry to this pass is vectorize_loops(), in which
89 the vectorizer applies a set of analyses on a given set of loops,
90 followed by the actual vectorization transformation for the loops that
91 had successfully passed the analysis phase.
92 Throughout this pass we make a distinction between two types of
93 data: scalars (which are represented by SSA_NAMES), and memory references
94 ("data-refs"). These two types of data require different handling both
95 during analysis and transformation. The types of data-refs that the
96 vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 accesses are required to have a simple (consecutive) access pattern.
100 Analysis phase:
101 ===============
102 The driver for the analysis phase is vect_analyze_loop().
103 It applies a set of analyses, some of which rely on the scalar evolution
104 analyzer (scev) developed by Sebastian Pop.
106 During the analysis phase the vectorizer records some information
107 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 loop, as well as general information about the loop as a whole, which is
109 recorded in a "loop_vec_info" struct attached to each loop.
111 Transformation phase:
112 =====================
113 The loop transformation phase scans all the stmts in the loop, and
114 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 the loop that needs to be vectorized. It inserts the vector code sequence
116 just before the scalar stmt S, and records a pointer to the vector code
117 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 attached to S). This pointer will be used for the vectorization of following
119 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 otherwise, we rely on dead code elimination for removing it.
122 For example, say stmt S1 was vectorized into stmt VS1:
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 S2: a = b;
128 To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 resulting sequence would be:
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 VS2: va = vb;
136 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 Operands that are not SSA_NAMEs, are data-refs that appear in
139 load/store operations (like 'x[i]' in S1), and are handled differently.
141 Target modeling:
142 =================
143 Currently the only target specific information that is used is the
144 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 Targets that can support different sizes of vectors, for now will need
146 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 flexibility will be added in the future.
149 Since we only vectorize operations which vector form can be
150 expressed using existing tree codes, to verify that an operation is
151 supported, the vectorizer checks the relevant optab at the relevant
152 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 the value found is CODE_FOR_nothing, then there's no target support, and
154 we can't vectorize the stmt.
156 For additional information on this project see:
157 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 unsigned *);
162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 bool *, bool *, bool);
165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 may already be set for general statements (not just data refs). */
169 static opt_result
170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 bool vectype_maybe_set_p,
172 poly_uint64 *vf)
174 gimple *stmt = stmt_info->stmt;
176 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 && !STMT_VINFO_LIVE_P (stmt_info))
178 || gimple_clobber_p (stmt))
180 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 return opt_result::success ();
185 tree stmt_vectype, nunits_vectype;
186 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 &stmt_vectype,
188 &nunits_vectype);
189 if (!res)
190 return res;
192 if (stmt_vectype)
194 if (STMT_VINFO_VECTYPE (stmt_info))
195 /* The only case when a vectype had been already set is for stmts
196 that contain a data ref, or for "pattern-stmts" (stmts generated
197 by the vectorizer to represent/replace a certain idiom). */
198 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 || vectype_maybe_set_p)
200 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 else
202 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
205 if (nunits_vectype)
206 vect_update_max_nunits (vf, nunits_vectype);
208 return opt_result::success ();
211 /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 types of STMT_INFO and all attached pattern statements and update
213 the vectorization factor VF accordingly. Return true on success
214 or false if something prevented vectorization. */
216 static opt_result
217 vect_determine_vf_for_stmt (vec_info *vinfo,
218 stmt_vec_info stmt_info, poly_uint64 *vf)
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 if (!res)
225 return res;
227 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 && STMT_VINFO_RELATED_STMT (stmt_info))
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
233 /* If a pattern statement has def stmts, analyze them too. */
234 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 !gsi_end_p (si); gsi_next (&si))
237 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "==> examining pattern def stmt: %G",
241 def_stmt_info->stmt);
242 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 if (!res)
244 return res;
247 if (dump_enabled_p ())
248 dump_printf_loc (MSG_NOTE, vect_location,
249 "==> examining pattern statement: %G",
250 stmt_info->stmt);
251 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 if (!res)
253 return res;
256 return opt_result::success ();
259 /* Function vect_determine_vectorization_factor
261 Determine the vectorization factor (VF). VF is the number of data elements
262 that are operated upon in parallel in a single iteration of the vectorized
263 loop. For example, when vectorizing a loop that operates on 4byte elements,
264 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 elements can fit in a single vector register.
267 We currently support vectorization of loops in which all types operated upon
268 are of the same size. Therefore this function currently sets VF according to
269 the size of the types operated upon, and fails if there are multiple sizes
270 in the loop.
272 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 original loop:
274 for (i=0; i<N; i++){
275 a[i] = b[i] + c[i];
278 vectorized loop:
279 for (i=0; i<N; i+=VF){
280 a[i:VF] = b[i:VF] + c[i:VF];
284 static opt_result
285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
287 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 unsigned nbbs = loop->num_nodes;
290 poly_uint64 vectorization_factor = 1;
291 tree scalar_type = NULL_TREE;
292 gphi *phi;
293 tree vectype;
294 stmt_vec_info stmt_info;
295 unsigned i;
297 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
299 for (i = 0; i < nbbs; i++)
301 basic_block bb = bbs[i];
303 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 gsi_next (&si))
306 phi = si.phi ();
307 stmt_info = loop_vinfo->lookup_stmt (phi);
308 if (dump_enabled_p ())
309 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 (gimple *) phi);
312 gcc_assert (stmt_info);
314 if (STMT_VINFO_RELEVANT_P (stmt_info)
315 || STMT_VINFO_LIVE_P (stmt_info))
317 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 scalar_type = TREE_TYPE (PHI_RESULT (phi));
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE, vect_location,
322 "get vectype for scalar type: %T\n",
323 scalar_type);
325 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 if (!vectype)
327 return opt_result::failure_at (phi,
328 "not vectorized: unsupported "
329 "data-type %T\n",
330 scalar_type);
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 dump_printf (MSG_NOTE, "\n");
344 vect_update_max_nunits (&vectorization_factor, vectype);
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 gsi_next (&si))
351 if (is_gimple_debug (gsi_stmt (si)))
352 continue;
353 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 opt_result res
355 = vect_determine_vf_for_stmt (loop_vinfo,
356 stmt_info, &vectorization_factor);
357 if (!res)
358 return res;
362 /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 dump_dec (MSG_NOTE, vectorization_factor);
367 dump_printf (MSG_NOTE, "\n");
370 if (known_le (vectorization_factor, 1U))
371 return opt_result::failure_at (vect_location,
372 "not vectorized: unsupported data-type\n");
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 return opt_result::success ();
378 /* Function vect_is_simple_iv_evolution.
380 FORNOW: A simple evolution of an induction variables in the loop is
381 considered a polynomial evolution. */
383 static bool
384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 tree * step)
387 tree init_expr;
388 tree step_expr;
389 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 basic_block bb;
392 /* When there is no evolution in this loop, the evolution function
393 is not "simple". */
394 if (evolution_part == NULL_TREE)
395 return false;
397 /* When the evolution is a polynomial of degree >= 2
398 the evolution function is not "simple". */
399 if (tree_is_chrec (evolution_part))
400 return false;
402 step_expr = evolution_part;
403 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 step_expr, init_expr);
409 *init = init_expr;
410 *step = step_expr;
412 if (TREE_CODE (step_expr) != INTEGER_CST
413 && (TREE_CODE (step_expr) != SSA_NAME
414 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 || !flag_associative_math)))
419 && (TREE_CODE (step_expr) != REAL_CST
420 || !flag_associative_math))
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "step unknown.\n");
425 return false;
428 return true;
431 /* Function vect_is_nonlinear_iv_evolution
433 Only support nonlinear induction for integer type
434 1. neg
435 2. mul by constant
436 3. lshift/rshift by constant.
438 For neg induction, return a fake step as integer -1. */
439 static bool
440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 gphi* loop_phi_node, tree *init, tree *step)
443 tree init_expr, ev_expr, result, op1, op2;
444 gimple* def;
446 if (gimple_phi_num_args (loop_phi_node) != 2)
447 return false;
449 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
452 /* Support nonlinear induction only for integer type. */
453 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 return false;
456 *init = init_expr;
457 result = PHI_RESULT (loop_phi_node);
459 if (TREE_CODE (ev_expr) != SSA_NAME
460 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 || !is_gimple_assign (def))
462 return false;
464 enum tree_code t_code = gimple_assign_rhs_code (def);
465 switch (t_code)
467 case NEGATE_EXPR:
468 if (gimple_assign_rhs1 (def) != result)
469 return false;
470 *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 break;
474 case RSHIFT_EXPR:
475 case LSHIFT_EXPR:
476 case MULT_EXPR:
477 op1 = gimple_assign_rhs1 (def);
478 op2 = gimple_assign_rhs2 (def);
479 if (TREE_CODE (op2) != INTEGER_CST
480 || op1 != result)
481 return false;
482 *step = op2;
483 if (t_code == LSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 else if (t_code == RSHIFT_EXPR)
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 else
489 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 break;
492 default:
493 return false;
496 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
499 return true;
502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 what we are assuming is a double reduction. For example, given
504 a structure like this:
506 outer1:
507 x_1 = PHI <x_4(outer2), ...>;
510 inner:
511 x_2 = PHI <x_1(outer1), ...>;
513 x_3 = ...;
516 outer2:
517 x_4 = PHI <x_3(inner)>;
520 outer loop analysis would treat x_1 as a double reduction phi and
521 this function would then return true for x_2. */
523 static bool
524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
526 use_operand_p use_p;
527 ssa_op_iter op_iter;
528 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 return true;
532 return false;
535 /* Returns true if Phi is a first-order recurrence. A first-order
536 recurrence is a non-reduction recurrence relation in which the value of
537 the recurrence in the current loop iteration equals a value defined in
538 the previous iteration. */
540 static bool
541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 gphi *phi)
544 /* A nested cycle isn't vectorizable as first order recurrence. */
545 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 return false;
548 /* Ensure the loop latch definition is from within the loop. */
549 edge latch = loop_latch_edge (loop);
550 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 if (TREE_CODE (ldef) != SSA_NAME
552 || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 return false;
557 tree def = gimple_phi_result (phi);
559 /* Ensure every use_stmt of the phi node is dominated by the latch
560 definition. */
561 imm_use_iterator imm_iter;
562 use_operand_p use_p;
563 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 if (!is_gimple_debug (USE_STMT (use_p))
565 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 USE_STMT (use_p))))
568 return false;
570 /* First-order recurrence autovectorization needs shuffle vector. */
571 tree scalar_type = TREE_TYPE (def);
572 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 if (!vectype)
574 return false;
576 return true;
579 /* Function vect_analyze_scalar_cycles_1.
581 Examine the cross iteration def-use cycles of scalar variables
582 in LOOP. LOOP_VINFO represents the loop that is now being
583 considered for vectorization (can be LOOP, or an outer-loop
584 enclosing LOOP). SLP indicates there will be some subsequent
585 slp analyses or not. */
587 static void
588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 bool slp)
591 basic_block bb = loop->header;
592 tree init, step;
593 auto_vec<stmt_vec_info, 64> worklist;
594 gphi_iterator gsi;
595 bool double_reduc, reduc_chain;
597 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
599 /* First - identify all inductions. Reduction detection assumes that all the
600 inductions have been identified, therefore, this order must not be
601 changed. */
602 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
604 gphi *phi = gsi.phi ();
605 tree access_fn = NULL;
606 tree def = PHI_RESULT (phi);
607 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 (gimple *) phi);
613 /* Skip virtual phi's. The data dependences that are associated with
614 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 if (virtual_operand_p (def))
616 continue;
618 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
620 /* Analyze the evolution function. */
621 access_fn = analyze_scalar_evolution (loop, def);
622 if (access_fn)
624 STRIP_NOPS (access_fn);
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "Access function of PHI: %T\n", access_fn);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 = initial_condition_in_loop_num (access_fn, loop->num);
630 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 = evolution_part_in_loop_num (access_fn, loop->num);
634 if ((!access_fn
635 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 &init, &step)
638 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 && TREE_CODE (step) != INTEGER_CST))
640 /* Only handle nonlinear iv for same loop. */
641 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 phi, &init, &step)))
645 worklist.safe_push (stmt_vinfo);
646 continue;
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 != NULL_TREE);
651 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
659 /* Second - identify all reductions and nested cycles. */
660 while (worklist.length () > 0)
662 stmt_vec_info stmt_vinfo = worklist.pop ();
663 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
664 tree def = PHI_RESULT (phi);
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 (gimple *) phi);
670 gcc_assert (!virtual_operand_p (def)
671 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
673 stmt_vec_info reduc_stmt_info
674 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 &reduc_chain, slp);
676 if (reduc_stmt_info)
678 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 if (double_reduc)
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location,
684 "Detected double reduction.\n");
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
689 else
691 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "Detected vectorizable nested cycle.\n");
697 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
699 else
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_NOTE, vect_location,
703 "Detected reduction.\n");
705 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
706 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
707 /* Store the reduction cycles for possible vectorization in
708 loop-aware SLP if it was not detected as reduction
709 chain. */
710 if (! reduc_chain)
711 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
712 (reduc_stmt_info);
716 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
718 else
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown def-use cycle pattern.\n");
726 /* Function vect_analyze_scalar_cycles.
728 Examine the cross iteration def-use cycles of scalar variables, by
729 analyzing the loop-header PHIs of scalar variables. Classify each
730 cycle as one of the following: invariant, induction, reduction, unknown.
731 We do that for the loop represented by LOOP_VINFO, and also to its
732 inner-loop, if exists.
733 Examples for scalar cycles:
735 Example1: reduction:
737 loop1:
738 for (i=0; i<N; i++)
739 sum += a[i];
741 Example2: induction:
743 loop2:
744 for (i=0; i<N; i++)
745 a[i] = i; */
747 static void
748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
752 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
754 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 Reductions in such inner-loop therefore have different properties than
756 the reductions in the nest that gets vectorized:
757 1. When vectorized, they are executed in the same order as in the original
758 scalar loop, so we can't change the order of computation when
759 vectorizing them.
760 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 current checks are too strict. */
763 if (loop->inner)
764 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
767 /* Transfer group and reduction information from STMT_INFO to its
768 pattern stmt. */
770 static void
771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
773 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
774 stmt_vec_info stmtp;
775 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
777 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
780 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
781 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
782 == STMT_VINFO_DEF_TYPE (stmt_info));
783 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
784 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
785 if (stmt_info)
786 REDUC_GROUP_NEXT_ELEMENT (stmtp)
787 = STMT_VINFO_RELATED_STMT (stmt_info);
789 while (stmt_info);
792 /* Fixup scalar cycles that now have their stmts detected as patterns. */
794 static void
795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
797 stmt_vec_info first;
798 unsigned i;
800 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
802 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
803 while (next)
805 if ((STMT_VINFO_IN_PATTERN_P (next)
806 != STMT_VINFO_IN_PATTERN_P (first))
807 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
808 break;
809 next = REDUC_GROUP_NEXT_ELEMENT (next);
811 /* If all reduction chain members are well-formed patterns adjust
812 the group to group the pattern stmts instead. */
813 if (! next
814 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
816 if (STMT_VINFO_IN_PATTERN_P (first))
818 vect_fixup_reduc_chain (first);
819 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
820 = STMT_VINFO_RELATED_STMT (first);
823 /* If not all stmt in the chain are patterns or if we failed
824 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 it as regular reduction instead. */
826 else
828 stmt_vec_info vinfo = first;
829 stmt_vec_info last = NULL;
830 while (vinfo)
832 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
833 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
834 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
835 last = vinfo;
836 vinfo = next;
838 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
839 = vect_internal_def;
840 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
841 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
842 --i;
847 /* Function vect_get_loop_niters.
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 niter information holds in ASSUMPTIONS.
854 Return the loop exit conditions. */
857 static vec<gcond *>
858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
859 tree *number_of_iterations, tree *number_of_iterationsm1)
861 auto_vec<edge> exits = get_loop_exit_edges (loop);
862 vec<gcond *> conds;
863 conds.create (exits.length ());
864 class tree_niter_desc niter_desc;
865 tree niter_assumptions, niter, may_be_zero;
867 *assumptions = boolean_true_node;
868 *number_of_iterationsm1 = chrec_dont_know;
869 *number_of_iterations = chrec_dont_know;
871 DUMP_VECT_SCOPE ("get_loop_niters");
873 if (exits.is_empty ())
874 return conds;
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
878 exits.length ());
880 edge exit;
881 unsigned int i;
882 FOR_EACH_VEC_ELT (exits, i, exit)
884 gcond *cond = get_loop_exit_condition (exit);
885 if (cond)
886 conds.safe_push (cond);
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
891 if (exit != main_exit)
892 continue;
894 may_be_zero = NULL_TREE;
895 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
896 || chrec_contains_undetermined (niter_desc.niter))
897 continue;
899 niter_assumptions = niter_desc.assumptions;
900 may_be_zero = niter_desc.may_be_zero;
901 niter = niter_desc.niter;
903 if (may_be_zero && integer_zerop (may_be_zero))
904 may_be_zero = NULL_TREE;
906 if (may_be_zero)
908 if (COMPARISON_CLASS_P (may_be_zero))
910 /* Try to combine may_be_zero with assumptions, this can simplify
911 computation of niter expression. */
912 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
913 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
914 niter_assumptions,
915 fold_build1 (TRUTH_NOT_EXPR,
916 boolean_type_node,
917 may_be_zero));
918 else
919 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
920 build_int_cst (TREE_TYPE (niter), 0),
921 rewrite_to_non_trapping_overflow (niter));
923 may_be_zero = NULL_TREE;
925 else if (integer_nonzerop (may_be_zero))
927 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
928 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
929 continue;
931 else
932 continue;
935 /* Loop assumptions are based off the normal exit. */
936 *assumptions = niter_assumptions;
937 *number_of_iterationsm1 = niter;
939 /* We want the number of loop header executions which is the number
940 of latch executions plus one.
941 ??? For UINT_MAX latch executions this number overflows to zero
942 for loops like do { n++; } while (n != 0); */
943 if (niter && !chrec_contains_undetermined (niter))
944 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
945 unshare_expr (niter),
946 build_int_cst (TREE_TYPE (niter), 1));
947 *number_of_iterations = niter;
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
953 return conds;
956 /* Determine the main loop exit for the vectorizer. */
958 edge
959 vec_init_loop_exit_info (class loop *loop)
961 /* Before we begin we must first determine which exit is the main one and
962 which are auxilary exits. */
963 auto_vec<edge> exits = get_loop_exit_edges (loop);
964 if (exits.length () == 1)
965 return exits[0];
967 /* If we have multiple exits we only support counting IV at the moment. Analyze
968 all exits and return one */
969 class tree_niter_desc niter_desc;
970 edge candidate = NULL;
971 for (edge exit : exits)
973 if (!get_loop_exit_condition (exit))
974 continue;
976 if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
977 && !chrec_contains_undetermined (niter_desc.niter))
979 if (!niter_desc.may_be_zero || !candidate)
980 candidate = exit;
984 return candidate;
987 /* Function bb_in_loop_p
989 Used as predicate for dfs order traversal of the loop bbs. */
991 static bool
992 bb_in_loop_p (const_basic_block bb, const void *data)
994 const class loop *const loop = (const class loop *)data;
995 if (flow_bb_inside_loop_p (loop, bb))
996 return true;
997 return false;
1001 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002 stmt_vec_info structs for all the stmts in LOOP_IN. */
1004 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005 : vec_info (vec_info::loop, shared),
1006 loop (loop_in),
1007 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008 num_itersm1 (NULL_TREE),
1009 num_iters (NULL_TREE),
1010 num_iters_unchanged (NULL_TREE),
1011 num_iters_assumptions (NULL_TREE),
1012 vector_costs (nullptr),
1013 scalar_costs (nullptr),
1014 th (0),
1015 versioning_threshold (0),
1016 vectorization_factor (0),
1017 main_loop_edge (nullptr),
1018 skip_main_loop_edge (nullptr),
1019 skip_this_loop_edge (nullptr),
1020 reusable_accumulators (),
1021 suggested_unroll_factor (1),
1022 max_vectorization_factor (0),
1023 mask_skip_niters (NULL_TREE),
1024 rgroup_compare_type (NULL_TREE),
1025 simd_if_cond (NULL_TREE),
1026 partial_vector_style (vect_partial_vectors_none),
1027 unaligned_dr (NULL),
1028 peeling_for_alignment (0),
1029 ptr_mask (0),
1030 ivexpr_map (NULL),
1031 scan_map (NULL),
1032 slp_unrolling_factor (1),
1033 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034 vectorizable (false),
1035 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036 using_partial_vectors_p (false),
1037 using_decrementing_iv_p (false),
1038 using_select_vl_p (false),
1039 epil_using_partial_vectors_p (false),
1040 partial_load_store_bias (0),
1041 peeling_for_gaps (false),
1042 peeling_for_niter (false),
1043 early_breaks (false),
1044 no_data_dependencies (false),
1045 has_mask_store (false),
1046 scalar_loop_scaling (profile_probability::uninitialized ()),
1047 scalar_loop (NULL),
1048 orig_loop_info (NULL),
1049 vec_loop_iv_exit (NULL),
1050 vec_epilogue_loop_iv_exit (NULL),
1051 scalar_loop_iv_exit (NULL)
1053 /* CHECKME: We want to visit all BBs before their successors (except for
1054 latch blocks, for which this assertion wouldn't hold). In the simple
1055 case of the loop forms we allow, a dfs order of the BBs would the same
1056 as reversed postorder traversal, so we are safe. */
1058 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1059 bbs, loop->num_nodes, loop);
1060 gcc_assert (nbbs == loop->num_nodes);
1062 for (unsigned int i = 0; i < nbbs; i++)
1064 basic_block bb = bbs[i];
1065 gimple_stmt_iterator si;
1067 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1069 gimple *phi = gsi_stmt (si);
1070 gimple_set_uid (phi, 0);
1071 add_stmt (phi);
1074 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1076 gimple *stmt = gsi_stmt (si);
1077 gimple_set_uid (stmt, 0);
1078 if (is_gimple_debug (stmt))
1079 continue;
1080 add_stmt (stmt);
1081 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1082 third argument is the #pragma omp simd if (x) condition, when 0,
1083 loop shouldn't be vectorized, when non-zero constant, it should
1084 be vectorized normally, otherwise versioned with vectorized loop
1085 done if the condition is non-zero at runtime. */
1086 if (loop_in->simduid
1087 && is_gimple_call (stmt)
1088 && gimple_call_internal_p (stmt)
1089 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1090 && gimple_call_num_args (stmt) >= 3
1091 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1092 && (loop_in->simduid
1093 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1095 tree arg = gimple_call_arg (stmt, 2);
1096 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1097 simd_if_cond = arg;
1098 else
1099 gcc_assert (integer_nonzerop (arg));
1104 epilogue_vinfos.create (6);
1107 /* Free all levels of rgroup CONTROLS. */
1109 void
1110 release_vec_loop_controls (vec<rgroup_controls> *controls)
1112 rgroup_controls *rgc;
1113 unsigned int i;
1114 FOR_EACH_VEC_ELT (*controls, i, rgc)
1115 rgc->controls.release ();
1116 controls->release ();
1119 /* Free all memory used by the _loop_vec_info, as well as all the
1120 stmt_vec_info structs of all the stmts in the loop. */
1122 _loop_vec_info::~_loop_vec_info ()
1124 free (bbs);
1126 release_vec_loop_controls (&masks.rgc_vec);
1127 release_vec_loop_controls (&lens);
1128 delete ivexpr_map;
1129 delete scan_map;
1130 epilogue_vinfos.release ();
1131 delete scalar_costs;
1132 delete vector_costs;
1134 /* When we release an epiloge vinfo that we do not intend to use
1135 avoid clearing AUX of the main loop which should continue to
1136 point to the main loop vinfo since otherwise we'll leak that. */
1137 if (loop->aux == this)
1138 loop->aux = NULL;
1141 /* Return an invariant or register for EXPR and emit necessary
1142 computations in the LOOP_VINFO loop preheader. */
1144 tree
1145 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1147 if (is_gimple_reg (expr)
1148 || is_gimple_min_invariant (expr))
1149 return expr;
1151 if (! loop_vinfo->ivexpr_map)
1152 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1153 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1154 if (! cached)
1156 gimple_seq stmts = NULL;
1157 cached = force_gimple_operand (unshare_expr (expr),
1158 &stmts, true, NULL_TREE);
1159 if (stmts)
1161 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1162 gsi_insert_seq_on_edge_immediate (e, stmts);
1165 return cached;
1168 /* Return true if we can use CMP_TYPE as the comparison type to produce
1169 all masks required to mask LOOP_VINFO. */
1171 static bool
1172 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1174 rgroup_controls *rgm;
1175 unsigned int i;
1176 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1177 if (rgm->type != NULL_TREE
1178 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1179 cmp_type, rgm->type,
1180 OPTIMIZE_FOR_SPEED))
1181 return false;
1182 return true;
1185 /* Calculate the maximum number of scalars per iteration for every
1186 rgroup in LOOP_VINFO. */
1188 static unsigned int
1189 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1191 unsigned int res = 1;
1192 unsigned int i;
1193 rgroup_controls *rgm;
1194 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1195 res = MAX (res, rgm->max_nscalars_per_iter);
1196 return res;
1199 /* Calculate the minimum precision necessary to represent:
1201 MAX_NITERS * FACTOR
1203 as an unsigned integer, where MAX_NITERS is the maximum number of
1204 loop header iterations for the original scalar form of LOOP_VINFO. */
1206 static unsigned
1207 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1209 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1211 /* Get the maximum number of iterations that is representable
1212 in the counter type. */
1213 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1214 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1216 /* Get a more refined estimate for the number of iterations. */
1217 widest_int max_back_edges;
1218 if (max_loop_iterations (loop, &max_back_edges))
1219 max_ni = wi::smin (max_ni, max_back_edges + 1);
1221 /* Work out how many bits we need to represent the limit. */
1222 return wi::min_precision (max_ni * factor, UNSIGNED);
1225 /* True if the loop needs peeling or partial vectors when vectorized. */
1227 static bool
1228 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1230 unsigned HOST_WIDE_INT const_vf;
1231 HOST_WIDE_INT max_niter
1232 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1234 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1235 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1236 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1237 (loop_vinfo));
1239 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1240 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1242 /* Work out the (constant) number of iterations that need to be
1243 peeled for reasons other than niters. */
1244 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1245 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1246 peel_niter += 1;
1247 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1248 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1249 return true;
1251 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1252 /* ??? When peeling for gaps but not alignment, we could
1253 try to check whether the (variable) niters is known to be
1254 VF * N + 1. That's something of a niche case though. */
1255 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1256 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1257 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1258 < (unsigned) exact_log2 (const_vf))
1259 /* In case of versioning, check if the maximum number of
1260 iterations is greater than th. If they are identical,
1261 the epilogue is unnecessary. */
1262 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1263 || ((unsigned HOST_WIDE_INT) max_niter
1264 /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1265 but that's only computed later based on our result.
1266 The following is the most conservative approximation. */
1267 > (std::max ((unsigned HOST_WIDE_INT) th,
1268 const_vf) / const_vf) * const_vf))))
1269 return true;
1271 return false;
1274 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1275 whether we can actually generate the masks required. Return true if so,
1276 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1278 static bool
1279 vect_verify_full_masking (loop_vec_info loop_vinfo)
1281 unsigned int min_ni_width;
1283 /* Use a normal loop if there are no statements that need masking.
1284 This only happens in rare degenerate cases: it means that the loop
1285 has no loads, no stores, and no live-out values. */
1286 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1287 return false;
1289 /* Produce the rgroup controls. */
1290 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1292 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1293 tree vectype = mask.first;
1294 unsigned nvectors = mask.second;
1296 if (masks->rgc_vec.length () < nvectors)
1297 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1298 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1299 /* The number of scalars per iteration and the number of vectors are
1300 both compile-time constants. */
1301 unsigned int nscalars_per_iter
1302 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1303 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1305 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1307 rgm->max_nscalars_per_iter = nscalars_per_iter;
1308 rgm->type = truth_type_for (vectype);
1309 rgm->factor = 1;
1313 unsigned int max_nscalars_per_iter
1314 = vect_get_max_nscalars_per_iter (loop_vinfo);
1316 /* Work out how many bits we need to represent the limit. */
1317 min_ni_width
1318 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1320 /* Find a scalar mode for which WHILE_ULT is supported. */
1321 opt_scalar_int_mode cmp_mode_iter;
1322 tree cmp_type = NULL_TREE;
1323 tree iv_type = NULL_TREE;
1324 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1325 unsigned int iv_precision = UINT_MAX;
1327 if (iv_limit != -1)
1328 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1329 UNSIGNED);
1331 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1333 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1334 if (cmp_bits >= min_ni_width
1335 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1337 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1338 if (this_type
1339 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1341 /* Although we could stop as soon as we find a valid mode,
1342 there are at least two reasons why that's not always the
1343 best choice:
1345 - An IV that's Pmode or wider is more likely to be reusable
1346 in address calculations than an IV that's narrower than
1347 Pmode.
1349 - Doing the comparison in IV_PRECISION or wider allows
1350 a natural 0-based IV, whereas using a narrower comparison
1351 type requires mitigations against wrap-around.
1353 Conversely, if the IV limit is variable, doing the comparison
1354 in a wider type than the original type can introduce
1355 unnecessary extensions, so picking the widest valid mode
1356 is not always a good choice either.
1358 Here we prefer the first IV type that's Pmode or wider,
1359 and the first comparison type that's IV_PRECISION or wider.
1360 (The comparison type must be no wider than the IV type,
1361 to avoid extensions in the vector loop.)
1363 ??? We might want to try continuing beyond Pmode for ILP32
1364 targets if CMP_BITS < IV_PRECISION. */
1365 iv_type = this_type;
1366 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1367 cmp_type = this_type;
1368 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1369 break;
1374 if (!cmp_type)
1376 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1377 return false;
1380 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1381 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1382 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1383 return true;
1386 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1387 whether we can actually generate AVX512 style masks. Return true if so,
1388 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1390 static bool
1391 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1393 /* Produce differently organized rgc_vec and differently check
1394 we can produce masks. */
1396 /* Use a normal loop if there are no statements that need masking.
1397 This only happens in rare degenerate cases: it means that the loop
1398 has no loads, no stores, and no live-out values. */
1399 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1400 return false;
1402 /* For the decrementing IV we need to represent all values in
1403 [0, niter + niter_skip] where niter_skip is the elements we
1404 skip in the first iteration for prologue peeling. */
1405 tree iv_type = NULL_TREE;
1406 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1407 unsigned int iv_precision = UINT_MAX;
1408 if (iv_limit != -1)
1409 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1411 /* First compute the type for the IV we use to track the remaining
1412 scalar iterations. */
1413 opt_scalar_int_mode cmp_mode_iter;
1414 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1416 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1417 if (cmp_bits >= iv_precision
1418 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1420 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1421 if (iv_type)
1422 break;
1425 if (!iv_type)
1426 return false;
1428 /* Produce the rgroup controls. */
1429 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1431 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1432 tree vectype = mask.first;
1433 unsigned nvectors = mask.second;
1435 /* The number of scalars per iteration and the number of vectors are
1436 both compile-time constants. */
1437 unsigned int nscalars_per_iter
1438 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1439 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1441 /* We index the rgroup_controls vector with nscalars_per_iter
1442 which we keep constant and instead have a varying nvectors,
1443 remembering the vector mask with the fewest nV. */
1444 if (masks->rgc_vec.length () < nscalars_per_iter)
1445 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1446 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1448 if (!rgm->type || rgm->factor > nvectors)
1450 rgm->type = truth_type_for (vectype);
1451 rgm->compare_type = NULL_TREE;
1452 rgm->max_nscalars_per_iter = nscalars_per_iter;
1453 rgm->factor = nvectors;
1454 rgm->bias_adjusted_ctrl = NULL_TREE;
1458 /* There is no fixed compare type we are going to use but we have to
1459 be able to get at one for each mask group. */
1460 unsigned int min_ni_width
1461 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1463 bool ok = true;
1464 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1466 tree mask_type = rgc.type;
1467 if (!mask_type)
1468 continue;
1470 /* For now vect_get_loop_mask only supports integer mode masks
1471 when we need to split it. */
1472 if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1473 || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1475 ok = false;
1476 break;
1479 /* If iv_type is usable as compare type use that - we can elide the
1480 saturation in that case. */
1481 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1483 tree cmp_vectype
1484 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1485 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1486 rgc.compare_type = cmp_vectype;
1488 if (!rgc.compare_type)
1489 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1491 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1492 if (cmp_bits >= min_ni_width
1493 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1495 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1496 if (!cmp_type)
1497 continue;
1499 /* Check whether we can produce the mask with cmp_type. */
1500 tree cmp_vectype
1501 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1502 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1504 rgc.compare_type = cmp_vectype;
1505 break;
1509 if (!rgc.compare_type)
1511 ok = false;
1512 break;
1515 if (!ok)
1517 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1518 return false;
1521 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1522 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1523 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1524 return true;
1527 /* Check whether we can use vector access with length based on precison
1528 comparison. So far, to keep it simple, we only allow the case that the
1529 precision of the target supported length is larger than the precision
1530 required by loop niters. */
1532 static bool
1533 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1535 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1536 return false;
1538 machine_mode len_load_mode, len_store_mode;
1539 if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1540 .exists (&len_load_mode))
1541 return false;
1542 if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1543 .exists (&len_store_mode))
1544 return false;
1546 signed char partial_load_bias = internal_len_load_store_bias
1547 (IFN_LEN_LOAD, len_load_mode);
1549 signed char partial_store_bias = internal_len_load_store_bias
1550 (IFN_LEN_STORE, len_store_mode);
1552 gcc_assert (partial_load_bias == partial_store_bias);
1554 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1555 return false;
1557 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1558 len_loads with a length of zero. In order to avoid that we prohibit
1559 more than one loop length here. */
1560 if (partial_load_bias == -1
1561 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1562 return false;
1564 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1566 unsigned int max_nitems_per_iter = 1;
1567 unsigned int i;
1568 rgroup_controls *rgl;
1569 /* Find the maximum number of items per iteration for every rgroup. */
1570 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1572 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1573 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1576 /* Work out how many bits we need to represent the length limit. */
1577 unsigned int min_ni_prec
1578 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1580 /* Now use the maximum of below precisions for one suitable IV type:
1581 - the IV's natural precision
1582 - the precision needed to hold: the maximum number of scalar
1583 iterations multiplied by the scale factor (min_ni_prec above)
1584 - the Pmode precision
1586 If min_ni_prec is less than the precision of the current niters,
1587 we perfer to still use the niters type. Prefer to use Pmode and
1588 wider IV to avoid narrow conversions. */
1590 unsigned int ni_prec
1591 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1592 min_ni_prec = MAX (min_ni_prec, ni_prec);
1593 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1595 tree iv_type = NULL_TREE;
1596 opt_scalar_int_mode tmode_iter;
1597 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1599 scalar_mode tmode = tmode_iter.require ();
1600 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1602 /* ??? Do we really want to construct one IV whose precision exceeds
1603 BITS_PER_WORD? */
1604 if (tbits > BITS_PER_WORD)
1605 break;
1607 /* Find the first available standard integral type. */
1608 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1610 iv_type = build_nonstandard_integer_type (tbits, true);
1611 break;
1615 if (!iv_type)
1617 if (dump_enabled_p ())
1618 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1619 "can't vectorize with length-based partial vectors"
1620 " because there is no suitable iv type.\n");
1621 return false;
1624 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1625 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1626 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1628 return true;
1631 /* Calculate the cost of one scalar iteration of the loop. */
1632 static void
1633 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1635 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1636 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1637 int nbbs = loop->num_nodes, factor;
1638 int innerloop_iters, i;
1640 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1642 /* Gather costs for statements in the scalar loop. */
1644 /* FORNOW. */
1645 innerloop_iters = 1;
1646 if (loop->inner)
1647 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1649 for (i = 0; i < nbbs; i++)
1651 gimple_stmt_iterator si;
1652 basic_block bb = bbs[i];
1654 if (bb->loop_father == loop->inner)
1655 factor = innerloop_iters;
1656 else
1657 factor = 1;
1659 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1661 gimple *stmt = gsi_stmt (si);
1662 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1664 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1665 continue;
1667 /* Skip stmts that are not vectorized inside the loop. */
1668 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1669 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1670 && (!STMT_VINFO_LIVE_P (vstmt_info)
1671 || !VECTORIZABLE_CYCLE_DEF
1672 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1673 continue;
1675 vect_cost_for_stmt kind;
1676 if (STMT_VINFO_DATA_REF (stmt_info))
1678 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1679 kind = scalar_load;
1680 else
1681 kind = scalar_store;
1683 else if (vect_nop_conversion_p (stmt_info))
1684 continue;
1685 else
1686 kind = scalar_stmt;
1688 /* We are using vect_prologue here to avoid scaling twice
1689 by the inner loop factor. */
1690 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1691 factor, kind, stmt_info, 0, vect_prologue);
1695 /* Now accumulate cost. */
1696 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1697 add_stmt_costs (loop_vinfo->scalar_costs,
1698 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1699 loop_vinfo->scalar_costs->finish_cost (nullptr);
1702 /* Function vect_analyze_loop_form.
1704 Verify that certain CFG restrictions hold, including:
1705 - the loop has a pre-header
1706 - the loop has a single entry
1707 - nested loops can have only a single exit.
1708 - the loop exit condition is simple enough
1709 - the number of iterations can be analyzed, i.e, a countable loop. The
1710 niter could be analyzed under some assumptions. */
1712 opt_result
1713 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1715 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1717 edge exit_e = vec_init_loop_exit_info (loop);
1718 if (!exit_e)
1719 return opt_result::failure_at (vect_location,
1720 "not vectorized:"
1721 " could not determine main exit from"
1722 " loop with multiple exits.\n");
1723 info->loop_exit = exit_e;
1724 if (dump_enabled_p ())
1725 dump_printf_loc (MSG_NOTE, vect_location,
1726 "using as main loop exit: %d -> %d [AUX: %p]\n",
1727 exit_e->src->index, exit_e->dest->index, exit_e->aux);
1729 /* Check if we have any control flow that doesn't leave the loop. */
1730 class loop *v_loop = loop->inner ? loop->inner : loop;
1731 basic_block *bbs= get_loop_body (v_loop);
1732 for (unsigned i = 0; i < v_loop->num_nodes; i++)
1733 if (EDGE_COUNT (bbs[i]->succs) != 1
1734 && (EDGE_COUNT (bbs[i]->succs) != 2
1735 || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1736 return opt_result::failure_at (vect_location,
1737 "not vectorized:"
1738 " unsupported control flow in loop.\n");
1740 /* Different restrictions apply when we are considering an inner-most loop,
1741 vs. an outer (nested) loop.
1742 (FORNOW. May want to relax some of these restrictions in the future). */
1744 info->inner_loop_cond = NULL;
1745 if (!loop->inner)
1747 /* Inner-most loop. We currently require that the number of BBs is
1748 exactly 2 (the header and latch). Vectorizable inner-most loops
1749 look like this:
1751 (pre-header)
1753 header <--------+
1754 | | |
1755 | +--> latch --+
1757 (exit-bb) */
1759 if (empty_block_p (loop->header))
1760 return opt_result::failure_at (vect_location,
1761 "not vectorized: empty loop.\n");
1763 else
1765 class loop *innerloop = loop->inner;
1766 edge entryedge;
1768 /* Nested loop. We currently require that the loop is doubly-nested,
1769 contains a single inner loop, and the number of BBs is exactly 5.
1770 Vectorizable outer-loops look like this:
1772 (pre-header)
1774 header <---+
1776 inner-loop |
1778 tail ------+
1780 (exit-bb)
1782 The inner-loop has the properties expected of inner-most loops
1783 as described above. */
1785 if ((loop->inner)->inner || (loop->inner)->next)
1786 return opt_result::failure_at (vect_location,
1787 "not vectorized:"
1788 " multiple nested loops.\n");
1790 entryedge = loop_preheader_edge (innerloop);
1791 if (entryedge->src != loop->header
1792 || !single_exit (innerloop)
1793 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1794 return opt_result::failure_at (vect_location,
1795 "not vectorized:"
1796 " unsupported outerloop form.\n");
1798 /* Analyze the inner-loop. */
1799 vect_loop_form_info inner;
1800 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1801 if (!res)
1803 if (dump_enabled_p ())
1804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1805 "not vectorized: Bad inner loop.\n");
1806 return res;
1809 /* Don't support analyzing niter under assumptions for inner
1810 loop. */
1811 if (!integer_onep (inner.assumptions))
1812 return opt_result::failure_at (vect_location,
1813 "not vectorized: Bad inner loop.\n");
1815 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1816 return opt_result::failure_at (vect_location,
1817 "not vectorized: inner-loop count not"
1818 " invariant.\n");
1820 if (dump_enabled_p ())
1821 dump_printf_loc (MSG_NOTE, vect_location,
1822 "Considering outer-loop vectorization.\n");
1823 info->inner_loop_cond = inner.conds[0];
1826 if (EDGE_COUNT (loop->header->preds) != 2)
1827 return opt_result::failure_at (vect_location,
1828 "not vectorized:"
1829 " too many incoming edges.\n");
1831 /* We assume that the loop exit condition is at the end of the loop. i.e,
1832 that the loop is represented as a do-while (with a proper if-guard
1833 before the loop if needed), where the loop header contains all the
1834 executable statements, and the latch is empty. */
1835 if (!empty_block_p (loop->latch)
1836 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1837 return opt_result::failure_at (vect_location,
1838 "not vectorized: latch block not empty.\n");
1840 /* Make sure the exit is not abnormal. */
1841 auto_vec<edge> exits = get_loop_exit_edges (loop);
1842 for (edge e : exits)
1844 if (e->flags & EDGE_ABNORMAL)
1845 return opt_result::failure_at (vect_location,
1846 "not vectorized:"
1847 " abnormal loop exit edge.\n");
1850 info->conds
1851 = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1852 &info->number_of_iterations,
1853 &info->number_of_iterationsm1);
1855 if (info->conds.is_empty ())
1856 return opt_result::failure_at
1857 (vect_location,
1858 "not vectorized: complicated exit condition.\n");
1860 /* Determine what the primary and alternate exit conds are. */
1861 for (unsigned i = 0; i < info->conds.length (); i++)
1863 gcond *cond = info->conds[i];
1864 if (exit_e->src == gimple_bb (cond))
1865 std::swap (info->conds[0], info->conds[i]);
1868 if (integer_zerop (info->assumptions)
1869 || !info->number_of_iterations
1870 || chrec_contains_undetermined (info->number_of_iterations))
1871 return opt_result::failure_at
1872 (info->conds[0],
1873 "not vectorized: number of iterations cannot be computed.\n");
1875 if (integer_zerop (info->number_of_iterations))
1876 return opt_result::failure_at
1877 (info->conds[0],
1878 "not vectorized: number of iterations = 0.\n");
1880 if (!(tree_fits_shwi_p (info->number_of_iterations)
1881 && tree_to_shwi (info->number_of_iterations) > 0))
1883 if (dump_enabled_p ())
1885 dump_printf_loc (MSG_NOTE, vect_location,
1886 "Symbolic number of iterations is ");
1887 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1888 dump_printf (MSG_NOTE, "\n");
1892 return opt_result::success ();
1895 /* Create a loop_vec_info for LOOP with SHARED and the
1896 vect_analyze_loop_form result. */
1898 loop_vec_info
1899 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1900 const vect_loop_form_info *info,
1901 loop_vec_info main_loop_info)
1903 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1904 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1905 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1906 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1907 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1908 /* Also record the assumptions for versioning. */
1909 if (!integer_onep (info->assumptions) && !main_loop_info)
1910 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1912 for (gcond *cond : info->conds)
1914 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1915 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1916 /* Mark the statement as a condition. */
1917 STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1920 for (unsigned i = 1; i < info->conds.length (); i ++)
1921 LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1922 LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1924 LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1926 /* Check to see if we're vectorizing multiple exits. */
1927 LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1928 = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1930 if (info->inner_loop_cond)
1932 stmt_vec_info inner_loop_cond_info
1933 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1934 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1935 /* If we have an estimate on the number of iterations of the inner
1936 loop use that to limit the scale for costing, otherwise use
1937 --param vect-inner-loop-cost-factor literally. */
1938 widest_int nit;
1939 if (estimated_stmt_executions (loop->inner, &nit))
1940 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1941 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1944 return loop_vinfo;
1949 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1950 statements update the vectorization factor. */
1952 static void
1953 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1955 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1956 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1957 int nbbs = loop->num_nodes;
1958 poly_uint64 vectorization_factor;
1959 int i;
1961 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1963 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1964 gcc_assert (known_ne (vectorization_factor, 0U));
1966 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1967 vectorization factor of the loop is the unrolling factor required by
1968 the SLP instances. If that unrolling factor is 1, we say, that we
1969 perform pure SLP on loop - cross iteration parallelism is not
1970 exploited. */
1971 bool only_slp_in_loop = true;
1972 for (i = 0; i < nbbs; i++)
1974 basic_block bb = bbs[i];
1975 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1976 gsi_next (&si))
1978 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1979 if (!stmt_info)
1980 continue;
1981 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1982 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1983 && !PURE_SLP_STMT (stmt_info))
1984 /* STMT needs both SLP and loop-based vectorization. */
1985 only_slp_in_loop = false;
1987 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1988 gsi_next (&si))
1990 if (is_gimple_debug (gsi_stmt (si)))
1991 continue;
1992 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1993 stmt_info = vect_stmt_to_vectorize (stmt_info);
1994 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1995 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1996 && !PURE_SLP_STMT (stmt_info))
1997 /* STMT needs both SLP and loop-based vectorization. */
1998 only_slp_in_loop = false;
2002 if (only_slp_in_loop)
2004 if (dump_enabled_p ())
2005 dump_printf_loc (MSG_NOTE, vect_location,
2006 "Loop contains only SLP stmts\n");
2007 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2009 else
2011 if (dump_enabled_p ())
2012 dump_printf_loc (MSG_NOTE, vect_location,
2013 "Loop contains SLP and non-SLP stmts\n");
2014 /* Both the vectorization factor and unroll factor have the form
2015 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2016 so they must have a common multiple. */
2017 vectorization_factor
2018 = force_common_multiple (vectorization_factor,
2019 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2022 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2023 if (dump_enabled_p ())
2025 dump_printf_loc (MSG_NOTE, vect_location,
2026 "Updating vectorization factor to ");
2027 dump_dec (MSG_NOTE, vectorization_factor);
2028 dump_printf (MSG_NOTE, ".\n");
2032 /* Return true if STMT_INFO describes a double reduction phi and if
2033 the other phi in the reduction is also relevant for vectorization.
2034 This rejects cases such as:
2036 outer1:
2037 x_1 = PHI <x_3(outer2), ...>;
2040 inner:
2041 x_2 = ...;
2044 outer2:
2045 x_3 = PHI <x_2(inner)>;
2047 if nothing in x_2 or elsewhere makes x_1 relevant. */
2049 static bool
2050 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2052 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2053 return false;
2055 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2058 /* Function vect_analyze_loop_operations.
2060 Scan the loop stmts and make sure they are all vectorizable. */
2062 static opt_result
2063 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2065 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2066 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2067 int nbbs = loop->num_nodes;
2068 int i;
2069 stmt_vec_info stmt_info;
2070 bool need_to_vectorize = false;
2071 bool ok;
2073 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2075 auto_vec<stmt_info_for_cost> cost_vec;
2077 for (i = 0; i < nbbs; i++)
2079 basic_block bb = bbs[i];
2081 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2082 gsi_next (&si))
2084 gphi *phi = si.phi ();
2085 ok = true;
2087 stmt_info = loop_vinfo->lookup_stmt (phi);
2088 if (dump_enabled_p ())
2089 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2090 (gimple *) phi);
2091 if (virtual_operand_p (gimple_phi_result (phi)))
2092 continue;
2094 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2095 (i.e., a phi in the tail of the outer-loop). */
2096 if (! is_loop_header_bb_p (bb))
2098 /* FORNOW: we currently don't support the case that these phis
2099 are not used in the outerloop (unless it is double reduction,
2100 i.e., this phi is vect_reduction_def), cause this case
2101 requires to actually do something here. */
2102 if (STMT_VINFO_LIVE_P (stmt_info)
2103 && !vect_active_double_reduction_p (stmt_info))
2104 return opt_result::failure_at (phi,
2105 "Unsupported loop-closed phi"
2106 " in outer-loop.\n");
2108 /* If PHI is used in the outer loop, we check that its operand
2109 is defined in the inner loop. */
2110 if (STMT_VINFO_RELEVANT_P (stmt_info))
2112 tree phi_op;
2114 if (gimple_phi_num_args (phi) != 1)
2115 return opt_result::failure_at (phi, "unsupported phi");
2117 phi_op = PHI_ARG_DEF (phi, 0);
2118 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2119 if (!op_def_info)
2120 return opt_result::failure_at (phi, "unsupported phi\n");
2122 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2123 && (STMT_VINFO_RELEVANT (op_def_info)
2124 != vect_used_in_outer_by_reduction))
2125 return opt_result::failure_at (phi, "unsupported phi\n");
2127 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2128 || (STMT_VINFO_DEF_TYPE (stmt_info)
2129 == vect_double_reduction_def))
2130 && !vectorizable_lc_phi (loop_vinfo,
2131 stmt_info, NULL, NULL))
2132 return opt_result::failure_at (phi, "unsupported phi\n");
2135 continue;
2138 gcc_assert (stmt_info);
2140 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2141 || STMT_VINFO_LIVE_P (stmt_info))
2142 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2143 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2144 /* A scalar-dependence cycle that we don't support. */
2145 return opt_result::failure_at (phi,
2146 "not vectorized:"
2147 " scalar dependence cycle.\n");
2149 if (STMT_VINFO_RELEVANT_P (stmt_info))
2151 need_to_vectorize = true;
2152 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2153 && ! PURE_SLP_STMT (stmt_info))
2154 ok = vectorizable_induction (loop_vinfo,
2155 stmt_info, NULL, NULL,
2156 &cost_vec);
2157 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2158 || (STMT_VINFO_DEF_TYPE (stmt_info)
2159 == vect_double_reduction_def)
2160 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2161 && ! PURE_SLP_STMT (stmt_info))
2162 ok = vectorizable_reduction (loop_vinfo,
2163 stmt_info, NULL, NULL, &cost_vec);
2164 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2165 == vect_first_order_recurrence)
2166 && ! PURE_SLP_STMT (stmt_info))
2167 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2168 &cost_vec);
2171 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2172 if (ok
2173 && STMT_VINFO_LIVE_P (stmt_info)
2174 && !PURE_SLP_STMT (stmt_info))
2175 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2176 -1, false, &cost_vec);
2178 if (!ok)
2179 return opt_result::failure_at (phi,
2180 "not vectorized: relevant phi not "
2181 "supported: %G",
2182 static_cast <gimple *> (phi));
2185 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2186 gsi_next (&si))
2188 gimple *stmt = gsi_stmt (si);
2189 if (!gimple_clobber_p (stmt)
2190 && !is_gimple_debug (stmt))
2192 opt_result res
2193 = vect_analyze_stmt (loop_vinfo,
2194 loop_vinfo->lookup_stmt (stmt),
2195 &need_to_vectorize,
2196 NULL, NULL, &cost_vec);
2197 if (!res)
2198 return res;
2201 } /* bbs */
2203 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2205 /* All operations in the loop are either irrelevant (deal with loop
2206 control, or dead), or only used outside the loop and can be moved
2207 out of the loop (e.g. invariants, inductions). The loop can be
2208 optimized away by scalar optimizations. We're better off not
2209 touching this loop. */
2210 if (!need_to_vectorize)
2212 if (dump_enabled_p ())
2213 dump_printf_loc (MSG_NOTE, vect_location,
2214 "All the computation can be taken out of the loop.\n");
2215 return opt_result::failure_at
2216 (vect_location,
2217 "not vectorized: redundant loop. no profit to vectorize.\n");
2220 return opt_result::success ();
2223 /* Return true if we know that the iteration count is smaller than the
2224 vectorization factor. Return false if it isn't, or if we can't be sure
2225 either way. */
2227 static bool
2228 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2230 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2232 HOST_WIDE_INT max_niter;
2233 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2234 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2235 else
2236 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2238 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2239 return true;
2241 return false;
2244 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2245 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2246 definitely no, or -1 if it's worth retrying. */
2248 static int
2249 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2250 unsigned *suggested_unroll_factor)
2252 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2253 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2255 /* Only loops that can handle partially-populated vectors can have iteration
2256 counts less than the vectorization factor. */
2257 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2258 && vect_known_niters_smaller_than_vf (loop_vinfo))
2260 if (dump_enabled_p ())
2261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262 "not vectorized: iteration count smaller than "
2263 "vectorization factor.\n");
2264 return 0;
2267 /* If we know the number of iterations we can do better, for the
2268 epilogue we can also decide whether the main loop leaves us
2269 with enough iterations, prefering a smaller vector epilog then
2270 also possibly used for the case we skip the vector loop. */
2271 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2273 widest_int scalar_niters
2274 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2275 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2277 loop_vec_info orig_loop_vinfo
2278 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2279 unsigned lowest_vf
2280 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2281 int prolog_peeling = 0;
2282 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2283 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2284 if (prolog_peeling >= 0
2285 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2286 lowest_vf))
2288 unsigned gap
2289 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2290 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2291 % lowest_vf + gap);
2294 /* Reject vectorizing for a single scalar iteration, even if
2295 we could in principle implement that using partial vectors. */
2296 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2297 if (scalar_niters <= peeling_gap + 1)
2299 if (dump_enabled_p ())
2300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301 "not vectorized: loop only has a single "
2302 "scalar iteration.\n");
2303 return 0;
2306 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2308 /* Check that the loop processes at least one full vector. */
2309 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2310 if (known_lt (scalar_niters, vf))
2312 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2314 "loop does not have enough iterations "
2315 "to support vectorization.\n");
2316 return 0;
2319 /* If we need to peel an extra epilogue iteration to handle data
2320 accesses with gaps, check that there are enough scalar iterations
2321 available.
2323 The check above is redundant with this one when peeling for gaps,
2324 but the distinction is useful for diagnostics. */
2325 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2326 && known_le (scalar_niters, vf))
2328 if (dump_enabled_p ())
2329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2330 "loop does not have enough iterations "
2331 "to support peeling for gaps.\n");
2332 return 0;
2337 /* If using the "very cheap" model. reject cases in which we'd keep
2338 a copy of the scalar code (even if we might be able to vectorize it). */
2339 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2340 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2341 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2342 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346 "some scalar iterations would need to be peeled\n");
2347 return 0;
2350 int min_profitable_iters, min_profitable_estimate;
2351 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2352 &min_profitable_estimate,
2353 suggested_unroll_factor);
2355 if (min_profitable_iters < 0)
2357 if (dump_enabled_p ())
2358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2359 "not vectorized: vectorization not profitable.\n");
2360 if (dump_enabled_p ())
2361 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2362 "not vectorized: vector version will never be "
2363 "profitable.\n");
2364 return -1;
2367 int min_scalar_loop_bound = (param_min_vect_loop_bound
2368 * assumed_vf);
2370 /* Use the cost model only if it is more conservative than user specified
2371 threshold. */
2372 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2373 min_profitable_iters);
2375 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2377 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2378 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2380 if (dump_enabled_p ())
2381 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2382 "not vectorized: vectorization not profitable.\n");
2383 if (dump_enabled_p ())
2384 dump_printf_loc (MSG_NOTE, vect_location,
2385 "not vectorized: iteration count smaller than user "
2386 "specified loop bound parameter or minimum profitable "
2387 "iterations (whichever is more conservative).\n");
2388 return 0;
2391 /* The static profitablity threshold min_profitable_estimate includes
2392 the cost of having to check at runtime whether the scalar loop
2393 should be used instead. If it turns out that we don't need or want
2394 such a check, the threshold we should use for the static estimate
2395 is simply the point at which the vector loop becomes more profitable
2396 than the scalar loop. */
2397 if (min_profitable_estimate > min_profitable_iters
2398 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2399 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2400 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2401 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2403 if (dump_enabled_p ())
2404 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2405 " choice between the scalar and vector loops\n");
2406 min_profitable_estimate = min_profitable_iters;
2409 /* If the vector loop needs multiple iterations to be beneficial then
2410 things are probably too close to call, and the conservative thing
2411 would be to stick with the scalar code. */
2412 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2413 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2415 if (dump_enabled_p ())
2416 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2417 "one iteration of the vector loop would be"
2418 " more expensive than the equivalent number of"
2419 " iterations of the scalar loop\n");
2420 return 0;
2423 HOST_WIDE_INT estimated_niter;
2425 /* If we are vectorizing an epilogue then we know the maximum number of
2426 scalar iterations it will cover is at least one lower than the
2427 vectorization factor of the main loop. */
2428 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2429 estimated_niter
2430 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2431 else
2433 estimated_niter = estimated_stmt_executions_int (loop);
2434 if (estimated_niter == -1)
2435 estimated_niter = likely_max_stmt_executions_int (loop);
2437 if (estimated_niter != -1
2438 && ((unsigned HOST_WIDE_INT) estimated_niter
2439 < MAX (th, (unsigned) min_profitable_estimate)))
2441 if (dump_enabled_p ())
2442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2443 "not vectorized: estimated iteration count too "
2444 "small.\n");
2445 if (dump_enabled_p ())
2446 dump_printf_loc (MSG_NOTE, vect_location,
2447 "not vectorized: estimated iteration count smaller "
2448 "than specified loop bound parameter or minimum "
2449 "profitable iterations (whichever is more "
2450 "conservative).\n");
2451 return -1;
2454 return 1;
2457 static opt_result
2458 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2459 vec<data_reference_p> *datarefs,
2460 unsigned int *n_stmts)
2462 *n_stmts = 0;
2463 for (unsigned i = 0; i < loop->num_nodes; i++)
2464 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2465 !gsi_end_p (gsi); gsi_next (&gsi))
2467 gimple *stmt = gsi_stmt (gsi);
2468 if (is_gimple_debug (stmt))
2469 continue;
2470 ++(*n_stmts);
2471 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2472 NULL, 0);
2473 if (!res)
2475 if (is_gimple_call (stmt) && loop->safelen)
2477 tree fndecl = gimple_call_fndecl (stmt), op;
2478 if (fndecl == NULL_TREE
2479 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2481 fndecl = gimple_call_arg (stmt, 0);
2482 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2483 fndecl = TREE_OPERAND (fndecl, 0);
2484 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2486 if (fndecl != NULL_TREE)
2488 cgraph_node *node = cgraph_node::get (fndecl);
2489 if (node != NULL && node->simd_clones != NULL)
2491 unsigned int j, n = gimple_call_num_args (stmt);
2492 for (j = 0; j < n; j++)
2494 op = gimple_call_arg (stmt, j);
2495 if (DECL_P (op)
2496 || (REFERENCE_CLASS_P (op)
2497 && get_base_address (op)))
2498 break;
2500 op = gimple_call_lhs (stmt);
2501 /* Ignore #pragma omp declare simd functions
2502 if they don't have data references in the
2503 call stmt itself. */
2504 if (j == n
2505 && !(op
2506 && (DECL_P (op)
2507 || (REFERENCE_CLASS_P (op)
2508 && get_base_address (op)))))
2509 continue;
2513 return res;
2515 /* If dependence analysis will give up due to the limit on the
2516 number of datarefs stop here and fail fatally. */
2517 if (datarefs->length ()
2518 > (unsigned)param_loop_max_datarefs_for_datadeps)
2519 return opt_result::failure_at (stmt, "exceeded param "
2520 "loop-max-datarefs-for-datadeps\n");
2522 return opt_result::success ();
2525 /* Look for SLP-only access groups and turn each individual access into its own
2526 group. */
2527 static void
2528 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2530 unsigned int i;
2531 struct data_reference *dr;
2533 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2535 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2536 FOR_EACH_VEC_ELT (datarefs, i, dr)
2538 gcc_assert (DR_REF (dr));
2539 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2541 /* Check if the load is a part of an interleaving chain. */
2542 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2544 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2545 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2546 unsigned int group_size = DR_GROUP_SIZE (first_element);
2548 /* Check if SLP-only groups. */
2549 if (!STMT_SLP_TYPE (stmt_info)
2550 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2552 /* Dissolve the group. */
2553 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2555 stmt_vec_info vinfo = first_element;
2556 while (vinfo)
2558 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2559 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2560 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2561 DR_GROUP_SIZE (vinfo) = 1;
2562 if (STMT_VINFO_STRIDED_P (first_element)
2563 /* We cannot handle stores with gaps. */
2564 || DR_IS_WRITE (dr_info->dr))
2566 STMT_VINFO_STRIDED_P (vinfo) = true;
2567 DR_GROUP_GAP (vinfo) = 0;
2569 else
2570 DR_GROUP_GAP (vinfo) = group_size - 1;
2571 /* Duplicate and adjust alignment info, it needs to
2572 be present on each group leader, see dr_misalignment. */
2573 if (vinfo != first_element)
2575 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2576 dr_info2->target_alignment = dr_info->target_alignment;
2577 int misalignment = dr_info->misalignment;
2578 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2580 HOST_WIDE_INT diff
2581 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2582 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2583 unsigned HOST_WIDE_INT align_c
2584 = dr_info->target_alignment.to_constant ();
2585 misalignment = (misalignment + diff) % align_c;
2587 dr_info2->misalignment = misalignment;
2589 vinfo = next;
2596 /* Determine if operating on full vectors for LOOP_VINFO might leave
2597 some scalar iterations still to do. If so, decide how we should
2598 handle those scalar iterations. The possibilities are:
2600 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2601 In this case:
2603 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2604 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2605 LOOP_VINFO_PEELING_FOR_NITER == false
2607 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2608 to handle the remaining scalar iterations. In this case:
2610 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2611 LOOP_VINFO_PEELING_FOR_NITER == true
2613 There are two choices:
2615 (2a) Consider vectorizing the epilogue loop at the same VF as the
2616 main loop, but using partial vectors instead of full vectors.
2617 In this case:
2619 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2621 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2622 In this case:
2624 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2627 opt_result
2628 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2630 /* Determine whether there would be any scalar iterations left over. */
2631 bool need_peeling_or_partial_vectors_p
2632 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2634 /* Decide whether to vectorize the loop with partial vectors. */
2635 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2636 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2637 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2638 && need_peeling_or_partial_vectors_p)
2640 /* For partial-vector-usage=1, try to push the handling of partial
2641 vectors to the epilogue, with the main loop continuing to operate
2642 on full vectors.
2644 If we are unrolling we also do not want to use partial vectors. This
2645 is to avoid the overhead of generating multiple masks and also to
2646 avoid having to execute entire iterations of FALSE masked instructions
2647 when dealing with one or less full iterations.
2649 ??? We could then end up failing to use partial vectors if we
2650 decide to peel iterations into a prologue, and if the main loop
2651 then ends up processing fewer than VF iterations. */
2652 if ((param_vect_partial_vector_usage == 1
2653 || loop_vinfo->suggested_unroll_factor > 1)
2654 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2655 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2656 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2657 else
2658 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2661 if (dump_enabled_p ())
2662 dump_printf_loc (MSG_NOTE, vect_location,
2663 "operating on %s vectors%s.\n",
2664 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2665 ? "partial" : "full",
2666 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2667 ? " for epilogue loop" : "");
2669 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2670 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2671 && need_peeling_or_partial_vectors_p);
2673 /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2674 analysis that we don't know whether the loop is vectorized by partial
2675 vectors (More details see tree-vect-loop-manip.cc).
2677 However, SELECT_VL vectorizaton style should only applied on partial
2678 vectorization since SELECT_VL is the GIMPLE IR that calculates the
2679 number of elements to be process for each iteration.
2681 After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2682 if it is not partial vectorized loop. */
2683 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2684 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2686 return opt_result::success ();
2689 /* Function vect_analyze_loop_2.
2691 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2692 analyses will record information in some members of LOOP_VINFO. FATAL
2693 indicates if some analysis meets fatal error. If one non-NULL pointer
2694 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2695 worked out suggested unroll factor, while one NULL pointer shows it's
2696 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2697 is to hold the slp decision when the suggested unroll factor is worked
2698 out. */
2699 static opt_result
2700 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2701 unsigned *suggested_unroll_factor,
2702 bool& slp_done_for_suggested_uf)
2704 opt_result ok = opt_result::success ();
2705 int res;
2706 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2707 poly_uint64 min_vf = 2;
2708 loop_vec_info orig_loop_vinfo = NULL;
2710 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2711 loop_vec_info of the first vectorized loop. */
2712 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2713 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2714 else
2715 orig_loop_vinfo = loop_vinfo;
2716 gcc_assert (orig_loop_vinfo);
2718 /* The first group of checks is independent of the vector size. */
2719 fatal = true;
2721 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2722 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2723 return opt_result::failure_at (vect_location,
2724 "not vectorized: simd if(0)\n");
2726 /* Find all data references in the loop (which correspond to vdefs/vuses)
2727 and analyze their evolution in the loop. */
2729 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2731 /* Gather the data references and count stmts in the loop. */
2732 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2734 opt_result res
2735 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2736 &LOOP_VINFO_DATAREFS (loop_vinfo),
2737 &LOOP_VINFO_N_STMTS (loop_vinfo));
2738 if (!res)
2740 if (dump_enabled_p ())
2741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2742 "not vectorized: loop contains function "
2743 "calls or data references that cannot "
2744 "be analyzed\n");
2745 return res;
2747 loop_vinfo->shared->save_datarefs ();
2749 else
2750 loop_vinfo->shared->check_datarefs ();
2752 /* Analyze the data references and also adjust the minimal
2753 vectorization factor according to the loads and stores. */
2755 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2756 if (!ok)
2758 if (dump_enabled_p ())
2759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760 "bad data references.\n");
2761 return ok;
2764 /* Check if we are applying unroll factor now. */
2765 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2766 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2768 /* If the slp decision is false when suggested unroll factor is worked
2769 out, and we are applying suggested unroll factor, we can simply skip
2770 all slp related analyses this time. */
2771 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2773 /* Classify all cross-iteration scalar data-flow cycles.
2774 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2775 vect_analyze_scalar_cycles (loop_vinfo, slp);
2777 vect_pattern_recog (loop_vinfo);
2779 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2781 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2782 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2784 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2785 if (!ok)
2787 if (dump_enabled_p ())
2788 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2789 "bad data access.\n");
2790 return ok;
2793 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2795 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2796 if (!ok)
2798 if (dump_enabled_p ())
2799 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2800 "unexpected pattern.\n");
2801 return ok;
2804 /* While the rest of the analysis below depends on it in some way. */
2805 fatal = false;
2807 /* Analyze data dependences between the data-refs in the loop
2808 and adjust the maximum vectorization factor according to
2809 the dependences.
2810 FORNOW: fail at the first data dependence that we encounter. */
2812 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2813 if (!ok)
2815 if (dump_enabled_p ())
2816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2817 "bad data dependence.\n");
2818 return ok;
2820 if (max_vf != MAX_VECTORIZATION_FACTOR
2821 && maybe_lt (max_vf, min_vf))
2822 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2823 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2825 ok = vect_determine_vectorization_factor (loop_vinfo);
2826 if (!ok)
2828 if (dump_enabled_p ())
2829 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2830 "can't determine vectorization factor.\n");
2831 return ok;
2834 /* Compute the scalar iteration cost. */
2835 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2837 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2839 if (slp)
2841 /* Check the SLP opportunities in the loop, analyze and build
2842 SLP trees. */
2843 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2844 if (!ok)
2845 return ok;
2847 /* If there are any SLP instances mark them as pure_slp. */
2848 slp = vect_make_slp_decision (loop_vinfo);
2849 if (slp)
2851 /* Find stmts that need to be both vectorized and SLPed. */
2852 vect_detect_hybrid_slp (loop_vinfo);
2854 /* Update the vectorization factor based on the SLP decision. */
2855 vect_update_vf_for_slp (loop_vinfo);
2857 /* Optimize the SLP graph with the vectorization factor fixed. */
2858 vect_optimize_slp (loop_vinfo);
2860 /* Gather the loads reachable from the SLP graph entries. */
2861 vect_gather_slp_loads (loop_vinfo);
2865 bool saved_can_use_partial_vectors_p
2866 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2868 /* We don't expect to have to roll back to anything other than an empty
2869 set of rgroups. */
2870 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2872 /* This is the point where we can re-start analysis with SLP forced off. */
2873 start_over:
2875 /* Apply the suggested unrolling factor, this was determined by the backend
2876 during finish_cost the first time we ran the analyzis for this
2877 vector mode. */
2878 if (applying_suggested_uf)
2879 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2881 /* Now the vectorization factor is final. */
2882 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2883 gcc_assert (known_ne (vectorization_factor, 0U));
2885 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2887 dump_printf_loc (MSG_NOTE, vect_location,
2888 "vectorization_factor = ");
2889 dump_dec (MSG_NOTE, vectorization_factor);
2890 dump_printf (MSG_NOTE, ", niters = %wd\n",
2891 LOOP_VINFO_INT_NITERS (loop_vinfo));
2894 if (max_vf != MAX_VECTORIZATION_FACTOR
2895 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2896 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2898 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2900 /* Analyze the alignment of the data-refs in the loop.
2901 Fail if a data reference is found that cannot be vectorized. */
2903 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2904 if (!ok)
2906 if (dump_enabled_p ())
2907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2908 "bad data alignment.\n");
2909 return ok;
2912 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2913 It is important to call pruning after vect_analyze_data_ref_accesses,
2914 since we use grouping information gathered by interleaving analysis. */
2915 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2916 if (!ok)
2917 return ok;
2919 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2920 vectorization, since we do not want to add extra peeling or
2921 add versioning for alignment. */
2922 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2923 /* This pass will decide on using loop versioning and/or loop peeling in
2924 order to enhance the alignment of data references in the loop. */
2925 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2926 if (!ok)
2927 return ok;
2929 if (slp)
2931 /* Analyze operations in the SLP instances. Note this may
2932 remove unsupported SLP instances which makes the above
2933 SLP kind detection invalid. */
2934 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2935 vect_slp_analyze_operations (loop_vinfo);
2936 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2938 ok = opt_result::failure_at (vect_location,
2939 "unsupported SLP instances\n");
2940 goto again;
2943 /* Check whether any load in ALL SLP instances is possibly permuted. */
2944 slp_tree load_node, slp_root;
2945 unsigned i, x;
2946 slp_instance instance;
2947 bool can_use_lanes = true;
2948 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2950 slp_root = SLP_INSTANCE_TREE (instance);
2951 int group_size = SLP_TREE_LANES (slp_root);
2952 tree vectype = SLP_TREE_VECTYPE (slp_root);
2953 bool loads_permuted = false;
2954 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2956 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2957 continue;
2958 unsigned j;
2959 stmt_vec_info load_info;
2960 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2961 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2963 loads_permuted = true;
2964 break;
2968 /* If the loads and stores can be handled with load/store-lane
2969 instructions record it and move on to the next instance. */
2970 if (loads_permuted
2971 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2972 && vect_store_lanes_supported (vectype, group_size, false)
2973 != IFN_LAST)
2975 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2976 if (STMT_VINFO_GROUPED_ACCESS
2977 (SLP_TREE_REPRESENTATIVE (load_node)))
2979 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2980 (SLP_TREE_REPRESENTATIVE (load_node));
2981 /* Use SLP for strided accesses (or if we can't
2982 load-lanes). */
2983 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2984 || vect_load_lanes_supported
2985 (STMT_VINFO_VECTYPE (stmt_vinfo),
2986 DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2987 break;
2990 can_use_lanes
2991 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2993 if (can_use_lanes && dump_enabled_p ())
2994 dump_printf_loc (MSG_NOTE, vect_location,
2995 "SLP instance %p can use load/store-lanes\n",
2996 (void *) instance);
2998 else
3000 can_use_lanes = false;
3001 break;
3005 /* If all SLP instances can use load/store-lanes abort SLP and try again
3006 with SLP disabled. */
3007 if (can_use_lanes)
3009 ok = opt_result::failure_at (vect_location,
3010 "Built SLP cancelled: can use "
3011 "load/store-lanes\n");
3012 if (dump_enabled_p ())
3013 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3014 "Built SLP cancelled: all SLP instances support "
3015 "load/store-lanes\n");
3016 goto again;
3020 /* Dissolve SLP-only groups. */
3021 vect_dissolve_slp_only_groups (loop_vinfo);
3023 /* Scan all the remaining operations in the loop that are not subject
3024 to SLP and make sure they are vectorizable. */
3025 ok = vect_analyze_loop_operations (loop_vinfo);
3026 if (!ok)
3028 if (dump_enabled_p ())
3029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3030 "bad operation or unsupported loop bound.\n");
3031 return ok;
3034 /* For now, we don't expect to mix both masking and length approaches for one
3035 loop, disable it if both are recorded. */
3036 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3037 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3038 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3040 if (dump_enabled_p ())
3041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3042 "can't vectorize a loop with partial vectors"
3043 " because we don't expect to mix different"
3044 " approaches with partial vectors for the"
3045 " same loop.\n");
3046 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3049 /* If we still have the option of using partial vectors,
3050 check whether we can generate the necessary loop controls. */
3051 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3053 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3055 if (!vect_verify_full_masking (loop_vinfo)
3056 && !vect_verify_full_masking_avx512 (loop_vinfo))
3057 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3059 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3060 if (!vect_verify_loop_lens (loop_vinfo))
3061 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3064 /* If we're vectorizing a loop that uses length "controls" and
3065 can iterate more than once, we apply decrementing IV approach
3066 in loop control. */
3067 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3068 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3069 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3070 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3071 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3072 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3073 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3075 /* If a loop uses length controls and has a decrementing loop control IV,
3076 we will normally pass that IV through a MIN_EXPR to calcaluate the
3077 basis for the length controls. E.g. in a loop that processes one
3078 element per scalar iteration, the number of elements would be
3079 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3081 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3082 step, since only the final iteration of the vector loop can have
3083 inactive lanes.
3085 However, some targets have a dedicated instruction for calculating the
3086 preferred length, given the total number of elements that still need to
3087 be processed. This is encapsulated in the SELECT_VL internal function.
3089 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3090 to determine the basis for the length controls. However, unlike the
3091 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3092 lanes inactive in any iteration of the vector loop, not just the last
3093 iteration. This SELECT_VL approach therefore requires us to use pointer
3094 IVs with variable steps.
3096 Once we've decided how many elements should be processed by one
3097 iteration of the vector loop, we need to populate the rgroup controls.
3098 If a loop has multiple rgroups, we need to make sure that those rgroups
3099 "line up" (that is, they must be consistent about which elements are
3100 active and which aren't). This is done by vect_adjust_loop_lens_control.
3102 In principle, it would be possible to use vect_adjust_loop_lens_control
3103 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3104 However:
3106 (1) In practice, it only makes sense to use SELECT_VL when a vector
3107 operation will be controlled directly by the result. It is not
3108 worth using SELECT_VL if it would only be the input to other
3109 calculations.
3111 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3112 pointer IV will need N updates by a variable amount (N-1 updates
3113 within the iteration and 1 update to move to the next iteration).
3115 Because of this, we prefer to use the MIN_EXPR approach whenever there
3116 is more than one length control.
3118 In addition, SELECT_VL always operates to a granularity of 1 unit.
3119 If we wanted to use it to control an SLP operation on N consecutive
3120 elements, we would need to make the SELECT_VL inputs measure scalar
3121 iterations (rather than elements) and then multiply the SELECT_VL
3122 result by N. But using SELECT_VL this way is inefficient because
3123 of (1) above.
3125 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3126 satisfied:
3128 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3129 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3131 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3132 we will fail to gain benefits of following unroll optimizations. We prefer
3133 using the MIN_EXPR approach in this situation. */
3134 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3136 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3137 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3138 OPTIMIZE_FOR_SPEED)
3139 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3140 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3141 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3142 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3143 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3146 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3147 assuming that the loop will be used as a main loop. We will redo
3148 this analysis later if we instead decide to use the loop as an
3149 epilogue loop. */
3150 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3151 if (!ok)
3152 return ok;
3154 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3155 to be able to handle fewer than VF scalars, or needs to have a lower VF
3156 than the main loop. */
3157 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3158 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3160 poly_uint64 unscaled_vf
3161 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3162 orig_loop_vinfo->suggested_unroll_factor);
3163 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3164 return opt_result::failure_at (vect_location,
3165 "Vectorization factor too high for"
3166 " epilogue loop.\n");
3169 /* Check the costings of the loop make vectorizing worthwhile. */
3170 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3171 if (res < 0)
3173 ok = opt_result::failure_at (vect_location,
3174 "Loop costings may not be worthwhile.\n");
3175 goto again;
3177 if (!res)
3178 return opt_result::failure_at (vect_location,
3179 "Loop costings not worthwhile.\n");
3181 /* If an epilogue loop is required make sure we can create one. */
3182 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3183 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3184 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3186 if (dump_enabled_p ())
3187 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3188 if (!vect_can_advance_ivs_p (loop_vinfo)
3189 || !slpeel_can_duplicate_loop_p (loop,
3190 LOOP_VINFO_IV_EXIT (loop_vinfo),
3191 LOOP_VINFO_IV_EXIT (loop_vinfo)))
3193 ok = opt_result::failure_at (vect_location,
3194 "not vectorized: can't create required "
3195 "epilog loop\n");
3196 goto again;
3200 /* During peeling, we need to check if number of loop iterations is
3201 enough for both peeled prolog loop and vector loop. This check
3202 can be merged along with threshold check of loop versioning, so
3203 increase threshold for this case if necessary.
3205 If we are analyzing an epilogue we still want to check what its
3206 versioning threshold would be. If we decide to vectorize the epilogues we
3207 will want to use the lowest versioning threshold of all epilogues and main
3208 loop. This will enable us to enter a vectorized epilogue even when
3209 versioning the loop. We can't simply check whether the epilogue requires
3210 versioning though since we may have skipped some versioning checks when
3211 analyzing the epilogue. For instance, checks for alias versioning will be
3212 skipped when dealing with epilogues as we assume we already checked them
3213 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3214 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3216 poly_uint64 niters_th = 0;
3217 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3219 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3221 /* Niters for peeled prolog loop. */
3222 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3224 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3225 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3226 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3228 else
3229 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3232 /* Niters for at least one iteration of vectorized loop. */
3233 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3234 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3235 /* One additional iteration because of peeling for gap. */
3236 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3237 niters_th += 1;
3239 /* Use the same condition as vect_transform_loop to decide when to use
3240 the cost to determine a versioning threshold. */
3241 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3242 && ordered_p (th, niters_th))
3243 niters_th = ordered_max (poly_uint64 (th), niters_th);
3245 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3248 gcc_assert (known_eq (vectorization_factor,
3249 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3251 slp_done_for_suggested_uf = slp;
3253 /* Ok to vectorize! */
3254 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3255 return opt_result::success ();
3257 again:
3258 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3259 gcc_assert (!ok);
3261 /* Try again with SLP forced off but if we didn't do any SLP there is
3262 no point in re-trying. */
3263 if (!slp)
3264 return ok;
3266 /* If the slp decision is true when suggested unroll factor is worked
3267 out, and we are applying suggested unroll factor, we don't need to
3268 re-try any more. */
3269 if (applying_suggested_uf && slp_done_for_suggested_uf)
3270 return ok;
3272 /* If there are reduction chains re-trying will fail anyway. */
3273 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3274 return ok;
3276 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3277 via interleaving or lane instructions. */
3278 slp_instance instance;
3279 slp_tree node;
3280 unsigned i, j;
3281 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3283 stmt_vec_info vinfo;
3284 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3285 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3286 continue;
3287 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3288 unsigned int size = DR_GROUP_SIZE (vinfo);
3289 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3290 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3291 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3292 && ! vect_grouped_store_supported (vectype, size))
3293 return opt_result::failure_at (vinfo->stmt,
3294 "unsupported grouped store\n");
3295 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3297 vinfo = SLP_TREE_REPRESENTATIVE (node);
3298 if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3300 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3301 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3302 size = DR_GROUP_SIZE (vinfo);
3303 vectype = STMT_VINFO_VECTYPE (vinfo);
3304 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3305 && ! vect_grouped_load_supported (vectype, single_element_p,
3306 size))
3307 return opt_result::failure_at (vinfo->stmt,
3308 "unsupported grouped load\n");
3313 if (dump_enabled_p ())
3314 dump_printf_loc (MSG_NOTE, vect_location,
3315 "re-trying with SLP disabled\n");
3317 /* Roll back state appropriately. No SLP this time. */
3318 slp = false;
3319 /* Restore vectorization factor as it were without SLP. */
3320 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3321 /* Free the SLP instances. */
3322 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3323 vect_free_slp_instance (instance);
3324 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3325 /* Reset SLP type to loop_vect on all stmts. */
3326 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3328 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3329 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3330 !gsi_end_p (si); gsi_next (&si))
3332 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3333 STMT_SLP_TYPE (stmt_info) = loop_vect;
3334 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3335 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3337 /* vectorizable_reduction adjusts reduction stmt def-types,
3338 restore them to that of the PHI. */
3339 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3340 = STMT_VINFO_DEF_TYPE (stmt_info);
3341 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3342 (STMT_VINFO_REDUC_DEF (stmt_info)))
3343 = STMT_VINFO_DEF_TYPE (stmt_info);
3346 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3347 !gsi_end_p (si); gsi_next (&si))
3349 if (is_gimple_debug (gsi_stmt (si)))
3350 continue;
3351 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3352 STMT_SLP_TYPE (stmt_info) = loop_vect;
3353 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3355 stmt_vec_info pattern_stmt_info
3356 = STMT_VINFO_RELATED_STMT (stmt_info);
3357 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3358 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3360 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3361 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3362 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3363 !gsi_end_p (pi); gsi_next (&pi))
3364 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3365 = loop_vect;
3369 /* Free optimized alias test DDRS. */
3370 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3371 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3372 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3373 /* Reset target cost data. */
3374 delete loop_vinfo->vector_costs;
3375 loop_vinfo->vector_costs = nullptr;
3376 /* Reset accumulated rgroup information. */
3377 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3378 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3379 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3380 /* Reset assorted flags. */
3381 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3382 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3383 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3384 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3385 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3386 = saved_can_use_partial_vectors_p;
3388 goto start_over;
3391 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3392 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3393 OLD_LOOP_VINFO is better unless something specifically indicates
3394 otherwise.
3396 Note that this deliberately isn't a partial order. */
3398 static bool
3399 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3400 loop_vec_info old_loop_vinfo)
3402 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3403 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3405 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3406 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3408 /* Always prefer a VF of loop->simdlen over any other VF. */
3409 if (loop->simdlen)
3411 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3412 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3413 if (new_simdlen_p != old_simdlen_p)
3414 return new_simdlen_p;
3417 const auto *old_costs = old_loop_vinfo->vector_costs;
3418 const auto *new_costs = new_loop_vinfo->vector_costs;
3419 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3420 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3422 return new_costs->better_main_loop_than_p (old_costs);
3425 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3426 true if we should. */
3428 static bool
3429 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3430 loop_vec_info old_loop_vinfo)
3432 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3433 return false;
3435 if (dump_enabled_p ())
3436 dump_printf_loc (MSG_NOTE, vect_location,
3437 "***** Preferring vector mode %s to vector mode %s\n",
3438 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3439 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3440 return true;
3443 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3444 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3445 MODE_I to the next mode useful to analyze.
3446 Return the loop_vinfo on success and wrapped null on failure. */
3448 static opt_loop_vec_info
3449 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3450 const vect_loop_form_info *loop_form_info,
3451 loop_vec_info main_loop_vinfo,
3452 const vector_modes &vector_modes, unsigned &mode_i,
3453 machine_mode &autodetected_vector_mode,
3454 bool &fatal)
3456 loop_vec_info loop_vinfo
3457 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3459 machine_mode vector_mode = vector_modes[mode_i];
3460 loop_vinfo->vector_mode = vector_mode;
3461 unsigned int suggested_unroll_factor = 1;
3462 bool slp_done_for_suggested_uf = false;
3464 /* Run the main analysis. */
3465 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3466 &suggested_unroll_factor,
3467 slp_done_for_suggested_uf);
3468 if (dump_enabled_p ())
3469 dump_printf_loc (MSG_NOTE, vect_location,
3470 "***** Analysis %s with vector mode %s\n",
3471 res ? "succeeded" : " failed",
3472 GET_MODE_NAME (loop_vinfo->vector_mode));
3474 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3476 if (dump_enabled_p ())
3477 dump_printf_loc (MSG_NOTE, vect_location,
3478 "***** Re-trying analysis for unrolling"
3479 " with unroll factor %d and slp %s.\n",
3480 suggested_unroll_factor,
3481 slp_done_for_suggested_uf ? "on" : "off");
3482 loop_vec_info unroll_vinfo
3483 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3484 unroll_vinfo->vector_mode = vector_mode;
3485 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3486 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3487 slp_done_for_suggested_uf);
3488 if (new_res)
3490 delete loop_vinfo;
3491 loop_vinfo = unroll_vinfo;
3493 else
3494 delete unroll_vinfo;
3497 /* Remember the autodetected vector mode. */
3498 if (vector_mode == VOIDmode)
3499 autodetected_vector_mode = loop_vinfo->vector_mode;
3501 /* Advance mode_i, first skipping modes that would result in the
3502 same analysis result. */
3503 while (mode_i + 1 < vector_modes.length ()
3504 && vect_chooses_same_modes_p (loop_vinfo,
3505 vector_modes[mode_i + 1]))
3507 if (dump_enabled_p ())
3508 dump_printf_loc (MSG_NOTE, vect_location,
3509 "***** The result for vector mode %s would"
3510 " be the same\n",
3511 GET_MODE_NAME (vector_modes[mode_i + 1]));
3512 mode_i += 1;
3514 if (mode_i + 1 < vector_modes.length ()
3515 && VECTOR_MODE_P (autodetected_vector_mode)
3516 && (related_vector_mode (vector_modes[mode_i + 1],
3517 GET_MODE_INNER (autodetected_vector_mode))
3518 == autodetected_vector_mode)
3519 && (related_vector_mode (autodetected_vector_mode,
3520 GET_MODE_INNER (vector_modes[mode_i + 1]))
3521 == vector_modes[mode_i + 1]))
3523 if (dump_enabled_p ())
3524 dump_printf_loc (MSG_NOTE, vect_location,
3525 "***** Skipping vector mode %s, which would"
3526 " repeat the analysis for %s\n",
3527 GET_MODE_NAME (vector_modes[mode_i + 1]),
3528 GET_MODE_NAME (autodetected_vector_mode));
3529 mode_i += 1;
3531 mode_i++;
3533 if (!res)
3535 delete loop_vinfo;
3536 if (fatal)
3537 gcc_checking_assert (main_loop_vinfo == NULL);
3538 return opt_loop_vec_info::propagate_failure (res);
3541 return opt_loop_vec_info::success (loop_vinfo);
3544 /* Function vect_analyze_loop.
3546 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3547 for it. The different analyses will record information in the
3548 loop_vec_info struct. */
3549 opt_loop_vec_info
3550 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3552 DUMP_VECT_SCOPE ("analyze_loop_nest");
3554 if (loop_outer (loop)
3555 && loop_vec_info_for_loop (loop_outer (loop))
3556 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3557 return opt_loop_vec_info::failure_at (vect_location,
3558 "outer-loop already vectorized.\n");
3560 if (!find_loop_nest (loop, &shared->loop_nest))
3561 return opt_loop_vec_info::failure_at
3562 (vect_location,
3563 "not vectorized: loop nest containing two or more consecutive inner"
3564 " loops cannot be vectorized\n");
3566 /* Analyze the loop form. */
3567 vect_loop_form_info loop_form_info;
3568 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3569 if (!res)
3571 if (dump_enabled_p ())
3572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3573 "bad loop form.\n");
3574 return opt_loop_vec_info::propagate_failure (res);
3576 if (!integer_onep (loop_form_info.assumptions))
3578 /* We consider to vectorize this loop by versioning it under
3579 some assumptions. In order to do this, we need to clear
3580 existing information computed by scev and niter analyzer. */
3581 scev_reset_htab ();
3582 free_numbers_of_iterations_estimates (loop);
3583 /* Also set flag for this loop so that following scev and niter
3584 analysis are done under the assumptions. */
3585 loop_constraint_set (loop, LOOP_C_FINITE);
3587 else
3588 /* Clear the existing niter information to make sure the nonwrapping flag
3589 will be calculated and set propriately. */
3590 free_numbers_of_iterations_estimates (loop);
3592 auto_vector_modes vector_modes;
3593 /* Autodetect first vector size we try. */
3594 vector_modes.safe_push (VOIDmode);
3595 unsigned int autovec_flags
3596 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3597 loop->simdlen != 0);
3598 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3599 && !unlimited_cost_model (loop));
3600 machine_mode autodetected_vector_mode = VOIDmode;
3601 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3602 unsigned int mode_i = 0;
3603 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3605 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3606 a mode has not been analyzed. */
3607 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3608 for (unsigned i = 0; i < vector_modes.length (); ++i)
3609 cached_vf_per_mode.safe_push (0);
3611 /* First determine the main loop vectorization mode, either the first
3612 one that works, starting with auto-detecting the vector mode and then
3613 following the targets order of preference, or the one with the
3614 lowest cost if pick_lowest_cost_p. */
3615 while (1)
3617 bool fatal;
3618 unsigned int last_mode_i = mode_i;
3619 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3620 failed. */
3621 cached_vf_per_mode[last_mode_i] = -1;
3622 opt_loop_vec_info loop_vinfo
3623 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3624 NULL, vector_modes, mode_i,
3625 autodetected_vector_mode, fatal);
3626 if (fatal)
3627 break;
3629 if (loop_vinfo)
3631 /* Analyzis has been successful so update the VF value. The
3632 VF should always be a multiple of unroll_factor and we want to
3633 capture the original VF here. */
3634 cached_vf_per_mode[last_mode_i]
3635 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3636 loop_vinfo->suggested_unroll_factor);
3637 /* Once we hit the desired simdlen for the first time,
3638 discard any previous attempts. */
3639 if (simdlen
3640 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3642 delete first_loop_vinfo;
3643 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3644 simdlen = 0;
3646 else if (pick_lowest_cost_p
3647 && first_loop_vinfo
3648 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3650 /* Pick loop_vinfo over first_loop_vinfo. */
3651 delete first_loop_vinfo;
3652 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3654 if (first_loop_vinfo == NULL)
3655 first_loop_vinfo = loop_vinfo;
3656 else
3658 delete loop_vinfo;
3659 loop_vinfo = opt_loop_vec_info::success (NULL);
3662 /* Commit to first_loop_vinfo if we have no reason to try
3663 alternatives. */
3664 if (!simdlen && !pick_lowest_cost_p)
3665 break;
3667 if (mode_i == vector_modes.length ()
3668 || autodetected_vector_mode == VOIDmode)
3669 break;
3671 /* Try the next biggest vector size. */
3672 if (dump_enabled_p ())
3673 dump_printf_loc (MSG_NOTE, vect_location,
3674 "***** Re-trying analysis with vector mode %s\n",
3675 GET_MODE_NAME (vector_modes[mode_i]));
3677 if (!first_loop_vinfo)
3678 return opt_loop_vec_info::propagate_failure (res);
3680 if (dump_enabled_p ())
3681 dump_printf_loc (MSG_NOTE, vect_location,
3682 "***** Choosing vector mode %s\n",
3683 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3685 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3686 enabled, SIMDUID is not set, it is the innermost loop and we have
3687 either already found the loop's SIMDLEN or there was no SIMDLEN to
3688 begin with.
3689 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3690 bool vect_epilogues = (!simdlen
3691 && loop->inner == NULL
3692 && param_vect_epilogues_nomask
3693 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3694 /* No code motion support for multiple epilogues so for now
3695 not supported when multiple exits. */
3696 && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3697 && !loop->simduid);
3698 if (!vect_epilogues)
3699 return first_loop_vinfo;
3701 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3702 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3704 /* For epilogues start the analysis from the first mode. The motivation
3705 behind starting from the beginning comes from cases where the VECTOR_MODES
3706 array may contain length-agnostic and length-specific modes. Their
3707 ordering is not guaranteed, so we could end up picking a mode for the main
3708 loop that is after the epilogue's optimal mode. */
3709 vector_modes[0] = autodetected_vector_mode;
3710 mode_i = 0;
3712 bool supports_partial_vectors =
3713 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3714 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3716 while (1)
3718 /* If the target does not support partial vectors we can shorten the
3719 number of modes to analyze for the epilogue as we know we can't pick a
3720 mode that would lead to a VF at least as big as the
3721 FIRST_VINFO_VF. */
3722 if (!supports_partial_vectors
3723 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3725 mode_i++;
3726 if (mode_i == vector_modes.length ())
3727 break;
3728 continue;
3731 if (dump_enabled_p ())
3732 dump_printf_loc (MSG_NOTE, vect_location,
3733 "***** Re-trying epilogue analysis with vector "
3734 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3736 bool fatal;
3737 opt_loop_vec_info loop_vinfo
3738 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3739 first_loop_vinfo,
3740 vector_modes, mode_i,
3741 autodetected_vector_mode, fatal);
3742 if (fatal)
3743 break;
3745 if (loop_vinfo)
3747 if (pick_lowest_cost_p)
3749 /* Keep trying to roll back vectorization attempts while the
3750 loop_vec_infos they produced were worse than this one. */
3751 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3752 while (!vinfos.is_empty ()
3753 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3755 gcc_assert (vect_epilogues);
3756 delete vinfos.pop ();
3759 /* For now only allow one epilogue loop. */
3760 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3762 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3763 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3764 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3765 || maybe_ne (lowest_th, 0U));
3766 /* Keep track of the known smallest versioning
3767 threshold. */
3768 if (ordered_p (lowest_th, th))
3769 lowest_th = ordered_min (lowest_th, th);
3771 else
3773 delete loop_vinfo;
3774 loop_vinfo = opt_loop_vec_info::success (NULL);
3777 /* For now only allow one epilogue loop, but allow
3778 pick_lowest_cost_p to replace it, so commit to the
3779 first epilogue if we have no reason to try alternatives. */
3780 if (!pick_lowest_cost_p)
3781 break;
3784 if (mode_i == vector_modes.length ())
3785 break;
3789 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3791 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3792 if (dump_enabled_p ())
3793 dump_printf_loc (MSG_NOTE, vect_location,
3794 "***** Choosing epilogue vector mode %s\n",
3795 GET_MODE_NAME
3796 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3799 return first_loop_vinfo;
3802 /* Return true if there is an in-order reduction function for CODE, storing
3803 it in *REDUC_FN if so. */
3805 static bool
3806 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3808 /* We support MINUS_EXPR by negating the operand. This also preserves an
3809 initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3810 (-0.0) = -0.0. */
3811 if (code == PLUS_EXPR || code == MINUS_EXPR)
3813 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3814 return true;
3816 return false;
3819 /* Function reduction_fn_for_scalar_code
3821 Input:
3822 CODE - tree_code of a reduction operations.
3824 Output:
3825 REDUC_FN - the corresponding internal function to be used to reduce the
3826 vector of partial results into a single scalar result, or IFN_LAST
3827 if the operation is a supported reduction operation, but does not have
3828 such an internal function.
3830 Return FALSE if CODE currently cannot be vectorized as reduction. */
3832 bool
3833 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3835 if (code.is_tree_code ())
3836 switch (tree_code (code))
3838 case MAX_EXPR:
3839 *reduc_fn = IFN_REDUC_MAX;
3840 return true;
3842 case MIN_EXPR:
3843 *reduc_fn = IFN_REDUC_MIN;
3844 return true;
3846 case PLUS_EXPR:
3847 *reduc_fn = IFN_REDUC_PLUS;
3848 return true;
3850 case BIT_AND_EXPR:
3851 *reduc_fn = IFN_REDUC_AND;
3852 return true;
3854 case BIT_IOR_EXPR:
3855 *reduc_fn = IFN_REDUC_IOR;
3856 return true;
3858 case BIT_XOR_EXPR:
3859 *reduc_fn = IFN_REDUC_XOR;
3860 return true;
3862 case MULT_EXPR:
3863 case MINUS_EXPR:
3864 *reduc_fn = IFN_LAST;
3865 return true;
3867 default:
3868 return false;
3870 else
3871 switch (combined_fn (code))
3873 CASE_CFN_FMAX:
3874 *reduc_fn = IFN_REDUC_FMAX;
3875 return true;
3877 CASE_CFN_FMIN:
3878 *reduc_fn = IFN_REDUC_FMIN;
3879 return true;
3881 default:
3882 return false;
3886 /* If there is a neutral value X such that a reduction would not be affected
3887 by the introduction of additional X elements, return that X, otherwise
3888 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3889 of the scalar elements. If the reduction has just a single initial value
3890 then INITIAL_VALUE is that value, otherwise it is null.
3891 If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3892 In that case no signed zero is returned. */
3894 tree
3895 neutral_op_for_reduction (tree scalar_type, code_helper code,
3896 tree initial_value, bool as_initial)
3898 if (code.is_tree_code ())
3899 switch (tree_code (code))
3901 case DOT_PROD_EXPR:
3902 case SAD_EXPR:
3903 case MINUS_EXPR:
3904 case BIT_IOR_EXPR:
3905 case BIT_XOR_EXPR:
3906 return build_zero_cst (scalar_type);
3907 case WIDEN_SUM_EXPR:
3908 case PLUS_EXPR:
3909 if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3910 return build_real (scalar_type, dconstm0);
3911 else
3912 return build_zero_cst (scalar_type);
3914 case MULT_EXPR:
3915 return build_one_cst (scalar_type);
3917 case BIT_AND_EXPR:
3918 return build_all_ones_cst (scalar_type);
3920 case MAX_EXPR:
3921 case MIN_EXPR:
3922 return initial_value;
3924 default:
3925 return NULL_TREE;
3927 else
3928 switch (combined_fn (code))
3930 CASE_CFN_FMIN:
3931 CASE_CFN_FMAX:
3932 return initial_value;
3934 default:
3935 return NULL_TREE;
3939 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3940 STMT is printed with a message MSG. */
3942 static void
3943 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3945 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3948 /* Return true if we need an in-order reduction for operation CODE
3949 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3950 overflow must wrap. */
3952 bool
3953 needs_fold_left_reduction_p (tree type, code_helper code)
3955 /* CHECKME: check for !flag_finite_math_only too? */
3956 if (SCALAR_FLOAT_TYPE_P (type))
3958 if (code.is_tree_code ())
3959 switch (tree_code (code))
3961 case MIN_EXPR:
3962 case MAX_EXPR:
3963 return false;
3965 default:
3966 return !flag_associative_math;
3968 else
3969 switch (combined_fn (code))
3971 CASE_CFN_FMIN:
3972 CASE_CFN_FMAX:
3973 return false;
3975 default:
3976 return !flag_associative_math;
3980 if (INTEGRAL_TYPE_P (type))
3981 return (!code.is_tree_code ()
3982 || !operation_no_trapping_overflow (type, tree_code (code)));
3984 if (SAT_FIXED_POINT_TYPE_P (type))
3985 return true;
3987 return false;
3990 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3991 has a handled computation expression. Store the main reduction
3992 operation in *CODE. */
3994 static bool
3995 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3996 tree loop_arg, code_helper *code,
3997 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3999 auto_bitmap visited;
4000 tree lookfor = PHI_RESULT (phi);
4001 ssa_op_iter curri;
4002 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4003 while (USE_FROM_PTR (curr) != loop_arg)
4004 curr = op_iter_next_use (&curri);
4005 curri.i = curri.numops;
4008 path.safe_push (std::make_pair (curri, curr));
4009 tree use = USE_FROM_PTR (curr);
4010 if (use == lookfor)
4011 break;
4012 gimple *def = SSA_NAME_DEF_STMT (use);
4013 if (gimple_nop_p (def)
4014 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4016 pop:
4019 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4020 curri = x.first;
4021 curr = x.second;
4023 curr = op_iter_next_use (&curri);
4024 /* Skip already visited or non-SSA operands (from iterating
4025 over PHI args). */
4026 while (curr != NULL_USE_OPERAND_P
4027 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4028 || ! bitmap_set_bit (visited,
4029 SSA_NAME_VERSION
4030 (USE_FROM_PTR (curr)))));
4032 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4033 if (curr == NULL_USE_OPERAND_P)
4034 break;
4036 else
4038 if (gimple_code (def) == GIMPLE_PHI)
4039 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4040 else
4041 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4042 while (curr != NULL_USE_OPERAND_P
4043 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4044 || ! bitmap_set_bit (visited,
4045 SSA_NAME_VERSION
4046 (USE_FROM_PTR (curr)))))
4047 curr = op_iter_next_use (&curri);
4048 if (curr == NULL_USE_OPERAND_P)
4049 goto pop;
4052 while (1);
4053 if (dump_file && (dump_flags & TDF_DETAILS))
4055 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4056 unsigned i;
4057 std::pair<ssa_op_iter, use_operand_p> *x;
4058 FOR_EACH_VEC_ELT (path, i, x)
4059 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4060 dump_printf (MSG_NOTE, "\n");
4063 /* Check whether the reduction path detected is valid. */
4064 bool fail = path.length () == 0;
4065 bool neg = false;
4066 int sign = -1;
4067 *code = ERROR_MARK;
4068 for (unsigned i = 1; i < path.length (); ++i)
4070 gimple *use_stmt = USE_STMT (path[i].second);
4071 gimple_match_op op;
4072 if (!gimple_extract_op (use_stmt, &op))
4074 fail = true;
4075 break;
4077 unsigned int opi = op.num_ops;
4078 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4080 /* The following make sure we can compute the operand index
4081 easily plus it mostly disallows chaining via COND_EXPR condition
4082 operands. */
4083 for (opi = 0; opi < op.num_ops; ++opi)
4084 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4085 break;
4087 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4089 for (opi = 0; opi < op.num_ops; ++opi)
4090 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4091 break;
4093 if (opi == op.num_ops)
4095 fail = true;
4096 break;
4098 op.code = canonicalize_code (op.code, op.type);
4099 if (op.code == MINUS_EXPR)
4101 op.code = PLUS_EXPR;
4102 /* Track whether we negate the reduction value each iteration. */
4103 if (op.ops[1] == op.ops[opi])
4104 neg = ! neg;
4106 if (CONVERT_EXPR_CODE_P (op.code)
4107 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4109 else if (*code == ERROR_MARK)
4111 *code = op.code;
4112 sign = TYPE_SIGN (op.type);
4114 else if (op.code != *code)
4116 fail = true;
4117 break;
4119 else if ((op.code == MIN_EXPR
4120 || op.code == MAX_EXPR)
4121 && sign != TYPE_SIGN (op.type))
4123 fail = true;
4124 break;
4126 /* Check there's only a single stmt the op is used on. For the
4127 not value-changing tail and the last stmt allow out-of-loop uses.
4128 ??? We could relax this and handle arbitrary live stmts by
4129 forcing a scalar epilogue for example. */
4130 imm_use_iterator imm_iter;
4131 use_operand_p use_p;
4132 gimple *op_use_stmt;
4133 unsigned cnt = 0;
4134 bool cond_fn_p = op.code.is_internal_fn ()
4135 && (conditional_internal_fn_code (internal_fn (op.code))
4136 != ERROR_MARK);
4138 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4140 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4141 op1 twice (once as definition, once as else) in the same operation.
4142 Allow this. */
4143 if (cond_fn_p && op_use_stmt == use_stmt)
4145 gcall *call = as_a<gcall *> (use_stmt);
4146 unsigned else_pos
4147 = internal_fn_else_index (internal_fn (op.code));
4149 for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4151 if (j == else_pos)
4152 continue;
4153 if (gimple_call_arg (call, j) == op.ops[opi])
4154 cnt++;
4157 else if (!is_gimple_debug (op_use_stmt)
4158 && (*code != ERROR_MARK
4159 || flow_bb_inside_loop_p (loop,
4160 gimple_bb (op_use_stmt))))
4161 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4162 cnt++;
4165 if (cnt != 1)
4167 fail = true;
4168 break;
4171 return ! fail && ! neg && *code != ERROR_MARK;
4174 bool
4175 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4176 tree loop_arg, enum tree_code code)
4178 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4179 code_helper code_;
4180 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4181 && code_ == code);
4186 /* Function vect_is_simple_reduction
4188 (1) Detect a cross-iteration def-use cycle that represents a simple
4189 reduction computation. We look for the following pattern:
4191 loop_header:
4192 a1 = phi < a0, a2 >
4193 a3 = ...
4194 a2 = operation (a3, a1)
4198 a3 = ...
4199 loop_header:
4200 a1 = phi < a0, a2 >
4201 a2 = operation (a3, a1)
4203 such that:
4204 1. operation is commutative and associative and it is safe to
4205 change the order of the computation
4206 2. no uses for a2 in the loop (a2 is used out of the loop)
4207 3. no uses of a1 in the loop besides the reduction operation
4208 4. no uses of a1 outside the loop.
4210 Conditions 1,4 are tested here.
4211 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4213 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4214 nested cycles.
4216 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4217 reductions:
4219 a1 = phi < a0, a2 >
4220 inner loop (def of a3)
4221 a2 = phi < a3 >
4223 (4) Detect condition expressions, ie:
4224 for (int i = 0; i < N; i++)
4225 if (a[i] < val)
4226 ret_val = a[i];
4230 static stmt_vec_info
4231 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4232 bool *double_reduc, bool *reduc_chain_p, bool slp)
4234 gphi *phi = as_a <gphi *> (phi_info->stmt);
4235 gimple *phi_use_stmt = NULL;
4236 imm_use_iterator imm_iter;
4237 use_operand_p use_p;
4239 *double_reduc = false;
4240 *reduc_chain_p = false;
4241 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4243 tree phi_name = PHI_RESULT (phi);
4244 /* ??? If there are no uses of the PHI result the inner loop reduction
4245 won't be detected as possibly double-reduction by vectorizable_reduction
4246 because that tries to walk the PHI arg from the preheader edge which
4247 can be constant. See PR60382. */
4248 if (has_zero_uses (phi_name))
4249 return NULL;
4250 class loop *loop = (gimple_bb (phi))->loop_father;
4251 unsigned nphi_def_loop_uses = 0;
4252 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4254 gimple *use_stmt = USE_STMT (use_p);
4255 if (is_gimple_debug (use_stmt))
4256 continue;
4258 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4260 if (dump_enabled_p ())
4261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4262 "intermediate value used outside loop.\n");
4264 return NULL;
4267 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4268 op1 twice (once as definition, once as else) in the same operation.
4269 Only count it as one. */
4270 if (use_stmt != phi_use_stmt)
4272 nphi_def_loop_uses++;
4273 phi_use_stmt = use_stmt;
4277 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4278 if (TREE_CODE (latch_def) != SSA_NAME)
4280 if (dump_enabled_p ())
4281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4282 "reduction: not ssa_name: %T\n", latch_def);
4283 return NULL;
4286 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4287 if (!def_stmt_info
4288 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4289 return NULL;
4291 bool nested_in_vect_loop
4292 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4293 unsigned nlatch_def_loop_uses = 0;
4294 auto_vec<gphi *, 3> lcphis;
4295 bool inner_loop_of_double_reduc = false;
4296 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4298 gimple *use_stmt = USE_STMT (use_p);
4299 if (is_gimple_debug (use_stmt))
4300 continue;
4301 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4302 nlatch_def_loop_uses++;
4303 else
4305 /* We can have more than one loop-closed PHI. */
4306 lcphis.safe_push (as_a <gphi *> (use_stmt));
4307 if (nested_in_vect_loop
4308 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4309 == vect_double_reduction_def))
4310 inner_loop_of_double_reduc = true;
4314 /* If we are vectorizing an inner reduction we are executing that
4315 in the original order only in case we are not dealing with a
4316 double reduction. */
4317 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4319 if (dump_enabled_p ())
4320 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4321 "detected nested cycle: ");
4322 return def_stmt_info;
4325 /* When the inner loop of a double reduction ends up with more than
4326 one loop-closed PHI we have failed to classify alternate such
4327 PHIs as double reduction, leading to wrong code. See PR103237. */
4328 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4330 if (dump_enabled_p ())
4331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4332 "unhandle double reduction\n");
4333 return NULL;
4336 /* If this isn't a nested cycle or if the nested cycle reduction value
4337 is used ouside of the inner loop we cannot handle uses of the reduction
4338 value. */
4339 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4341 if (dump_enabled_p ())
4342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4343 "reduction used in loop.\n");
4344 return NULL;
4347 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4348 defined in the inner loop. */
4349 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4351 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4352 if (gimple_phi_num_args (def_stmt) != 1
4353 || TREE_CODE (op1) != SSA_NAME)
4355 if (dump_enabled_p ())
4356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4357 "unsupported phi node definition.\n");
4359 return NULL;
4362 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4363 and the latch definition op1. */
4364 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4365 if (gimple_bb (def1)
4366 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4367 && loop->inner
4368 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4369 && (is_gimple_assign (def1) || is_gimple_call (def1))
4370 && is_a <gphi *> (phi_use_stmt)
4371 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4372 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4373 loop_latch_edge (loop->inner))))
4375 if (dump_enabled_p ())
4376 report_vect_op (MSG_NOTE, def_stmt,
4377 "detected double reduction: ");
4379 *double_reduc = true;
4380 return def_stmt_info;
4383 return NULL;
4386 /* Look for the expression computing latch_def from then loop PHI result. */
4387 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4388 code_helper code;
4389 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4390 path))
4392 STMT_VINFO_REDUC_CODE (phi_info) = code;
4393 if (code == COND_EXPR && !nested_in_vect_loop)
4394 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4396 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4397 reduction chain for which the additional restriction is that
4398 all operations in the chain are the same. */
4399 auto_vec<stmt_vec_info, 8> reduc_chain;
4400 unsigned i;
4401 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4402 for (i = path.length () - 1; i >= 1; --i)
4404 gimple *stmt = USE_STMT (path[i].second);
4405 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4406 gimple_match_op op;
4407 if (!gimple_extract_op (stmt, &op))
4408 gcc_unreachable ();
4409 if (gassign *assign = dyn_cast<gassign *> (stmt))
4410 STMT_VINFO_REDUC_IDX (stmt_info)
4411 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4412 else
4414 gcall *call = as_a<gcall *> (stmt);
4415 STMT_VINFO_REDUC_IDX (stmt_info)
4416 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4418 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4419 && (i == 1 || i == path.length () - 1));
4420 if ((op.code != code && !leading_conversion)
4421 /* We can only handle the final value in epilogue
4422 generation for reduction chains. */
4423 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4424 is_slp_reduc = false;
4425 /* For reduction chains we support a trailing/leading
4426 conversions. We do not store those in the actual chain. */
4427 if (leading_conversion)
4428 continue;
4429 reduc_chain.safe_push (stmt_info);
4431 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4433 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4435 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4436 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4438 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4439 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4441 /* Save the chain for further analysis in SLP detection. */
4442 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4443 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4445 *reduc_chain_p = true;
4446 if (dump_enabled_p ())
4447 dump_printf_loc (MSG_NOTE, vect_location,
4448 "reduction: detected reduction chain\n");
4450 else if (dump_enabled_p ())
4451 dump_printf_loc (MSG_NOTE, vect_location,
4452 "reduction: detected reduction\n");
4454 return def_stmt_info;
4457 if (dump_enabled_p ())
4458 dump_printf_loc (MSG_NOTE, vect_location,
4459 "reduction: unknown pattern\n");
4461 return NULL;
4464 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4465 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4466 or -1 if not known. */
4468 static int
4469 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4471 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4472 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4474 if (dump_enabled_p ())
4475 dump_printf_loc (MSG_NOTE, vect_location,
4476 "cost model: epilogue peel iters set to vf/2 "
4477 "because loop iterations are unknown .\n");
4478 return assumed_vf / 2;
4480 else
4482 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4483 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4484 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4485 /* If we need to peel for gaps, but no peeling is required, we have to
4486 peel VF iterations. */
4487 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4488 peel_iters_epilogue = assumed_vf;
4489 return peel_iters_epilogue;
4493 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4495 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4496 int *peel_iters_epilogue,
4497 stmt_vector_for_cost *scalar_cost_vec,
4498 stmt_vector_for_cost *prologue_cost_vec,
4499 stmt_vector_for_cost *epilogue_cost_vec)
4501 int retval = 0;
4503 *peel_iters_epilogue
4504 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4506 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4508 /* If peeled iterations are known but number of scalar loop
4509 iterations are unknown, count a taken branch per peeled loop. */
4510 if (peel_iters_prologue > 0)
4511 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4512 vect_prologue);
4513 if (*peel_iters_epilogue > 0)
4514 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4515 vect_epilogue);
4518 stmt_info_for_cost *si;
4519 int j;
4520 if (peel_iters_prologue)
4521 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4522 retval += record_stmt_cost (prologue_cost_vec,
4523 si->count * peel_iters_prologue,
4524 si->kind, si->stmt_info, si->misalign,
4525 vect_prologue);
4526 if (*peel_iters_epilogue)
4527 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4528 retval += record_stmt_cost (epilogue_cost_vec,
4529 si->count * *peel_iters_epilogue,
4530 si->kind, si->stmt_info, si->misalign,
4531 vect_epilogue);
4533 return retval;
4536 /* Function vect_estimate_min_profitable_iters
4538 Return the number of iterations required for the vector version of the
4539 loop to be profitable relative to the cost of the scalar version of the
4540 loop.
4542 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4543 of iterations for vectorization. -1 value means loop vectorization
4544 is not profitable. This returned value may be used for dynamic
4545 profitability check.
4547 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4548 for static check against estimated number of iterations. */
4550 static void
4551 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4552 int *ret_min_profitable_niters,
4553 int *ret_min_profitable_estimate,
4554 unsigned *suggested_unroll_factor)
4556 int min_profitable_iters;
4557 int min_profitable_estimate;
4558 int peel_iters_prologue;
4559 int peel_iters_epilogue;
4560 unsigned vec_inside_cost = 0;
4561 int vec_outside_cost = 0;
4562 unsigned vec_prologue_cost = 0;
4563 unsigned vec_epilogue_cost = 0;
4564 int scalar_single_iter_cost = 0;
4565 int scalar_outside_cost = 0;
4566 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4567 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4568 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4570 /* Cost model disabled. */
4571 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4573 if (dump_enabled_p ())
4574 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4575 *ret_min_profitable_niters = 0;
4576 *ret_min_profitable_estimate = 0;
4577 return;
4580 /* Requires loop versioning tests to handle misalignment. */
4581 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4583 /* FIXME: Make cost depend on complexity of individual check. */
4584 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4585 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4586 if (dump_enabled_p ())
4587 dump_printf (MSG_NOTE,
4588 "cost model: Adding cost of checks for loop "
4589 "versioning to treat misalignment.\n");
4592 /* Requires loop versioning with alias checks. */
4593 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4595 /* FIXME: Make cost depend on complexity of individual check. */
4596 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4597 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4598 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4599 if (len)
4600 /* Count LEN - 1 ANDs and LEN comparisons. */
4601 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4602 scalar_stmt, vect_prologue);
4603 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4604 if (len)
4606 /* Count LEN - 1 ANDs and LEN comparisons. */
4607 unsigned int nstmts = len * 2 - 1;
4608 /* +1 for each bias that needs adding. */
4609 for (unsigned int i = 0; i < len; ++i)
4610 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4611 nstmts += 1;
4612 (void) add_stmt_cost (target_cost_data, nstmts,
4613 scalar_stmt, vect_prologue);
4615 if (dump_enabled_p ())
4616 dump_printf (MSG_NOTE,
4617 "cost model: Adding cost of checks for loop "
4618 "versioning aliasing.\n");
4621 /* Requires loop versioning with niter checks. */
4622 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4624 /* FIXME: Make cost depend on complexity of individual check. */
4625 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4626 NULL, NULL, NULL_TREE, 0, vect_prologue);
4627 if (dump_enabled_p ())
4628 dump_printf (MSG_NOTE,
4629 "cost model: Adding cost of checks for loop "
4630 "versioning niters.\n");
4633 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4634 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4635 vect_prologue);
4637 /* Count statements in scalar loop. Using this as scalar cost for a single
4638 iteration for now.
4640 TODO: Add outer loop support.
4642 TODO: Consider assigning different costs to different scalar
4643 statements. */
4645 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4647 /* Add additional cost for the peeled instructions in prologue and epilogue
4648 loop. (For fully-masked loops there will be no peeling.)
4650 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4651 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4653 TODO: Build an expression that represents peel_iters for prologue and
4654 epilogue to be used in a run-time test. */
4656 bool prologue_need_br_taken_cost = false;
4657 bool prologue_need_br_not_taken_cost = false;
4659 /* Calculate peel_iters_prologue. */
4660 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4661 peel_iters_prologue = 0;
4662 else if (npeel < 0)
4664 peel_iters_prologue = assumed_vf / 2;
4665 if (dump_enabled_p ())
4666 dump_printf (MSG_NOTE, "cost model: "
4667 "prologue peel iters set to vf/2.\n");
4669 /* If peeled iterations are unknown, count a taken branch and a not taken
4670 branch per peeled loop. Even if scalar loop iterations are known,
4671 vector iterations are not known since peeled prologue iterations are
4672 not known. Hence guards remain the same. */
4673 prologue_need_br_taken_cost = true;
4674 prologue_need_br_not_taken_cost = true;
4676 else
4678 peel_iters_prologue = npeel;
4679 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4680 /* If peeled iterations are known but number of scalar loop
4681 iterations are unknown, count a taken branch per peeled loop. */
4682 prologue_need_br_taken_cost = true;
4685 bool epilogue_need_br_taken_cost = false;
4686 bool epilogue_need_br_not_taken_cost = false;
4688 /* Calculate peel_iters_epilogue. */
4689 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4690 /* We need to peel exactly one iteration for gaps. */
4691 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4692 else if (npeel < 0)
4694 /* If peeling for alignment is unknown, loop bound of main loop
4695 becomes unknown. */
4696 peel_iters_epilogue = assumed_vf / 2;
4697 if (dump_enabled_p ())
4698 dump_printf (MSG_NOTE, "cost model: "
4699 "epilogue peel iters set to vf/2 because "
4700 "peeling for alignment is unknown.\n");
4702 /* See the same reason above in peel_iters_prologue calculation. */
4703 epilogue_need_br_taken_cost = true;
4704 epilogue_need_br_not_taken_cost = true;
4706 else
4708 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4709 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4710 /* If peeled iterations are known but number of scalar loop
4711 iterations are unknown, count a taken branch per peeled loop. */
4712 epilogue_need_br_taken_cost = true;
4715 stmt_info_for_cost *si;
4716 int j;
4717 /* Add costs associated with peel_iters_prologue. */
4718 if (peel_iters_prologue)
4719 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4721 (void) add_stmt_cost (target_cost_data,
4722 si->count * peel_iters_prologue, si->kind,
4723 si->stmt_info, si->node, si->vectype,
4724 si->misalign, vect_prologue);
4727 /* Add costs associated with peel_iters_epilogue. */
4728 if (peel_iters_epilogue)
4729 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4731 (void) add_stmt_cost (target_cost_data,
4732 si->count * peel_iters_epilogue, si->kind,
4733 si->stmt_info, si->node, si->vectype,
4734 si->misalign, vect_epilogue);
4737 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4739 if (prologue_need_br_taken_cost)
4740 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4741 vect_prologue);
4743 if (prologue_need_br_not_taken_cost)
4744 (void) add_stmt_cost (target_cost_data, 1,
4745 cond_branch_not_taken, vect_prologue);
4747 if (epilogue_need_br_taken_cost)
4748 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4749 vect_epilogue);
4751 if (epilogue_need_br_not_taken_cost)
4752 (void) add_stmt_cost (target_cost_data, 1,
4753 cond_branch_not_taken, vect_epilogue);
4755 /* Take care of special costs for rgroup controls of partial vectors. */
4756 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4757 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4758 == vect_partial_vectors_avx512))
4760 /* Calculate how many masks we need to generate. */
4761 unsigned int num_masks = 0;
4762 bool need_saturation = false;
4763 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4764 if (rgm.type)
4766 unsigned nvectors = rgm.factor;
4767 num_masks += nvectors;
4768 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4769 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4770 need_saturation = true;
4773 /* ??? The target isn't able to identify the costs below as
4774 producing masks so it cannot penaltize cases where we'd run
4775 out of mask registers for example. */
4777 /* ??? We are also failing to account for smaller vector masks
4778 we generate by splitting larger masks in vect_get_loop_mask. */
4780 /* In the worst case, we need to generate each mask in the prologue
4781 and in the loop body. We need one splat per group and one
4782 compare per mask.
4784 Sometimes the prologue mask will fold to a constant,
4785 so the actual prologue cost might be smaller. However, it's
4786 simpler and safer to use the worst-case cost; if this ends up
4787 being the tie-breaker between vectorizing or not, then it's
4788 probably better not to vectorize. */
4789 (void) add_stmt_cost (target_cost_data,
4790 num_masks
4791 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4792 vector_stmt, NULL, NULL, NULL_TREE, 0,
4793 vect_prologue);
4794 (void) add_stmt_cost (target_cost_data,
4795 num_masks
4796 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4797 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4799 /* When we need saturation we need it both in the prologue and
4800 the epilogue. */
4801 if (need_saturation)
4803 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4804 NULL, NULL, NULL_TREE, 0, vect_prologue);
4805 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4806 NULL, NULL, NULL_TREE, 0, vect_body);
4809 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4810 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4811 == vect_partial_vectors_while_ult))
4813 /* Calculate how many masks we need to generate. */
4814 unsigned int num_masks = 0;
4815 rgroup_controls *rgm;
4816 unsigned int num_vectors_m1;
4817 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4818 num_vectors_m1, rgm)
4819 if (rgm->type)
4820 num_masks += num_vectors_m1 + 1;
4821 gcc_assert (num_masks > 0);
4823 /* In the worst case, we need to generate each mask in the prologue
4824 and in the loop body. One of the loop body mask instructions
4825 replaces the comparison in the scalar loop, and since we don't
4826 count the scalar comparison against the scalar body, we shouldn't
4827 count that vector instruction against the vector body either.
4829 Sometimes we can use unpacks instead of generating prologue
4830 masks and sometimes the prologue mask will fold to a constant,
4831 so the actual prologue cost might be smaller. However, it's
4832 simpler and safer to use the worst-case cost; if this ends up
4833 being the tie-breaker between vectorizing or not, then it's
4834 probably better not to vectorize. */
4835 (void) add_stmt_cost (target_cost_data, num_masks,
4836 vector_stmt, NULL, NULL, NULL_TREE, 0,
4837 vect_prologue);
4838 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4839 vector_stmt, NULL, NULL, NULL_TREE, 0,
4840 vect_body);
4842 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4844 /* Referring to the functions vect_set_loop_condition_partial_vectors
4845 and vect_set_loop_controls_directly, we need to generate each
4846 length in the prologue and in the loop body if required. Although
4847 there are some possible optimizations, we consider the worst case
4848 here. */
4850 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4851 signed char partial_load_store_bias
4852 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4853 bool need_iterate_p
4854 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4855 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4857 /* Calculate how many statements to be added. */
4858 unsigned int prologue_stmts = 0;
4859 unsigned int body_stmts = 0;
4861 rgroup_controls *rgc;
4862 unsigned int num_vectors_m1;
4863 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4864 if (rgc->type)
4866 /* May need one SHIFT for nitems_total computation. */
4867 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4868 if (nitems != 1 && !niters_known_p)
4869 prologue_stmts += 1;
4871 /* May need one MAX and one MINUS for wrap around. */
4872 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4873 prologue_stmts += 2;
4875 /* Need one MAX and one MINUS for each batch limit excepting for
4876 the 1st one. */
4877 prologue_stmts += num_vectors_m1 * 2;
4879 unsigned int num_vectors = num_vectors_m1 + 1;
4881 /* Need to set up lengths in prologue, only one MIN required
4882 for each since start index is zero. */
4883 prologue_stmts += num_vectors;
4885 /* If we have a non-zero partial load bias, we need one PLUS
4886 to adjust the load length. */
4887 if (partial_load_store_bias != 0)
4888 body_stmts += 1;
4890 unsigned int length_update_cost = 0;
4891 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4892 /* For decrement IV style, Each only need a single SELECT_VL
4893 or MIN since beginning to calculate the number of elements
4894 need to be processed in current iteration. */
4895 length_update_cost = 1;
4896 else
4897 /* For increment IV stype, Each may need two MINs and one MINUS to
4898 update lengths in body for next iteration. */
4899 length_update_cost = 3;
4901 if (need_iterate_p)
4902 body_stmts += length_update_cost * num_vectors;
4905 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4906 scalar_stmt, vect_prologue);
4907 (void) add_stmt_cost (target_cost_data, body_stmts,
4908 scalar_stmt, vect_body);
4911 /* FORNOW: The scalar outside cost is incremented in one of the
4912 following ways:
4914 1. The vectorizer checks for alignment and aliasing and generates
4915 a condition that allows dynamic vectorization. A cost model
4916 check is ANDED with the versioning condition. Hence scalar code
4917 path now has the added cost of the versioning check.
4919 if (cost > th & versioning_check)
4920 jmp to vector code
4922 Hence run-time scalar is incremented by not-taken branch cost.
4924 2. The vectorizer then checks if a prologue is required. If the
4925 cost model check was not done before during versioning, it has to
4926 be done before the prologue check.
4928 if (cost <= th)
4929 prologue = scalar_iters
4930 if (prologue == 0)
4931 jmp to vector code
4932 else
4933 execute prologue
4934 if (prologue == num_iters)
4935 go to exit
4937 Hence the run-time scalar cost is incremented by a taken branch,
4938 plus a not-taken branch, plus a taken branch cost.
4940 3. The vectorizer then checks if an epilogue is required. If the
4941 cost model check was not done before during prologue check, it
4942 has to be done with the epilogue check.
4944 if (prologue == 0)
4945 jmp to vector code
4946 else
4947 execute prologue
4948 if (prologue == num_iters)
4949 go to exit
4950 vector code:
4951 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4952 jmp to epilogue
4954 Hence the run-time scalar cost should be incremented by 2 taken
4955 branches.
4957 TODO: The back end may reorder the BBS's differently and reverse
4958 conditions/branch directions. Change the estimates below to
4959 something more reasonable. */
4961 /* If the number of iterations is known and we do not do versioning, we can
4962 decide whether to vectorize at compile time. Hence the scalar version
4963 do not carry cost model guard costs. */
4964 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4965 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4967 /* Cost model check occurs at versioning. */
4968 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4969 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4970 else
4972 /* Cost model check occurs at prologue generation. */
4973 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4974 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4975 + vect_get_stmt_cost (cond_branch_not_taken);
4976 /* Cost model check occurs at epilogue generation. */
4977 else
4978 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4982 /* Complete the target-specific cost calculations. */
4983 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4984 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4985 suggested_unroll_factor);
4987 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4988 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4989 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4990 *suggested_unroll_factor,
4991 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4993 if (dump_enabled_p ())
4994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4995 "can't unroll as unrolled vectorization factor larger"
4996 " than maximum vectorization factor: "
4997 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4998 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4999 *suggested_unroll_factor = 1;
5002 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5004 if (dump_enabled_p ())
5006 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5007 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
5008 vec_inside_cost);
5009 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
5010 vec_prologue_cost);
5011 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
5012 vec_epilogue_cost);
5013 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
5014 scalar_single_iter_cost);
5015 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
5016 scalar_outside_cost);
5017 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
5018 vec_outside_cost);
5019 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
5020 peel_iters_prologue);
5021 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
5022 peel_iters_epilogue);
5025 /* Calculate number of iterations required to make the vector version
5026 profitable, relative to the loop bodies only. The following condition
5027 must hold true:
5028 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5029 where
5030 SIC = scalar iteration cost, VIC = vector iteration cost,
5031 VOC = vector outside cost, VF = vectorization factor,
5032 NPEEL = prologue iterations + epilogue iterations,
5033 SOC = scalar outside cost for run time cost model check. */
5035 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5036 - vec_inside_cost);
5037 if (saving_per_viter <= 0)
5039 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5040 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5041 "vectorization did not happen for a simd loop");
5043 if (dump_enabled_p ())
5044 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5045 "cost model: the vector iteration cost = %d "
5046 "divided by the scalar iteration cost = %d "
5047 "is greater or equal to the vectorization factor = %d"
5048 ".\n",
5049 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5050 *ret_min_profitable_niters = -1;
5051 *ret_min_profitable_estimate = -1;
5052 return;
5055 /* ??? The "if" arm is written to handle all cases; see below for what
5056 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5057 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5059 /* Rewriting the condition above in terms of the number of
5060 vector iterations (vniters) rather than the number of
5061 scalar iterations (niters) gives:
5063 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5065 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5067 For integer N, X and Y when X > 0:
5069 N * X > Y <==> N >= (Y /[floor] X) + 1. */
5070 int outside_overhead = (vec_outside_cost
5071 - scalar_single_iter_cost * peel_iters_prologue
5072 - scalar_single_iter_cost * peel_iters_epilogue
5073 - scalar_outside_cost);
5074 /* We're only interested in cases that require at least one
5075 vector iteration. */
5076 int min_vec_niters = 1;
5077 if (outside_overhead > 0)
5078 min_vec_niters = outside_overhead / saving_per_viter + 1;
5080 if (dump_enabled_p ())
5081 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5082 min_vec_niters);
5084 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5086 /* Now that we know the minimum number of vector iterations,
5087 find the minimum niters for which the scalar cost is larger:
5089 SIC * niters > VIC * vniters + VOC - SOC
5091 We know that the minimum niters is no more than
5092 vniters * VF + NPEEL, but it might be (and often is) less
5093 than that if a partial vector iteration is cheaper than the
5094 equivalent scalar code. */
5095 int threshold = (vec_inside_cost * min_vec_niters
5096 + vec_outside_cost
5097 - scalar_outside_cost);
5098 if (threshold <= 0)
5099 min_profitable_iters = 1;
5100 else
5101 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5103 else
5104 /* Convert the number of vector iterations into a number of
5105 scalar iterations. */
5106 min_profitable_iters = (min_vec_niters * assumed_vf
5107 + peel_iters_prologue
5108 + peel_iters_epilogue);
5110 else
5112 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5113 * assumed_vf
5114 - vec_inside_cost * peel_iters_prologue
5115 - vec_inside_cost * peel_iters_epilogue);
5116 if (min_profitable_iters <= 0)
5117 min_profitable_iters = 0;
5118 else
5120 min_profitable_iters /= saving_per_viter;
5122 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5123 <= (((int) vec_inside_cost * min_profitable_iters)
5124 + (((int) vec_outside_cost - scalar_outside_cost)
5125 * assumed_vf)))
5126 min_profitable_iters++;
5130 if (dump_enabled_p ())
5131 dump_printf (MSG_NOTE,
5132 " Calculated minimum iters for profitability: %d\n",
5133 min_profitable_iters);
5135 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5136 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5137 /* We want the vectorized loop to execute at least once. */
5138 min_profitable_iters = assumed_vf + peel_iters_prologue;
5139 else if (min_profitable_iters < peel_iters_prologue)
5140 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5141 vectorized loop executes at least once. */
5142 min_profitable_iters = peel_iters_prologue;
5144 if (dump_enabled_p ())
5145 dump_printf_loc (MSG_NOTE, vect_location,
5146 " Runtime profitability threshold = %d\n",
5147 min_profitable_iters);
5149 *ret_min_profitable_niters = min_profitable_iters;
5151 /* Calculate number of iterations required to make the vector version
5152 profitable, relative to the loop bodies only.
5154 Non-vectorized variant is SIC * niters and it must win over vector
5155 variant on the expected loop trip count. The following condition must hold true:
5156 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5158 if (vec_outside_cost <= 0)
5159 min_profitable_estimate = 0;
5160 /* ??? This "else if" arm is written to handle all cases; see below for
5161 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5162 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5164 /* This is a repeat of the code above, but with + SOC rather
5165 than - SOC. */
5166 int outside_overhead = (vec_outside_cost
5167 - scalar_single_iter_cost * peel_iters_prologue
5168 - scalar_single_iter_cost * peel_iters_epilogue
5169 + scalar_outside_cost);
5170 int min_vec_niters = 1;
5171 if (outside_overhead > 0)
5172 min_vec_niters = outside_overhead / saving_per_viter + 1;
5174 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5176 int threshold = (vec_inside_cost * min_vec_niters
5177 + vec_outside_cost
5178 + scalar_outside_cost);
5179 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5181 else
5182 min_profitable_estimate = (min_vec_niters * assumed_vf
5183 + peel_iters_prologue
5184 + peel_iters_epilogue);
5186 else
5188 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5189 * assumed_vf
5190 - vec_inside_cost * peel_iters_prologue
5191 - vec_inside_cost * peel_iters_epilogue)
5192 / ((scalar_single_iter_cost * assumed_vf)
5193 - vec_inside_cost);
5195 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5196 if (dump_enabled_p ())
5197 dump_printf_loc (MSG_NOTE, vect_location,
5198 " Static estimate profitability threshold = %d\n",
5199 min_profitable_estimate);
5201 *ret_min_profitable_estimate = min_profitable_estimate;
5204 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5205 vector elements (not bits) for a vector with NELT elements. */
5206 static void
5207 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5208 vec_perm_builder *sel)
5210 /* The encoding is a single stepped pattern. Any wrap-around is handled
5211 by vec_perm_indices. */
5212 sel->new_vector (nelt, 1, 3);
5213 for (unsigned int i = 0; i < 3; i++)
5214 sel->quick_push (i + offset);
5217 /* Checks whether the target supports whole-vector shifts for vectors of mode
5218 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5219 it supports vec_perm_const with masks for all necessary shift amounts. */
5220 static bool
5221 have_whole_vector_shift (machine_mode mode)
5223 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5224 return true;
5226 /* Variable-length vectors should be handled via the optab. */
5227 unsigned int nelt;
5228 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5229 return false;
5231 vec_perm_builder sel;
5232 vec_perm_indices indices;
5233 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5235 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5236 indices.new_vector (sel, 2, nelt);
5237 if (!can_vec_perm_const_p (mode, mode, indices, false))
5238 return false;
5240 return true;
5243 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5244 multiplication operands have differing signs and (b) we intend
5245 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5246 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5248 static bool
5249 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5250 stmt_vec_info stmt_info)
5252 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5253 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5254 return false;
5256 tree rhs1 = gimple_assign_rhs1 (assign);
5257 tree rhs2 = gimple_assign_rhs2 (assign);
5258 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5259 return false;
5261 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5262 gcc_assert (reduc_info->is_reduc_info);
5263 return !directly_supported_p (DOT_PROD_EXPR,
5264 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5265 optab_vector_mixed_sign);
5268 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5269 functions. Design better to avoid maintenance issues. */
5271 /* Function vect_model_reduction_cost.
5273 Models cost for a reduction operation, including the vector ops
5274 generated within the strip-mine loop in some cases, the initial
5275 definition before the loop, and the epilogue code that must be generated. */
5277 static void
5278 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5279 stmt_vec_info stmt_info, internal_fn reduc_fn,
5280 vect_reduction_type reduction_type,
5281 int ncopies, stmt_vector_for_cost *cost_vec)
5283 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5284 tree vectype;
5285 machine_mode mode;
5286 class loop *loop = NULL;
5288 if (loop_vinfo)
5289 loop = LOOP_VINFO_LOOP (loop_vinfo);
5291 /* Condition reductions generate two reductions in the loop. */
5292 if (reduction_type == COND_REDUCTION)
5293 ncopies *= 2;
5295 vectype = STMT_VINFO_VECTYPE (stmt_info);
5296 mode = TYPE_MODE (vectype);
5297 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5299 gimple_match_op op;
5300 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5301 gcc_unreachable ();
5303 bool emulated_mixed_dot_prod
5304 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5305 if (reduction_type == EXTRACT_LAST_REDUCTION)
5306 /* No extra instructions are needed in the prologue. The loop body
5307 operations are costed in vectorizable_condition. */
5308 inside_cost = 0;
5309 else if (reduction_type == FOLD_LEFT_REDUCTION)
5311 /* No extra instructions needed in the prologue. */
5312 prologue_cost = 0;
5314 if (reduc_fn != IFN_LAST)
5315 /* Count one reduction-like operation per vector. */
5316 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5317 stmt_info, 0, vect_body);
5318 else
5320 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5321 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5322 inside_cost = record_stmt_cost (cost_vec, nelements,
5323 vec_to_scalar, stmt_info, 0,
5324 vect_body);
5325 inside_cost += record_stmt_cost (cost_vec, nelements,
5326 scalar_stmt, stmt_info, 0,
5327 vect_body);
5330 else
5332 /* Add in the cost of the initial definitions. */
5333 int prologue_stmts;
5334 if (reduction_type == COND_REDUCTION)
5335 /* For cond reductions we have four vectors: initial index, step,
5336 initial result of the data reduction, initial value of the index
5337 reduction. */
5338 prologue_stmts = 4;
5339 else if (emulated_mixed_dot_prod)
5340 /* We need the initial reduction value and two invariants:
5341 one that contains the minimum signed value and one that
5342 contains half of its negative. */
5343 prologue_stmts = 3;
5344 else
5345 prologue_stmts = 1;
5346 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5347 scalar_to_vec, stmt_info, 0,
5348 vect_prologue);
5351 /* Determine cost of epilogue code.
5353 We have a reduction operator that will reduce the vector in one statement.
5354 Also requires scalar extract. */
5356 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5358 if (reduc_fn != IFN_LAST)
5360 if (reduction_type == COND_REDUCTION)
5362 /* An EQ stmt and an COND_EXPR stmt. */
5363 epilogue_cost += record_stmt_cost (cost_vec, 2,
5364 vector_stmt, stmt_info, 0,
5365 vect_epilogue);
5366 /* Reduction of the max index and a reduction of the found
5367 values. */
5368 epilogue_cost += record_stmt_cost (cost_vec, 2,
5369 vec_to_scalar, stmt_info, 0,
5370 vect_epilogue);
5371 /* A broadcast of the max value. */
5372 epilogue_cost += record_stmt_cost (cost_vec, 1,
5373 scalar_to_vec, stmt_info, 0,
5374 vect_epilogue);
5376 else
5378 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5379 stmt_info, 0, vect_epilogue);
5380 epilogue_cost += record_stmt_cost (cost_vec, 1,
5381 vec_to_scalar, stmt_info, 0,
5382 vect_epilogue);
5385 else if (reduction_type == COND_REDUCTION)
5387 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5388 /* Extraction of scalar elements. */
5389 epilogue_cost += record_stmt_cost (cost_vec,
5390 2 * estimated_nunits,
5391 vec_to_scalar, stmt_info, 0,
5392 vect_epilogue);
5393 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5394 epilogue_cost += record_stmt_cost (cost_vec,
5395 2 * estimated_nunits - 3,
5396 scalar_stmt, stmt_info, 0,
5397 vect_epilogue);
5399 else if (reduction_type == EXTRACT_LAST_REDUCTION
5400 || reduction_type == FOLD_LEFT_REDUCTION)
5401 /* No extra instructions need in the epilogue. */
5403 else
5405 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5406 tree bitsize = TYPE_SIZE (op.type);
5407 int element_bitsize = tree_to_uhwi (bitsize);
5408 int nelements = vec_size_in_bits / element_bitsize;
5410 if (op.code == COND_EXPR)
5411 op.code = MAX_EXPR;
5413 /* We have a whole vector shift available. */
5414 if (VECTOR_MODE_P (mode)
5415 && directly_supported_p (op.code, vectype)
5416 && have_whole_vector_shift (mode))
5418 /* Final reduction via vector shifts and the reduction operator.
5419 Also requires scalar extract. */
5420 epilogue_cost += record_stmt_cost (cost_vec,
5421 exact_log2 (nelements) * 2,
5422 vector_stmt, stmt_info, 0,
5423 vect_epilogue);
5424 epilogue_cost += record_stmt_cost (cost_vec, 1,
5425 vec_to_scalar, stmt_info, 0,
5426 vect_epilogue);
5428 else
5429 /* Use extracts and reduction op for final reduction. For N
5430 elements, we have N extracts and N-1 reduction ops. */
5431 epilogue_cost += record_stmt_cost (cost_vec,
5432 nelements + nelements - 1,
5433 vector_stmt, stmt_info, 0,
5434 vect_epilogue);
5438 if (dump_enabled_p ())
5439 dump_printf (MSG_NOTE,
5440 "vect_model_reduction_cost: inside_cost = %d, "
5441 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5442 prologue_cost, epilogue_cost);
5445 /* SEQ is a sequence of instructions that initialize the reduction
5446 described by REDUC_INFO. Emit them in the appropriate place. */
5448 static void
5449 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5450 stmt_vec_info reduc_info, gimple *seq)
5452 if (reduc_info->reused_accumulator)
5454 /* When reusing an accumulator from the main loop, we only need
5455 initialization instructions if the main loop can be skipped.
5456 In that case, emit the initialization instructions at the end
5457 of the guard block that does the skip. */
5458 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5459 gcc_assert (skip_edge);
5460 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5461 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5463 else
5465 /* The normal case: emit the initialization instructions on the
5466 preheader edge. */
5467 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5468 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5472 /* Function get_initial_def_for_reduction
5474 Input:
5475 REDUC_INFO - the info_for_reduction
5476 INIT_VAL - the initial value of the reduction variable
5477 NEUTRAL_OP - a value that has no effect on the reduction, as per
5478 neutral_op_for_reduction
5480 Output:
5481 Return a vector variable, initialized according to the operation that
5482 STMT_VINFO performs. This vector will be used as the initial value
5483 of the vector of partial results.
5485 The value we need is a vector in which element 0 has value INIT_VAL
5486 and every other element has value NEUTRAL_OP. */
5488 static tree
5489 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5490 stmt_vec_info reduc_info,
5491 tree init_val, tree neutral_op)
5493 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5494 tree scalar_type = TREE_TYPE (init_val);
5495 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5496 tree init_def;
5497 gimple_seq stmts = NULL;
5499 gcc_assert (vectype);
5501 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5502 || SCALAR_FLOAT_TYPE_P (scalar_type));
5504 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5505 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5507 if (operand_equal_p (init_val, neutral_op))
5509 /* If both elements are equal then the vector described above is
5510 just a splat. */
5511 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5512 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5514 else
5516 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5517 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5518 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5520 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5521 element 0. */
5522 init_def = gimple_build_vector_from_val (&stmts, vectype,
5523 neutral_op);
5524 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5525 vectype, init_def, init_val);
5527 else
5529 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5530 tree_vector_builder elts (vectype, 1, 2);
5531 elts.quick_push (init_val);
5532 elts.quick_push (neutral_op);
5533 init_def = gimple_build_vector (&stmts, &elts);
5537 if (stmts)
5538 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5539 return init_def;
5542 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5543 which performs a reduction involving GROUP_SIZE scalar statements.
5544 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5545 is nonnull, introducing extra elements of that value will not change the
5546 result. */
5548 static void
5549 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5550 stmt_vec_info reduc_info,
5551 vec<tree> *vec_oprnds,
5552 unsigned int number_of_vectors,
5553 unsigned int group_size, tree neutral_op)
5555 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5556 unsigned HOST_WIDE_INT nunits;
5557 unsigned j, number_of_places_left_in_vector;
5558 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5559 unsigned int i;
5561 gcc_assert (group_size == initial_values.length () || neutral_op);
5563 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5564 created vectors. It is greater than 1 if unrolling is performed.
5566 For example, we have two scalar operands, s1 and s2 (e.g., group of
5567 strided accesses of size two), while NUNITS is four (i.e., four scalars
5568 of this type can be packed in a vector). The output vector will contain
5569 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5570 will be 2).
5572 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5573 vectors containing the operands.
5575 For example, NUNITS is four as before, and the group size is 8
5576 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5577 {s5, s6, s7, s8}. */
5579 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5580 nunits = group_size;
5582 number_of_places_left_in_vector = nunits;
5583 bool constant_p = true;
5584 tree_vector_builder elts (vector_type, nunits, 1);
5585 elts.quick_grow (nunits);
5586 gimple_seq ctor_seq = NULL;
5587 for (j = 0; j < nunits * number_of_vectors; ++j)
5589 tree op;
5590 i = j % group_size;
5592 /* Get the def before the loop. In reduction chain we have only
5593 one initial value. Else we have as many as PHIs in the group. */
5594 if (i >= initial_values.length () || (j > i && neutral_op))
5595 op = neutral_op;
5596 else
5597 op = initial_values[i];
5599 /* Create 'vect_ = {op0,op1,...,opn}'. */
5600 number_of_places_left_in_vector--;
5601 elts[nunits - number_of_places_left_in_vector - 1] = op;
5602 if (!CONSTANT_CLASS_P (op))
5603 constant_p = false;
5605 if (number_of_places_left_in_vector == 0)
5607 tree init;
5608 if (constant_p && !neutral_op
5609 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5610 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5611 /* Build the vector directly from ELTS. */
5612 init = gimple_build_vector (&ctor_seq, &elts);
5613 else if (neutral_op)
5615 /* Build a vector of the neutral value and shift the
5616 other elements into place. */
5617 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5618 neutral_op);
5619 int k = nunits;
5620 while (k > 0 && elts[k - 1] == neutral_op)
5621 k -= 1;
5622 while (k > 0)
5624 k -= 1;
5625 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5626 vector_type, init, elts[k]);
5629 else
5631 /* First time round, duplicate ELTS to fill the
5632 required number of vectors. */
5633 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5634 elts, number_of_vectors, *vec_oprnds);
5635 break;
5637 vec_oprnds->quick_push (init);
5639 number_of_places_left_in_vector = nunits;
5640 elts.new_vector (vector_type, nunits, 1);
5641 elts.quick_grow (nunits);
5642 constant_p = true;
5645 if (ctor_seq != NULL)
5646 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5649 /* For a statement STMT_INFO taking part in a reduction operation return
5650 the stmt_vec_info the meta information is stored on. */
5652 stmt_vec_info
5653 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5655 stmt_info = vect_orig_stmt (stmt_info);
5656 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5657 if (!is_a <gphi *> (stmt_info->stmt)
5658 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5659 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5660 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5661 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5663 if (gimple_phi_num_args (phi) == 1)
5664 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5666 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5668 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5669 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5670 stmt_info = info;
5672 return stmt_info;
5675 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5676 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5677 return false. */
5679 static bool
5680 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5681 stmt_vec_info reduc_info)
5683 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5684 if (!main_loop_vinfo)
5685 return false;
5687 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5688 return false;
5690 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5691 auto_vec<tree, 16> main_loop_results (num_phis);
5692 auto_vec<tree, 16> initial_values (num_phis);
5693 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5695 /* The epilogue loop can be entered either from the main loop or
5696 from an earlier guard block. */
5697 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5698 for (tree incoming_value : reduc_info->reduc_initial_values)
5700 /* Look for:
5702 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5703 INITIAL_VALUE(guard block)>. */
5704 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5706 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5707 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5709 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5710 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5712 main_loop_results.quick_push (from_main_loop);
5713 initial_values.quick_push (from_skip);
5716 else
5717 /* The main loop dominates the epilogue loop. */
5718 main_loop_results.splice (reduc_info->reduc_initial_values);
5720 /* See if the main loop has the kind of accumulator we need. */
5721 vect_reusable_accumulator *accumulator
5722 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5723 if (!accumulator
5724 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5725 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5726 accumulator->reduc_info->reduc_scalar_results.begin ()))
5727 return false;
5729 /* Handle the case where we can reduce wider vectors to narrower ones. */
5730 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5731 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5732 unsigned HOST_WIDE_INT m;
5733 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5734 TYPE_VECTOR_SUBPARTS (vectype), &m))
5735 return false;
5736 /* Check the intermediate vector types and operations are available. */
5737 tree prev_vectype = old_vectype;
5738 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5739 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5741 intermediate_nunits = exact_div (intermediate_nunits, 2);
5742 tree intermediate_vectype = get_related_vectype_for_scalar_type
5743 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5744 if (!intermediate_vectype
5745 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5746 intermediate_vectype)
5747 || !can_vec_extract (TYPE_MODE (prev_vectype),
5748 TYPE_MODE (intermediate_vectype)))
5749 return false;
5750 prev_vectype = intermediate_vectype;
5753 /* Non-SLP reductions might apply an adjustment after the reduction
5754 operation, in order to simplify the initialization of the accumulator.
5755 If the epilogue loop carries on from where the main loop left off,
5756 it should apply the same adjustment to the final reduction result.
5758 If the epilogue loop can also be entered directly (rather than via
5759 the main loop), we need to be able to handle that case in the same way,
5760 with the same adjustment. (In principle we could add a PHI node
5761 to select the correct adjustment, but in practice that shouldn't be
5762 necessary.) */
5763 tree main_adjustment
5764 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5765 if (loop_vinfo->main_loop_edge && main_adjustment)
5767 gcc_assert (num_phis == 1);
5768 tree initial_value = initial_values[0];
5769 /* Check that we can use INITIAL_VALUE as the adjustment and
5770 initialize the accumulator with a neutral value instead. */
5771 if (!operand_equal_p (initial_value, main_adjustment))
5772 return false;
5773 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5774 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5775 code, initial_value);
5777 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5778 reduc_info->reduc_initial_values.truncate (0);
5779 reduc_info->reduc_initial_values.splice (initial_values);
5780 reduc_info->reused_accumulator = accumulator;
5781 return true;
5784 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5785 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5787 static tree
5788 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5789 gimple_seq *seq)
5791 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5792 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5793 tree stype = TREE_TYPE (vectype);
5794 tree new_temp = vec_def;
5795 while (nunits > nunits1)
5797 nunits /= 2;
5798 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5799 stype, nunits);
5800 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5802 /* The target has to make sure we support lowpart/highpart
5803 extraction, either via direct vector extract or through
5804 an integer mode punning. */
5805 tree dst1, dst2;
5806 gimple *epilog_stmt;
5807 if (convert_optab_handler (vec_extract_optab,
5808 TYPE_MODE (TREE_TYPE (new_temp)),
5809 TYPE_MODE (vectype1))
5810 != CODE_FOR_nothing)
5812 /* Extract sub-vectors directly once vec_extract becomes
5813 a conversion optab. */
5814 dst1 = make_ssa_name (vectype1);
5815 epilog_stmt
5816 = gimple_build_assign (dst1, BIT_FIELD_REF,
5817 build3 (BIT_FIELD_REF, vectype1,
5818 new_temp, TYPE_SIZE (vectype1),
5819 bitsize_int (0)));
5820 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5821 dst2 = make_ssa_name (vectype1);
5822 epilog_stmt
5823 = gimple_build_assign (dst2, BIT_FIELD_REF,
5824 build3 (BIT_FIELD_REF, vectype1,
5825 new_temp, TYPE_SIZE (vectype1),
5826 bitsize_int (bitsize)));
5827 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5829 else
5831 /* Extract via punning to appropriately sized integer mode
5832 vector. */
5833 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5834 tree etype = build_vector_type (eltype, 2);
5835 gcc_assert (convert_optab_handler (vec_extract_optab,
5836 TYPE_MODE (etype),
5837 TYPE_MODE (eltype))
5838 != CODE_FOR_nothing);
5839 tree tem = make_ssa_name (etype);
5840 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5841 build1 (VIEW_CONVERT_EXPR,
5842 etype, new_temp));
5843 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5844 new_temp = tem;
5845 tem = make_ssa_name (eltype);
5846 epilog_stmt
5847 = gimple_build_assign (tem, BIT_FIELD_REF,
5848 build3 (BIT_FIELD_REF, eltype,
5849 new_temp, TYPE_SIZE (eltype),
5850 bitsize_int (0)));
5851 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5852 dst1 = make_ssa_name (vectype1);
5853 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5854 build1 (VIEW_CONVERT_EXPR,
5855 vectype1, tem));
5856 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5857 tem = make_ssa_name (eltype);
5858 epilog_stmt
5859 = gimple_build_assign (tem, BIT_FIELD_REF,
5860 build3 (BIT_FIELD_REF, eltype,
5861 new_temp, TYPE_SIZE (eltype),
5862 bitsize_int (bitsize)));
5863 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5864 dst2 = make_ssa_name (vectype1);
5865 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5866 build1 (VIEW_CONVERT_EXPR,
5867 vectype1, tem));
5868 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5871 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5874 return new_temp;
5877 /* Retrieves the definining statement to be used for a reduction.
5878 For MAIN_EXIT_P we use the current VEC_STMTs and otherwise we look at
5879 the reduction definitions. */
5881 tree
5882 vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
5883 slp_instance slp_node_instance, bool main_exit_p, unsigned i,
5884 vec <gimple *> &vec_stmts)
5886 tree def;
5888 if (slp_node)
5890 if (!main_exit_p)
5891 slp_node = slp_node_instance->reduc_phis;
5892 def = vect_get_slp_vect_def (slp_node, i);
5894 else
5896 if (!main_exit_p)
5897 reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
5898 vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
5899 def = gimple_get_lhs (vec_stmts[0]);
5902 return def;
5905 /* Function vect_create_epilog_for_reduction
5907 Create code at the loop-epilog to finalize the result of a reduction
5908 computation.
5910 STMT_INFO is the scalar reduction stmt that is being vectorized.
5911 SLP_NODE is an SLP node containing a group of reduction statements. The
5912 first one in this group is STMT_INFO.
5913 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5914 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5915 (counting from 0)
5916 LOOP_EXIT is the edge to update in the merge block. In the case of a single
5917 exit this edge is always the main loop exit.
5919 This function:
5920 1. Completes the reduction def-use cycles.
5921 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5922 by calling the function specified by REDUC_FN if available, or by
5923 other means (whole-vector shifts or a scalar loop).
5924 The function also creates a new phi node at the loop exit to preserve
5925 loop-closed form, as illustrated below.
5927 The flow at the entry to this function:
5929 loop:
5930 vec_def = phi <vec_init, null> # REDUCTION_PHI
5931 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5932 s_loop = scalar_stmt # (scalar) STMT_INFO
5933 loop_exit:
5934 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5935 use <s_out0>
5936 use <s_out0>
5938 The above is transformed by this function into:
5940 loop:
5941 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5942 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5943 s_loop = scalar_stmt # (scalar) STMT_INFO
5944 loop_exit:
5945 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5946 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5947 v_out2 = reduce <v_out1>
5948 s_out3 = extract_field <v_out2, 0>
5949 s_out4 = adjust_result <s_out3>
5950 use <s_out4>
5951 use <s_out4>
5954 static void
5955 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5956 stmt_vec_info stmt_info,
5957 slp_tree slp_node,
5958 slp_instance slp_node_instance,
5959 edge loop_exit)
5961 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5962 gcc_assert (reduc_info->is_reduc_info);
5963 /* For double reductions we need to get at the inner loop reduction
5964 stmt which has the meta info attached. Our stmt_info is that of the
5965 loop-closed PHI of the inner loop which we remember as
5966 def for the reduction PHI generation. */
5967 bool double_reduc = false;
5968 bool main_exit_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit;
5969 stmt_vec_info rdef_info = stmt_info;
5970 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5972 gcc_assert (!slp_node);
5973 double_reduc = true;
5974 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5975 (stmt_info->stmt, 0));
5976 stmt_info = vect_stmt_to_vectorize (stmt_info);
5978 gphi *reduc_def_stmt
5979 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5980 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5981 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5982 tree vectype;
5983 machine_mode mode;
5984 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5985 basic_block exit_bb;
5986 tree scalar_dest;
5987 tree scalar_type;
5988 gimple *new_phi = NULL, *phi = NULL;
5989 gimple_stmt_iterator exit_gsi;
5990 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5991 gimple *epilog_stmt = NULL;
5992 gimple *exit_phi;
5993 tree bitsize;
5994 tree def;
5995 tree orig_name, scalar_result;
5996 imm_use_iterator imm_iter, phi_imm_iter;
5997 use_operand_p use_p, phi_use_p;
5998 gimple *use_stmt;
5999 auto_vec<tree> reduc_inputs;
6000 int j, i;
6001 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
6002 unsigned int group_size = 1, k;
6003 auto_vec<gimple *> phis;
6004 /* SLP reduction without reduction chain, e.g.,
6005 # a1 = phi <a2, a0>
6006 # b1 = phi <b2, b0>
6007 a2 = operation (a1)
6008 b2 = operation (b1) */
6009 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6010 bool direct_slp_reduc;
6011 tree induction_index = NULL_TREE;
6013 if (slp_node)
6014 group_size = SLP_TREE_LANES (slp_node);
6016 if (nested_in_vect_loop_p (loop, stmt_info))
6018 outer_loop = loop;
6019 loop = loop->inner;
6020 gcc_assert (!slp_node && double_reduc);
6023 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6024 gcc_assert (vectype);
6025 mode = TYPE_MODE (vectype);
6027 tree induc_val = NULL_TREE;
6028 tree adjustment_def = NULL;
6029 if (slp_node)
6031 else
6033 /* Optimize: for induction condition reduction, if we can't use zero
6034 for induc_val, use initial_def. */
6035 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6036 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6037 else if (double_reduc)
6039 else
6040 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6043 stmt_vec_info single_live_out_stmt[] = { stmt_info };
6044 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6045 if (slp_reduc)
6046 /* All statements produce live-out values. */
6047 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6048 else if (slp_node)
6050 /* The last statement in the reduction chain produces the live-out
6051 value. Note SLP optimization can shuffle scalar stmts to
6052 optimize permutations so we have to search for the last stmt. */
6053 for (k = 0; k < group_size; ++k)
6054 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
6056 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6057 break;
6061 unsigned vec_num;
6062 int ncopies;
6063 if (slp_node)
6065 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6066 ncopies = 1;
6068 else
6070 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6071 vec_num = 1;
6072 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6075 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6076 which is updated with the current index of the loop for every match of
6077 the original loop's cond_expr (VEC_STMT). This results in a vector
6078 containing the last time the condition passed for that vector lane.
6079 The first match will be a 1 to allow 0 to be used for non-matching
6080 indexes. If there are no matches at all then the vector will be all
6081 zeroes.
6083 PR92772: This algorithm is broken for architectures that support
6084 masked vectors, but do not provide fold_extract_last. */
6085 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6087 auto_vec<std::pair<tree, bool>, 2> ccompares;
6088 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6089 cond_info = vect_stmt_to_vectorize (cond_info);
6090 while (cond_info != reduc_info)
6092 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6094 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6095 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6096 ccompares.safe_push
6097 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6098 STMT_VINFO_REDUC_IDX (cond_info) == 2));
6100 cond_info
6101 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6102 1 + STMT_VINFO_REDUC_IDX
6103 (cond_info)));
6104 cond_info = vect_stmt_to_vectorize (cond_info);
6106 gcc_assert (ccompares.length () != 0);
6108 tree indx_before_incr, indx_after_incr;
6109 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6110 int scalar_precision
6111 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6112 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6113 tree cr_index_vector_type = get_related_vectype_for_scalar_type
6114 (TYPE_MODE (vectype), cr_index_scalar_type,
6115 TYPE_VECTOR_SUBPARTS (vectype));
6117 /* First we create a simple vector induction variable which starts
6118 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6119 vector size (STEP). */
6121 /* Create a {1,2,3,...} vector. */
6122 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6124 /* Create a vector of the step value. */
6125 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6126 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6128 /* Create an induction variable. */
6129 gimple_stmt_iterator incr_gsi;
6130 bool insert_after;
6131 vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6132 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6133 insert_after, &indx_before_incr, &indx_after_incr);
6135 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6136 filled with zeros (VEC_ZERO). */
6138 /* Create a vector of 0s. */
6139 tree zero = build_zero_cst (cr_index_scalar_type);
6140 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6142 /* Create a vector phi node. */
6143 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6144 new_phi = create_phi_node (new_phi_tree, loop->header);
6145 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6146 loop_preheader_edge (loop), UNKNOWN_LOCATION);
6148 /* Now take the condition from the loops original cond_exprs
6149 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6150 every match uses values from the induction variable
6151 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6152 (NEW_PHI_TREE).
6153 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6154 the new cond_expr (INDEX_COND_EXPR). */
6155 gimple_seq stmts = NULL;
6156 for (int i = ccompares.length () - 1; i != -1; --i)
6158 tree ccompare = ccompares[i].first;
6159 if (ccompares[i].second)
6160 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6161 cr_index_vector_type,
6162 ccompare,
6163 indx_before_incr, new_phi_tree);
6164 else
6165 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6166 cr_index_vector_type,
6167 ccompare,
6168 new_phi_tree, indx_before_incr);
6170 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6172 /* Update the phi with the vec cond. */
6173 induction_index = new_phi_tree;
6174 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6175 loop_latch_edge (loop), UNKNOWN_LOCATION);
6178 /* 2. Create epilog code.
6179 The reduction epilog code operates across the elements of the vector
6180 of partial results computed by the vectorized loop.
6181 The reduction epilog code consists of:
6183 step 1: compute the scalar result in a vector (v_out2)
6184 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6185 step 3: adjust the scalar result (s_out3) if needed.
6187 Step 1 can be accomplished using one the following three schemes:
6188 (scheme 1) using reduc_fn, if available.
6189 (scheme 2) using whole-vector shifts, if available.
6190 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6191 combined.
6193 The overall epilog code looks like this:
6195 s_out0 = phi <s_loop> # original EXIT_PHI
6196 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6197 v_out2 = reduce <v_out1> # step 1
6198 s_out3 = extract_field <v_out2, 0> # step 2
6199 s_out4 = adjust_result <s_out3> # step 3
6201 (step 3 is optional, and steps 1 and 2 may be combined).
6202 Lastly, the uses of s_out0 are replaced by s_out4. */
6205 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6206 v_out1 = phi <VECT_DEF>
6207 Store them in NEW_PHIS. */
6208 if (double_reduc)
6209 loop = outer_loop;
6210 /* We need to reduce values in all exits. */
6211 exit_bb = loop_exit->dest;
6212 exit_gsi = gsi_after_labels (exit_bb);
6213 reduc_inputs.create (slp_node ? vec_num : ncopies);
6214 vec <gimple *> vec_stmts = vNULL;
6215 for (unsigned i = 0; i < vec_num; i++)
6217 gimple_seq stmts = NULL;
6218 def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
6219 main_exit_p, i, vec_stmts);
6220 for (j = 0; j < ncopies; j++)
6222 tree new_def = copy_ssa_name (def);
6223 phi = create_phi_node (new_def, exit_bb);
6224 if (j)
6225 def = gimple_get_lhs (vec_stmts[j]);
6226 SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6227 new_def = gimple_convert (&stmts, vectype, new_def);
6228 reduc_inputs.quick_push (new_def);
6230 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6233 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6234 (i.e. when reduc_fn is not available) and in the final adjustment
6235 code (if needed). Also get the original scalar reduction variable as
6236 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6237 represents a reduction pattern), the tree-code and scalar-def are
6238 taken from the original stmt that the pattern-stmt (STMT) replaces.
6239 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6240 are taken from STMT. */
6242 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6243 if (orig_stmt_info != stmt_info)
6245 /* Reduction pattern */
6246 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6247 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6250 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6251 scalar_type = TREE_TYPE (scalar_dest);
6252 scalar_results.truncate (0);
6253 scalar_results.reserve_exact (group_size);
6254 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6255 bitsize = TYPE_SIZE (scalar_type);
6257 /* True if we should implement SLP_REDUC using native reduction operations
6258 instead of scalar operations. */
6259 direct_slp_reduc = (reduc_fn != IFN_LAST
6260 && slp_reduc
6261 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6263 /* In case of reduction chain, e.g.,
6264 # a1 = phi <a3, a0>
6265 a2 = operation (a1)
6266 a3 = operation (a2),
6268 we may end up with more than one vector result. Here we reduce them
6269 to one vector.
6271 The same is true for a SLP reduction, e.g.,
6272 # a1 = phi <a2, a0>
6273 # b1 = phi <b2, b0>
6274 a2 = operation (a1)
6275 b2 = operation (a2),
6277 where we can end up with more than one vector as well. We can
6278 easily accumulate vectors when the number of vector elements is
6279 a multiple of the SLP group size.
6281 The same is true if we couldn't use a single defuse cycle. */
6282 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6283 || direct_slp_reduc
6284 || (slp_reduc
6285 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6286 || ncopies > 1)
6288 gimple_seq stmts = NULL;
6289 tree single_input = reduc_inputs[0];
6290 for (k = 1; k < reduc_inputs.length (); k++)
6291 single_input = gimple_build (&stmts, code, vectype,
6292 single_input, reduc_inputs[k]);
6293 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6295 reduc_inputs.truncate (0);
6296 reduc_inputs.safe_push (single_input);
6299 tree orig_reduc_input = reduc_inputs[0];
6301 /* If this loop is an epilogue loop that can be skipped after the
6302 main loop, we can only share a reduction operation between the
6303 main loop and the epilogue if we put it at the target of the
6304 skip edge.
6306 We can still reuse accumulators if this check fails. Doing so has
6307 the minor(?) benefit of making the epilogue loop's scalar result
6308 independent of the main loop's scalar result. */
6309 bool unify_with_main_loop_p = false;
6310 if (reduc_info->reused_accumulator
6311 && loop_vinfo->skip_this_loop_edge
6312 && single_succ_p (exit_bb)
6313 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6315 unify_with_main_loop_p = true;
6317 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6318 reduc_inputs[0] = make_ssa_name (vectype);
6319 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6320 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6321 UNKNOWN_LOCATION);
6322 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6323 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6324 exit_gsi = gsi_after_labels (reduc_block);
6327 /* Shouldn't be used beyond this point. */
6328 exit_bb = nullptr;
6330 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6331 && reduc_fn != IFN_LAST)
6333 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6334 various data values where the condition matched and another vector
6335 (INDUCTION_INDEX) containing all the indexes of those matches. We
6336 need to extract the last matching index (which will be the index with
6337 highest value) and use this to index into the data vector.
6338 For the case where there were no matches, the data vector will contain
6339 all default values and the index vector will be all zeros. */
6341 /* Get various versions of the type of the vector of indexes. */
6342 tree index_vec_type = TREE_TYPE (induction_index);
6343 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6344 tree index_scalar_type = TREE_TYPE (index_vec_type);
6345 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6347 /* Get an unsigned integer version of the type of the data vector. */
6348 int scalar_precision
6349 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6350 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6351 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6352 vectype);
6354 /* First we need to create a vector (ZERO_VEC) of zeros and another
6355 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6356 can create using a MAX reduction and then expanding.
6357 In the case where the loop never made any matches, the max index will
6358 be zero. */
6360 /* Vector of {0, 0, 0,...}. */
6361 tree zero_vec = build_zero_cst (vectype);
6363 /* Find maximum value from the vector of found indexes. */
6364 tree max_index = make_ssa_name (index_scalar_type);
6365 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6366 1, induction_index);
6367 gimple_call_set_lhs (max_index_stmt, max_index);
6368 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6370 /* Vector of {max_index, max_index, max_index,...}. */
6371 tree max_index_vec = make_ssa_name (index_vec_type);
6372 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6373 max_index);
6374 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6375 max_index_vec_rhs);
6376 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6378 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6379 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6380 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6381 otherwise. Only one value should match, resulting in a vector
6382 (VEC_COND) with one data value and the rest zeros.
6383 In the case where the loop never made any matches, every index will
6384 match, resulting in a vector with all data values (which will all be
6385 the default value). */
6387 /* Compare the max index vector to the vector of found indexes to find
6388 the position of the max value. */
6389 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6390 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6391 induction_index,
6392 max_index_vec);
6393 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6395 /* Use the compare to choose either values from the data vector or
6396 zero. */
6397 tree vec_cond = make_ssa_name (vectype);
6398 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6399 vec_compare,
6400 reduc_inputs[0],
6401 zero_vec);
6402 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6404 /* Finally we need to extract the data value from the vector (VEC_COND)
6405 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6406 reduction, but because this doesn't exist, we can use a MAX reduction
6407 instead. The data value might be signed or a float so we need to cast
6408 it first.
6409 In the case where the loop never made any matches, the data values are
6410 all identical, and so will reduce down correctly. */
6412 /* Make the matched data values unsigned. */
6413 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6414 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6415 vec_cond);
6416 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6417 VIEW_CONVERT_EXPR,
6418 vec_cond_cast_rhs);
6419 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6421 /* Reduce down to a scalar value. */
6422 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6423 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6424 1, vec_cond_cast);
6425 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6426 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6428 /* Convert the reduced value back to the result type and set as the
6429 result. */
6430 gimple_seq stmts = NULL;
6431 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6432 data_reduc);
6433 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6434 scalar_results.safe_push (new_temp);
6436 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6437 && reduc_fn == IFN_LAST)
6439 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6440 idx = 0;
6441 idx_val = induction_index[0];
6442 val = data_reduc[0];
6443 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6444 if (induction_index[i] > idx_val)
6445 val = data_reduc[i], idx_val = induction_index[i];
6446 return val; */
6448 tree data_eltype = TREE_TYPE (vectype);
6449 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6450 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6451 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6452 /* Enforced by vectorizable_reduction, which ensures we have target
6453 support before allowing a conditional reduction on variable-length
6454 vectors. */
6455 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6456 tree idx_val = NULL_TREE, val = NULL_TREE;
6457 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6459 tree old_idx_val = idx_val;
6460 tree old_val = val;
6461 idx_val = make_ssa_name (idx_eltype);
6462 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6463 build3 (BIT_FIELD_REF, idx_eltype,
6464 induction_index,
6465 bitsize_int (el_size),
6466 bitsize_int (off)));
6467 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6468 val = make_ssa_name (data_eltype);
6469 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6470 build3 (BIT_FIELD_REF,
6471 data_eltype,
6472 reduc_inputs[0],
6473 bitsize_int (el_size),
6474 bitsize_int (off)));
6475 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6476 if (off != 0)
6478 tree new_idx_val = idx_val;
6479 if (off != v_size - el_size)
6481 new_idx_val = make_ssa_name (idx_eltype);
6482 epilog_stmt = gimple_build_assign (new_idx_val,
6483 MAX_EXPR, idx_val,
6484 old_idx_val);
6485 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6487 tree cond = make_ssa_name (boolean_type_node);
6488 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6489 idx_val, old_idx_val);
6490 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6491 tree new_val = make_ssa_name (data_eltype);
6492 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6493 cond, val, old_val);
6494 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6495 idx_val = new_idx_val;
6496 val = new_val;
6499 /* Convert the reduced value back to the result type and set as the
6500 result. */
6501 gimple_seq stmts = NULL;
6502 val = gimple_convert (&stmts, scalar_type, val);
6503 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6504 scalar_results.safe_push (val);
6507 /* 2.3 Create the reduction code, using one of the three schemes described
6508 above. In SLP we simply need to extract all the elements from the
6509 vector (without reducing them), so we use scalar shifts. */
6510 else if (reduc_fn != IFN_LAST && !slp_reduc)
6512 tree tmp;
6513 tree vec_elem_type;
6515 /* Case 1: Create:
6516 v_out2 = reduc_expr <v_out1> */
6518 if (dump_enabled_p ())
6519 dump_printf_loc (MSG_NOTE, vect_location,
6520 "Reduce using direct vector reduction.\n");
6522 gimple_seq stmts = NULL;
6523 vec_elem_type = TREE_TYPE (vectype);
6524 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6525 vec_elem_type, reduc_inputs[0]);
6526 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6527 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6529 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6530 && induc_val)
6532 /* Earlier we set the initial value to be a vector if induc_val
6533 values. Check the result and if it is induc_val then replace
6534 with the original initial value, unless induc_val is
6535 the same as initial_def already. */
6536 tree zcompare = make_ssa_name (boolean_type_node);
6537 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6538 new_temp, induc_val);
6539 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6540 tree initial_def = reduc_info->reduc_initial_values[0];
6541 tmp = make_ssa_name (new_scalar_dest);
6542 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6543 initial_def, new_temp);
6544 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6545 new_temp = tmp;
6548 scalar_results.safe_push (new_temp);
6550 else if (direct_slp_reduc)
6552 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6553 with the elements for other SLP statements replaced with the
6554 neutral value. We can then do a normal reduction on each vector. */
6556 /* Enforced by vectorizable_reduction. */
6557 gcc_assert (reduc_inputs.length () == 1);
6558 gcc_assert (pow2p_hwi (group_size));
6560 gimple_seq seq = NULL;
6562 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6563 and the same element size as VECTYPE. */
6564 tree index = build_index_vector (vectype, 0, 1);
6565 tree index_type = TREE_TYPE (index);
6566 tree index_elt_type = TREE_TYPE (index_type);
6567 tree mask_type = truth_type_for (index_type);
6569 /* Create a vector that, for each element, identifies which of
6570 the REDUC_GROUP_SIZE results should use it. */
6571 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6572 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6573 build_vector_from_val (index_type, index_mask));
6575 /* Get a neutral vector value. This is simply a splat of the neutral
6576 scalar value if we have one, otherwise the initial scalar value
6577 is itself a neutral value. */
6578 tree vector_identity = NULL_TREE;
6579 tree neutral_op = NULL_TREE;
6580 if (slp_node)
6582 tree initial_value = NULL_TREE;
6583 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6584 initial_value = reduc_info->reduc_initial_values[0];
6585 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6586 initial_value, false);
6588 if (neutral_op)
6589 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6590 neutral_op);
6591 for (unsigned int i = 0; i < group_size; ++i)
6593 /* If there's no univeral neutral value, we can use the
6594 initial scalar value from the original PHI. This is used
6595 for MIN and MAX reduction, for example. */
6596 if (!neutral_op)
6598 tree scalar_value = reduc_info->reduc_initial_values[i];
6599 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6600 scalar_value);
6601 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6602 scalar_value);
6605 /* Calculate the equivalent of:
6607 sel[j] = (index[j] == i);
6609 which selects the elements of REDUC_INPUTS[0] that should
6610 be included in the result. */
6611 tree compare_val = build_int_cst (index_elt_type, i);
6612 compare_val = build_vector_from_val (index_type, compare_val);
6613 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6614 index, compare_val);
6616 /* Calculate the equivalent of:
6618 vec = seq ? reduc_inputs[0] : vector_identity;
6620 VEC is now suitable for a full vector reduction. */
6621 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6622 sel, reduc_inputs[0], vector_identity);
6624 /* Do the reduction and convert it to the appropriate type. */
6625 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6626 TREE_TYPE (vectype), vec);
6627 scalar = gimple_convert (&seq, scalar_type, scalar);
6628 scalar_results.safe_push (scalar);
6630 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6632 else
6634 bool reduce_with_shift;
6635 tree vec_temp;
6637 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6639 /* See if the target wants to do the final (shift) reduction
6640 in a vector mode of smaller size and first reduce upper/lower
6641 halves against each other. */
6642 enum machine_mode mode1 = mode;
6643 tree stype = TREE_TYPE (vectype);
6644 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6645 unsigned nunits1 = nunits;
6646 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6647 && reduc_inputs.length () == 1)
6649 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6650 /* For SLP reductions we have to make sure lanes match up, but
6651 since we're doing individual element final reduction reducing
6652 vector width here is even more important.
6653 ??? We can also separate lanes with permutes, for the common
6654 case of power-of-two group-size odd/even extracts would work. */
6655 if (slp_reduc && nunits != nunits1)
6657 nunits1 = least_common_multiple (nunits1, group_size);
6658 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6661 if (!slp_reduc
6662 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6663 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6665 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6666 stype, nunits1);
6667 reduce_with_shift = have_whole_vector_shift (mode1);
6668 if (!VECTOR_MODE_P (mode1)
6669 || !directly_supported_p (code, vectype1))
6670 reduce_with_shift = false;
6672 /* First reduce the vector to the desired vector size we should
6673 do shift reduction on by combining upper and lower halves. */
6674 gimple_seq stmts = NULL;
6675 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6676 code, &stmts);
6677 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6678 reduc_inputs[0] = new_temp;
6680 if (reduce_with_shift && !slp_reduc)
6682 int element_bitsize = tree_to_uhwi (bitsize);
6683 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6684 for variable-length vectors and also requires direct target support
6685 for loop reductions. */
6686 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6687 int nelements = vec_size_in_bits / element_bitsize;
6688 vec_perm_builder sel;
6689 vec_perm_indices indices;
6691 int elt_offset;
6693 tree zero_vec = build_zero_cst (vectype1);
6694 /* Case 2: Create:
6695 for (offset = nelements/2; offset >= 1; offset/=2)
6697 Create: va' = vec_shift <va, offset>
6698 Create: va = vop <va, va'>
6699 } */
6701 tree rhs;
6703 if (dump_enabled_p ())
6704 dump_printf_loc (MSG_NOTE, vect_location,
6705 "Reduce using vector shifts\n");
6707 gimple_seq stmts = NULL;
6708 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6709 for (elt_offset = nelements / 2;
6710 elt_offset >= 1;
6711 elt_offset /= 2)
6713 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6714 indices.new_vector (sel, 2, nelements);
6715 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6716 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6717 new_temp, zero_vec, mask);
6718 new_temp = gimple_build (&stmts, code,
6719 vectype1, new_name, new_temp);
6721 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6723 /* 2.4 Extract the final scalar result. Create:
6724 s_out3 = extract_field <v_out2, bitpos> */
6726 if (dump_enabled_p ())
6727 dump_printf_loc (MSG_NOTE, vect_location,
6728 "extract scalar result\n");
6730 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6731 bitsize, bitsize_zero_node);
6732 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6733 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6734 gimple_assign_set_lhs (epilog_stmt, new_temp);
6735 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6736 scalar_results.safe_push (new_temp);
6738 else
6740 /* Case 3: Create:
6741 s = extract_field <v_out2, 0>
6742 for (offset = element_size;
6743 offset < vector_size;
6744 offset += element_size;)
6746 Create: s' = extract_field <v_out2, offset>
6747 Create: s = op <s, s'> // For non SLP cases
6748 } */
6750 if (dump_enabled_p ())
6751 dump_printf_loc (MSG_NOTE, vect_location,
6752 "Reduce using scalar code.\n");
6754 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6755 int element_bitsize = tree_to_uhwi (bitsize);
6756 tree compute_type = TREE_TYPE (vectype);
6757 gimple_seq stmts = NULL;
6758 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6760 int bit_offset;
6761 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6762 vec_temp, bitsize, bitsize_zero_node);
6764 /* In SLP we don't need to apply reduction operation, so we just
6765 collect s' values in SCALAR_RESULTS. */
6766 if (slp_reduc)
6767 scalar_results.safe_push (new_temp);
6769 for (bit_offset = element_bitsize;
6770 bit_offset < vec_size_in_bits;
6771 bit_offset += element_bitsize)
6773 tree bitpos = bitsize_int (bit_offset);
6774 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6775 compute_type, vec_temp,
6776 bitsize, bitpos);
6777 if (slp_reduc)
6779 /* In SLP we don't need to apply reduction operation, so
6780 we just collect s' values in SCALAR_RESULTS. */
6781 new_temp = new_name;
6782 scalar_results.safe_push (new_name);
6784 else
6785 new_temp = gimple_build (&stmts, code, compute_type,
6786 new_name, new_temp);
6790 /* The only case where we need to reduce scalar results in SLP, is
6791 unrolling. If the size of SCALAR_RESULTS is greater than
6792 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6793 REDUC_GROUP_SIZE. */
6794 if (slp_reduc)
6796 tree res, first_res, new_res;
6798 /* Reduce multiple scalar results in case of SLP unrolling. */
6799 for (j = group_size; scalar_results.iterate (j, &res);
6800 j++)
6802 first_res = scalar_results[j % group_size];
6803 new_res = gimple_build (&stmts, code, compute_type,
6804 first_res, res);
6805 scalar_results[j % group_size] = new_res;
6807 scalar_results.truncate (group_size);
6808 for (k = 0; k < group_size; k++)
6809 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6810 scalar_results[k]);
6812 else
6814 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6815 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6816 scalar_results.safe_push (new_temp);
6819 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6822 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6823 && induc_val)
6825 /* Earlier we set the initial value to be a vector if induc_val
6826 values. Check the result and if it is induc_val then replace
6827 with the original initial value, unless induc_val is
6828 the same as initial_def already. */
6829 tree zcompare = make_ssa_name (boolean_type_node);
6830 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6831 induc_val);
6832 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6833 tree initial_def = reduc_info->reduc_initial_values[0];
6834 tree tmp = make_ssa_name (new_scalar_dest);
6835 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6836 initial_def, new_temp);
6837 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6838 scalar_results[0] = tmp;
6842 /* 2.5 Adjust the final result by the initial value of the reduction
6843 variable. (When such adjustment is not needed, then
6844 'adjustment_def' is zero). For example, if code is PLUS we create:
6845 new_temp = loop_exit_def + adjustment_def */
6847 if (adjustment_def)
6849 gcc_assert (!slp_reduc);
6850 gimple_seq stmts = NULL;
6851 if (double_reduc)
6853 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6854 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6855 new_temp = gimple_build (&stmts, code, vectype,
6856 reduc_inputs[0], adjustment_def);
6858 else
6860 new_temp = scalar_results[0];
6861 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6862 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6863 adjustment_def);
6864 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6865 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6866 new_temp, adjustment_def);
6867 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6870 epilog_stmt = gimple_seq_last_stmt (stmts);
6871 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6872 scalar_results[0] = new_temp;
6875 /* Record this operation if it could be reused by the epilogue loop. */
6876 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6877 && reduc_inputs.length () == 1)
6878 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6879 { orig_reduc_input, reduc_info });
6881 if (double_reduc)
6882 loop = outer_loop;
6884 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6885 phis with new adjusted scalar results, i.e., replace use <s_out0>
6886 with use <s_out4>.
6888 Transform:
6889 loop_exit:
6890 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6891 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6892 v_out2 = reduce <v_out1>
6893 s_out3 = extract_field <v_out2, 0>
6894 s_out4 = adjust_result <s_out3>
6895 use <s_out0>
6896 use <s_out0>
6898 into:
6900 loop_exit:
6901 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6902 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6903 v_out2 = reduce <v_out1>
6904 s_out3 = extract_field <v_out2, 0>
6905 s_out4 = adjust_result <s_out3>
6906 use <s_out4>
6907 use <s_out4> */
6909 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6910 for (k = 0; k < live_out_stmts.size (); k++)
6912 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6913 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6915 phis.create (3);
6916 /* Find the loop-closed-use at the loop exit of the original scalar
6917 result. (The reduction result is expected to have two immediate uses,
6918 one at the latch block, and one at the loop exit). For double
6919 reductions we are looking for exit phis of the outer loop. */
6920 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6922 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6924 if (!is_gimple_debug (USE_STMT (use_p)))
6925 phis.safe_push (USE_STMT (use_p));
6927 else
6929 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6931 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6933 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6935 if (!flow_bb_inside_loop_p (loop,
6936 gimple_bb (USE_STMT (phi_use_p)))
6937 && !is_gimple_debug (USE_STMT (phi_use_p)))
6938 phis.safe_push (USE_STMT (phi_use_p));
6944 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6946 /* Replace the uses: */
6947 orig_name = PHI_RESULT (exit_phi);
6949 /* Look for a single use at the target of the skip edge. */
6950 if (unify_with_main_loop_p)
6952 use_operand_p use_p;
6953 gimple *user;
6954 if (!single_imm_use (orig_name, &use_p, &user))
6955 gcc_unreachable ();
6956 orig_name = gimple_get_lhs (user);
6959 scalar_result = scalar_results[k];
6960 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6962 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6963 SET_USE (use_p, scalar_result);
6964 update_stmt (use_stmt);
6968 phis.release ();
6972 /* Return a vector of type VECTYPE that is equal to the vector select
6973 operation "MASK ? VEC : IDENTITY". Insert the select statements
6974 before GSI. */
6976 static tree
6977 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6978 tree vec, tree identity)
6980 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6981 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6982 mask, vec, identity);
6983 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6984 return cond;
6987 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6988 order, starting with LHS. Insert the extraction statements before GSI and
6989 associate the new scalar SSA names with variable SCALAR_DEST.
6990 If MASK is nonzero mask the input and then operate on it unconditionally.
6991 Return the SSA name for the result. */
6993 static tree
6994 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6995 tree_code code, tree lhs, tree vector_rhs,
6996 tree mask)
6998 tree vectype = TREE_TYPE (vector_rhs);
6999 tree scalar_type = TREE_TYPE (vectype);
7000 tree bitsize = TYPE_SIZE (scalar_type);
7001 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
7002 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
7004 /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7005 to perform an unconditional element-wise reduction of it. */
7006 if (mask)
7008 tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7009 "masked_vector_rhs");
7010 tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7011 false);
7012 tree vector_identity = build_vector_from_val (vectype, neutral_op);
7013 gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7014 mask, vector_rhs, vector_identity);
7015 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7016 vector_rhs = masked_vector_rhs;
7019 for (unsigned HOST_WIDE_INT bit_offset = 0;
7020 bit_offset < vec_size_in_bits;
7021 bit_offset += element_bitsize)
7023 tree bitpos = bitsize_int (bit_offset);
7024 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7025 bitsize, bitpos);
7027 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7028 rhs = make_ssa_name (scalar_dest, stmt);
7029 gimple_assign_set_lhs (stmt, rhs);
7030 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7032 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7033 tree new_name = make_ssa_name (scalar_dest, stmt);
7034 gimple_assign_set_lhs (stmt, new_name);
7035 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7036 lhs = new_name;
7038 return lhs;
7041 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
7042 type of the vector input. */
7044 static internal_fn
7045 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7047 internal_fn mask_reduc_fn;
7048 internal_fn mask_len_reduc_fn;
7050 switch (reduc_fn)
7052 case IFN_FOLD_LEFT_PLUS:
7053 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7054 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7055 break;
7057 default:
7058 return IFN_LAST;
7061 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7062 OPTIMIZE_FOR_SPEED))
7063 return mask_reduc_fn;
7064 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7065 OPTIMIZE_FOR_SPEED))
7066 return mask_len_reduc_fn;
7067 return IFN_LAST;
7070 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
7071 statement that sets the live-out value. REDUC_DEF_STMT is the phi
7072 statement. CODE is the operation performed by STMT_INFO and OPS are
7073 its scalar operands. REDUC_INDEX is the index of the operand in
7074 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
7075 implements in-order reduction, or IFN_LAST if we should open-code it.
7076 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7077 that should be used to control the operation in a fully-masked loop. */
7079 static bool
7080 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7081 stmt_vec_info stmt_info,
7082 gimple_stmt_iterator *gsi,
7083 gimple **vec_stmt, slp_tree slp_node,
7084 gimple *reduc_def_stmt,
7085 code_helper code, internal_fn reduc_fn,
7086 tree *ops, int num_ops, tree vectype_in,
7087 int reduc_index, vec_loop_masks *masks,
7088 vec_loop_lens *lens)
7090 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7091 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7092 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7094 int ncopies;
7095 if (slp_node)
7096 ncopies = 1;
7097 else
7098 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7100 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7101 gcc_assert (ncopies == 1);
7103 bool is_cond_op = false;
7104 if (!code.is_tree_code ())
7106 code = conditional_internal_fn_code (internal_fn (code));
7107 gcc_assert (code != ERROR_MARK);
7108 is_cond_op = true;
7111 gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7113 if (slp_node)
7115 if (is_cond_op)
7117 if (dump_enabled_p ())
7118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7119 "fold-left reduction on SLP not supported.\n");
7120 return false;
7123 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7124 TYPE_VECTOR_SUBPARTS (vectype_in)));
7127 /* The operands either come from a binary operation or an IFN_COND operation.
7128 The former is a gimple assign with binary rhs and the latter is a
7129 gimple call with four arguments. */
7130 gcc_assert (num_ops == 2 || num_ops == 4);
7131 tree op0, opmask;
7132 if (!is_cond_op)
7133 op0 = ops[1 - reduc_index];
7134 else
7136 op0 = ops[2 + (1 - reduc_index)];
7137 opmask = ops[0];
7138 gcc_assert (!slp_node);
7141 int group_size = 1;
7142 stmt_vec_info scalar_dest_def_info;
7143 auto_vec<tree> vec_oprnds0, vec_opmask;
7144 if (slp_node)
7146 auto_vec<vec<tree> > vec_defs (2);
7147 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7148 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7149 vec_defs[0].release ();
7150 vec_defs[1].release ();
7151 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7152 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7154 else
7156 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7157 op0, &vec_oprnds0);
7158 scalar_dest_def_info = stmt_info;
7160 /* For an IFN_COND_OP we also need the vector mask operand. */
7161 if (is_cond_op)
7162 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7163 opmask, &vec_opmask);
7166 gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7167 tree scalar_dest = gimple_get_lhs (sdef);
7168 tree scalar_type = TREE_TYPE (scalar_dest);
7169 tree reduc_var = gimple_phi_result (reduc_def_stmt);
7171 int vec_num = vec_oprnds0.length ();
7172 gcc_assert (vec_num == 1 || slp_node);
7173 tree vec_elem_type = TREE_TYPE (vectype_out);
7174 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7176 tree vector_identity = NULL_TREE;
7177 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7179 vector_identity = build_zero_cst (vectype_out);
7180 if (!HONOR_SIGNED_ZEROS (vectype_out))
7182 else
7184 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7185 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7186 vector_identity);
7190 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7191 int i;
7192 tree def0;
7193 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7195 gimple *new_stmt;
7196 tree mask = NULL_TREE;
7197 tree len = NULL_TREE;
7198 tree bias = NULL_TREE;
7199 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7200 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7201 else if (is_cond_op)
7202 mask = vec_opmask[0];
7203 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7205 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7206 i, 1);
7207 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7208 bias = build_int_cst (intQI_type_node, biasval);
7209 if (!is_cond_op)
7210 mask = build_minus_one_cst (truth_type_for (vectype_in));
7213 /* Handle MINUS by adding the negative. */
7214 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7216 tree negated = make_ssa_name (vectype_out);
7217 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7218 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7219 def0 = negated;
7222 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7223 && mask && mask_reduc_fn == IFN_LAST)
7224 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7225 vector_identity);
7227 /* On the first iteration the input is simply the scalar phi
7228 result, and for subsequent iterations it is the output of
7229 the preceding operation. */
7230 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7232 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7233 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7234 def0, mask, len, bias);
7235 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7236 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7237 def0, mask);
7238 else
7239 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7240 def0);
7241 /* For chained SLP reductions the output of the previous reduction
7242 operation serves as the input of the next. For the final statement
7243 the output cannot be a temporary - we reuse the original
7244 scalar destination of the last statement. */
7245 if (i != vec_num - 1)
7247 gimple_set_lhs (new_stmt, scalar_dest_var);
7248 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7249 gimple_set_lhs (new_stmt, reduc_var);
7252 else
7254 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7255 tree_code (code), reduc_var, def0,
7256 mask);
7257 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7258 /* Remove the statement, so that we can use the same code paths
7259 as for statements that we've just created. */
7260 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7261 gsi_remove (&tmp_gsi, true);
7264 if (i == vec_num - 1)
7266 gimple_set_lhs (new_stmt, scalar_dest);
7267 vect_finish_replace_stmt (loop_vinfo,
7268 scalar_dest_def_info,
7269 new_stmt);
7271 else
7272 vect_finish_stmt_generation (loop_vinfo,
7273 scalar_dest_def_info,
7274 new_stmt, gsi);
7276 if (slp_node)
7277 slp_node->push_vec_def (new_stmt);
7278 else
7280 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7281 *vec_stmt = new_stmt;
7285 return true;
7288 /* Function is_nonwrapping_integer_induction.
7290 Check if STMT_VINO (which is part of loop LOOP) both increments and
7291 does not cause overflow. */
7293 static bool
7294 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7296 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7297 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7298 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7299 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7300 widest_int ni, max_loop_value, lhs_max;
7301 wi::overflow_type overflow = wi::OVF_NONE;
7303 /* Make sure the loop is integer based. */
7304 if (TREE_CODE (base) != INTEGER_CST
7305 || TREE_CODE (step) != INTEGER_CST)
7306 return false;
7308 /* Check that the max size of the loop will not wrap. */
7310 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7311 return true;
7313 if (! max_stmt_executions (loop, &ni))
7314 return false;
7316 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7317 &overflow);
7318 if (overflow)
7319 return false;
7321 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7322 TYPE_SIGN (lhs_type), &overflow);
7323 if (overflow)
7324 return false;
7326 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7327 <= TYPE_PRECISION (lhs_type));
7330 /* Check if masking can be supported by inserting a conditional expression.
7331 CODE is the code for the operation. COND_FN is the conditional internal
7332 function, if it exists. VECTYPE_IN is the type of the vector input. */
7333 static bool
7334 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7335 tree vectype_in)
7337 if (cond_fn != IFN_LAST
7338 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7339 OPTIMIZE_FOR_SPEED))
7340 return false;
7342 if (code.is_tree_code ())
7343 switch (tree_code (code))
7345 case DOT_PROD_EXPR:
7346 case SAD_EXPR:
7347 return true;
7349 default:
7350 break;
7352 return false;
7355 /* Insert a conditional expression to enable masked vectorization. CODE is the
7356 code for the operation. VOP is the array of operands. MASK is the loop
7357 mask. GSI is a statement iterator used to place the new conditional
7358 expression. */
7359 static void
7360 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7361 gimple_stmt_iterator *gsi)
7363 switch (tree_code (code))
7365 case DOT_PROD_EXPR:
7367 tree vectype = TREE_TYPE (vop[1]);
7368 tree zero = build_zero_cst (vectype);
7369 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7370 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7371 mask, vop[1], zero);
7372 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7373 vop[1] = masked_op1;
7374 break;
7377 case SAD_EXPR:
7379 tree vectype = TREE_TYPE (vop[1]);
7380 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7381 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7382 mask, vop[1], vop[0]);
7383 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7384 vop[1] = masked_op1;
7385 break;
7388 default:
7389 gcc_unreachable ();
7393 /* Function vectorizable_reduction.
7395 Check if STMT_INFO performs a reduction operation that can be vectorized.
7396 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7397 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7398 Return true if STMT_INFO is vectorizable in this way.
7400 This function also handles reduction idioms (patterns) that have been
7401 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7402 may be of this form:
7403 X = pattern_expr (arg0, arg1, ..., X)
7404 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7405 sequence that had been detected and replaced by the pattern-stmt
7406 (STMT_INFO).
7408 This function also handles reduction of condition expressions, for example:
7409 for (int i = 0; i < N; i++)
7410 if (a[i] < value)
7411 last = a[i];
7412 This is handled by vectorising the loop and creating an additional vector
7413 containing the loop indexes for which "a[i] < value" was true. In the
7414 function epilogue this is reduced to a single max value and then used to
7415 index into the vector of results.
7417 In some cases of reduction patterns, the type of the reduction variable X is
7418 different than the type of the other arguments of STMT_INFO.
7419 In such cases, the vectype that is used when transforming STMT_INFO into
7420 a vector stmt is different than the vectype that is used to determine the
7421 vectorization factor, because it consists of a different number of elements
7422 than the actual number of elements that are being operated upon in parallel.
7424 For example, consider an accumulation of shorts into an int accumulator.
7425 On some targets it's possible to vectorize this pattern operating on 8
7426 shorts at a time (hence, the vectype for purposes of determining the
7427 vectorization factor should be V8HI); on the other hand, the vectype that
7428 is used to create the vector form is actually V4SI (the type of the result).
7430 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7431 indicates what is the actual level of parallelism (V8HI in the example), so
7432 that the right vectorization factor would be derived. This vectype
7433 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7434 be used to create the vectorized stmt. The right vectype for the vectorized
7435 stmt is obtained from the type of the result X:
7436 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7438 This means that, contrary to "regular" reductions (or "regular" stmts in
7439 general), the following equation:
7440 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7441 does *NOT* necessarily hold for reduction patterns. */
7443 bool
7444 vectorizable_reduction (loop_vec_info loop_vinfo,
7445 stmt_vec_info stmt_info, slp_tree slp_node,
7446 slp_instance slp_node_instance,
7447 stmt_vector_for_cost *cost_vec)
7449 tree vectype_in = NULL_TREE;
7450 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7451 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7452 stmt_vec_info cond_stmt_vinfo = NULL;
7453 int i;
7454 int ncopies;
7455 bool single_defuse_cycle = false;
7456 bool nested_cycle = false;
7457 bool double_reduc = false;
7458 int vec_num;
7459 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7460 tree cond_reduc_val = NULL_TREE;
7462 /* Make sure it was already recognized as a reduction computation. */
7463 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7464 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7465 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7466 return false;
7468 /* The stmt we store reduction analysis meta on. */
7469 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7470 reduc_info->is_reduc_info = true;
7472 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7474 if (is_a <gphi *> (stmt_info->stmt))
7476 if (slp_node)
7478 /* We eventually need to set a vector type on invariant
7479 arguments. */
7480 unsigned j;
7481 slp_tree child;
7482 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7483 if (!vect_maybe_update_slp_op_vectype
7484 (child, SLP_TREE_VECTYPE (slp_node)))
7486 if (dump_enabled_p ())
7487 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7488 "incompatible vector types for "
7489 "invariants\n");
7490 return false;
7493 /* Analysis for double-reduction is done on the outer
7494 loop PHI, nested cycles have no further restrictions. */
7495 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7497 else
7498 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7499 return true;
7502 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7503 stmt_vec_info phi_info = stmt_info;
7504 if (!is_a <gphi *> (stmt_info->stmt))
7506 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7507 return true;
7509 if (slp_node)
7511 slp_node_instance->reduc_phis = slp_node;
7512 /* ??? We're leaving slp_node to point to the PHIs, we only
7513 need it to get at the number of vector stmts which wasn't
7514 yet initialized for the instance root. */
7516 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7518 use_operand_p use_p;
7519 gimple *use_stmt;
7520 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7521 &use_p, &use_stmt);
7522 gcc_assert (res);
7523 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7526 /* PHIs should not participate in patterns. */
7527 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7528 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7530 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7531 and compute the reduction chain length. Discover the real
7532 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7533 tree reduc_def
7534 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7535 loop_latch_edge
7536 (gimple_bb (reduc_def_phi)->loop_father));
7537 unsigned reduc_chain_length = 0;
7538 bool only_slp_reduc_chain = true;
7539 stmt_info = NULL;
7540 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7541 while (reduc_def != PHI_RESULT (reduc_def_phi))
7543 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7544 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7545 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7547 if (dump_enabled_p ())
7548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7549 "reduction chain broken by patterns.\n");
7550 return false;
7552 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7553 only_slp_reduc_chain = false;
7554 /* For epilogue generation live members of the chain need
7555 to point back to the PHI via their original stmt for
7556 info_for_reduction to work. For SLP we need to look at
7557 all lanes here - even though we only will vectorize from
7558 the SLP node with live lane zero the other live lanes also
7559 need to be identified as part of a reduction to be able
7560 to skip code generation for them. */
7561 if (slp_for_stmt_info)
7563 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7564 if (STMT_VINFO_LIVE_P (s))
7565 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7567 else if (STMT_VINFO_LIVE_P (vdef))
7568 STMT_VINFO_REDUC_DEF (def) = phi_info;
7569 gimple_match_op op;
7570 if (!gimple_extract_op (vdef->stmt, &op))
7572 if (dump_enabled_p ())
7573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7574 "reduction chain includes unsupported"
7575 " statement type.\n");
7576 return false;
7578 if (CONVERT_EXPR_CODE_P (op.code))
7580 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7582 if (dump_enabled_p ())
7583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7584 "conversion in the reduction chain.\n");
7585 return false;
7588 else if (!stmt_info)
7589 /* First non-conversion stmt. */
7590 stmt_info = vdef;
7591 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7592 reduc_chain_length++;
7593 if (!stmt_info && slp_node)
7594 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7596 /* PHIs should not participate in patterns. */
7597 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7599 if (nested_in_vect_loop_p (loop, stmt_info))
7601 loop = loop->inner;
7602 nested_cycle = true;
7605 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7606 element. */
7607 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7609 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7610 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7612 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7613 gcc_assert (slp_node
7614 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7616 /* 1. Is vectorizable reduction? */
7617 /* Not supportable if the reduction variable is used in the loop, unless
7618 it's a reduction chain. */
7619 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7620 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7621 return false;
7623 /* Reductions that are not used even in an enclosing outer-loop,
7624 are expected to be "live" (used out of the loop). */
7625 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7626 && !STMT_VINFO_LIVE_P (stmt_info))
7627 return false;
7629 /* 2. Has this been recognized as a reduction pattern?
7631 Check if STMT represents a pattern that has been recognized
7632 in earlier analysis stages. For stmts that represent a pattern,
7633 the STMT_VINFO_RELATED_STMT field records the last stmt in
7634 the original sequence that constitutes the pattern. */
7636 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7637 if (orig_stmt_info)
7639 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7640 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7643 /* 3. Check the operands of the operation. The first operands are defined
7644 inside the loop body. The last operand is the reduction variable,
7645 which is defined by the loop-header-phi. */
7647 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7648 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7649 gimple_match_op op;
7650 if (!gimple_extract_op (stmt_info->stmt, &op))
7651 gcc_unreachable ();
7652 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7653 || op.code == WIDEN_SUM_EXPR
7654 || op.code == SAD_EXPR);
7656 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7657 && !SCALAR_FLOAT_TYPE_P (op.type))
7658 return false;
7660 /* Do not try to vectorize bit-precision reductions. */
7661 if (!type_has_mode_precision_p (op.type))
7662 return false;
7664 /* For lane-reducing ops we're reducing the number of reduction PHIs
7665 which means the only use of that may be in the lane-reducing operation. */
7666 if (lane_reduc_code_p
7667 && reduc_chain_length != 1
7668 && !only_slp_reduc_chain)
7670 if (dump_enabled_p ())
7671 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7672 "lane-reducing reduction with extra stmts.\n");
7673 return false;
7676 /* All uses but the last are expected to be defined in the loop.
7677 The last use is the reduction variable. In case of nested cycle this
7678 assumption is not true: we use reduc_index to record the index of the
7679 reduction variable. */
7680 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7681 tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7682 /* We need to skip an extra operand for COND_EXPRs with embedded
7683 comparison. */
7684 unsigned opno_adjust = 0;
7685 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7686 opno_adjust = 1;
7687 for (i = 0; i < (int) op.num_ops; i++)
7689 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7690 if (i == 0 && op.code == COND_EXPR)
7691 continue;
7693 stmt_vec_info def_stmt_info;
7694 enum vect_def_type dt;
7695 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7696 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7697 &vectype_op[i], &def_stmt_info))
7699 if (dump_enabled_p ())
7700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7701 "use not simple.\n");
7702 return false;
7704 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7705 continue;
7707 /* For an IFN_COND_OP we might hit the reduction definition operand
7708 twice (once as definition, once as else). */
7709 if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7710 continue;
7712 /* There should be only one cycle def in the stmt, the one
7713 leading to reduc_def. */
7714 if (VECTORIZABLE_CYCLE_DEF (dt))
7715 return false;
7717 if (!vectype_op[i])
7718 vectype_op[i]
7719 = get_vectype_for_scalar_type (loop_vinfo,
7720 TREE_TYPE (op.ops[i]), slp_op[i]);
7722 /* To properly compute ncopies we are interested in the widest
7723 non-reduction input type in case we're looking at a widening
7724 accumulation that we later handle in vect_transform_reduction. */
7725 if (lane_reduc_code_p
7726 && vectype_op[i]
7727 && (!vectype_in
7728 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7729 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7730 vectype_in = vectype_op[i];
7732 if (op.code == COND_EXPR)
7734 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7735 if (dt == vect_constant_def)
7737 cond_reduc_dt = dt;
7738 cond_reduc_val = op.ops[i];
7740 if (dt == vect_induction_def
7741 && def_stmt_info
7742 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7744 cond_reduc_dt = dt;
7745 cond_stmt_vinfo = def_stmt_info;
7749 if (!vectype_in)
7750 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7751 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7753 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7754 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7755 /* If we have a condition reduction, see if we can simplify it further. */
7756 if (v_reduc_type == COND_REDUCTION)
7758 if (slp_node)
7759 return false;
7761 /* When the condition uses the reduction value in the condition, fail. */
7762 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7764 if (dump_enabled_p ())
7765 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7766 "condition depends on previous iteration\n");
7767 return false;
7770 if (reduc_chain_length == 1
7771 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7772 OPTIMIZE_FOR_SPEED)
7773 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7774 vectype_in,
7775 OPTIMIZE_FOR_SPEED)))
7777 if (dump_enabled_p ())
7778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7779 "optimizing condition reduction with"
7780 " FOLD_EXTRACT_LAST.\n");
7781 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7783 else if (cond_reduc_dt == vect_induction_def)
7785 tree base
7786 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7787 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7789 gcc_assert (TREE_CODE (base) == INTEGER_CST
7790 && TREE_CODE (step) == INTEGER_CST);
7791 cond_reduc_val = NULL_TREE;
7792 enum tree_code cond_reduc_op_code = ERROR_MARK;
7793 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7794 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7796 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7797 above base; punt if base is the minimum value of the type for
7798 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7799 else if (tree_int_cst_sgn (step) == -1)
7801 cond_reduc_op_code = MIN_EXPR;
7802 if (tree_int_cst_sgn (base) == -1)
7803 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7804 else if (tree_int_cst_lt (base,
7805 TYPE_MAX_VALUE (TREE_TYPE (base))))
7806 cond_reduc_val
7807 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7809 else
7811 cond_reduc_op_code = MAX_EXPR;
7812 if (tree_int_cst_sgn (base) == 1)
7813 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7814 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7815 base))
7816 cond_reduc_val
7817 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7819 if (cond_reduc_val)
7821 if (dump_enabled_p ())
7822 dump_printf_loc (MSG_NOTE, vect_location,
7823 "condition expression based on "
7824 "integer induction.\n");
7825 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7826 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7827 = cond_reduc_val;
7828 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7831 else if (cond_reduc_dt == vect_constant_def)
7833 enum vect_def_type cond_initial_dt;
7834 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7835 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7836 if (cond_initial_dt == vect_constant_def
7837 && types_compatible_p (TREE_TYPE (cond_initial_val),
7838 TREE_TYPE (cond_reduc_val)))
7840 tree e = fold_binary (LE_EXPR, boolean_type_node,
7841 cond_initial_val, cond_reduc_val);
7842 if (e && (integer_onep (e) || integer_zerop (e)))
7844 if (dump_enabled_p ())
7845 dump_printf_loc (MSG_NOTE, vect_location,
7846 "condition expression based on "
7847 "compile time constant.\n");
7848 /* Record reduction code at analysis stage. */
7849 STMT_VINFO_REDUC_CODE (reduc_info)
7850 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7851 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7857 if (STMT_VINFO_LIVE_P (phi_info))
7858 return false;
7860 if (slp_node)
7861 ncopies = 1;
7862 else
7863 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7865 gcc_assert (ncopies >= 1);
7867 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7869 if (nested_cycle)
7871 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7872 == vect_double_reduction_def);
7873 double_reduc = true;
7876 /* 4.2. Check support for the epilog operation.
7878 If STMT represents a reduction pattern, then the type of the
7879 reduction variable may be different than the type of the rest
7880 of the arguments. For example, consider the case of accumulation
7881 of shorts into an int accumulator; The original code:
7882 S1: int_a = (int) short_a;
7883 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7885 was replaced with:
7886 STMT: int_acc = widen_sum <short_a, int_acc>
7888 This means that:
7889 1. The tree-code that is used to create the vector operation in the
7890 epilog code (that reduces the partial results) is not the
7891 tree-code of STMT, but is rather the tree-code of the original
7892 stmt from the pattern that STMT is replacing. I.e, in the example
7893 above we want to use 'widen_sum' in the loop, but 'plus' in the
7894 epilog.
7895 2. The type (mode) we use to check available target support
7896 for the vector operation to be created in the *epilog*, is
7897 determined by the type of the reduction variable (in the example
7898 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7899 However the type (mode) we use to check available target support
7900 for the vector operation to be created *inside the loop*, is
7901 determined by the type of the other arguments to STMT (in the
7902 example we'd check this: optab_handler (widen_sum_optab,
7903 vect_short_mode)).
7905 This is contrary to "regular" reductions, in which the types of all
7906 the arguments are the same as the type of the reduction variable.
7907 For "regular" reductions we can therefore use the same vector type
7908 (and also the same tree-code) when generating the epilog code and
7909 when generating the code inside the loop. */
7911 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7913 /* If conversion might have created a conditional operation like
7914 IFN_COND_ADD already. Use the internal code for the following checks. */
7915 if (orig_code.is_internal_fn ())
7917 tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7918 orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7921 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7923 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7924 if (reduction_type == TREE_CODE_REDUCTION)
7926 /* Check whether it's ok to change the order of the computation.
7927 Generally, when vectorizing a reduction we change the order of the
7928 computation. This may change the behavior of the program in some
7929 cases, so we need to check that this is ok. One exception is when
7930 vectorizing an outer-loop: the inner-loop is executed sequentially,
7931 and therefore vectorizing reductions in the inner-loop during
7932 outer-loop vectorization is safe. Likewise when we are vectorizing
7933 a series of reductions using SLP and the VF is one the reductions
7934 are performed in scalar order. */
7935 if (slp_node
7936 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7937 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7939 else if (needs_fold_left_reduction_p (op.type, orig_code))
7941 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7942 is not directy used in stmt. */
7943 if (!only_slp_reduc_chain
7944 && reduc_chain_length != 1)
7946 if (dump_enabled_p ())
7947 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7948 "in-order reduction chain without SLP.\n");
7949 return false;
7951 STMT_VINFO_REDUC_TYPE (reduc_info)
7952 = reduction_type = FOLD_LEFT_REDUCTION;
7954 else if (!commutative_binary_op_p (orig_code, op.type)
7955 || !associative_binary_op_p (orig_code, op.type))
7957 if (dump_enabled_p ())
7958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7959 "reduction: not commutative/associative\n");
7960 return false;
7964 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7965 && ncopies > 1)
7967 if (dump_enabled_p ())
7968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7969 "multiple types in double reduction or condition "
7970 "reduction or fold-left reduction.\n");
7971 return false;
7974 internal_fn reduc_fn = IFN_LAST;
7975 if (reduction_type == TREE_CODE_REDUCTION
7976 || reduction_type == FOLD_LEFT_REDUCTION
7977 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7978 || reduction_type == CONST_COND_REDUCTION)
7980 if (reduction_type == FOLD_LEFT_REDUCTION
7981 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7982 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7984 if (reduc_fn != IFN_LAST
7985 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7986 OPTIMIZE_FOR_SPEED))
7988 if (dump_enabled_p ())
7989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7990 "reduc op not supported by target.\n");
7992 reduc_fn = IFN_LAST;
7995 else
7997 if (!nested_cycle || double_reduc)
7999 if (dump_enabled_p ())
8000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8001 "no reduc code for scalar code.\n");
8003 return false;
8007 else if (reduction_type == COND_REDUCTION)
8009 int scalar_precision
8010 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8011 cr_index_scalar_type = make_unsigned_type (scalar_precision);
8012 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8013 vectype_out);
8015 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8016 OPTIMIZE_FOR_SPEED))
8017 reduc_fn = IFN_REDUC_MAX;
8019 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8021 if (reduction_type != EXTRACT_LAST_REDUCTION
8022 && (!nested_cycle || double_reduc)
8023 && reduc_fn == IFN_LAST
8024 && !nunits_out.is_constant ())
8026 if (dump_enabled_p ())
8027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8028 "missing target support for reduction on"
8029 " variable-length vectors.\n");
8030 return false;
8033 /* For SLP reductions, see if there is a neutral value we can use. */
8034 tree neutral_op = NULL_TREE;
8035 if (slp_node)
8037 tree initial_value = NULL_TREE;
8038 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8039 initial_value = vect_phi_initial_value (reduc_def_phi);
8040 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8041 orig_code, initial_value);
8044 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8046 /* We can't support in-order reductions of code such as this:
8048 for (int i = 0; i < n1; ++i)
8049 for (int j = 0; j < n2; ++j)
8050 l += a[j];
8052 since GCC effectively transforms the loop when vectorizing:
8054 for (int i = 0; i < n1 / VF; ++i)
8055 for (int j = 0; j < n2; ++j)
8056 for (int k = 0; k < VF; ++k)
8057 l += a[j];
8059 which is a reassociation of the original operation. */
8060 if (dump_enabled_p ())
8061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8062 "in-order double reduction not supported.\n");
8064 return false;
8067 if (reduction_type == FOLD_LEFT_REDUCTION
8068 && slp_node
8069 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8071 /* We cannot use in-order reductions in this case because there is
8072 an implicit reassociation of the operations involved. */
8073 if (dump_enabled_p ())
8074 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8075 "in-order unchained SLP reductions not supported.\n");
8076 return false;
8079 /* For double reductions, and for SLP reductions with a neutral value,
8080 we construct a variable-length initial vector by loading a vector
8081 full of the neutral value and then shift-and-inserting the start
8082 values into the low-numbered elements. */
8083 if ((double_reduc || neutral_op)
8084 && !nunits_out.is_constant ()
8085 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8086 vectype_out, OPTIMIZE_FOR_SPEED))
8088 if (dump_enabled_p ())
8089 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8090 "reduction on variable-length vectors requires"
8091 " target support for a vector-shift-and-insert"
8092 " operation.\n");
8093 return false;
8096 /* Check extra constraints for variable-length unchained SLP reductions. */
8097 if (slp_node
8098 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8099 && !nunits_out.is_constant ())
8101 /* We checked above that we could build the initial vector when
8102 there's a neutral element value. Check here for the case in
8103 which each SLP statement has its own initial value and in which
8104 that value needs to be repeated for every instance of the
8105 statement within the initial vector. */
8106 unsigned int group_size = SLP_TREE_LANES (slp_node);
8107 if (!neutral_op
8108 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8109 TREE_TYPE (vectype_out)))
8111 if (dump_enabled_p ())
8112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8113 "unsupported form of SLP reduction for"
8114 " variable-length vectors: cannot build"
8115 " initial vector.\n");
8116 return false;
8118 /* The epilogue code relies on the number of elements being a multiple
8119 of the group size. The duplicate-and-interleave approach to setting
8120 up the initial vector does too. */
8121 if (!multiple_p (nunits_out, group_size))
8123 if (dump_enabled_p ())
8124 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8125 "unsupported form of SLP reduction for"
8126 " variable-length vectors: the vector size"
8127 " is not a multiple of the number of results.\n");
8128 return false;
8132 if (reduction_type == COND_REDUCTION)
8134 widest_int ni;
8136 if (! max_loop_iterations (loop, &ni))
8138 if (dump_enabled_p ())
8139 dump_printf_loc (MSG_NOTE, vect_location,
8140 "loop count not known, cannot create cond "
8141 "reduction.\n");
8142 return false;
8144 /* Convert backedges to iterations. */
8145 ni += 1;
8147 /* The additional index will be the same type as the condition. Check
8148 that the loop can fit into this less one (because we'll use up the
8149 zero slot for when there are no matches). */
8150 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8151 if (wi::geu_p (ni, wi::to_widest (max_index)))
8153 if (dump_enabled_p ())
8154 dump_printf_loc (MSG_NOTE, vect_location,
8155 "loop size is greater than data size.\n");
8156 return false;
8160 /* In case the vectorization factor (VF) is bigger than the number
8161 of elements that we can fit in a vectype (nunits), we have to generate
8162 more than one vector stmt - i.e - we need to "unroll" the
8163 vector stmt by a factor VF/nunits. For more details see documentation
8164 in vectorizable_operation. */
8166 /* If the reduction is used in an outer loop we need to generate
8167 VF intermediate results, like so (e.g. for ncopies=2):
8168 r0 = phi (init, r0)
8169 r1 = phi (init, r1)
8170 r0 = x0 + r0;
8171 r1 = x1 + r1;
8172 (i.e. we generate VF results in 2 registers).
8173 In this case we have a separate def-use cycle for each copy, and therefore
8174 for each copy we get the vector def for the reduction variable from the
8175 respective phi node created for this copy.
8177 Otherwise (the reduction is unused in the loop nest), we can combine
8178 together intermediate results, like so (e.g. for ncopies=2):
8179 r = phi (init, r)
8180 r = x0 + r;
8181 r = x1 + r;
8182 (i.e. we generate VF/2 results in a single register).
8183 In this case for each copy we get the vector def for the reduction variable
8184 from the vectorized reduction operation generated in the previous iteration.
8186 This only works when we see both the reduction PHI and its only consumer
8187 in vectorizable_reduction and there are no intermediate stmts
8188 participating. When unrolling we want each unrolled iteration to have its
8189 own reduction accumulator since one of the main goals of unrolling a
8190 reduction is to reduce the aggregate loop-carried latency. */
8191 if (ncopies > 1
8192 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8193 && reduc_chain_length == 1
8194 && loop_vinfo->suggested_unroll_factor == 1)
8195 single_defuse_cycle = true;
8197 if (single_defuse_cycle || lane_reduc_code_p)
8199 gcc_assert (op.code != COND_EXPR);
8201 /* 4. Supportable by target? */
8202 bool ok = true;
8204 /* 4.1. check support for the operation in the loop
8206 This isn't necessary for the lane reduction codes, since they
8207 can only be produced by pattern matching, and it's up to the
8208 pattern matcher to test for support. The main reason for
8209 specifically skipping this step is to avoid rechecking whether
8210 mixed-sign dot-products can be implemented using signed
8211 dot-products. */
8212 machine_mode vec_mode = TYPE_MODE (vectype_in);
8213 if (!lane_reduc_code_p
8214 && !directly_supported_p (op.code, vectype_in, optab_vector))
8216 if (dump_enabled_p ())
8217 dump_printf (MSG_NOTE, "op not supported by target.\n");
8218 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8219 || !vect_can_vectorize_without_simd_p (op.code))
8220 ok = false;
8221 else
8222 if (dump_enabled_p ())
8223 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8226 if (vect_emulated_vector_p (vectype_in)
8227 && !vect_can_vectorize_without_simd_p (op.code))
8229 if (dump_enabled_p ())
8230 dump_printf (MSG_NOTE, "using word mode not possible.\n");
8231 return false;
8234 /* lane-reducing operations have to go through vect_transform_reduction.
8235 For the other cases try without the single cycle optimization. */
8236 if (!ok)
8238 if (lane_reduc_code_p)
8239 return false;
8240 else
8241 single_defuse_cycle = false;
8244 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8246 /* If the reduction stmt is one of the patterns that have lane
8247 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
8248 if ((ncopies > 1 && ! single_defuse_cycle)
8249 && lane_reduc_code_p)
8251 if (dump_enabled_p ())
8252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8253 "multi def-use cycle not possible for lane-reducing "
8254 "reduction operation\n");
8255 return false;
8258 if (slp_node
8259 && !(!single_defuse_cycle
8260 && !lane_reduc_code_p
8261 && reduction_type != FOLD_LEFT_REDUCTION))
8262 for (i = 0; i < (int) op.num_ops; i++)
8263 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8265 if (dump_enabled_p ())
8266 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8267 "incompatible vector types for invariants\n");
8268 return false;
8271 if (slp_node)
8272 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8273 else
8274 vec_num = 1;
8276 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8277 reduction_type, ncopies, cost_vec);
8278 /* Cost the reduction op inside the loop if transformed via
8279 vect_transform_reduction. Otherwise this is costed by the
8280 separate vectorizable_* routines. */
8281 if (single_defuse_cycle || lane_reduc_code_p)
8283 int factor = 1;
8284 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8285 /* Three dot-products and a subtraction. */
8286 factor = 4;
8287 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8288 stmt_info, 0, vect_body);
8291 if (dump_enabled_p ()
8292 && reduction_type == FOLD_LEFT_REDUCTION)
8293 dump_printf_loc (MSG_NOTE, vect_location,
8294 "using an in-order (fold-left) reduction.\n");
8295 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8296 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8297 reductions go through their own vectorizable_* routines. */
8298 if (!single_defuse_cycle
8299 && !lane_reduc_code_p
8300 && reduction_type != FOLD_LEFT_REDUCTION)
8302 stmt_vec_info tem
8303 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8304 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8306 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8307 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8309 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8310 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8312 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8314 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8315 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8316 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8318 if (reduction_type != FOLD_LEFT_REDUCTION
8319 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8320 && (cond_fn == IFN_LAST
8321 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8322 OPTIMIZE_FOR_SPEED)))
8324 if (dump_enabled_p ())
8325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8326 "can't operate on partial vectors because"
8327 " no conditional operation is available.\n");
8328 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8330 else if (reduction_type == FOLD_LEFT_REDUCTION
8331 && reduc_fn == IFN_LAST
8332 && !expand_vec_cond_expr_p (vectype_in,
8333 truth_type_for (vectype_in),
8334 SSA_NAME))
8336 if (dump_enabled_p ())
8337 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8338 "can't operate on partial vectors because"
8339 " no conditional operation is available.\n");
8340 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8342 else if (reduction_type == FOLD_LEFT_REDUCTION
8343 && internal_fn_mask_index (reduc_fn) == -1
8344 && FLOAT_TYPE_P (vectype_in)
8345 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8347 if (dump_enabled_p ())
8348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8349 "can't operate on partial vectors because"
8350 " signed zeros cannot be preserved.\n");
8351 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8353 else
8355 internal_fn mask_reduc_fn
8356 = get_masked_reduction_fn (reduc_fn, vectype_in);
8358 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8359 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8360 vectype_in, 1);
8361 else
8362 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8363 vectype_in, NULL);
8366 return true;
8369 /* STMT_INFO is a dot-product reduction whose multiplication operands
8370 have different signs. Emit a sequence to emulate the operation
8371 using a series of signed DOT_PROD_EXPRs and return the last
8372 statement generated. VEC_DEST is the result of the vector operation
8373 and VOP lists its inputs. */
8375 static gassign *
8376 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8377 gimple_stmt_iterator *gsi, tree vec_dest,
8378 tree vop[3])
8380 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8381 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8382 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8383 gimple *new_stmt;
8385 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8386 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8387 std::swap (vop[0], vop[1]);
8389 /* Convert all inputs to signed types. */
8390 for (int i = 0; i < 3; ++i)
8391 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8393 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8394 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8395 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8396 vop[i] = tmp;
8399 /* In the comments below we assume 8-bit inputs for simplicity,
8400 but the approach works for any full integer type. */
8402 /* Create a vector of -128. */
8403 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8404 tree min_narrow = build_vector_from_val (narrow_vectype,
8405 min_narrow_elttype);
8407 /* Create a vector of 64. */
8408 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8409 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8410 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8412 /* Emit: SUB_RES = VOP[0] - 128. */
8413 tree sub_res = make_ssa_name (narrow_vectype);
8414 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8415 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8417 /* Emit:
8419 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8420 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8421 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8423 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8424 Doing the two 64 * y steps first allows more time to compute x. */
8425 tree stage1 = make_ssa_name (wide_vectype);
8426 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8427 vop[1], half_narrow, vop[2]);
8428 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8430 tree stage2 = make_ssa_name (wide_vectype);
8431 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8432 vop[1], half_narrow, stage1);
8433 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8435 tree stage3 = make_ssa_name (wide_vectype);
8436 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8437 sub_res, vop[1], stage2);
8438 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8440 /* Convert STAGE3 to the reduction type. */
8441 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8444 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8445 value. */
8447 bool
8448 vect_transform_reduction (loop_vec_info loop_vinfo,
8449 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8450 gimple **vec_stmt, slp_tree slp_node)
8452 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8453 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8454 int i;
8455 int ncopies;
8456 int vec_num;
8458 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8459 gcc_assert (reduc_info->is_reduc_info);
8461 if (nested_in_vect_loop_p (loop, stmt_info))
8463 loop = loop->inner;
8464 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8467 gimple_match_op op;
8468 if (!gimple_extract_op (stmt_info->stmt, &op))
8469 gcc_unreachable ();
8471 /* All uses but the last are expected to be defined in the loop.
8472 The last use is the reduction variable. In case of nested cycle this
8473 assumption is not true: we use reduc_index to record the index of the
8474 reduction variable. */
8475 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8476 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8477 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8478 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8480 if (slp_node)
8482 ncopies = 1;
8483 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8485 else
8487 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8488 vec_num = 1;
8491 code_helper code = canonicalize_code (op.code, op.type);
8492 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8494 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8495 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8496 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8498 /* Transform. */
8499 tree new_temp = NULL_TREE;
8500 auto_vec<tree> vec_oprnds0;
8501 auto_vec<tree> vec_oprnds1;
8502 auto_vec<tree> vec_oprnds2;
8503 tree def0;
8505 if (dump_enabled_p ())
8506 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8508 /* FORNOW: Multiple types are not supported for condition. */
8509 if (code == COND_EXPR)
8510 gcc_assert (ncopies == 1);
8512 /* A binary COND_OP reduction must have the same definition and else
8513 value. */
8514 bool cond_fn_p = code.is_internal_fn ()
8515 && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8516 if (cond_fn_p)
8518 gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8519 || code == IFN_COND_MUL || code == IFN_COND_AND
8520 || code == IFN_COND_IOR || code == IFN_COND_XOR);
8521 gcc_assert (op.num_ops == 4
8522 && (op.ops[reduc_index]
8523 == op.ops[internal_fn_else_index ((internal_fn) code)]));
8526 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8528 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8529 if (reduction_type == FOLD_LEFT_REDUCTION)
8531 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8532 gcc_assert (code.is_tree_code () || cond_fn_p);
8533 return vectorize_fold_left_reduction
8534 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8535 code, reduc_fn, op.ops, op.num_ops, vectype_in,
8536 reduc_index, masks, lens);
8539 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8540 gcc_assert (single_defuse_cycle
8541 || code == DOT_PROD_EXPR
8542 || code == WIDEN_SUM_EXPR
8543 || code == SAD_EXPR);
8545 /* Create the destination vector */
8546 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8547 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8549 /* Get NCOPIES vector definitions for all operands except the reduction
8550 definition. */
8551 if (!cond_fn_p)
8553 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8554 single_defuse_cycle && reduc_index == 0
8555 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8556 single_defuse_cycle && reduc_index == 1
8557 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8558 op.num_ops == 3
8559 && !(single_defuse_cycle && reduc_index == 2)
8560 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8562 else
8564 /* For a conditional operation pass the truth type as mask
8565 vectype. */
8566 gcc_assert (single_defuse_cycle
8567 && (reduc_index == 1 || reduc_index == 2));
8568 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8569 op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8570 reduc_index == 1 ? NULL_TREE : op.ops[1],
8571 NULL_TREE, &vec_oprnds1,
8572 reduc_index == 2 ? NULL_TREE : op.ops[2],
8573 NULL_TREE, &vec_oprnds2);
8576 /* For single def-use cycles get one copy of the vectorized reduction
8577 definition. */
8578 if (single_defuse_cycle)
8580 gcc_assert (!slp_node);
8581 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8582 op.ops[reduc_index],
8583 reduc_index == 0 ? &vec_oprnds0
8584 : (reduc_index == 1 ? &vec_oprnds1
8585 : &vec_oprnds2));
8588 bool emulated_mixed_dot_prod
8589 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8590 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8592 gimple *new_stmt;
8593 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8594 if (masked_loop_p && !mask_by_cond_expr)
8596 /* No conditional ifns have been defined for dot-product yet. */
8597 gcc_assert (code != DOT_PROD_EXPR);
8599 /* Make sure that the reduction accumulator is vop[0]. */
8600 if (reduc_index == 1)
8602 gcc_assert (commutative_binary_op_p (code, op.type));
8603 std::swap (vop[0], vop[1]);
8605 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8606 vec_num * ncopies, vectype_in, i);
8607 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8608 vop[0], vop[1], vop[0]);
8609 new_temp = make_ssa_name (vec_dest, call);
8610 gimple_call_set_lhs (call, new_temp);
8611 gimple_call_set_nothrow (call, true);
8612 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8613 new_stmt = call;
8615 else
8617 if (op.num_ops >= 3)
8618 vop[2] = vec_oprnds2[i];
8620 if (masked_loop_p && mask_by_cond_expr)
8622 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8623 vec_num * ncopies, vectype_in, i);
8624 build_vect_cond_expr (code, vop, mask, gsi);
8627 if (emulated_mixed_dot_prod)
8628 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8629 vec_dest, vop);
8631 else if (code.is_internal_fn () && !cond_fn_p)
8632 new_stmt = gimple_build_call_internal (internal_fn (code),
8633 op.num_ops,
8634 vop[0], vop[1], vop[2]);
8635 else if (code.is_internal_fn () && cond_fn_p)
8636 new_stmt = gimple_build_call_internal (internal_fn (code),
8637 op.num_ops,
8638 vop[0], vop[1], vop[2],
8639 vop[1]);
8640 else
8641 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8642 vop[0], vop[1], vop[2]);
8643 new_temp = make_ssa_name (vec_dest, new_stmt);
8644 gimple_set_lhs (new_stmt, new_temp);
8645 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8648 if (slp_node)
8649 slp_node->push_vec_def (new_stmt);
8650 else if (single_defuse_cycle
8651 && i < ncopies - 1)
8653 if (reduc_index == 0)
8654 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8655 else if (reduc_index == 1)
8656 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8657 else if (reduc_index == 2)
8658 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8660 else
8661 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8664 if (!slp_node)
8665 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8667 return true;
8670 /* Transform phase of a cycle PHI. */
8672 bool
8673 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8674 stmt_vec_info stmt_info, gimple **vec_stmt,
8675 slp_tree slp_node, slp_instance slp_node_instance)
8677 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8678 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8679 int i;
8680 int ncopies;
8681 int j;
8682 bool nested_cycle = false;
8683 int vec_num;
8685 if (nested_in_vect_loop_p (loop, stmt_info))
8687 loop = loop->inner;
8688 nested_cycle = true;
8691 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8692 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8693 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8694 gcc_assert (reduc_info->is_reduc_info);
8696 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8697 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8698 /* Leave the scalar phi in place. */
8699 return true;
8701 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8702 /* For a nested cycle we do not fill the above. */
8703 if (!vectype_in)
8704 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8705 gcc_assert (vectype_in);
8707 if (slp_node)
8709 /* The size vect_schedule_slp_instance computes is off for us. */
8710 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8711 * SLP_TREE_LANES (slp_node), vectype_in);
8712 ncopies = 1;
8714 else
8716 vec_num = 1;
8717 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8720 /* Check whether we should use a single PHI node and accumulate
8721 vectors to one before the backedge. */
8722 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8723 ncopies = 1;
8725 /* Create the destination vector */
8726 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8727 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8728 vectype_out);
8730 /* Get the loop-entry arguments. */
8731 tree vec_initial_def = NULL_TREE;
8732 auto_vec<tree> vec_initial_defs;
8733 if (slp_node)
8735 vec_initial_defs.reserve (vec_num);
8736 if (nested_cycle)
8738 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8739 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8740 &vec_initial_defs);
8742 else
8744 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8745 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8746 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8748 unsigned int num_phis = stmts.length ();
8749 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8750 num_phis = 1;
8751 initial_values.reserve (num_phis);
8752 for (unsigned int i = 0; i < num_phis; ++i)
8754 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8755 initial_values.quick_push (vect_phi_initial_value (this_phi));
8757 if (vec_num == 1)
8758 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8759 if (!initial_values.is_empty ())
8761 tree initial_value
8762 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8763 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8764 tree neutral_op
8765 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8766 code, initial_value);
8767 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8768 &vec_initial_defs, vec_num,
8769 stmts.length (), neutral_op);
8773 else
8775 /* Get at the scalar def before the loop, that defines the initial
8776 value of the reduction variable. */
8777 tree initial_def = vect_phi_initial_value (phi);
8778 reduc_info->reduc_initial_values.safe_push (initial_def);
8779 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8780 and we can't use zero for induc_val, use initial_def. Similarly
8781 for REDUC_MIN and initial_def larger than the base. */
8782 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8784 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8785 if (TREE_CODE (initial_def) == INTEGER_CST
8786 && !integer_zerop (induc_val)
8787 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8788 && tree_int_cst_lt (initial_def, induc_val))
8789 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8790 && tree_int_cst_lt (induc_val, initial_def))))
8792 induc_val = initial_def;
8793 /* Communicate we used the initial_def to epilouge
8794 generation. */
8795 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8797 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8799 else if (nested_cycle)
8801 /* Do not use an adjustment def as that case is not supported
8802 correctly if ncopies is not one. */
8803 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8804 ncopies, initial_def,
8805 &vec_initial_defs);
8807 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8808 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8809 /* Fill the initial vector with the initial scalar value. */
8810 vec_initial_def
8811 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8812 initial_def, initial_def);
8813 else
8815 if (ncopies == 1)
8816 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8817 if (!reduc_info->reduc_initial_values.is_empty ())
8819 initial_def = reduc_info->reduc_initial_values[0];
8820 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8821 tree neutral_op
8822 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8823 code, initial_def);
8824 gcc_assert (neutral_op);
8825 /* Try to simplify the vector initialization by applying an
8826 adjustment after the reduction has been performed. */
8827 if (!reduc_info->reused_accumulator
8828 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8829 && !operand_equal_p (neutral_op, initial_def))
8831 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8832 = initial_def;
8833 initial_def = neutral_op;
8835 vec_initial_def
8836 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8837 initial_def, neutral_op);
8842 if (vec_initial_def)
8844 vec_initial_defs.create (ncopies);
8845 for (i = 0; i < ncopies; ++i)
8846 vec_initial_defs.quick_push (vec_initial_def);
8849 if (auto *accumulator = reduc_info->reused_accumulator)
8851 tree def = accumulator->reduc_input;
8852 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8854 unsigned int nreduc;
8855 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8856 (TREE_TYPE (def)),
8857 TYPE_VECTOR_SUBPARTS (vectype_out),
8858 &nreduc);
8859 gcc_assert (res);
8860 gimple_seq stmts = NULL;
8861 /* Reduce the single vector to a smaller one. */
8862 if (nreduc != 1)
8864 /* Perform the reduction in the appropriate type. */
8865 tree rvectype = vectype_out;
8866 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8867 TREE_TYPE (TREE_TYPE (def))))
8868 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8869 TYPE_VECTOR_SUBPARTS
8870 (vectype_out));
8871 def = vect_create_partial_epilog (def, rvectype,
8872 STMT_VINFO_REDUC_CODE
8873 (reduc_info),
8874 &stmts);
8876 /* The epilogue loop might use a different vector mode, like
8877 VNx2DI vs. V2DI. */
8878 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8880 tree reduc_type = build_vector_type_for_mode
8881 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8882 def = gimple_convert (&stmts, reduc_type, def);
8884 /* Adjust the input so we pick up the partially reduced value
8885 for the skip edge in vect_create_epilog_for_reduction. */
8886 accumulator->reduc_input = def;
8887 /* And the reduction could be carried out using a different sign. */
8888 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8889 def = gimple_convert (&stmts, vectype_out, def);
8890 if (loop_vinfo->main_loop_edge)
8892 /* While we'd like to insert on the edge this will split
8893 blocks and disturb bookkeeping, we also will eventually
8894 need this on the skip edge. Rely on sinking to
8895 fixup optimal placement and insert in the pred. */
8896 gimple_stmt_iterator gsi
8897 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8898 /* Insert before a cond that eventually skips the
8899 epilogue. */
8900 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8901 gsi_prev (&gsi);
8902 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8904 else
8905 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8906 stmts);
8908 if (loop_vinfo->main_loop_edge)
8909 vec_initial_defs[0]
8910 = vect_get_main_loop_result (loop_vinfo, def,
8911 vec_initial_defs[0]);
8912 else
8913 vec_initial_defs.safe_push (def);
8916 /* Generate the reduction PHIs upfront. */
8917 for (i = 0; i < vec_num; i++)
8919 tree vec_init_def = vec_initial_defs[i];
8920 for (j = 0; j < ncopies; j++)
8922 /* Create the reduction-phi that defines the reduction
8923 operand. */
8924 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8926 /* Set the loop-entry arg of the reduction-phi. */
8927 if (j != 0 && nested_cycle)
8928 vec_init_def = vec_initial_defs[j];
8929 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8930 UNKNOWN_LOCATION);
8932 /* The loop-latch arg is set in epilogue processing. */
8934 if (slp_node)
8935 slp_node->push_vec_def (new_phi);
8936 else
8938 if (j == 0)
8939 *vec_stmt = new_phi;
8940 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8945 return true;
8948 /* Vectorizes LC PHIs. */
8950 bool
8951 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8952 stmt_vec_info stmt_info, gimple **vec_stmt,
8953 slp_tree slp_node)
8955 if (!loop_vinfo
8956 || !is_a <gphi *> (stmt_info->stmt)
8957 || gimple_phi_num_args (stmt_info->stmt) != 1)
8958 return false;
8960 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8961 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8962 return false;
8964 if (!vec_stmt) /* transformation not required. */
8966 /* Deal with copies from externs or constants that disguise as
8967 loop-closed PHI nodes (PR97886). */
8968 if (slp_node
8969 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8970 SLP_TREE_VECTYPE (slp_node)))
8972 if (dump_enabled_p ())
8973 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8974 "incompatible vector types for invariants\n");
8975 return false;
8977 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8978 return true;
8981 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8982 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8983 basic_block bb = gimple_bb (stmt_info->stmt);
8984 edge e = single_pred_edge (bb);
8985 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8986 auto_vec<tree> vec_oprnds;
8987 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8988 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8989 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8990 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8992 /* Create the vectorized LC PHI node. */
8993 gphi *new_phi = create_phi_node (vec_dest, bb);
8994 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8995 if (slp_node)
8996 slp_node->push_vec_def (new_phi);
8997 else
8998 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
9000 if (!slp_node)
9001 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9003 return true;
9006 /* Vectorizes PHIs. */
9008 bool
9009 vectorizable_phi (vec_info *,
9010 stmt_vec_info stmt_info, gimple **vec_stmt,
9011 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9013 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9014 return false;
9016 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9017 return false;
9019 tree vectype = SLP_TREE_VECTYPE (slp_node);
9021 if (!vec_stmt) /* transformation not required. */
9023 slp_tree child;
9024 unsigned i;
9025 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9026 if (!child)
9028 if (dump_enabled_p ())
9029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9030 "PHI node with unvectorized backedge def\n");
9031 return false;
9033 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9035 if (dump_enabled_p ())
9036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9037 "incompatible vector types for invariants\n");
9038 return false;
9040 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9041 && !useless_type_conversion_p (vectype,
9042 SLP_TREE_VECTYPE (child)))
9044 /* With bools we can have mask and non-mask precision vectors
9045 or different non-mask precisions. while pattern recog is
9046 supposed to guarantee consistency here bugs in it can cause
9047 mismatches (PR103489 and PR103800 for example).
9048 Deal with them here instead of ICEing later. */
9049 if (dump_enabled_p ())
9050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9051 "incompatible vector type setup from "
9052 "bool pattern detection\n");
9053 return false;
9056 /* For single-argument PHIs assume coalescing which means zero cost
9057 for the scalar and the vector PHIs. This avoids artificially
9058 favoring the vector path (but may pessimize it in some cases). */
9059 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9060 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9061 vector_stmt, stmt_info, vectype, 0, vect_body);
9062 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9063 return true;
9066 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9067 basic_block bb = gimple_bb (stmt_info->stmt);
9068 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9069 auto_vec<gphi *> new_phis;
9070 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9072 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9074 /* Skip not yet vectorized defs. */
9075 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9076 && SLP_TREE_VEC_DEFS (child).is_empty ())
9077 continue;
9079 auto_vec<tree> vec_oprnds;
9080 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9081 if (!new_phis.exists ())
9083 new_phis.create (vec_oprnds.length ());
9084 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9086 /* Create the vectorized LC PHI node. */
9087 new_phis.quick_push (create_phi_node (vec_dest, bb));
9088 slp_node->push_vec_def (new_phis[j]);
9091 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9092 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9093 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9095 /* We should have at least one already vectorized child. */
9096 gcc_assert (new_phis.exists ());
9098 return true;
9101 /* Vectorizes first order recurrences. An overview of the transformation
9102 is described below. Suppose we have the following loop.
9104 int t = 0;
9105 for (int i = 0; i < n; ++i)
9107 b[i] = a[i] - t;
9108 t = a[i];
9111 There is a first-order recurrence on 'a'. For this loop, the scalar IR
9112 looks (simplified) like:
9114 scalar.preheader:
9115 init = 0;
9117 scalar.body:
9118 i = PHI <0(scalar.preheader), i+1(scalar.body)>
9119 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9120 _1 = a[i]
9121 b[i] = _1 - _2
9122 if (i < n) goto scalar.body
9124 In this example, _2 is a recurrence because it's value depends on the
9125 previous iteration. We vectorize this as (VF = 4)
9127 vector.preheader:
9128 vect_init = vect_cst(..., ..., ..., 0)
9130 vector.body
9131 i = PHI <0(vector.preheader), i+4(vector.body)>
9132 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9133 vect_2 = a[i, i+1, i+2, i+3];
9134 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9135 b[i, i+1, i+2, i+3] = vect_2 - vect_3
9136 if (..) goto vector.body
9138 In this function, vectorizable_recurr, we code generate both the
9139 vector PHI node and the permute since those together compute the
9140 vectorized value of the scalar PHI. We do not yet have the
9141 backedge value to fill in there nor into the vec_perm. Those
9142 are filled in maybe_set_vectorized_backedge_value and
9143 vect_schedule_scc.
9145 TODO: Since the scalar loop does not have a use of the recurrence
9146 outside of the loop the natural way to implement peeling via
9147 vectorizing the live value doesn't work. For now peeling of loops
9148 with a recurrence is not implemented. For SLP the supported cases
9149 are restricted to those requiring a single vector recurrence PHI. */
9151 bool
9152 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9153 gimple **vec_stmt, slp_tree slp_node,
9154 stmt_vector_for_cost *cost_vec)
9156 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9157 return false;
9159 gphi *phi = as_a<gphi *> (stmt_info->stmt);
9161 /* So far we only support first-order recurrence auto-vectorization. */
9162 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9163 return false;
9165 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9166 unsigned ncopies;
9167 if (slp_node)
9168 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9169 else
9170 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9171 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9172 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9173 /* We need to be able to make progress with a single vector. */
9174 if (maybe_gt (dist * 2, nunits))
9176 if (dump_enabled_p ())
9177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9178 "first order recurrence exceeds half of "
9179 "a vector\n");
9180 return false;
9183 /* First-order recurrence autovectorization needs to handle permutation
9184 with indices = [nunits-1, nunits, nunits+1, ...]. */
9185 vec_perm_builder sel (nunits, 1, 3);
9186 for (int i = 0; i < 3; ++i)
9187 sel.quick_push (nunits - dist + i);
9188 vec_perm_indices indices (sel, 2, nunits);
9190 if (!vec_stmt) /* transformation not required. */
9192 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9193 indices))
9194 return false;
9196 if (slp_node)
9198 /* We eventually need to set a vector type on invariant
9199 arguments. */
9200 unsigned j;
9201 slp_tree child;
9202 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9203 if (!vect_maybe_update_slp_op_vectype
9204 (child, SLP_TREE_VECTYPE (slp_node)))
9206 if (dump_enabled_p ())
9207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9208 "incompatible vector types for "
9209 "invariants\n");
9210 return false;
9213 /* The recurrence costs the initialization vector and one permute
9214 for each copy. */
9215 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9216 stmt_info, 0, vect_prologue);
9217 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9218 stmt_info, 0, vect_body);
9219 if (dump_enabled_p ())
9220 dump_printf_loc (MSG_NOTE, vect_location,
9221 "vectorizable_recurr: inside_cost = %d, "
9222 "prologue_cost = %d .\n", inside_cost,
9223 prologue_cost);
9225 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9226 return true;
9229 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9230 basic_block bb = gimple_bb (phi);
9231 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9232 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9234 gimple_seq stmts = NULL;
9235 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9236 gsi_insert_seq_on_edge_immediate (pe, stmts);
9238 tree vec_init = build_vector_from_val (vectype, preheader);
9239 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9241 /* Create the vectorized first-order PHI node. */
9242 tree vec_dest = vect_get_new_vect_var (vectype,
9243 vect_simple_var, "vec_recur_");
9244 gphi *new_phi = create_phi_node (vec_dest, bb);
9245 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9247 /* Insert shuffles the first-order recurrence autovectorization.
9248 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9249 tree perm = vect_gen_perm_mask_checked (vectype, indices);
9251 /* Insert the required permute after the latch definition. The
9252 second and later operands are tentative and will be updated when we have
9253 vectorized the latch definition. */
9254 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9255 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9256 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9257 gsi_next (&gsi2);
9259 for (unsigned i = 0; i < ncopies; ++i)
9261 vec_dest = make_ssa_name (vectype);
9262 gassign *vperm
9263 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9264 i == 0 ? gimple_phi_result (new_phi) : NULL,
9265 NULL, perm);
9266 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9268 if (slp_node)
9269 slp_node->push_vec_def (vperm);
9270 else
9271 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9274 if (!slp_node)
9275 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9276 return true;
9279 /* Return true if VECTYPE represents a vector that requires lowering
9280 by the vector lowering pass. */
9282 bool
9283 vect_emulated_vector_p (tree vectype)
9285 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9286 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9287 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9290 /* Return true if we can emulate CODE on an integer mode representation
9291 of a vector. */
9293 bool
9294 vect_can_vectorize_without_simd_p (tree_code code)
9296 switch (code)
9298 case PLUS_EXPR:
9299 case MINUS_EXPR:
9300 case NEGATE_EXPR:
9301 case BIT_AND_EXPR:
9302 case BIT_IOR_EXPR:
9303 case BIT_XOR_EXPR:
9304 case BIT_NOT_EXPR:
9305 return true;
9307 default:
9308 return false;
9312 /* Likewise, but taking a code_helper. */
9314 bool
9315 vect_can_vectorize_without_simd_p (code_helper code)
9317 return (code.is_tree_code ()
9318 && vect_can_vectorize_without_simd_p (tree_code (code)));
9321 /* Create vector init for vectorized iv. */
9322 static tree
9323 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9324 tree step_expr, poly_uint64 nunits,
9325 tree vectype,
9326 enum vect_induction_op_type induction_type)
9328 unsigned HOST_WIDE_INT const_nunits;
9329 tree vec_shift, vec_init, new_name;
9330 unsigned i;
9331 tree itype = TREE_TYPE (vectype);
9333 /* iv_loop is the loop to be vectorized. Create:
9334 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9335 new_name = gimple_convert (stmts, itype, init_expr);
9336 switch (induction_type)
9338 case vect_step_op_shr:
9339 case vect_step_op_shl:
9340 /* Build the Initial value from shift_expr. */
9341 vec_init = gimple_build_vector_from_val (stmts,
9342 vectype,
9343 new_name);
9344 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9345 build_zero_cst (itype), step_expr);
9346 vec_init = gimple_build (stmts,
9347 (induction_type == vect_step_op_shr
9348 ? RSHIFT_EXPR : LSHIFT_EXPR),
9349 vectype, vec_init, vec_shift);
9350 break;
9352 case vect_step_op_neg:
9354 vec_init = gimple_build_vector_from_val (stmts,
9355 vectype,
9356 new_name);
9357 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9358 vectype, vec_init);
9359 /* The encoding has 2 interleaved stepped patterns. */
9360 vec_perm_builder sel (nunits, 2, 3);
9361 sel.quick_grow (6);
9362 for (i = 0; i < 3; i++)
9364 sel[2 * i] = i;
9365 sel[2 * i + 1] = i + nunits;
9367 vec_perm_indices indices (sel, 2, nunits);
9368 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9369 fail when vec_init is const vector. In that situation vec_perm is not
9370 really needed. */
9371 tree perm_mask_even
9372 = vect_gen_perm_mask_any (vectype, indices);
9373 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9374 vectype,
9375 vec_init, vec_neg,
9376 perm_mask_even);
9378 break;
9380 case vect_step_op_mul:
9382 /* Use unsigned mult to avoid UD integer overflow. */
9383 gcc_assert (nunits.is_constant (&const_nunits));
9384 tree utype = unsigned_type_for (itype);
9385 tree uvectype = build_vector_type (utype,
9386 TYPE_VECTOR_SUBPARTS (vectype));
9387 new_name = gimple_convert (stmts, utype, new_name);
9388 vec_init = gimple_build_vector_from_val (stmts,
9389 uvectype,
9390 new_name);
9391 tree_vector_builder elts (uvectype, const_nunits, 1);
9392 tree elt_step = build_one_cst (utype);
9394 elts.quick_push (elt_step);
9395 for (i = 1; i < const_nunits; i++)
9397 /* Create: new_name_i = new_name + step_expr. */
9398 elt_step = gimple_build (stmts, MULT_EXPR,
9399 utype, elt_step, step_expr);
9400 elts.quick_push (elt_step);
9402 /* Create a vector from [new_name_0, new_name_1, ...,
9403 new_name_nunits-1]. */
9404 tree vec_mul = gimple_build_vector (stmts, &elts);
9405 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9406 vec_init, vec_mul);
9407 vec_init = gimple_convert (stmts, vectype, vec_init);
9409 break;
9411 default:
9412 gcc_unreachable ();
9415 return vec_init;
9418 /* Peel init_expr by skip_niter for induction_type. */
9419 tree
9420 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9421 tree skip_niters, tree step_expr,
9422 enum vect_induction_op_type induction_type)
9424 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9425 tree type = TREE_TYPE (init_expr);
9426 unsigned prec = TYPE_PRECISION (type);
9427 switch (induction_type)
9429 case vect_step_op_neg:
9430 if (TREE_INT_CST_LOW (skip_niters) % 2)
9431 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9432 /* else no change. */
9433 break;
9435 case vect_step_op_shr:
9436 case vect_step_op_shl:
9437 skip_niters = gimple_convert (stmts, type, skip_niters);
9438 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9439 /* When shift mount >= precision, need to avoid UD.
9440 In the original loop, there's no UD, and according to semantic,
9441 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9442 if (!tree_fits_uhwi_p (step_expr)
9443 || tree_to_uhwi (step_expr) >= prec)
9445 if (induction_type == vect_step_op_shl
9446 || TYPE_UNSIGNED (type))
9447 init_expr = build_zero_cst (type);
9448 else
9449 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9450 init_expr,
9451 wide_int_to_tree (type, prec - 1));
9453 else
9454 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9455 ? RSHIFT_EXPR : LSHIFT_EXPR),
9456 type, init_expr, step_expr);
9457 break;
9459 case vect_step_op_mul:
9461 tree utype = unsigned_type_for (type);
9462 init_expr = gimple_convert (stmts, utype, init_expr);
9463 wide_int skipn = wi::to_wide (skip_niters);
9464 wide_int begin = wi::to_wide (step_expr);
9465 auto_mpz base, exp, mod, res;
9466 wi::to_mpz (begin, base, TYPE_SIGN (type));
9467 wi::to_mpz (skipn, exp, UNSIGNED);
9468 mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9469 mpz_powm (res, base, exp, mod);
9470 begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9471 tree mult_expr = wide_int_to_tree (utype, begin);
9472 init_expr = gimple_build (stmts, MULT_EXPR, utype,
9473 init_expr, mult_expr);
9474 init_expr = gimple_convert (stmts, type, init_expr);
9476 break;
9478 default:
9479 gcc_unreachable ();
9482 return init_expr;
9485 /* Create vector step for vectorized iv. */
9486 static tree
9487 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9488 poly_uint64 vf,
9489 enum vect_induction_op_type induction_type)
9491 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9492 tree new_name = NULL;
9493 /* Step should be pow (step, vf) for mult induction. */
9494 if (induction_type == vect_step_op_mul)
9496 gcc_assert (vf.is_constant ());
9497 wide_int begin = wi::to_wide (step_expr);
9499 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9500 begin = wi::mul (begin, wi::to_wide (step_expr));
9502 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9504 else if (induction_type == vect_step_op_neg)
9505 /* Do nothing. */
9507 else
9508 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9509 expr, step_expr);
9510 return new_name;
9513 static tree
9514 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9515 stmt_vec_info stmt_info,
9516 tree new_name, tree vectype,
9517 enum vect_induction_op_type induction_type)
9519 /* No step is needed for neg induction. */
9520 if (induction_type == vect_step_op_neg)
9521 return NULL;
9523 tree t = unshare_expr (new_name);
9524 gcc_assert (CONSTANT_CLASS_P (new_name)
9525 || TREE_CODE (new_name) == SSA_NAME);
9526 tree new_vec = build_vector_from_val (vectype, t);
9527 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9528 new_vec, vectype, NULL);
9529 return vec_step;
9532 /* Update vectorized iv with vect_step, induc_def is init. */
9533 static tree
9534 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9535 tree induc_def, tree vec_step,
9536 enum vect_induction_op_type induction_type)
9538 tree vec_def = induc_def;
9539 switch (induction_type)
9541 case vect_step_op_mul:
9543 /* Use unsigned mult to avoid UD integer overflow. */
9544 tree uvectype
9545 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9546 TYPE_VECTOR_SUBPARTS (vectype));
9547 vec_def = gimple_convert (stmts, uvectype, vec_def);
9548 vec_step = gimple_convert (stmts, uvectype, vec_step);
9549 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9550 vec_def, vec_step);
9551 vec_def = gimple_convert (stmts, vectype, vec_def);
9553 break;
9555 case vect_step_op_shr:
9556 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9557 vec_def, vec_step);
9558 break;
9560 case vect_step_op_shl:
9561 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9562 vec_def, vec_step);
9563 break;
9564 case vect_step_op_neg:
9565 vec_def = induc_def;
9566 /* Do nothing. */
9567 break;
9568 default:
9569 gcc_unreachable ();
9572 return vec_def;
9576 /* Function vectorizable_induction
9578 Check if STMT_INFO performs an nonlinear induction computation that can be
9579 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9580 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9581 basic block.
9582 Return true if STMT_INFO is vectorizable in this way. */
9584 static bool
9585 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9586 stmt_vec_info stmt_info,
9587 gimple **vec_stmt, slp_tree slp_node,
9588 stmt_vector_for_cost *cost_vec)
9590 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9591 unsigned ncopies;
9592 bool nested_in_vect_loop = false;
9593 class loop *iv_loop;
9594 tree vec_def;
9595 edge pe = loop_preheader_edge (loop);
9596 basic_block new_bb;
9597 tree vec_init, vec_step;
9598 tree new_name;
9599 gimple *new_stmt;
9600 gphi *induction_phi;
9601 tree induc_def, vec_dest;
9602 tree init_expr, step_expr;
9603 tree niters_skip;
9604 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9605 unsigned i;
9606 gimple_stmt_iterator si;
9608 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9610 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9611 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9612 enum vect_induction_op_type induction_type
9613 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9615 gcc_assert (induction_type > vect_step_op_add);
9617 if (slp_node)
9618 ncopies = 1;
9619 else
9620 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9621 gcc_assert (ncopies >= 1);
9623 /* FORNOW. Only handle nonlinear induction in the same loop. */
9624 if (nested_in_vect_loop_p (loop, stmt_info))
9626 if (dump_enabled_p ())
9627 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9628 "nonlinear induction in nested loop.\n");
9629 return false;
9632 iv_loop = loop;
9633 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9635 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9636 update for each iv and a permutation to generate wanted vector iv. */
9637 if (slp_node)
9639 if (dump_enabled_p ())
9640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9641 "SLP induction not supported for nonlinear"
9642 " induction.\n");
9643 return false;
9646 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9648 if (dump_enabled_p ())
9649 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9650 "floating point nonlinear induction vectorization"
9651 " not supported.\n");
9652 return false;
9655 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9656 init_expr = vect_phi_initial_value (phi);
9657 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9658 && TREE_CODE (step_expr) == INTEGER_CST);
9659 /* step_expr should be aligned with init_expr,
9660 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9661 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9663 if (TREE_CODE (init_expr) == INTEGER_CST)
9664 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9665 else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9667 /* INIT_EXPR could be a bit_field, bail out for such case. */
9668 if (dump_enabled_p ())
9669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9670 "nonlinear induction vectorization failed:"
9671 " component type of vectype is not a nop conversion"
9672 " from type of init_expr.\n");
9673 return false;
9676 switch (induction_type)
9678 case vect_step_op_neg:
9679 if (TREE_CODE (init_expr) != INTEGER_CST
9680 && TREE_CODE (init_expr) != REAL_CST)
9682 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9683 if (!directly_supported_p (NEGATE_EXPR, vectype))
9684 return false;
9686 /* The encoding has 2 interleaved stepped patterns. */
9687 vec_perm_builder sel (nunits, 2, 3);
9688 machine_mode mode = TYPE_MODE (vectype);
9689 sel.quick_grow (6);
9690 for (i = 0; i < 3; i++)
9692 sel[i * 2] = i;
9693 sel[i * 2 + 1] = i + nunits;
9695 vec_perm_indices indices (sel, 2, nunits);
9696 if (!can_vec_perm_const_p (mode, mode, indices))
9697 return false;
9699 break;
9701 case vect_step_op_mul:
9703 /* Check for backend support of MULT_EXPR. */
9704 if (!directly_supported_p (MULT_EXPR, vectype))
9705 return false;
9707 /* ?? How to construct vector step for variable number vector.
9708 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9709 if (!vf.is_constant ())
9710 return false;
9712 break;
9714 case vect_step_op_shr:
9715 /* Check for backend support of RSHIFT_EXPR. */
9716 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9717 return false;
9719 /* Don't shift more than type precision to avoid UD. */
9720 if (!tree_fits_uhwi_p (step_expr)
9721 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9722 TYPE_PRECISION (TREE_TYPE (init_expr))))
9723 return false;
9724 break;
9726 case vect_step_op_shl:
9727 /* Check for backend support of RSHIFT_EXPR. */
9728 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9729 return false;
9731 /* Don't shift more than type precision to avoid UD. */
9732 if (!tree_fits_uhwi_p (step_expr)
9733 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9734 TYPE_PRECISION (TREE_TYPE (init_expr))))
9735 return false;
9737 break;
9739 default:
9740 gcc_unreachable ();
9743 if (!vec_stmt) /* transformation not required. */
9745 unsigned inside_cost = 0, prologue_cost = 0;
9746 /* loop cost for vec_loop. Neg induction doesn't have any
9747 inside_cost. */
9748 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9749 stmt_info, 0, vect_body);
9751 /* loop cost for vec_loop. Neg induction doesn't have any
9752 inside_cost. */
9753 if (induction_type == vect_step_op_neg)
9754 inside_cost = 0;
9756 /* prologue cost for vec_init and vec_step. */
9757 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9758 stmt_info, 0, vect_prologue);
9760 if (dump_enabled_p ())
9761 dump_printf_loc (MSG_NOTE, vect_location,
9762 "vect_model_induction_cost: inside_cost = %d, "
9763 "prologue_cost = %d. \n", inside_cost,
9764 prologue_cost);
9766 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9767 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9768 return true;
9771 /* Transform. */
9773 /* Compute a vector variable, initialized with the first VF values of
9774 the induction variable. E.g., for an iv with IV_PHI='X' and
9775 evolution S, for a vector of 4 units, we want to compute:
9776 [X, X + S, X + 2*S, X + 3*S]. */
9778 if (dump_enabled_p ())
9779 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9781 pe = loop_preheader_edge (iv_loop);
9782 /* Find the first insertion point in the BB. */
9783 basic_block bb = gimple_bb (phi);
9784 si = gsi_after_labels (bb);
9786 gimple_seq stmts = NULL;
9788 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9789 /* If we are using the loop mask to "peel" for alignment then we need
9790 to adjust the start value here. */
9791 if (niters_skip != NULL_TREE)
9792 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9793 step_expr, induction_type);
9795 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9796 step_expr, nunits, vectype,
9797 induction_type);
9798 if (stmts)
9800 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9801 gcc_assert (!new_bb);
9804 stmts = NULL;
9805 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9806 vf, induction_type);
9807 if (stmts)
9809 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9810 gcc_assert (!new_bb);
9813 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9814 new_name, vectype,
9815 induction_type);
9816 /* Create the following def-use cycle:
9817 loop prolog:
9818 vec_init = ...
9819 vec_step = ...
9820 loop:
9821 vec_iv = PHI <vec_init, vec_loop>
9823 STMT
9825 vec_loop = vec_iv + vec_step; */
9827 /* Create the induction-phi that defines the induction-operand. */
9828 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9829 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9830 induc_def = PHI_RESULT (induction_phi);
9832 /* Create the iv update inside the loop. */
9833 stmts = NULL;
9834 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9835 induc_def, vec_step,
9836 induction_type);
9838 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9839 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9841 /* Set the arguments of the phi node: */
9842 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9843 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9844 UNKNOWN_LOCATION);
9846 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9847 *vec_stmt = induction_phi;
9849 /* In case that vectorization factor (VF) is bigger than the number
9850 of elements that we can fit in a vectype (nunits), we have to generate
9851 more than one vector stmt - i.e - we need to "unroll" the
9852 vector stmt by a factor VF/nunits. For more details see documentation
9853 in vectorizable_operation. */
9855 if (ncopies > 1)
9857 stmts = NULL;
9858 /* FORNOW. This restriction should be relaxed. */
9859 gcc_assert (!nested_in_vect_loop);
9861 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9862 nunits, induction_type);
9864 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9865 new_name, vectype,
9866 induction_type);
9867 vec_def = induc_def;
9868 for (i = 1; i < ncopies; i++)
9870 /* vec_i = vec_prev + vec_step. */
9871 stmts = NULL;
9872 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9873 vec_def, vec_step,
9874 induction_type);
9875 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9876 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9877 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9881 if (dump_enabled_p ())
9882 dump_printf_loc (MSG_NOTE, vect_location,
9883 "transform induction: created def-use cycle: %G%G",
9884 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9886 return true;
9889 /* Function vectorizable_induction
9891 Check if STMT_INFO performs an induction computation that can be vectorized.
9892 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9893 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9894 Return true if STMT_INFO is vectorizable in this way. */
9896 bool
9897 vectorizable_induction (loop_vec_info loop_vinfo,
9898 stmt_vec_info stmt_info,
9899 gimple **vec_stmt, slp_tree slp_node,
9900 stmt_vector_for_cost *cost_vec)
9902 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9903 unsigned ncopies;
9904 bool nested_in_vect_loop = false;
9905 class loop *iv_loop;
9906 tree vec_def;
9907 edge pe = loop_preheader_edge (loop);
9908 basic_block new_bb;
9909 tree new_vec, vec_init, vec_step, t;
9910 tree new_name;
9911 gimple *new_stmt;
9912 gphi *induction_phi;
9913 tree induc_def, vec_dest;
9914 tree init_expr, step_expr;
9915 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9916 unsigned i;
9917 tree expr;
9918 gimple_stmt_iterator si;
9919 enum vect_induction_op_type induction_type
9920 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9922 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9923 if (!phi)
9924 return false;
9926 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9927 return false;
9929 /* Make sure it was recognized as induction computation. */
9930 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9931 return false;
9933 /* Handle nonlinear induction in a separate place. */
9934 if (induction_type != vect_step_op_add)
9935 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9936 vec_stmt, slp_node, cost_vec);
9938 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9939 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9941 if (slp_node)
9942 ncopies = 1;
9943 else
9944 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9945 gcc_assert (ncopies >= 1);
9947 /* FORNOW. These restrictions should be relaxed. */
9948 if (nested_in_vect_loop_p (loop, stmt_info))
9950 imm_use_iterator imm_iter;
9951 use_operand_p use_p;
9952 gimple *exit_phi;
9953 edge latch_e;
9954 tree loop_arg;
9956 if (ncopies > 1)
9958 if (dump_enabled_p ())
9959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9960 "multiple types in nested loop.\n");
9961 return false;
9964 exit_phi = NULL;
9965 latch_e = loop_latch_edge (loop->inner);
9966 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9967 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9969 gimple *use_stmt = USE_STMT (use_p);
9970 if (is_gimple_debug (use_stmt))
9971 continue;
9973 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9975 exit_phi = use_stmt;
9976 break;
9979 if (exit_phi)
9981 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9982 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9983 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9985 if (dump_enabled_p ())
9986 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9987 "inner-loop induction only used outside "
9988 "of the outer vectorized loop.\n");
9989 return false;
9993 nested_in_vect_loop = true;
9994 iv_loop = loop->inner;
9996 else
9997 iv_loop = loop;
9998 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10000 if (slp_node && !nunits.is_constant ())
10002 /* The current SLP code creates the step value element-by-element. */
10003 if (dump_enabled_p ())
10004 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10005 "SLP induction not supported for variable-length"
10006 " vectors.\n");
10007 return false;
10010 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10012 if (dump_enabled_p ())
10013 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10014 "floating point induction vectorization disabled\n");
10015 return false;
10018 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10019 gcc_assert (step_expr != NULL_TREE);
10020 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10022 /* Check for backend support of PLUS/MINUS_EXPR. */
10023 if (!directly_supported_p (PLUS_EXPR, step_vectype)
10024 || !directly_supported_p (MINUS_EXPR, step_vectype))
10025 return false;
10027 if (!vec_stmt) /* transformation not required. */
10029 unsigned inside_cost = 0, prologue_cost = 0;
10030 if (slp_node)
10032 /* We eventually need to set a vector type on invariant
10033 arguments. */
10034 unsigned j;
10035 slp_tree child;
10036 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10037 if (!vect_maybe_update_slp_op_vectype
10038 (child, SLP_TREE_VECTYPE (slp_node)))
10040 if (dump_enabled_p ())
10041 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10042 "incompatible vector types for "
10043 "invariants\n");
10044 return false;
10046 /* loop cost for vec_loop. */
10047 inside_cost
10048 = record_stmt_cost (cost_vec,
10049 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10050 vector_stmt, stmt_info, 0, vect_body);
10051 /* prologue cost for vec_init (if not nested) and step. */
10052 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10053 scalar_to_vec,
10054 stmt_info, 0, vect_prologue);
10056 else /* if (!slp_node) */
10058 /* loop cost for vec_loop. */
10059 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10060 stmt_info, 0, vect_body);
10061 /* prologue cost for vec_init and vec_step. */
10062 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10063 stmt_info, 0, vect_prologue);
10065 if (dump_enabled_p ())
10066 dump_printf_loc (MSG_NOTE, vect_location,
10067 "vect_model_induction_cost: inside_cost = %d, "
10068 "prologue_cost = %d .\n", inside_cost,
10069 prologue_cost);
10071 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10072 DUMP_VECT_SCOPE ("vectorizable_induction");
10073 return true;
10076 /* Transform. */
10078 /* Compute a vector variable, initialized with the first VF values of
10079 the induction variable. E.g., for an iv with IV_PHI='X' and
10080 evolution S, for a vector of 4 units, we want to compute:
10081 [X, X + S, X + 2*S, X + 3*S]. */
10083 if (dump_enabled_p ())
10084 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10086 pe = loop_preheader_edge (iv_loop);
10087 /* Find the first insertion point in the BB. */
10088 basic_block bb = gimple_bb (phi);
10089 si = gsi_after_labels (bb);
10091 /* For SLP induction we have to generate several IVs as for example
10092 with group size 3 we need
10093 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10094 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
10095 if (slp_node)
10097 /* Enforced above. */
10098 unsigned int const_nunits = nunits.to_constant ();
10100 /* The initial values are vectorized, but any lanes > group_size
10101 need adjustment. */
10102 slp_tree init_node
10103 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10105 /* Gather steps. Since we do not vectorize inductions as
10106 cycles we have to reconstruct the step from SCEV data. */
10107 unsigned group_size = SLP_TREE_LANES (slp_node);
10108 tree *steps = XALLOCAVEC (tree, group_size);
10109 tree *inits = XALLOCAVEC (tree, group_size);
10110 stmt_vec_info phi_info;
10111 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10113 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10114 if (!init_node)
10115 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10116 pe->dest_idx);
10119 /* Now generate the IVs. */
10120 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10121 gcc_assert ((const_nunits * nvects) % group_size == 0);
10122 unsigned nivs;
10123 if (nested_in_vect_loop)
10124 nivs = nvects;
10125 else
10127 /* Compute the number of distinct IVs we need. First reduce
10128 group_size if it is a multiple of const_nunits so we get
10129 one IV for a group_size of 4 but const_nunits 2. */
10130 unsigned group_sizep = group_size;
10131 if (group_sizep % const_nunits == 0)
10132 group_sizep = group_sizep / const_nunits;
10133 nivs = least_common_multiple (group_sizep,
10134 const_nunits) / const_nunits;
10136 tree stept = TREE_TYPE (step_vectype);
10137 tree lupdate_mul = NULL_TREE;
10138 if (!nested_in_vect_loop)
10140 /* The number of iterations covered in one vector iteration. */
10141 unsigned lup_mul = (nvects * const_nunits) / group_size;
10142 lupdate_mul
10143 = build_vector_from_val (step_vectype,
10144 SCALAR_FLOAT_TYPE_P (stept)
10145 ? build_real_from_wide (stept, lup_mul,
10146 UNSIGNED)
10147 : build_int_cstu (stept, lup_mul));
10149 tree peel_mul = NULL_TREE;
10150 gimple_seq init_stmts = NULL;
10151 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10153 if (SCALAR_FLOAT_TYPE_P (stept))
10154 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10155 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10156 else
10157 peel_mul = gimple_convert (&init_stmts, stept,
10158 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10159 peel_mul = gimple_build_vector_from_val (&init_stmts,
10160 step_vectype, peel_mul);
10162 unsigned ivn;
10163 auto_vec<tree> vec_steps;
10164 for (ivn = 0; ivn < nivs; ++ivn)
10166 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10167 tree_vector_builder init_elts (vectype, const_nunits, 1);
10168 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10169 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10171 /* The scalar steps of the IVs. */
10172 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10173 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10174 step_elts.quick_push (elt);
10175 if (!init_node)
10177 /* The scalar inits of the IVs if not vectorized. */
10178 elt = inits[(ivn*const_nunits + eltn) % group_size];
10179 if (!useless_type_conversion_p (TREE_TYPE (vectype),
10180 TREE_TYPE (elt)))
10181 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10182 TREE_TYPE (vectype), elt);
10183 init_elts.quick_push (elt);
10185 /* The number of steps to add to the initial values. */
10186 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10187 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10188 ? build_real_from_wide (stept,
10189 mul_elt, UNSIGNED)
10190 : build_int_cstu (stept, mul_elt));
10192 vec_step = gimple_build_vector (&init_stmts, &step_elts);
10193 vec_steps.safe_push (vec_step);
10194 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10195 if (peel_mul)
10196 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10197 step_mul, peel_mul);
10198 if (!init_node)
10199 vec_init = gimple_build_vector (&init_stmts, &init_elts);
10201 /* Create the induction-phi that defines the induction-operand. */
10202 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10203 "vec_iv_");
10204 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10205 induc_def = PHI_RESULT (induction_phi);
10207 /* Create the iv update inside the loop */
10208 tree up = vec_step;
10209 if (lupdate_mul)
10210 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10211 vec_step, lupdate_mul);
10212 gimple_seq stmts = NULL;
10213 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10214 vec_def = gimple_build (&stmts,
10215 PLUS_EXPR, step_vectype, vec_def, up);
10216 vec_def = gimple_convert (&stmts, vectype, vec_def);
10217 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10218 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10219 UNKNOWN_LOCATION);
10221 if (init_node)
10222 vec_init = vect_get_slp_vect_def (init_node, ivn);
10223 if (!nested_in_vect_loop
10224 && !integer_zerop (step_mul))
10226 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10227 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10228 vec_step, step_mul);
10229 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10230 vec_def, up);
10231 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10234 /* Set the arguments of the phi node: */
10235 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10237 slp_node->push_vec_def (induction_phi);
10239 if (!nested_in_vect_loop)
10241 /* Fill up to the number of vectors we need for the whole group. */
10242 nivs = least_common_multiple (group_size,
10243 const_nunits) / const_nunits;
10244 vec_steps.reserve (nivs-ivn);
10245 for (; ivn < nivs; ++ivn)
10247 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10248 vec_steps.quick_push (vec_steps[0]);
10252 /* Re-use IVs when we can. We are generating further vector
10253 stmts by adding VF' * stride to the IVs generated above. */
10254 if (ivn < nvects)
10256 unsigned vfp
10257 = least_common_multiple (group_size, const_nunits) / group_size;
10258 tree lupdate_mul
10259 = build_vector_from_val (step_vectype,
10260 SCALAR_FLOAT_TYPE_P (stept)
10261 ? build_real_from_wide (stept,
10262 vfp, UNSIGNED)
10263 : build_int_cstu (stept, vfp));
10264 for (; ivn < nvects; ++ivn)
10266 gimple *iv
10267 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10268 tree def = gimple_get_lhs (iv);
10269 if (ivn < 2*nivs)
10270 vec_steps[ivn - nivs]
10271 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10272 vec_steps[ivn - nivs], lupdate_mul);
10273 gimple_seq stmts = NULL;
10274 def = gimple_convert (&stmts, step_vectype, def);
10275 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10276 def, vec_steps[ivn % nivs]);
10277 def = gimple_convert (&stmts, vectype, def);
10278 if (gimple_code (iv) == GIMPLE_PHI)
10279 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10280 else
10282 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10283 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10285 slp_node->push_vec_def (def);
10289 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10290 gcc_assert (!new_bb);
10292 return true;
10295 init_expr = vect_phi_initial_value (phi);
10297 gimple_seq stmts = NULL;
10298 if (!nested_in_vect_loop)
10300 /* Convert the initial value to the IV update type. */
10301 tree new_type = TREE_TYPE (step_expr);
10302 init_expr = gimple_convert (&stmts, new_type, init_expr);
10304 /* If we are using the loop mask to "peel" for alignment then we need
10305 to adjust the start value here. */
10306 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10307 if (skip_niters != NULL_TREE)
10309 if (FLOAT_TYPE_P (vectype))
10310 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10311 skip_niters);
10312 else
10313 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10314 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10315 skip_niters, step_expr);
10316 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10317 init_expr, skip_step);
10321 if (stmts)
10323 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10324 gcc_assert (!new_bb);
10327 /* Create the vector that holds the initial_value of the induction. */
10328 if (nested_in_vect_loop)
10330 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10331 been created during vectorization of previous stmts. We obtain it
10332 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10333 auto_vec<tree> vec_inits;
10334 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10335 init_expr, &vec_inits);
10336 vec_init = vec_inits[0];
10337 /* If the initial value is not of proper type, convert it. */
10338 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10340 new_stmt
10341 = gimple_build_assign (vect_get_new_ssa_name (vectype,
10342 vect_simple_var,
10343 "vec_iv_"),
10344 VIEW_CONVERT_EXPR,
10345 build1 (VIEW_CONVERT_EXPR, vectype,
10346 vec_init));
10347 vec_init = gimple_assign_lhs (new_stmt);
10348 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10349 new_stmt);
10350 gcc_assert (!new_bb);
10353 else
10355 /* iv_loop is the loop to be vectorized. Create:
10356 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10357 stmts = NULL;
10358 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10360 unsigned HOST_WIDE_INT const_nunits;
10361 if (nunits.is_constant (&const_nunits))
10363 tree_vector_builder elts (step_vectype, const_nunits, 1);
10364 elts.quick_push (new_name);
10365 for (i = 1; i < const_nunits; i++)
10367 /* Create: new_name_i = new_name + step_expr */
10368 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10369 new_name, step_expr);
10370 elts.quick_push (new_name);
10372 /* Create a vector from [new_name_0, new_name_1, ...,
10373 new_name_nunits-1] */
10374 vec_init = gimple_build_vector (&stmts, &elts);
10376 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10377 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10378 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10379 new_name, step_expr);
10380 else
10382 /* Build:
10383 [base, base, base, ...]
10384 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10385 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10386 gcc_assert (flag_associative_math);
10387 tree index = build_index_vector (step_vectype, 0, 1);
10388 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10389 new_name);
10390 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10391 step_expr);
10392 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10393 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10394 vec_init, step_vec);
10395 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10396 vec_init, base_vec);
10398 vec_init = gimple_convert (&stmts, vectype, vec_init);
10400 if (stmts)
10402 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10403 gcc_assert (!new_bb);
10408 /* Create the vector that holds the step of the induction. */
10409 gimple_stmt_iterator *step_iv_si = NULL;
10410 if (nested_in_vect_loop)
10411 /* iv_loop is nested in the loop to be vectorized. Generate:
10412 vec_step = [S, S, S, S] */
10413 new_name = step_expr;
10414 else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10416 /* When we're using loop_len produced by SELEC_VL, the non-final
10417 iterations are not always processing VF elements. So vectorize
10418 induction variable instead of
10420 _21 = vect_vec_iv_.6_22 + { VF, ... };
10422 We should generate:
10424 _35 = .SELECT_VL (ivtmp_33, VF);
10425 vect_cst__22 = [vec_duplicate_expr] _35;
10426 _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10427 gcc_assert (!slp_node);
10428 gimple_seq seq = NULL;
10429 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10430 tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10431 expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10432 unshare_expr (len)),
10433 &seq, true, NULL_TREE);
10434 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10435 step_expr);
10436 gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10437 step_iv_si = &si;
10439 else
10441 /* iv_loop is the loop to be vectorized. Generate:
10442 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10443 gimple_seq seq = NULL;
10444 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10446 expr = build_int_cst (integer_type_node, vf);
10447 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10449 else
10450 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10451 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10452 expr, step_expr);
10453 if (seq)
10455 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10456 gcc_assert (!new_bb);
10460 t = unshare_expr (new_name);
10461 gcc_assert (CONSTANT_CLASS_P (new_name)
10462 || TREE_CODE (new_name) == SSA_NAME);
10463 new_vec = build_vector_from_val (step_vectype, t);
10464 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10465 new_vec, step_vectype, step_iv_si);
10468 /* Create the following def-use cycle:
10469 loop prolog:
10470 vec_init = ...
10471 vec_step = ...
10472 loop:
10473 vec_iv = PHI <vec_init, vec_loop>
10475 STMT
10477 vec_loop = vec_iv + vec_step; */
10479 /* Create the induction-phi that defines the induction-operand. */
10480 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10481 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10482 induc_def = PHI_RESULT (induction_phi);
10484 /* Create the iv update inside the loop */
10485 stmts = NULL;
10486 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10487 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10488 vec_def = gimple_convert (&stmts, vectype, vec_def);
10489 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10490 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10492 /* Set the arguments of the phi node: */
10493 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10494 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10495 UNKNOWN_LOCATION);
10497 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10498 *vec_stmt = induction_phi;
10500 /* In case that vectorization factor (VF) is bigger than the number
10501 of elements that we can fit in a vectype (nunits), we have to generate
10502 more than one vector stmt - i.e - we need to "unroll" the
10503 vector stmt by a factor VF/nunits. For more details see documentation
10504 in vectorizable_operation. */
10506 if (ncopies > 1)
10508 gimple_seq seq = NULL;
10509 /* FORNOW. This restriction should be relaxed. */
10510 gcc_assert (!nested_in_vect_loop);
10511 /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1. */
10512 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10514 /* Create the vector that holds the step of the induction. */
10515 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10517 expr = build_int_cst (integer_type_node, nunits);
10518 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10520 else
10521 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10522 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10523 expr, step_expr);
10524 if (seq)
10526 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10527 gcc_assert (!new_bb);
10530 t = unshare_expr (new_name);
10531 gcc_assert (CONSTANT_CLASS_P (new_name)
10532 || TREE_CODE (new_name) == SSA_NAME);
10533 new_vec = build_vector_from_val (step_vectype, t);
10534 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10535 new_vec, step_vectype, NULL);
10537 vec_def = induc_def;
10538 for (i = 1; i < ncopies + 1; i++)
10540 /* vec_i = vec_prev + vec_step */
10541 gimple_seq stmts = NULL;
10542 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10543 vec_def = gimple_build (&stmts,
10544 PLUS_EXPR, step_vectype, vec_def, vec_step);
10545 vec_def = gimple_convert (&stmts, vectype, vec_def);
10547 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10548 if (i < ncopies)
10550 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10551 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10553 else
10555 /* vec_1 = vec_iv + (VF/n * S)
10556 vec_2 = vec_1 + (VF/n * S)
10558 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10560 vec_n is used as vec_loop to save the large step register and
10561 related operations. */
10562 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10563 UNKNOWN_LOCATION);
10568 if (dump_enabled_p ())
10569 dump_printf_loc (MSG_NOTE, vect_location,
10570 "transform induction: created def-use cycle: %G%G",
10571 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10573 return true;
10576 /* Function vectorizable_live_operation_1.
10578 helper function for vectorizable_live_operation. */
10580 tree
10581 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10582 stmt_vec_info stmt_info, basic_block exit_bb,
10583 tree vectype, int ncopies, slp_tree slp_node,
10584 tree bitsize, tree bitstart, tree vec_lhs,
10585 tree lhs_type, bool restart_loop,
10586 gimple_stmt_iterator *exit_gsi)
10588 gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10590 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10591 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10592 for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10593 SET_PHI_ARG_DEF (phi, i, vec_lhs);
10595 gimple_seq stmts = NULL;
10596 tree new_tree;
10597 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10599 /* Emit:
10601 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10603 where VEC_LHS is the vectorized live-out result and MASK is
10604 the loop mask for the final iteration. */
10605 gcc_assert (ncopies == 1 && !slp_node);
10606 gimple_seq tem = NULL;
10607 gimple_stmt_iterator gsi = gsi_last (tem);
10608 tree len = vect_get_loop_len (loop_vinfo, &gsi,
10609 &LOOP_VINFO_LENS (loop_vinfo),
10610 1, vectype, 0, 0);
10612 /* BIAS - 1. */
10613 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10614 tree bias_minus_one
10615 = int_const_binop (MINUS_EXPR,
10616 build_int_cst (TREE_TYPE (len), biasval),
10617 build_one_cst (TREE_TYPE (len)));
10619 /* LAST_INDEX = LEN + (BIAS - 1). */
10620 tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10621 len, bias_minus_one);
10623 /* This needs to implement extraction of the first index, but not sure
10624 how the LEN stuff works. At the moment we shouldn't get here since
10625 there's no LEN support for early breaks. But guard this so there's
10626 no incorrect codegen. */
10627 gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10629 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10630 tree scalar_res
10631 = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10632 vec_lhs_phi, last_index);
10634 /* Convert the extracted vector element to the scalar type. */
10635 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10637 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10639 /* Emit:
10641 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10643 where VEC_LHS is the vectorized live-out result and MASK is
10644 the loop mask for the final iteration. */
10645 gcc_assert (!slp_node);
10646 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10647 gimple_seq tem = NULL;
10648 gimple_stmt_iterator gsi = gsi_last (tem);
10649 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10650 &LOOP_VINFO_MASKS (loop_vinfo),
10651 1, vectype, 0);
10652 tree scalar_res;
10654 /* For an inverted control flow with early breaks we want EXTRACT_FIRST
10655 instead of EXTRACT_LAST. Emulate by reversing the vector and mask. */
10656 if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10658 /* First create the permuted mask. */
10659 tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
10660 tree perm_dest = copy_ssa_name (mask);
10661 gimple *perm_stmt
10662 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
10663 mask, perm_mask);
10664 vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
10665 &gsi);
10666 mask = perm_dest;
10668 /* Then permute the vector contents. */
10669 tree perm_elem = perm_mask_for_reverse (vectype);
10670 perm_dest = copy_ssa_name (vec_lhs_phi);
10671 perm_stmt
10672 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
10673 vec_lhs_phi, perm_elem);
10674 vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
10675 &gsi);
10676 vec_lhs_phi = perm_dest;
10679 gimple_seq_add_seq (&stmts, tem);
10681 scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10682 mask, vec_lhs_phi);
10684 /* Convert the extracted vector element to the scalar type. */
10685 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10687 else
10689 tree bftype = TREE_TYPE (vectype);
10690 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10691 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10692 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10693 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10694 &stmts, true, NULL_TREE);
10697 *exit_gsi = gsi_after_labels (exit_bb);
10698 if (stmts)
10699 gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10701 return new_tree;
10704 /* Find the edge that's the final one in the path from SRC to DEST and
10705 return it. This edge must exist in at most one forwarder edge between. */
10707 static edge
10708 find_connected_edge (edge src, basic_block dest)
10710 if (src->dest == dest)
10711 return src;
10713 return find_edge (src->dest, dest);
10716 /* Function vectorizable_live_operation.
10718 STMT_INFO computes a value that is used outside the loop. Check if
10719 it can be supported. */
10721 bool
10722 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10723 slp_tree slp_node, slp_instance slp_node_instance,
10724 int slp_index, bool vec_stmt_p,
10725 stmt_vector_for_cost *cost_vec)
10727 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10728 imm_use_iterator imm_iter;
10729 tree lhs, lhs_type, bitsize;
10730 tree vectype = (slp_node
10731 ? SLP_TREE_VECTYPE (slp_node)
10732 : STMT_VINFO_VECTYPE (stmt_info));
10733 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10734 int ncopies;
10735 gimple *use_stmt;
10736 use_operand_p use_p;
10737 auto_vec<tree> vec_oprnds;
10738 int vec_entry = 0;
10739 poly_uint64 vec_index = 0;
10741 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10742 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10744 /* If a stmt of a reduction is live, vectorize it via
10745 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10746 validity so just trigger the transform here. */
10747 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10749 if (!vec_stmt_p)
10750 return true;
10751 if (slp_node)
10753 /* For reduction chains the meta-info is attached to
10754 the group leader. */
10755 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10756 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10757 /* For SLP reductions we vectorize the epilogue for
10758 all involved stmts together. */
10759 else if (slp_index != 0)
10760 return true;
10762 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10763 gcc_assert (reduc_info->is_reduc_info);
10764 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10765 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10766 return true;
10768 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10769 slp_node_instance,
10770 LOOP_VINFO_IV_EXIT (loop_vinfo));
10772 /* If early break we only have to materialize the reduction on the merge
10773 block, but we have to find an alternate exit first. */
10774 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10776 for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10777 if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10779 vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10780 slp_node, slp_node_instance,
10781 exit);
10782 break;
10786 return true;
10789 /* If STMT is not relevant and it is a simple assignment and its inputs are
10790 invariant then it can remain in place, unvectorized. The original last
10791 scalar value that it computes will be used. */
10792 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10794 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10795 if (dump_enabled_p ())
10796 dump_printf_loc (MSG_NOTE, vect_location,
10797 "statement is simple and uses invariant. Leaving in "
10798 "place.\n");
10799 return true;
10802 if (slp_node)
10803 ncopies = 1;
10804 else
10805 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10807 if (slp_node)
10809 gcc_assert (slp_index >= 0);
10811 /* Get the last occurrence of the scalar index from the concatenation of
10812 all the slp vectors. Calculate which slp vector it is and the index
10813 within. */
10814 int num_scalar = SLP_TREE_LANES (slp_node);
10815 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10816 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10818 /* Calculate which vector contains the result, and which lane of
10819 that vector we need. */
10820 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10822 if (dump_enabled_p ())
10823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10824 "Cannot determine which vector holds the"
10825 " final result.\n");
10826 return false;
10830 if (!vec_stmt_p)
10832 /* No transformation required. */
10833 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10835 if (slp_node)
10837 if (dump_enabled_p ())
10838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10839 "can't operate on partial vectors "
10840 "because an SLP statement is live after "
10841 "the loop.\n");
10842 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10844 else if (ncopies > 1)
10846 if (dump_enabled_p ())
10847 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10848 "can't operate on partial vectors "
10849 "because ncopies is greater than 1.\n");
10850 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10852 else
10854 gcc_assert (ncopies == 1 && !slp_node);
10855 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10856 OPTIMIZE_FOR_SPEED))
10857 vect_record_loop_mask (loop_vinfo,
10858 &LOOP_VINFO_MASKS (loop_vinfo),
10859 1, vectype, NULL);
10860 else if (can_vec_extract_var_idx_p (
10861 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10862 vect_record_loop_len (loop_vinfo,
10863 &LOOP_VINFO_LENS (loop_vinfo),
10864 1, vectype, 1);
10865 else
10867 if (dump_enabled_p ())
10868 dump_printf_loc (
10869 MSG_MISSED_OPTIMIZATION, vect_location,
10870 "can't operate on partial vectors "
10871 "because the target doesn't support extract "
10872 "last reduction.\n");
10873 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10877 /* ??? Enable for loop costing as well. */
10878 if (!loop_vinfo)
10879 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10880 0, vect_epilogue);
10881 return true;
10884 /* Use the lhs of the original scalar statement. */
10885 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10886 if (dump_enabled_p ())
10887 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10888 "stmt %G", stmt);
10890 lhs = gimple_get_lhs (stmt);
10891 lhs_type = TREE_TYPE (lhs);
10893 bitsize = vector_element_bits_tree (vectype);
10895 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10896 tree vec_lhs, vec_lhs0, bitstart;
10897 gimple *vec_stmt, *vec_stmt0;
10898 if (slp_node)
10900 gcc_assert (!loop_vinfo
10901 || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10902 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10904 /* Get the correct slp vectorized stmt. */
10905 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10906 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10908 /* In case we need to early break vectorize also get the first stmt. */
10909 vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10910 vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10912 /* Get entry to use. */
10913 bitstart = bitsize_int (vec_index);
10914 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10916 else
10918 /* For multiple copies, get the last copy. */
10919 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10920 vec_lhs = gimple_get_lhs (vec_stmt);
10922 /* In case we need to early break vectorize also get the first stmt. */
10923 vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10924 vec_lhs0 = gimple_get_lhs (vec_stmt0);
10926 /* Get the last lane in the vector. */
10927 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10930 if (loop_vinfo)
10932 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10933 requirement, insert one phi node for it. It looks like:
10934 loop;
10936 # lhs' = PHI <lhs>
10938 loop;
10940 # vec_lhs' = PHI <vec_lhs>
10941 new_tree = lane_extract <vec_lhs', ...>;
10942 lhs' = new_tree; */
10944 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10945 /* Check if we have a loop where the chosen exit is not the main exit,
10946 in these cases for an early break we restart the iteration the vector code
10947 did. For the live values we want the value at the start of the iteration
10948 rather than at the end. */
10949 edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10950 bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10951 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10952 if (!is_gimple_debug (use_stmt)
10953 && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10954 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10956 edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10957 phi_arg_index_from_use (use_p));
10958 bool main_exit_edge = e == main_e
10959 || find_connected_edge (main_e, e->src);
10961 /* Early exits have an merge block, we want the merge block itself
10962 so use ->src. For main exit the merge block is the
10963 destination. */
10964 basic_block dest = main_exit_edge ? main_e->dest : e->src;
10965 tree tmp_vec_lhs = vec_lhs;
10966 tree tmp_bitstart = bitstart;
10968 /* For early exit where the exit is not in the BB that leads
10969 to the latch then we're restarting the iteration in the
10970 scalar loop. So get the first live value. */
10971 restart_loop = restart_loop || !main_exit_edge;
10972 if (restart_loop
10973 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10975 tmp_vec_lhs = vec_lhs0;
10976 tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10979 gimple_stmt_iterator exit_gsi;
10980 tree new_tree
10981 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10982 dest, vectype, ncopies,
10983 slp_node, bitsize,
10984 tmp_bitstart, tmp_vec_lhs,
10985 lhs_type, restart_loop,
10986 &exit_gsi);
10988 if (gimple_phi_num_args (use_stmt) == 1)
10990 auto gsi = gsi_for_stmt (use_stmt);
10991 remove_phi_node (&gsi, false);
10992 tree lhs_phi = gimple_phi_result (use_stmt);
10993 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10994 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10996 else
10997 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, new_tree);
11000 /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
11001 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11002 gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
11004 else
11006 /* For basic-block vectorization simply insert the lane-extraction. */
11007 tree bftype = TREE_TYPE (vectype);
11008 if (VECTOR_BOOLEAN_TYPE_P (vectype))
11009 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11010 tree new_tree = build3 (BIT_FIELD_REF, bftype,
11011 vec_lhs, bitsize, bitstart);
11012 gimple_seq stmts = NULL;
11013 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11014 &stmts, true, NULL_TREE);
11015 if (TREE_CODE (new_tree) == SSA_NAME
11016 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11017 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11018 if (is_a <gphi *> (vec_stmt))
11020 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11021 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11023 else
11025 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11026 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11029 /* Replace use of lhs with newly computed result. If the use stmt is a
11030 single arg PHI, just replace all uses of PHI result. It's necessary
11031 because lcssa PHI defining lhs may be before newly inserted stmt. */
11032 use_operand_p use_p;
11033 stmt_vec_info use_stmt_info;
11034 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11035 if (!is_gimple_debug (use_stmt)
11036 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11037 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11039 /* ??? This can happen when the live lane ends up being
11040 rooted in a vector construction code-generated by an
11041 external SLP node (and code-generation for that already
11042 happened). See gcc.dg/vect/bb-slp-47.c.
11043 Doing this is what would happen if that vector CTOR
11044 were not code-generated yet so it is not too bad.
11045 ??? In fact we'd likely want to avoid this situation
11046 in the first place. */
11047 if (TREE_CODE (new_tree) == SSA_NAME
11048 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11049 && gimple_code (use_stmt) != GIMPLE_PHI
11050 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11051 use_stmt))
11053 if (dump_enabled_p ())
11054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11055 "Using original scalar computation for "
11056 "live lane because use preceeds vector "
11057 "def\n");
11058 continue;
11060 /* ??? It can also happen that we end up pulling a def into
11061 a loop where replacing out-of-loop uses would require
11062 a new LC SSA PHI node. Retain the original scalar in
11063 those cases as well. PR98064. */
11064 if (TREE_CODE (new_tree) == SSA_NAME
11065 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11066 && (gimple_bb (use_stmt)->loop_father
11067 != gimple_bb (vec_stmt)->loop_father)
11068 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11069 gimple_bb (use_stmt)->loop_father))
11071 if (dump_enabled_p ())
11072 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11073 "Using original scalar computation for "
11074 "live lane because there is an out-of-loop "
11075 "definition for it\n");
11076 continue;
11078 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11079 SET_USE (use_p, new_tree);
11080 update_stmt (use_stmt);
11084 return true;
11087 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
11089 static void
11090 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11092 ssa_op_iter op_iter;
11093 imm_use_iterator imm_iter;
11094 def_operand_p def_p;
11095 gimple *ustmt;
11097 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11099 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11101 basic_block bb;
11103 if (!is_gimple_debug (ustmt))
11104 continue;
11106 bb = gimple_bb (ustmt);
11108 if (!flow_bb_inside_loop_p (loop, bb))
11110 if (gimple_debug_bind_p (ustmt))
11112 if (dump_enabled_p ())
11113 dump_printf_loc (MSG_NOTE, vect_location,
11114 "killing debug use\n");
11116 gimple_debug_bind_reset_value (ustmt);
11117 update_stmt (ustmt);
11119 else
11120 gcc_unreachable ();
11126 /* Given loop represented by LOOP_VINFO, return true if computation of
11127 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11128 otherwise. */
11130 static bool
11131 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11133 /* Constant case. */
11134 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11136 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11137 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11139 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11140 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11141 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11142 return true;
11145 widest_int max;
11146 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11147 /* Check the upper bound of loop niters. */
11148 if (get_max_loop_iterations (loop, &max))
11150 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11151 signop sgn = TYPE_SIGN (type);
11152 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11153 if (max < type_max)
11154 return true;
11156 return false;
11159 /* Return a mask type with half the number of elements as OLD_TYPE,
11160 given that it should have mode NEW_MODE. */
11162 tree
11163 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11165 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11166 return build_truth_vector_type_for_mode (nunits, new_mode);
11169 /* Return a mask type with twice as many elements as OLD_TYPE,
11170 given that it should have mode NEW_MODE. */
11172 tree
11173 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11175 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11176 return build_truth_vector_type_for_mode (nunits, new_mode);
11179 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11180 contain a sequence of NVECTORS masks that each control a vector of type
11181 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
11182 these vector masks with the vector version of SCALAR_MASK. */
11184 void
11185 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11186 unsigned int nvectors, tree vectype, tree scalar_mask)
11188 gcc_assert (nvectors != 0);
11190 if (scalar_mask)
11192 scalar_cond_masked_key cond (scalar_mask, nvectors);
11193 loop_vinfo->scalar_cond_masked_set.add (cond);
11196 masks->mask_set.add (std::make_pair (vectype, nvectors));
11199 /* Given a complete set of masks MASKS, extract mask number INDEX
11200 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11201 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
11203 See the comment above vec_loop_masks for more details about the mask
11204 arrangement. */
11206 tree
11207 vect_get_loop_mask (loop_vec_info loop_vinfo,
11208 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11209 unsigned int nvectors, tree vectype, unsigned int index)
11211 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11212 == vect_partial_vectors_while_ult)
11214 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11215 tree mask_type = rgm->type;
11217 /* Populate the rgroup's mask array, if this is the first time we've
11218 used it. */
11219 if (rgm->controls.is_empty ())
11221 rgm->controls.safe_grow_cleared (nvectors, true);
11222 for (unsigned int i = 0; i < nvectors; ++i)
11224 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11225 /* Provide a dummy definition until the real one is available. */
11226 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11227 rgm->controls[i] = mask;
11231 tree mask = rgm->controls[index];
11232 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11233 TYPE_VECTOR_SUBPARTS (vectype)))
11235 /* A loop mask for data type X can be reused for data type Y
11236 if X has N times more elements than Y and if Y's elements
11237 are N times bigger than X's. In this case each sequence
11238 of N elements in the loop mask will be all-zero or all-one.
11239 We can then view-convert the mask so that each sequence of
11240 N elements is replaced by a single element. */
11241 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11242 TYPE_VECTOR_SUBPARTS (vectype)));
11243 gimple_seq seq = NULL;
11244 mask_type = truth_type_for (vectype);
11245 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11246 if (seq)
11247 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11249 return mask;
11251 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11252 == vect_partial_vectors_avx512)
11254 /* The number of scalars per iteration and the number of vectors are
11255 both compile-time constants. */
11256 unsigned int nscalars_per_iter
11257 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11258 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11260 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11262 /* The stored nV is dependent on the mask type produced. */
11263 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11264 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11265 == rgm->factor);
11266 nvectors = rgm->factor;
11268 /* Populate the rgroup's mask array, if this is the first time we've
11269 used it. */
11270 if (rgm->controls.is_empty ())
11272 rgm->controls.safe_grow_cleared (nvectors, true);
11273 for (unsigned int i = 0; i < nvectors; ++i)
11275 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11276 /* Provide a dummy definition until the real one is available. */
11277 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11278 rgm->controls[i] = mask;
11281 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11282 TYPE_VECTOR_SUBPARTS (vectype)))
11283 return rgm->controls[index];
11285 /* Split the vector if needed. Since we are dealing with integer mode
11286 masks with AVX512 we can operate on the integer representation
11287 performing the whole vector shifting. */
11288 unsigned HOST_WIDE_INT factor;
11289 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11290 TYPE_VECTOR_SUBPARTS (vectype), &factor);
11291 gcc_assert (ok);
11292 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11293 tree mask_type = truth_type_for (vectype);
11294 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11295 unsigned vi = index / factor;
11296 unsigned vpart = index % factor;
11297 tree vec = rgm->controls[vi];
11298 gimple_seq seq = NULL;
11299 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11300 lang_hooks.types.type_for_mode
11301 (TYPE_MODE (rgm->type), 1), vec);
11302 /* For integer mode masks simply shift the right bits into position. */
11303 if (vpart != 0)
11304 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11305 build_int_cst (integer_type_node,
11306 (TYPE_VECTOR_SUBPARTS (vectype)
11307 * vpart)));
11308 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11309 (TYPE_MODE (mask_type), 1), vec);
11310 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11311 if (seq)
11312 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11313 return vec;
11315 else
11316 gcc_unreachable ();
11319 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11320 lengths for controlling an operation on VECTYPE. The operation splits
11321 each element of VECTYPE into FACTOR separate subelements, measuring the
11322 length as a number of these subelements. */
11324 void
11325 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11326 unsigned int nvectors, tree vectype, unsigned int factor)
11328 gcc_assert (nvectors != 0);
11329 if (lens->length () < nvectors)
11330 lens->safe_grow_cleared (nvectors, true);
11331 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11333 /* The number of scalars per iteration, scalar occupied bytes and
11334 the number of vectors are both compile-time constants. */
11335 unsigned int nscalars_per_iter
11336 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11337 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11339 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11341 /* For now, we only support cases in which all loads and stores fall back
11342 to VnQI or none do. */
11343 gcc_assert (!rgl->max_nscalars_per_iter
11344 || (rgl->factor == 1 && factor == 1)
11345 || (rgl->max_nscalars_per_iter * rgl->factor
11346 == nscalars_per_iter * factor));
11347 rgl->max_nscalars_per_iter = nscalars_per_iter;
11348 rgl->type = vectype;
11349 rgl->factor = factor;
11353 /* Given a complete set of lengths LENS, extract length number INDEX
11354 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11355 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11356 multipled by the number of elements that should be processed.
11357 Insert any set-up statements before GSI. */
11359 tree
11360 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11361 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11362 unsigned int index, unsigned int factor)
11364 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11365 bool use_bias_adjusted_len =
11366 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11368 /* Populate the rgroup's len array, if this is the first time we've
11369 used it. */
11370 if (rgl->controls.is_empty ())
11372 rgl->controls.safe_grow_cleared (nvectors, true);
11373 for (unsigned int i = 0; i < nvectors; ++i)
11375 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11376 gcc_assert (len_type != NULL_TREE);
11378 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11380 /* Provide a dummy definition until the real one is available. */
11381 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11382 rgl->controls[i] = len;
11384 if (use_bias_adjusted_len)
11386 gcc_assert (i == 0);
11387 tree adjusted_len =
11388 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11389 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11390 rgl->bias_adjusted_ctrl = adjusted_len;
11395 if (use_bias_adjusted_len)
11396 return rgl->bias_adjusted_ctrl;
11398 tree loop_len = rgl->controls[index];
11399 if (rgl->factor == 1 && factor == 1)
11401 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11402 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11403 if (maybe_ne (nunits1, nunits2))
11405 /* A loop len for data type X can be reused for data type Y
11406 if X has N times more elements than Y and if Y's elements
11407 are N times bigger than X's. */
11408 gcc_assert (multiple_p (nunits1, nunits2));
11409 factor = exact_div (nunits1, nunits2).to_constant ();
11410 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11411 gimple_seq seq = NULL;
11412 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11413 build_int_cst (iv_type, factor));
11414 if (seq)
11415 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11418 return loop_len;
11421 /* Scale profiling counters by estimation for LOOP which is vectorized
11422 by factor VF.
11423 If FLAT is true, the loop we started with had unrealistically flat
11424 profile. */
11426 static void
11427 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11429 /* For flat profiles do not scale down proportionally by VF and only
11430 cap by known iteration count bounds. */
11431 if (flat)
11433 if (dump_file && (dump_flags & TDF_DETAILS))
11434 fprintf (dump_file,
11435 "Vectorized loop profile seems flat; not scaling iteration "
11436 "count down by the vectorization factor %i\n", vf);
11437 scale_loop_profile (loop, profile_probability::always (),
11438 get_likely_max_loop_iterations_int (loop));
11439 return;
11441 /* Loop body executes VF fewer times and exit increases VF times. */
11442 profile_count entry_count = loop_preheader_edge (loop)->count ();
11444 /* If we have unreliable loop profile avoid dropping entry
11445 count bellow header count. This can happen since loops
11446 has unrealistically low trip counts. */
11447 while (vf > 1
11448 && loop->header->count > entry_count
11449 && loop->header->count < entry_count * vf)
11451 if (dump_file && (dump_flags & TDF_DETAILS))
11452 fprintf (dump_file,
11453 "Vectorization factor %i seems too large for profile "
11454 "prevoiusly believed to be consistent; reducing.\n", vf);
11455 vf /= 2;
11458 if (entry_count.nonzero_p ())
11459 set_edge_probability_and_rescale_others
11460 (exit_e,
11461 entry_count.probability_in (loop->header->count / vf));
11462 /* Avoid producing very large exit probability when we do not have
11463 sensible profile. */
11464 else if (exit_e->probability < profile_probability::always () / (vf * 2))
11465 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11466 loop->latch->count = single_pred_edge (loop->latch)->count ();
11468 scale_loop_profile (loop, profile_probability::always () / vf,
11469 get_likely_max_loop_iterations_int (loop));
11472 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11473 latch edge values originally defined by it. */
11475 static void
11476 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11477 stmt_vec_info def_stmt_info)
11479 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11480 if (!def || TREE_CODE (def) != SSA_NAME)
11481 return;
11482 stmt_vec_info phi_info;
11483 imm_use_iterator iter;
11484 use_operand_p use_p;
11485 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11487 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11488 if (!phi)
11489 continue;
11490 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11491 && (phi_info = loop_vinfo->lookup_stmt (phi))
11492 && STMT_VINFO_RELEVANT_P (phi_info)))
11493 continue;
11494 loop_p loop = gimple_bb (phi)->loop_father;
11495 edge e = loop_latch_edge (loop);
11496 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11497 continue;
11499 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11500 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11501 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11503 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11504 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11505 gcc_assert (phi_defs.length () == latch_defs.length ());
11506 for (unsigned i = 0; i < phi_defs.length (); ++i)
11507 add_phi_arg (as_a <gphi *> (phi_defs[i]),
11508 gimple_get_lhs (latch_defs[i]), e,
11509 gimple_phi_arg_location (phi, e->dest_idx));
11511 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11513 /* For first order recurrences we have to update both uses of
11514 the latch definition, the one in the PHI node and the one
11515 in the generated VEC_PERM_EXPR. */
11516 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11517 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11518 gcc_assert (phi_defs.length () == latch_defs.length ());
11519 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11520 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11521 for (unsigned i = 0; i < phi_defs.length (); ++i)
11523 gassign *perm = as_a <gassign *> (phi_defs[i]);
11524 if (i > 0)
11525 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11526 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11527 update_stmt (perm);
11529 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11530 gimple_phi_arg_location (phi, e->dest_idx));
11535 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11536 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11537 stmt_vec_info. */
11539 static bool
11540 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11541 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11543 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11544 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11546 if (dump_enabled_p ())
11547 dump_printf_loc (MSG_NOTE, vect_location,
11548 "------>vectorizing statement: %G", stmt_info->stmt);
11550 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11551 vect_loop_kill_debug_uses (loop, stmt_info);
11553 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11554 && !STMT_VINFO_LIVE_P (stmt_info))
11556 if (is_gimple_call (stmt_info->stmt)
11557 && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11559 gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11560 *seen_store = stmt_info;
11561 return false;
11563 return false;
11566 if (STMT_VINFO_VECTYPE (stmt_info))
11568 poly_uint64 nunits
11569 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11570 if (!STMT_SLP_TYPE (stmt_info)
11571 && maybe_ne (nunits, vf)
11572 && dump_enabled_p ())
11573 /* For SLP VF is set according to unrolling factor, and not
11574 to vector size, hence for SLP this print is not valid. */
11575 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11578 /* Pure SLP statements have already been vectorized. We still need
11579 to apply loop vectorization to hybrid SLP statements. */
11580 if (PURE_SLP_STMT (stmt_info))
11581 return false;
11583 if (dump_enabled_p ())
11584 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11586 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11587 *seen_store = stmt_info;
11589 return true;
11592 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11593 in the hash_map with its corresponding values. */
11595 static tree
11596 find_in_mapping (tree t, void *context)
11598 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11600 tree *value = mapping->get (t);
11601 return value ? *value : t;
11604 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11605 original loop that has now been vectorized.
11607 The inits of the data_references need to be advanced with the number of
11608 iterations of the main loop. This has been computed in vect_do_peeling and
11609 is stored in parameter ADVANCE. We first restore the data_references
11610 initial offset with the values recored in ORIG_DRS_INIT.
11612 Since the loop_vec_info of this EPILOGUE was constructed for the original
11613 loop, its stmt_vec_infos all point to the original statements. These need
11614 to be updated to point to their corresponding copies as well as the SSA_NAMES
11615 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11617 The data_reference's connections also need to be updated. Their
11618 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11619 stmt_vec_infos, their statements need to point to their corresponding copy,
11620 if they are gather loads or scatter stores then their reference needs to be
11621 updated to point to its corresponding copy and finally we set
11622 'base_misaligned' to false as we have already peeled for alignment in the
11623 prologue of the main loop. */
11625 static void
11626 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11628 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11629 auto_vec<gimple *> stmt_worklist;
11630 hash_map<tree,tree> mapping;
11631 gimple *orig_stmt, *new_stmt;
11632 gimple_stmt_iterator epilogue_gsi;
11633 gphi_iterator epilogue_phi_gsi;
11634 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11635 basic_block *epilogue_bbs = get_loop_body (epilogue);
11636 unsigned i;
11638 free (LOOP_VINFO_BBS (epilogue_vinfo));
11639 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11641 /* Advance data_reference's with the number of iterations of the previous
11642 loop and its prologue. */
11643 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11646 /* The EPILOGUE loop is a copy of the original loop so they share the same
11647 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11648 point to the copied statements. We also create a mapping of all LHS' in
11649 the original loop and all the LHS' in the EPILOGUE and create worklists to
11650 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11651 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11653 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11654 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11656 new_stmt = epilogue_phi_gsi.phi ();
11658 gcc_assert (gimple_uid (new_stmt) > 0);
11659 stmt_vinfo
11660 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11662 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11663 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11665 mapping.put (gimple_phi_result (orig_stmt),
11666 gimple_phi_result (new_stmt));
11667 /* PHI nodes can not have patterns or related statements. */
11668 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11669 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11672 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11673 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11675 new_stmt = gsi_stmt (epilogue_gsi);
11676 if (is_gimple_debug (new_stmt))
11677 continue;
11679 gcc_assert (gimple_uid (new_stmt) > 0);
11680 stmt_vinfo
11681 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11683 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11684 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11686 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11687 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11689 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11691 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11692 for (gimple_stmt_iterator gsi = gsi_start (seq);
11693 !gsi_end_p (gsi); gsi_next (&gsi))
11694 stmt_worklist.safe_push (gsi_stmt (gsi));
11697 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11698 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11700 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11701 stmt_worklist.safe_push (stmt);
11702 /* Set BB such that the assert in
11703 'get_initial_def_for_reduction' is able to determine that
11704 the BB of the related stmt is inside this loop. */
11705 gimple_set_bb (stmt,
11706 gimple_bb (new_stmt));
11707 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11708 gcc_assert (related_vinfo == NULL
11709 || related_vinfo == stmt_vinfo);
11714 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11715 using the original main loop and thus need to be updated to refer to the
11716 cloned variables used in the epilogue. */
11717 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11719 gimple *stmt = stmt_worklist[i];
11720 tree *new_op;
11722 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11724 tree op = gimple_op (stmt, j);
11725 if ((new_op = mapping.get(op)))
11726 gimple_set_op (stmt, j, *new_op);
11727 else
11729 /* PR92429: The last argument of simplify_replace_tree disables
11730 folding when replacing arguments. This is required as
11731 otherwise you might end up with different statements than the
11732 ones analyzed in vect_loop_analyze, leading to different
11733 vectorization. */
11734 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11735 &find_in_mapping, &mapping, false);
11736 gimple_set_op (stmt, j, op);
11741 struct data_reference *dr;
11742 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11743 FOR_EACH_VEC_ELT (datarefs, i, dr)
11745 orig_stmt = DR_STMT (dr);
11746 gcc_assert (gimple_uid (orig_stmt) > 0);
11747 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11748 /* Data references for gather loads and scatter stores do not use the
11749 updated offset we set using ADVANCE. Instead we have to make sure the
11750 reference in the data references point to the corresponding copy of
11751 the original in the epilogue. Make sure to update both
11752 gather/scatters recognized by dataref analysis and also other
11753 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11754 auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11755 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11756 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11758 DR_REF (dr)
11759 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11760 &find_in_mapping, &mapping);
11761 DR_BASE_ADDRESS (dr)
11762 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11763 &find_in_mapping, &mapping);
11765 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11766 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11767 /* The vector size of the epilogue is smaller than that of the main loop
11768 so the alignment is either the same or lower. This means the dr will
11769 thus by definition be aligned. */
11770 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11773 epilogue_vinfo->shared->datarefs_copy.release ();
11774 epilogue_vinfo->shared->save_datarefs ();
11777 /* When vectorizing early break statements instructions that happen before
11778 the early break in the current BB need to be moved to after the early
11779 break. This function deals with that and assumes that any validity
11780 checks has already been performed.
11782 While moving the instructions if it encounters a VUSE or VDEF it then
11783 corrects the VUSES as it moves the statements along. GDEST is the location
11784 in which to insert the new statements. */
11786 static void
11787 move_early_exit_stmts (loop_vec_info loop_vinfo)
11789 DUMP_VECT_SCOPE ("move_early_exit_stmts");
11791 if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11792 return;
11794 /* Move all stmts that need moving. */
11795 basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11796 gimple_stmt_iterator dest_gsi = gsi_start_bb (dest_bb);
11798 for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11800 /* Check to see if statement is still required for vect or has been
11801 elided. */
11802 auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11803 if (!stmt_info)
11804 continue;
11806 if (dump_enabled_p ())
11807 dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11809 gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11810 gsi_move_before (&stmt_gsi, &dest_gsi);
11811 gsi_prev (&dest_gsi);
11814 /* Update all the stmts with their new reaching VUSES. */
11815 tree vuse
11816 = gimple_vuse (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).last ());
11817 for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11819 if (dump_enabled_p ())
11820 dump_printf_loc (MSG_NOTE, vect_location,
11821 "updating vuse to %T for load %G", vuse, p);
11822 gimple_set_vuse (p, vuse);
11823 update_stmt (p);
11827 /* Function vect_transform_loop.
11829 The analysis phase has determined that the loop is vectorizable.
11830 Vectorize the loop - created vectorized stmts to replace the scalar
11831 stmts in the loop, and update the loop exit condition.
11832 Returns scalar epilogue loop if any. */
11834 class loop *
11835 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11837 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11838 class loop *epilogue = NULL;
11839 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11840 int nbbs = loop->num_nodes;
11841 int i;
11842 tree niters_vector = NULL_TREE;
11843 tree step_vector = NULL_TREE;
11844 tree niters_vector_mult_vf = NULL_TREE;
11845 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11846 unsigned int lowest_vf = constant_lower_bound (vf);
11847 gimple *stmt;
11848 bool check_profitability = false;
11849 unsigned int th;
11850 bool flat = maybe_flat_loop_profile (loop);
11852 DUMP_VECT_SCOPE ("vec_transform_loop");
11854 loop_vinfo->shared->check_datarefs ();
11856 /* Use the more conservative vectorization threshold. If the number
11857 of iterations is constant assume the cost check has been performed
11858 by our caller. If the threshold makes all loops profitable that
11859 run at least the (estimated) vectorization factor number of times
11860 checking is pointless, too. */
11861 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11862 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11864 if (dump_enabled_p ())
11865 dump_printf_loc (MSG_NOTE, vect_location,
11866 "Profitability threshold is %d loop iterations.\n",
11867 th);
11868 check_profitability = true;
11871 /* Make sure there exists a single-predecessor exit bb. Do this before
11872 versioning. */
11873 edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11874 if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11876 split_loop_exit_edge (e, true);
11877 if (dump_enabled_p ())
11878 dump_printf (MSG_NOTE, "split exit edge\n");
11881 /* Version the loop first, if required, so the profitability check
11882 comes first. */
11884 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11886 class loop *sloop
11887 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11888 sloop->force_vectorize = false;
11889 check_profitability = false;
11892 /* Make sure there exists a single-predecessor exit bb also on the
11893 scalar loop copy. Do this after versioning but before peeling
11894 so CFG structure is fine for both scalar and if-converted loop
11895 to make slpeel_duplicate_current_defs_from_edges face matched
11896 loop closed PHI nodes on the exit. */
11897 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11899 e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11900 if (! single_pred_p (e->dest))
11902 split_loop_exit_edge (e, true);
11903 if (dump_enabled_p ())
11904 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11908 tree niters = vect_build_loop_niters (loop_vinfo);
11909 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11910 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11911 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11912 tree advance;
11913 drs_init_vec orig_drs_init;
11915 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11916 &step_vector, &niters_vector_mult_vf, th,
11917 check_profitability, niters_no_overflow,
11918 &advance);
11919 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11920 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11922 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11923 block after loop exit. We need to scale all that. */
11924 basic_block preheader
11925 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11926 preheader->count
11927 = preheader->count.apply_probability
11928 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11929 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11930 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11931 single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11932 = preheader->count;
11935 if (niters_vector == NULL_TREE)
11937 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11938 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11939 && known_eq (lowest_vf, vf))
11941 niters_vector
11942 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11943 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11944 step_vector = build_one_cst (TREE_TYPE (niters));
11946 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11947 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11948 &step_vector, niters_no_overflow);
11949 else
11950 /* vect_do_peeling subtracted the number of peeled prologue
11951 iterations from LOOP_VINFO_NITERS. */
11952 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11953 &niters_vector, &step_vector,
11954 niters_no_overflow);
11957 /* 1) Make sure the loop header has exactly two entries
11958 2) Make sure we have a preheader basic block. */
11960 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11962 split_edge (loop_preheader_edge (loop));
11964 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11965 /* This will deal with any possible peeling. */
11966 vect_prepare_for_masked_peels (loop_vinfo);
11968 /* Handle any code motion that we need to for early-break vectorization after
11969 we've done peeling but just before we start vectorizing. */
11970 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11971 move_early_exit_stmts (loop_vinfo);
11973 /* Schedule the SLP instances first, then handle loop vectorization
11974 below. */
11975 if (!loop_vinfo->slp_instances.is_empty ())
11977 DUMP_VECT_SCOPE ("scheduling SLP instances");
11978 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11981 /* FORNOW: the vectorizer supports only loops which body consist
11982 of one basic block (header + empty latch). When the vectorizer will
11983 support more involved loop forms, the order by which the BBs are
11984 traversed need to be reconsidered. */
11986 for (i = 0; i < nbbs; i++)
11988 basic_block bb = bbs[i];
11989 stmt_vec_info stmt_info;
11991 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11992 gsi_next (&si))
11994 gphi *phi = si.phi ();
11995 if (dump_enabled_p ())
11996 dump_printf_loc (MSG_NOTE, vect_location,
11997 "------>vectorizing phi: %G", (gimple *) phi);
11998 stmt_info = loop_vinfo->lookup_stmt (phi);
11999 if (!stmt_info)
12000 continue;
12002 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
12003 vect_loop_kill_debug_uses (loop, stmt_info);
12005 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12006 && !STMT_VINFO_LIVE_P (stmt_info))
12007 continue;
12009 if (STMT_VINFO_VECTYPE (stmt_info)
12010 && (maybe_ne
12011 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12012 && dump_enabled_p ())
12013 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12015 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12016 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12017 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12018 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12019 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12020 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12021 && ! PURE_SLP_STMT (stmt_info))
12023 if (dump_enabled_p ())
12024 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12025 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12029 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12030 gsi_next (&si))
12032 gphi *phi = si.phi ();
12033 stmt_info = loop_vinfo->lookup_stmt (phi);
12034 if (!stmt_info)
12035 continue;
12037 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12038 && !STMT_VINFO_LIVE_P (stmt_info))
12039 continue;
12041 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12042 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12043 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12044 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12045 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12046 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12047 && ! PURE_SLP_STMT (stmt_info))
12048 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12051 for (gimple_stmt_iterator si = gsi_start_bb (bb);
12052 !gsi_end_p (si);)
12054 stmt = gsi_stmt (si);
12055 /* During vectorization remove existing clobber stmts. */
12056 if (gimple_clobber_p (stmt))
12058 unlink_stmt_vdef (stmt);
12059 gsi_remove (&si, true);
12060 release_defs (stmt);
12062 else
12064 /* Ignore vector stmts created in the outer loop. */
12065 stmt_info = loop_vinfo->lookup_stmt (stmt);
12067 /* vector stmts created in the outer-loop during vectorization of
12068 stmts in an inner-loop may not have a stmt_info, and do not
12069 need to be vectorized. */
12070 stmt_vec_info seen_store = NULL;
12071 if (stmt_info)
12073 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12075 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12076 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12077 !gsi_end_p (subsi); gsi_next (&subsi))
12079 stmt_vec_info pat_stmt_info
12080 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12081 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12082 &si, &seen_store);
12084 stmt_vec_info pat_stmt_info
12085 = STMT_VINFO_RELATED_STMT (stmt_info);
12086 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12087 &si, &seen_store))
12088 maybe_set_vectorized_backedge_value (loop_vinfo,
12089 pat_stmt_info);
12091 else
12093 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12094 &seen_store))
12095 maybe_set_vectorized_backedge_value (loop_vinfo,
12096 stmt_info);
12099 gsi_next (&si);
12100 if (seen_store)
12102 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12103 /* Interleaving. If IS_STORE is TRUE, the
12104 vectorization of the interleaving chain was
12105 completed - free all the stores in the chain. */
12106 vect_remove_stores (loop_vinfo,
12107 DR_GROUP_FIRST_ELEMENT (seen_store));
12108 else
12109 /* Free the attached stmt_vec_info and remove the stmt. */
12110 loop_vinfo->remove_stmt (stmt_info);
12115 /* Stub out scalar statements that must not survive vectorization.
12116 Doing this here helps with grouped statements, or statements that
12117 are involved in patterns. */
12118 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12119 !gsi_end_p (gsi); gsi_next (&gsi))
12121 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12122 if (!call || !gimple_call_internal_p (call))
12123 continue;
12124 internal_fn ifn = gimple_call_internal_fn (call);
12125 if (ifn == IFN_MASK_LOAD)
12127 tree lhs = gimple_get_lhs (call);
12128 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12130 tree zero = build_zero_cst (TREE_TYPE (lhs));
12131 gimple *new_stmt = gimple_build_assign (lhs, zero);
12132 gsi_replace (&gsi, new_stmt, true);
12135 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12137 tree lhs = gimple_get_lhs (call);
12138 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12140 tree else_arg
12141 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12142 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12143 gsi_replace (&gsi, new_stmt, true);
12147 } /* BBs in loop */
12149 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12150 a zero NITERS becomes a nonzero NITERS_VECTOR. */
12151 if (integer_onep (step_vector))
12152 niters_no_overflow = true;
12153 vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12154 niters_vector, step_vector, niters_vector_mult_vf,
12155 !niters_no_overflow);
12157 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12159 /* True if the final iteration might not handle a full vector's
12160 worth of scalar iterations. */
12161 bool final_iter_may_be_partial
12162 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
12163 /* The minimum number of iterations performed by the epilogue. This
12164 is 1 when peeling for gaps because we always need a final scalar
12165 iteration. */
12166 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12167 /* +1 to convert latch counts to loop iteration counts,
12168 -min_epilogue_iters to remove iterations that cannot be performed
12169 by the vector code. */
12170 int bias_for_lowest = 1 - min_epilogue_iters;
12171 int bias_for_assumed = bias_for_lowest;
12172 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12173 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12175 /* When the amount of peeling is known at compile time, the first
12176 iteration will have exactly alignment_npeels active elements.
12177 In the worst case it will have at least one. */
12178 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12179 bias_for_lowest += lowest_vf - min_first_active;
12180 bias_for_assumed += assumed_vf - min_first_active;
12182 /* In these calculations the "- 1" converts loop iteration counts
12183 back to latch counts. */
12184 if (loop->any_upper_bound)
12186 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12187 loop->nb_iterations_upper_bound
12188 = (final_iter_may_be_partial
12189 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12190 lowest_vf) - 1
12191 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12192 lowest_vf) - 1);
12193 if (main_vinfo
12194 /* Both peeling for alignment and peeling for gaps can end up
12195 with the scalar epilogue running for more than VF-1 iterations. */
12196 && !main_vinfo->peeling_for_alignment
12197 && !main_vinfo->peeling_for_gaps)
12199 unsigned int bound;
12200 poly_uint64 main_iters
12201 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12202 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12203 main_iters
12204 = upper_bound (main_iters,
12205 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12206 if (can_div_away_from_zero_p (main_iters,
12207 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12208 &bound))
12209 loop->nb_iterations_upper_bound
12210 = wi::umin ((bound_wide_int) (bound - 1),
12211 loop->nb_iterations_upper_bound);
12214 if (loop->any_likely_upper_bound)
12215 loop->nb_iterations_likely_upper_bound
12216 = (final_iter_may_be_partial
12217 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12218 + bias_for_lowest, lowest_vf) - 1
12219 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12220 + bias_for_lowest, lowest_vf) - 1);
12221 if (loop->any_estimate)
12222 loop->nb_iterations_estimate
12223 = (final_iter_may_be_partial
12224 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12225 assumed_vf) - 1
12226 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12227 assumed_vf) - 1);
12228 scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12229 assumed_vf, flat);
12231 if (dump_enabled_p ())
12233 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12235 dump_printf_loc (MSG_NOTE, vect_location,
12236 "LOOP VECTORIZED\n");
12237 if (loop->inner)
12238 dump_printf_loc (MSG_NOTE, vect_location,
12239 "OUTER LOOP VECTORIZED\n");
12240 dump_printf (MSG_NOTE, "\n");
12242 else
12243 dump_printf_loc (MSG_NOTE, vect_location,
12244 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12245 GET_MODE_NAME (loop_vinfo->vector_mode));
12248 /* Loops vectorized with a variable factor won't benefit from
12249 unrolling/peeling. */
12250 if (!vf.is_constant ())
12252 loop->unroll = 1;
12253 if (dump_enabled_p ())
12254 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12255 " variable-length vectorization factor\n");
12257 /* Free SLP instances here because otherwise stmt reference counting
12258 won't work. */
12259 slp_instance instance;
12260 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12261 vect_free_slp_instance (instance);
12262 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12263 /* Clear-up safelen field since its value is invalid after vectorization
12264 since vectorized loop can have loop-carried dependencies. */
12265 loop->safelen = 0;
12267 if (epilogue)
12269 update_epilogue_loop_vinfo (epilogue, advance);
12271 epilogue->simduid = loop->simduid;
12272 epilogue->force_vectorize = loop->force_vectorize;
12273 epilogue->dont_vectorize = false;
12276 return epilogue;
12279 /* The code below is trying to perform simple optimization - revert
12280 if-conversion for masked stores, i.e. if the mask of a store is zero
12281 do not perform it and all stored value producers also if possible.
12282 For example,
12283 for (i=0; i<n; i++)
12284 if (c[i])
12286 p1[i] += 1;
12287 p2[i] = p3[i] +2;
12289 this transformation will produce the following semi-hammock:
12291 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12293 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12294 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12295 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12296 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12297 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12298 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12302 void
12303 optimize_mask_stores (class loop *loop)
12305 basic_block *bbs = get_loop_body (loop);
12306 unsigned nbbs = loop->num_nodes;
12307 unsigned i;
12308 basic_block bb;
12309 class loop *bb_loop;
12310 gimple_stmt_iterator gsi;
12311 gimple *stmt;
12312 auto_vec<gimple *> worklist;
12313 auto_purge_vect_location sentinel;
12315 vect_location = find_loop_location (loop);
12316 /* Pick up all masked stores in loop if any. */
12317 for (i = 0; i < nbbs; i++)
12319 bb = bbs[i];
12320 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12321 gsi_next (&gsi))
12323 stmt = gsi_stmt (gsi);
12324 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12325 worklist.safe_push (stmt);
12329 free (bbs);
12330 if (worklist.is_empty ())
12331 return;
12333 /* Loop has masked stores. */
12334 while (!worklist.is_empty ())
12336 gimple *last, *last_store;
12337 edge e, efalse;
12338 tree mask;
12339 basic_block store_bb, join_bb;
12340 gimple_stmt_iterator gsi_to;
12341 tree vdef, new_vdef;
12342 gphi *phi;
12343 tree vectype;
12344 tree zero;
12346 last = worklist.pop ();
12347 mask = gimple_call_arg (last, 2);
12348 bb = gimple_bb (last);
12349 /* Create then_bb and if-then structure in CFG, then_bb belongs to
12350 the same loop as if_bb. It could be different to LOOP when two
12351 level loop-nest is vectorized and mask_store belongs to the inner
12352 one. */
12353 e = split_block (bb, last);
12354 bb_loop = bb->loop_father;
12355 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12356 join_bb = e->dest;
12357 store_bb = create_empty_bb (bb);
12358 add_bb_to_loop (store_bb, bb_loop);
12359 e->flags = EDGE_TRUE_VALUE;
12360 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12361 /* Put STORE_BB to likely part. */
12362 efalse->probability = profile_probability::likely ();
12363 e->probability = efalse->probability.invert ();
12364 store_bb->count = efalse->count ();
12365 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12366 if (dom_info_available_p (CDI_DOMINATORS))
12367 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12368 if (dump_enabled_p ())
12369 dump_printf_loc (MSG_NOTE, vect_location,
12370 "Create new block %d to sink mask stores.",
12371 store_bb->index);
12372 /* Create vector comparison with boolean result. */
12373 vectype = TREE_TYPE (mask);
12374 zero = build_zero_cst (vectype);
12375 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12376 gsi = gsi_last_bb (bb);
12377 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12378 /* Create new PHI node for vdef of the last masked store:
12379 .MEM_2 = VDEF <.MEM_1>
12380 will be converted to
12381 .MEM.3 = VDEF <.MEM_1>
12382 and new PHI node will be created in join bb
12383 .MEM_2 = PHI <.MEM_1, .MEM_3>
12385 vdef = gimple_vdef (last);
12386 new_vdef = make_ssa_name (gimple_vop (cfun), last);
12387 gimple_set_vdef (last, new_vdef);
12388 phi = create_phi_node (vdef, join_bb);
12389 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12391 /* Put all masked stores with the same mask to STORE_BB if possible. */
12392 while (true)
12394 gimple_stmt_iterator gsi_from;
12395 gimple *stmt1 = NULL;
12397 /* Move masked store to STORE_BB. */
12398 last_store = last;
12399 gsi = gsi_for_stmt (last);
12400 gsi_from = gsi;
12401 /* Shift GSI to the previous stmt for further traversal. */
12402 gsi_prev (&gsi);
12403 gsi_to = gsi_start_bb (store_bb);
12404 gsi_move_before (&gsi_from, &gsi_to);
12405 /* Setup GSI_TO to the non-empty block start. */
12406 gsi_to = gsi_start_bb (store_bb);
12407 if (dump_enabled_p ())
12408 dump_printf_loc (MSG_NOTE, vect_location,
12409 "Move stmt to created bb\n%G", last);
12410 /* Move all stored value producers if possible. */
12411 while (!gsi_end_p (gsi))
12413 tree lhs;
12414 imm_use_iterator imm_iter;
12415 use_operand_p use_p;
12416 bool res;
12418 /* Skip debug statements. */
12419 if (is_gimple_debug (gsi_stmt (gsi)))
12421 gsi_prev (&gsi);
12422 continue;
12424 stmt1 = gsi_stmt (gsi);
12425 /* Do not consider statements writing to memory or having
12426 volatile operand. */
12427 if (gimple_vdef (stmt1)
12428 || gimple_has_volatile_ops (stmt1))
12429 break;
12430 gsi_from = gsi;
12431 gsi_prev (&gsi);
12432 lhs = gimple_get_lhs (stmt1);
12433 if (!lhs)
12434 break;
12436 /* LHS of vectorized stmt must be SSA_NAME. */
12437 if (TREE_CODE (lhs) != SSA_NAME)
12438 break;
12440 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12442 /* Remove dead scalar statement. */
12443 if (has_zero_uses (lhs))
12445 gsi_remove (&gsi_from, true);
12446 continue;
12450 /* Check that LHS does not have uses outside of STORE_BB. */
12451 res = true;
12452 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12454 gimple *use_stmt;
12455 use_stmt = USE_STMT (use_p);
12456 if (is_gimple_debug (use_stmt))
12457 continue;
12458 if (gimple_bb (use_stmt) != store_bb)
12460 res = false;
12461 break;
12464 if (!res)
12465 break;
12467 if (gimple_vuse (stmt1)
12468 && gimple_vuse (stmt1) != gimple_vuse (last_store))
12469 break;
12471 /* Can move STMT1 to STORE_BB. */
12472 if (dump_enabled_p ())
12473 dump_printf_loc (MSG_NOTE, vect_location,
12474 "Move stmt to created bb\n%G", stmt1);
12475 gsi_move_before (&gsi_from, &gsi_to);
12476 /* Shift GSI_TO for further insertion. */
12477 gsi_prev (&gsi_to);
12479 /* Put other masked stores with the same mask to STORE_BB. */
12480 if (worklist.is_empty ()
12481 || gimple_call_arg (worklist.last (), 2) != mask
12482 || worklist.last () != stmt1)
12483 break;
12484 last = worklist.pop ();
12486 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12490 /* Decide whether it is possible to use a zero-based induction variable
12491 when vectorizing LOOP_VINFO with partial vectors. If it is, return
12492 the value that the induction variable must be able to hold in order
12493 to ensure that the rgroups eventually have no active vector elements.
12494 Return -1 otherwise. */
12496 widest_int
12497 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12499 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12500 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12501 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12503 /* Calculate the value that the induction variable must be able
12504 to hit in order to ensure that we end the loop with an all-false mask.
12505 This involves adding the maximum number of inactive trailing scalar
12506 iterations. */
12507 widest_int iv_limit = -1;
12508 if (max_loop_iterations (loop, &iv_limit))
12510 if (niters_skip)
12512 /* Add the maximum number of skipped iterations to the
12513 maximum iteration count. */
12514 if (TREE_CODE (niters_skip) == INTEGER_CST)
12515 iv_limit += wi::to_widest (niters_skip);
12516 else
12517 iv_limit += max_vf - 1;
12519 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12520 /* Make a conservatively-correct assumption. */
12521 iv_limit += max_vf - 1;
12523 /* IV_LIMIT is the maximum number of latch iterations, which is also
12524 the maximum in-range IV value. Round this value down to the previous
12525 vector alignment boundary and then add an extra full iteration. */
12526 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12527 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12529 return iv_limit;
12532 /* For the given rgroup_controls RGC, check whether an induction variable
12533 would ever hit a value that produces a set of all-false masks or zero
12534 lengths before wrapping around. Return true if it's possible to wrap
12535 around before hitting the desirable value, otherwise return false. */
12537 bool
12538 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12540 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12542 if (iv_limit == -1)
12543 return true;
12545 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12546 unsigned int compare_precision = TYPE_PRECISION (compare_type);
12547 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12549 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12550 return true;
12552 return false;