hppa: Fix pr104869.C on hpux
[official-gcc.git] / gcc / tree-vect-loop.cc
blob3df020d222883e3a88145c89b2add94bc1be619e
1 /* Loop Vectorization
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "memmodel.h"
36 #include "optabs.h"
37 #include "diagnostic-core.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "cfganal.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "gimple-fold.h"
52 #include "cgraph.h"
53 #include "tree-cfg.h"
54 #include "tree-if-conv.h"
55 #include "internal-fn.h"
56 #include "tree-vector-builder.h"
57 #include "vec-perm-indices.h"
58 #include "tree-eh.h"
59 #include "case-cfn-macros.h"
60 #include "langhooks.h"
62 /* Loop Vectorization Pass.
64 This pass tries to vectorize loops.
66 For example, the vectorizer transforms the following simple loop:
68 short a[N]; short b[N]; short c[N]; int i;
70 for (i=0; i<N; i++){
71 a[i] = b[i] + c[i];
74 as if it was manually vectorized by rewriting the source code into:
76 typedef int __attribute__((mode(V8HI))) v8hi;
77 short a[N]; short b[N]; short c[N]; int i;
78 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 v8hi va, vb, vc;
81 for (i=0; i<N/8; i++){
82 vb = pb[i];
83 vc = pc[i];
84 va = vb + vc;
85 pa[i] = va;
88 The main entry to this pass is vectorize_loops(), in which
89 the vectorizer applies a set of analyses on a given set of loops,
90 followed by the actual vectorization transformation for the loops that
91 had successfully passed the analysis phase.
92 Throughout this pass we make a distinction between two types of
93 data: scalars (which are represented by SSA_NAMES), and memory references
94 ("data-refs"). These two types of data require different handling both
95 during analysis and transformation. The types of data-refs that the
96 vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 accesses are required to have a simple (consecutive) access pattern.
100 Analysis phase:
101 ===============
102 The driver for the analysis phase is vect_analyze_loop().
103 It applies a set of analyses, some of which rely on the scalar evolution
104 analyzer (scev) developed by Sebastian Pop.
106 During the analysis phase the vectorizer records some information
107 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 loop, as well as general information about the loop as a whole, which is
109 recorded in a "loop_vec_info" struct attached to each loop.
111 Transformation phase:
112 =====================
113 The loop transformation phase scans all the stmts in the loop, and
114 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 the loop that needs to be vectorized. It inserts the vector code sequence
116 just before the scalar stmt S, and records a pointer to the vector code
117 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 attached to S). This pointer will be used for the vectorization of following
119 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 otherwise, we rely on dead code elimination for removing it.
122 For example, say stmt S1 was vectorized into stmt VS1:
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 S2: a = b;
128 To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 resulting sequence would be:
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 VS2: va = vb;
136 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 Operands that are not SSA_NAMEs, are data-refs that appear in
139 load/store operations (like 'x[i]' in S1), and are handled differently.
141 Target modeling:
142 =================
143 Currently the only target specific information that is used is the
144 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 Targets that can support different sizes of vectors, for now will need
146 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 flexibility will be added in the future.
149 Since we only vectorize operations which vector form can be
150 expressed using existing tree codes, to verify that an operation is
151 supported, the vectorizer checks the relevant optab at the relevant
152 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 the value found is CODE_FOR_nothing, then there's no target support, and
154 we can't vectorize the stmt.
156 For additional information on this project see:
157 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 unsigned *);
162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 bool *, bool *, bool);
165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 may already be set for general statements (not just data refs). */
169 static opt_result
170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 bool vectype_maybe_set_p,
172 poly_uint64 *vf)
174 gimple *stmt = stmt_info->stmt;
176 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 && !STMT_VINFO_LIVE_P (stmt_info))
178 || gimple_clobber_p (stmt))
180 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 return opt_result::success ();
185 tree stmt_vectype, nunits_vectype;
186 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 &stmt_vectype,
188 &nunits_vectype);
189 if (!res)
190 return res;
192 if (stmt_vectype)
194 if (STMT_VINFO_VECTYPE (stmt_info))
195 /* The only case when a vectype had been already set is for stmts
196 that contain a data ref, or for "pattern-stmts" (stmts generated
197 by the vectorizer to represent/replace a certain idiom). */
198 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 || vectype_maybe_set_p)
200 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 else
202 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
205 if (nunits_vectype)
206 vect_update_max_nunits (vf, nunits_vectype);
208 return opt_result::success ();
211 /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 types of STMT_INFO and all attached pattern statements and update
213 the vectorization factor VF accordingly. Return true on success
214 or false if something prevented vectorization. */
216 static opt_result
217 vect_determine_vf_for_stmt (vec_info *vinfo,
218 stmt_vec_info stmt_info, poly_uint64 *vf)
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 if (!res)
225 return res;
227 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 && STMT_VINFO_RELATED_STMT (stmt_info))
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
233 /* If a pattern statement has def stmts, analyze them too. */
234 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 !gsi_end_p (si); gsi_next (&si))
237 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "==> examining pattern def stmt: %G",
241 def_stmt_info->stmt);
242 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 if (!res)
244 return res;
247 if (dump_enabled_p ())
248 dump_printf_loc (MSG_NOTE, vect_location,
249 "==> examining pattern statement: %G",
250 stmt_info->stmt);
251 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 if (!res)
253 return res;
256 return opt_result::success ();
259 /* Function vect_determine_vectorization_factor
261 Determine the vectorization factor (VF). VF is the number of data elements
262 that are operated upon in parallel in a single iteration of the vectorized
263 loop. For example, when vectorizing a loop that operates on 4byte elements,
264 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 elements can fit in a single vector register.
267 We currently support vectorization of loops in which all types operated upon
268 are of the same size. Therefore this function currently sets VF according to
269 the size of the types operated upon, and fails if there are multiple sizes
270 in the loop.
272 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 original loop:
274 for (i=0; i<N; i++){
275 a[i] = b[i] + c[i];
278 vectorized loop:
279 for (i=0; i<N; i+=VF){
280 a[i:VF] = b[i:VF] + c[i:VF];
284 static opt_result
285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
287 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 unsigned nbbs = loop->num_nodes;
290 poly_uint64 vectorization_factor = 1;
291 tree scalar_type = NULL_TREE;
292 gphi *phi;
293 tree vectype;
294 stmt_vec_info stmt_info;
295 unsigned i;
297 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
299 for (i = 0; i < nbbs; i++)
301 basic_block bb = bbs[i];
303 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 gsi_next (&si))
306 phi = si.phi ();
307 stmt_info = loop_vinfo->lookup_stmt (phi);
308 if (dump_enabled_p ())
309 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 (gimple *) phi);
312 gcc_assert (stmt_info);
314 if (STMT_VINFO_RELEVANT_P (stmt_info)
315 || STMT_VINFO_LIVE_P (stmt_info))
317 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 scalar_type = TREE_TYPE (PHI_RESULT (phi));
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE, vect_location,
322 "get vectype for scalar type: %T\n",
323 scalar_type);
325 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 if (!vectype)
327 return opt_result::failure_at (phi,
328 "not vectorized: unsupported "
329 "data-type %T\n",
330 scalar_type);
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 dump_printf (MSG_NOTE, "\n");
344 vect_update_max_nunits (&vectorization_factor, vectype);
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 gsi_next (&si))
351 if (is_gimple_debug (gsi_stmt (si)))
352 continue;
353 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 opt_result res
355 = vect_determine_vf_for_stmt (loop_vinfo,
356 stmt_info, &vectorization_factor);
357 if (!res)
358 return res;
362 /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 dump_dec (MSG_NOTE, vectorization_factor);
367 dump_printf (MSG_NOTE, "\n");
370 if (known_le (vectorization_factor, 1U))
371 return opt_result::failure_at (vect_location,
372 "not vectorized: unsupported data-type\n");
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 return opt_result::success ();
378 /* Function vect_is_simple_iv_evolution.
380 FORNOW: A simple evolution of an induction variables in the loop is
381 considered a polynomial evolution. */
383 static bool
384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 tree * step)
387 tree init_expr;
388 tree step_expr;
389 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 basic_block bb;
392 /* When there is no evolution in this loop, the evolution function
393 is not "simple". */
394 if (evolution_part == NULL_TREE)
395 return false;
397 /* When the evolution is a polynomial of degree >= 2
398 the evolution function is not "simple". */
399 if (tree_is_chrec (evolution_part))
400 return false;
402 step_expr = evolution_part;
403 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 step_expr, init_expr);
409 *init = init_expr;
410 *step = step_expr;
412 if (TREE_CODE (step_expr) != INTEGER_CST
413 && (TREE_CODE (step_expr) != SSA_NAME
414 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 || !flag_associative_math)))
419 && (TREE_CODE (step_expr) != REAL_CST
420 || !flag_associative_math))
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "step unknown.\n");
425 return false;
428 return true;
431 /* Function vect_is_nonlinear_iv_evolution
433 Only support nonlinear induction for integer type
434 1. neg
435 2. mul by constant
436 3. lshift/rshift by constant.
438 For neg induction, return a fake step as integer -1. */
439 static bool
440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 gphi* loop_phi_node, tree *init, tree *step)
443 tree init_expr, ev_expr, result, op1, op2;
444 gimple* def;
446 if (gimple_phi_num_args (loop_phi_node) != 2)
447 return false;
449 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
452 /* Support nonlinear induction only for integer type. */
453 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 return false;
456 *init = init_expr;
457 result = PHI_RESULT (loop_phi_node);
459 if (TREE_CODE (ev_expr) != SSA_NAME
460 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 || !is_gimple_assign (def))
462 return false;
464 enum tree_code t_code = gimple_assign_rhs_code (def);
465 switch (t_code)
467 case NEGATE_EXPR:
468 if (gimple_assign_rhs1 (def) != result)
469 return false;
470 *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 break;
474 case RSHIFT_EXPR:
475 case LSHIFT_EXPR:
476 case MULT_EXPR:
477 op1 = gimple_assign_rhs1 (def);
478 op2 = gimple_assign_rhs2 (def);
479 if (TREE_CODE (op2) != INTEGER_CST
480 || op1 != result)
481 return false;
482 *step = op2;
483 if (t_code == LSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 else if (t_code == RSHIFT_EXPR)
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 else
489 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 break;
492 default:
493 return false;
496 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
499 return true;
502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 what we are assuming is a double reduction. For example, given
504 a structure like this:
506 outer1:
507 x_1 = PHI <x_4(outer2), ...>;
510 inner:
511 x_2 = PHI <x_1(outer1), ...>;
513 x_3 = ...;
516 outer2:
517 x_4 = PHI <x_3(inner)>;
520 outer loop analysis would treat x_1 as a double reduction phi and
521 this function would then return true for x_2. */
523 static bool
524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
526 use_operand_p use_p;
527 ssa_op_iter op_iter;
528 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 return true;
532 return false;
535 /* Returns true if Phi is a first-order recurrence. A first-order
536 recurrence is a non-reduction recurrence relation in which the value of
537 the recurrence in the current loop iteration equals a value defined in
538 the previous iteration. */
540 static bool
541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 gphi *phi)
544 /* A nested cycle isn't vectorizable as first order recurrence. */
545 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 return false;
548 /* Ensure the loop latch definition is from within the loop. */
549 edge latch = loop_latch_edge (loop);
550 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 if (TREE_CODE (ldef) != SSA_NAME
552 || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 return false;
557 tree def = gimple_phi_result (phi);
559 /* Ensure every use_stmt of the phi node is dominated by the latch
560 definition. */
561 imm_use_iterator imm_iter;
562 use_operand_p use_p;
563 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 if (!is_gimple_debug (USE_STMT (use_p))
565 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 USE_STMT (use_p))))
568 return false;
570 /* First-order recurrence autovectorization needs shuffle vector. */
571 tree scalar_type = TREE_TYPE (def);
572 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 if (!vectype)
574 return false;
576 return true;
579 /* Function vect_analyze_scalar_cycles_1.
581 Examine the cross iteration def-use cycles of scalar variables
582 in LOOP. LOOP_VINFO represents the loop that is now being
583 considered for vectorization (can be LOOP, or an outer-loop
584 enclosing LOOP). SLP indicates there will be some subsequent
585 slp analyses or not. */
587 static void
588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 bool slp)
591 basic_block bb = loop->header;
592 tree init, step;
593 auto_vec<stmt_vec_info, 64> worklist;
594 gphi_iterator gsi;
595 bool double_reduc, reduc_chain;
597 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
599 /* First - identify all inductions. Reduction detection assumes that all the
600 inductions have been identified, therefore, this order must not be
601 changed. */
602 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
604 gphi *phi = gsi.phi ();
605 tree access_fn = NULL;
606 tree def = PHI_RESULT (phi);
607 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 (gimple *) phi);
613 /* Skip virtual phi's. The data dependences that are associated with
614 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 if (virtual_operand_p (def))
616 continue;
618 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
620 /* Analyze the evolution function. */
621 access_fn = analyze_scalar_evolution (loop, def);
622 if (access_fn)
624 STRIP_NOPS (access_fn);
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "Access function of PHI: %T\n", access_fn);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 = initial_condition_in_loop_num (access_fn, loop->num);
630 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 = evolution_part_in_loop_num (access_fn, loop->num);
634 if ((!access_fn
635 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 &init, &step)
638 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 && TREE_CODE (step) != INTEGER_CST))
640 /* Only handle nonlinear iv for same loop. */
641 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 phi, &init, &step)))
645 worklist.safe_push (stmt_vinfo);
646 continue;
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 != NULL_TREE);
651 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
659 /* Second - identify all reductions and nested cycles. */
660 while (worklist.length () > 0)
662 stmt_vec_info stmt_vinfo = worklist.pop ();
663 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
664 tree def = PHI_RESULT (phi);
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 (gimple *) phi);
670 gcc_assert (!virtual_operand_p (def)
671 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
673 stmt_vec_info reduc_stmt_info
674 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 &reduc_chain, slp);
676 if (reduc_stmt_info)
678 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 if (double_reduc)
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location,
684 "Detected double reduction.\n");
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
689 else
691 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "Detected vectorizable nested cycle.\n");
697 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
699 else
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_NOTE, vect_location,
703 "Detected reduction.\n");
705 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
706 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
707 /* Store the reduction cycles for possible vectorization in
708 loop-aware SLP if it was not detected as reduction
709 chain. */
710 if (! reduc_chain)
711 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
712 (reduc_stmt_info);
716 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
718 else
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown def-use cycle pattern.\n");
726 /* Function vect_analyze_scalar_cycles.
728 Examine the cross iteration def-use cycles of scalar variables, by
729 analyzing the loop-header PHIs of scalar variables. Classify each
730 cycle as one of the following: invariant, induction, reduction, unknown.
731 We do that for the loop represented by LOOP_VINFO, and also to its
732 inner-loop, if exists.
733 Examples for scalar cycles:
735 Example1: reduction:
737 loop1:
738 for (i=0; i<N; i++)
739 sum += a[i];
741 Example2: induction:
743 loop2:
744 for (i=0; i<N; i++)
745 a[i] = i; */
747 static void
748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
752 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
754 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 Reductions in such inner-loop therefore have different properties than
756 the reductions in the nest that gets vectorized:
757 1. When vectorized, they are executed in the same order as in the original
758 scalar loop, so we can't change the order of computation when
759 vectorizing them.
760 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 current checks are too strict. */
763 if (loop->inner)
764 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
767 /* Transfer group and reduction information from STMT_INFO to its
768 pattern stmt. */
770 static void
771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
773 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
774 stmt_vec_info stmtp;
775 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
777 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
780 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
781 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
782 == STMT_VINFO_DEF_TYPE (stmt_info));
783 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
784 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
785 if (stmt_info)
786 REDUC_GROUP_NEXT_ELEMENT (stmtp)
787 = STMT_VINFO_RELATED_STMT (stmt_info);
789 while (stmt_info);
792 /* Fixup scalar cycles that now have their stmts detected as patterns. */
794 static void
795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
797 stmt_vec_info first;
798 unsigned i;
800 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
802 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
803 while (next)
805 if ((STMT_VINFO_IN_PATTERN_P (next)
806 != STMT_VINFO_IN_PATTERN_P (first))
807 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
808 break;
809 next = REDUC_GROUP_NEXT_ELEMENT (next);
811 /* If all reduction chain members are well-formed patterns adjust
812 the group to group the pattern stmts instead. */
813 if (! next
814 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
816 if (STMT_VINFO_IN_PATTERN_P (first))
818 vect_fixup_reduc_chain (first);
819 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
820 = STMT_VINFO_RELATED_STMT (first);
823 /* If not all stmt in the chain are patterns or if we failed
824 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 it as regular reduction instead. */
826 else
828 stmt_vec_info vinfo = first;
829 stmt_vec_info last = NULL;
830 while (vinfo)
832 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
833 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
834 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
835 last = vinfo;
836 vinfo = next;
838 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
839 = vect_internal_def;
840 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
841 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
842 --i;
847 /* Function vect_get_loop_niters.
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 niter information holds in ASSUMPTIONS.
854 Return the loop exit conditions. */
857 static vec<gcond *>
858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
859 tree *number_of_iterations, tree *number_of_iterationsm1)
861 auto_vec<edge> exits = get_loop_exit_edges (loop);
862 vec<gcond *> conds;
863 conds.create (exits.length ());
864 class tree_niter_desc niter_desc;
865 tree niter_assumptions, niter, may_be_zero;
867 *assumptions = boolean_true_node;
868 *number_of_iterationsm1 = chrec_dont_know;
869 *number_of_iterations = chrec_dont_know;
871 DUMP_VECT_SCOPE ("get_loop_niters");
873 if (exits.is_empty ())
874 return conds;
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
878 exits.length ());
880 edge exit;
881 unsigned int i;
882 FOR_EACH_VEC_ELT (exits, i, exit)
884 gcond *cond = get_loop_exit_condition (exit);
885 if (cond)
886 conds.safe_push (cond);
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
891 if (exit != main_exit)
892 continue;
894 may_be_zero = NULL_TREE;
895 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
896 || chrec_contains_undetermined (niter_desc.niter))
897 continue;
899 niter_assumptions = niter_desc.assumptions;
900 may_be_zero = niter_desc.may_be_zero;
901 niter = niter_desc.niter;
903 if (may_be_zero && integer_zerop (may_be_zero))
904 may_be_zero = NULL_TREE;
906 if (may_be_zero)
908 if (COMPARISON_CLASS_P (may_be_zero))
910 /* Try to combine may_be_zero with assumptions, this can simplify
911 computation of niter expression. */
912 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
913 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
914 niter_assumptions,
915 fold_build1 (TRUTH_NOT_EXPR,
916 boolean_type_node,
917 may_be_zero));
918 else
919 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
920 build_int_cst (TREE_TYPE (niter), 0),
921 rewrite_to_non_trapping_overflow (niter));
923 may_be_zero = NULL_TREE;
925 else if (integer_nonzerop (may_be_zero))
927 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
928 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
929 continue;
931 else
932 continue;
935 /* Loop assumptions are based off the normal exit. */
936 *assumptions = niter_assumptions;
937 *number_of_iterationsm1 = niter;
939 /* We want the number of loop header executions which is the number
940 of latch executions plus one.
941 ??? For UINT_MAX latch executions this number overflows to zero
942 for loops like do { n++; } while (n != 0); */
943 if (niter && !chrec_contains_undetermined (niter))
944 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
945 unshare_expr (niter),
946 build_int_cst (TREE_TYPE (niter), 1));
947 *number_of_iterations = niter;
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
953 return conds;
956 /* Determine the main loop exit for the vectorizer. */
958 edge
959 vec_init_loop_exit_info (class loop *loop)
961 /* Before we begin we must first determine which exit is the main one and
962 which are auxilary exits. */
963 auto_vec<edge> exits = get_loop_exit_edges (loop);
964 if (exits.length () == 1)
965 return exits[0];
967 /* If we have multiple exits we only support counting IV at the moment. Analyze
968 all exits and return one */
969 class tree_niter_desc niter_desc;
970 edge candidate = NULL;
971 for (edge exit : exits)
973 if (!get_loop_exit_condition (exit))
974 continue;
976 if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
977 && !chrec_contains_undetermined (niter_desc.niter))
979 if (!niter_desc.may_be_zero || !candidate)
980 candidate = exit;
984 return candidate;
987 /* Function bb_in_loop_p
989 Used as predicate for dfs order traversal of the loop bbs. */
991 static bool
992 bb_in_loop_p (const_basic_block bb, const void *data)
994 const class loop *const loop = (const class loop *)data;
995 if (flow_bb_inside_loop_p (loop, bb))
996 return true;
997 return false;
1001 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002 stmt_vec_info structs for all the stmts in LOOP_IN. */
1004 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005 : vec_info (vec_info::loop, shared),
1006 loop (loop_in),
1007 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008 num_itersm1 (NULL_TREE),
1009 num_iters (NULL_TREE),
1010 num_iters_unchanged (NULL_TREE),
1011 num_iters_assumptions (NULL_TREE),
1012 vector_costs (nullptr),
1013 scalar_costs (nullptr),
1014 th (0),
1015 versioning_threshold (0),
1016 vectorization_factor (0),
1017 main_loop_edge (nullptr),
1018 skip_main_loop_edge (nullptr),
1019 skip_this_loop_edge (nullptr),
1020 reusable_accumulators (),
1021 suggested_unroll_factor (1),
1022 max_vectorization_factor (0),
1023 mask_skip_niters (NULL_TREE),
1024 rgroup_compare_type (NULL_TREE),
1025 simd_if_cond (NULL_TREE),
1026 partial_vector_style (vect_partial_vectors_none),
1027 unaligned_dr (NULL),
1028 peeling_for_alignment (0),
1029 ptr_mask (0),
1030 ivexpr_map (NULL),
1031 scan_map (NULL),
1032 slp_unrolling_factor (1),
1033 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034 vectorizable (false),
1035 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036 using_partial_vectors_p (false),
1037 using_decrementing_iv_p (false),
1038 using_select_vl_p (false),
1039 epil_using_partial_vectors_p (false),
1040 partial_load_store_bias (0),
1041 peeling_for_gaps (false),
1042 peeling_for_niter (false),
1043 no_data_dependencies (false),
1044 has_mask_store (false),
1045 scalar_loop_scaling (profile_probability::uninitialized ()),
1046 scalar_loop (NULL),
1047 orig_loop_info (NULL),
1048 vec_loop_iv_exit (NULL),
1049 vec_epilogue_loop_iv_exit (NULL),
1050 scalar_loop_iv_exit (NULL)
1052 /* CHECKME: We want to visit all BBs before their successors (except for
1053 latch blocks, for which this assertion wouldn't hold). In the simple
1054 case of the loop forms we allow, a dfs order of the BBs would the same
1055 as reversed postorder traversal, so we are safe. */
1057 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1058 bbs, loop->num_nodes, loop);
1059 gcc_assert (nbbs == loop->num_nodes);
1061 for (unsigned int i = 0; i < nbbs; i++)
1063 basic_block bb = bbs[i];
1064 gimple_stmt_iterator si;
1066 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1068 gimple *phi = gsi_stmt (si);
1069 gimple_set_uid (phi, 0);
1070 add_stmt (phi);
1073 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1075 gimple *stmt = gsi_stmt (si);
1076 gimple_set_uid (stmt, 0);
1077 if (is_gimple_debug (stmt))
1078 continue;
1079 add_stmt (stmt);
1080 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1081 third argument is the #pragma omp simd if (x) condition, when 0,
1082 loop shouldn't be vectorized, when non-zero constant, it should
1083 be vectorized normally, otherwise versioned with vectorized loop
1084 done if the condition is non-zero at runtime. */
1085 if (loop_in->simduid
1086 && is_gimple_call (stmt)
1087 && gimple_call_internal_p (stmt)
1088 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1089 && gimple_call_num_args (stmt) >= 3
1090 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1091 && (loop_in->simduid
1092 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1094 tree arg = gimple_call_arg (stmt, 2);
1095 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1096 simd_if_cond = arg;
1097 else
1098 gcc_assert (integer_nonzerop (arg));
1103 epilogue_vinfos.create (6);
1106 /* Free all levels of rgroup CONTROLS. */
1108 void
1109 release_vec_loop_controls (vec<rgroup_controls> *controls)
1111 rgroup_controls *rgc;
1112 unsigned int i;
1113 FOR_EACH_VEC_ELT (*controls, i, rgc)
1114 rgc->controls.release ();
1115 controls->release ();
1118 /* Free all memory used by the _loop_vec_info, as well as all the
1119 stmt_vec_info structs of all the stmts in the loop. */
1121 _loop_vec_info::~_loop_vec_info ()
1123 free (bbs);
1125 release_vec_loop_controls (&masks.rgc_vec);
1126 release_vec_loop_controls (&lens);
1127 delete ivexpr_map;
1128 delete scan_map;
1129 epilogue_vinfos.release ();
1130 delete scalar_costs;
1131 delete vector_costs;
1133 /* When we release an epiloge vinfo that we do not intend to use
1134 avoid clearing AUX of the main loop which should continue to
1135 point to the main loop vinfo since otherwise we'll leak that. */
1136 if (loop->aux == this)
1137 loop->aux = NULL;
1140 /* Return an invariant or register for EXPR and emit necessary
1141 computations in the LOOP_VINFO loop preheader. */
1143 tree
1144 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1146 if (is_gimple_reg (expr)
1147 || is_gimple_min_invariant (expr))
1148 return expr;
1150 if (! loop_vinfo->ivexpr_map)
1151 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1152 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1153 if (! cached)
1155 gimple_seq stmts = NULL;
1156 cached = force_gimple_operand (unshare_expr (expr),
1157 &stmts, true, NULL_TREE);
1158 if (stmts)
1160 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1161 gsi_insert_seq_on_edge_immediate (e, stmts);
1164 return cached;
1167 /* Return true if we can use CMP_TYPE as the comparison type to produce
1168 all masks required to mask LOOP_VINFO. */
1170 static bool
1171 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1173 rgroup_controls *rgm;
1174 unsigned int i;
1175 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1176 if (rgm->type != NULL_TREE
1177 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1178 cmp_type, rgm->type,
1179 OPTIMIZE_FOR_SPEED))
1180 return false;
1181 return true;
1184 /* Calculate the maximum number of scalars per iteration for every
1185 rgroup in LOOP_VINFO. */
1187 static unsigned int
1188 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1190 unsigned int res = 1;
1191 unsigned int i;
1192 rgroup_controls *rgm;
1193 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1194 res = MAX (res, rgm->max_nscalars_per_iter);
1195 return res;
1198 /* Calculate the minimum precision necessary to represent:
1200 MAX_NITERS * FACTOR
1202 as an unsigned integer, where MAX_NITERS is the maximum number of
1203 loop header iterations for the original scalar form of LOOP_VINFO. */
1205 static unsigned
1206 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1208 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1210 /* Get the maximum number of iterations that is representable
1211 in the counter type. */
1212 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1213 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1215 /* Get a more refined estimate for the number of iterations. */
1216 widest_int max_back_edges;
1217 if (max_loop_iterations (loop, &max_back_edges))
1218 max_ni = wi::smin (max_ni, max_back_edges + 1);
1220 /* Work out how many bits we need to represent the limit. */
1221 return wi::min_precision (max_ni * factor, UNSIGNED);
1224 /* True if the loop needs peeling or partial vectors when vectorized. */
1226 static bool
1227 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1229 unsigned HOST_WIDE_INT const_vf;
1230 HOST_WIDE_INT max_niter
1231 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1233 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1234 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1235 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1236 (loop_vinfo));
1238 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1239 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1241 /* Work out the (constant) number of iterations that need to be
1242 peeled for reasons other than niters. */
1243 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1244 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1245 peel_niter += 1;
1246 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1247 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1248 return true;
1250 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1251 /* ??? When peeling for gaps but not alignment, we could
1252 try to check whether the (variable) niters is known to be
1253 VF * N + 1. That's something of a niche case though. */
1254 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1255 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1256 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1257 < (unsigned) exact_log2 (const_vf))
1258 /* In case of versioning, check if the maximum number of
1259 iterations is greater than th. If they are identical,
1260 the epilogue is unnecessary. */
1261 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1262 || ((unsigned HOST_WIDE_INT) max_niter
1263 > (th / const_vf) * const_vf))))
1264 return true;
1266 return false;
1269 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1270 whether we can actually generate the masks required. Return true if so,
1271 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1273 static bool
1274 vect_verify_full_masking (loop_vec_info loop_vinfo)
1276 unsigned int min_ni_width;
1278 /* Use a normal loop if there are no statements that need masking.
1279 This only happens in rare degenerate cases: it means that the loop
1280 has no loads, no stores, and no live-out values. */
1281 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1282 return false;
1284 /* Produce the rgroup controls. */
1285 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1287 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1288 tree vectype = mask.first;
1289 unsigned nvectors = mask.second;
1291 if (masks->rgc_vec.length () < nvectors)
1292 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1293 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1294 /* The number of scalars per iteration and the number of vectors are
1295 both compile-time constants. */
1296 unsigned int nscalars_per_iter
1297 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1298 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1300 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1302 rgm->max_nscalars_per_iter = nscalars_per_iter;
1303 rgm->type = truth_type_for (vectype);
1304 rgm->factor = 1;
1308 unsigned int max_nscalars_per_iter
1309 = vect_get_max_nscalars_per_iter (loop_vinfo);
1311 /* Work out how many bits we need to represent the limit. */
1312 min_ni_width
1313 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1315 /* Find a scalar mode for which WHILE_ULT is supported. */
1316 opt_scalar_int_mode cmp_mode_iter;
1317 tree cmp_type = NULL_TREE;
1318 tree iv_type = NULL_TREE;
1319 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1320 unsigned int iv_precision = UINT_MAX;
1322 if (iv_limit != -1)
1323 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1324 UNSIGNED);
1326 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1328 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1329 if (cmp_bits >= min_ni_width
1330 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1332 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1333 if (this_type
1334 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1336 /* Although we could stop as soon as we find a valid mode,
1337 there are at least two reasons why that's not always the
1338 best choice:
1340 - An IV that's Pmode or wider is more likely to be reusable
1341 in address calculations than an IV that's narrower than
1342 Pmode.
1344 - Doing the comparison in IV_PRECISION or wider allows
1345 a natural 0-based IV, whereas using a narrower comparison
1346 type requires mitigations against wrap-around.
1348 Conversely, if the IV limit is variable, doing the comparison
1349 in a wider type than the original type can introduce
1350 unnecessary extensions, so picking the widest valid mode
1351 is not always a good choice either.
1353 Here we prefer the first IV type that's Pmode or wider,
1354 and the first comparison type that's IV_PRECISION or wider.
1355 (The comparison type must be no wider than the IV type,
1356 to avoid extensions in the vector loop.)
1358 ??? We might want to try continuing beyond Pmode for ILP32
1359 targets if CMP_BITS < IV_PRECISION. */
1360 iv_type = this_type;
1361 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1362 cmp_type = this_type;
1363 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1364 break;
1369 if (!cmp_type)
1371 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1372 return false;
1375 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1376 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1377 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1378 return true;
1381 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1382 whether we can actually generate AVX512 style masks. Return true if so,
1383 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1385 static bool
1386 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1388 /* Produce differently organized rgc_vec and differently check
1389 we can produce masks. */
1391 /* Use a normal loop if there are no statements that need masking.
1392 This only happens in rare degenerate cases: it means that the loop
1393 has no loads, no stores, and no live-out values. */
1394 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1395 return false;
1397 /* For the decrementing IV we need to represent all values in
1398 [0, niter + niter_skip] where niter_skip is the elements we
1399 skip in the first iteration for prologue peeling. */
1400 tree iv_type = NULL_TREE;
1401 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1402 unsigned int iv_precision = UINT_MAX;
1403 if (iv_limit != -1)
1404 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1406 /* First compute the type for the IV we use to track the remaining
1407 scalar iterations. */
1408 opt_scalar_int_mode cmp_mode_iter;
1409 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1411 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1412 if (cmp_bits >= iv_precision
1413 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1415 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1416 if (iv_type)
1417 break;
1420 if (!iv_type)
1421 return false;
1423 /* Produce the rgroup controls. */
1424 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1426 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1427 tree vectype = mask.first;
1428 unsigned nvectors = mask.second;
1430 /* The number of scalars per iteration and the number of vectors are
1431 both compile-time constants. */
1432 unsigned int nscalars_per_iter
1433 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1434 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1436 /* We index the rgroup_controls vector with nscalars_per_iter
1437 which we keep constant and instead have a varying nvectors,
1438 remembering the vector mask with the fewest nV. */
1439 if (masks->rgc_vec.length () < nscalars_per_iter)
1440 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1441 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1443 if (!rgm->type || rgm->factor > nvectors)
1445 rgm->type = truth_type_for (vectype);
1446 rgm->compare_type = NULL_TREE;
1447 rgm->max_nscalars_per_iter = nscalars_per_iter;
1448 rgm->factor = nvectors;
1449 rgm->bias_adjusted_ctrl = NULL_TREE;
1453 /* There is no fixed compare type we are going to use but we have to
1454 be able to get at one for each mask group. */
1455 unsigned int min_ni_width
1456 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1458 bool ok = true;
1459 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1461 tree mask_type = rgc.type;
1462 if (!mask_type)
1463 continue;
1465 /* For now vect_get_loop_mask only supports integer mode masks
1466 when we need to split it. */
1467 if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1468 || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1470 ok = false;
1471 break;
1474 /* If iv_type is usable as compare type use that - we can elide the
1475 saturation in that case. */
1476 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1478 tree cmp_vectype
1479 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1480 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1481 rgc.compare_type = cmp_vectype;
1483 if (!rgc.compare_type)
1484 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1486 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1487 if (cmp_bits >= min_ni_width
1488 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1490 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1491 if (!cmp_type)
1492 continue;
1494 /* Check whether we can produce the mask with cmp_type. */
1495 tree cmp_vectype
1496 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1497 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1499 rgc.compare_type = cmp_vectype;
1500 break;
1504 if (!rgc.compare_type)
1506 ok = false;
1507 break;
1510 if (!ok)
1512 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1513 return false;
1516 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1517 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1518 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1519 return true;
1522 /* Check whether we can use vector access with length based on precison
1523 comparison. So far, to keep it simple, we only allow the case that the
1524 precision of the target supported length is larger than the precision
1525 required by loop niters. */
1527 static bool
1528 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1530 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1531 return false;
1533 machine_mode len_load_mode, len_store_mode;
1534 if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1535 .exists (&len_load_mode))
1536 return false;
1537 if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1538 .exists (&len_store_mode))
1539 return false;
1541 signed char partial_load_bias = internal_len_load_store_bias
1542 (IFN_LEN_LOAD, len_load_mode);
1544 signed char partial_store_bias = internal_len_load_store_bias
1545 (IFN_LEN_STORE, len_store_mode);
1547 gcc_assert (partial_load_bias == partial_store_bias);
1549 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1550 return false;
1552 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1553 len_loads with a length of zero. In order to avoid that we prohibit
1554 more than one loop length here. */
1555 if (partial_load_bias == -1
1556 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1557 return false;
1559 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1561 unsigned int max_nitems_per_iter = 1;
1562 unsigned int i;
1563 rgroup_controls *rgl;
1564 /* Find the maximum number of items per iteration for every rgroup. */
1565 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1567 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1568 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1571 /* Work out how many bits we need to represent the length limit. */
1572 unsigned int min_ni_prec
1573 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1575 /* Now use the maximum of below precisions for one suitable IV type:
1576 - the IV's natural precision
1577 - the precision needed to hold: the maximum number of scalar
1578 iterations multiplied by the scale factor (min_ni_prec above)
1579 - the Pmode precision
1581 If min_ni_prec is less than the precision of the current niters,
1582 we perfer to still use the niters type. Prefer to use Pmode and
1583 wider IV to avoid narrow conversions. */
1585 unsigned int ni_prec
1586 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1587 min_ni_prec = MAX (min_ni_prec, ni_prec);
1588 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1590 tree iv_type = NULL_TREE;
1591 opt_scalar_int_mode tmode_iter;
1592 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1594 scalar_mode tmode = tmode_iter.require ();
1595 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1597 /* ??? Do we really want to construct one IV whose precision exceeds
1598 BITS_PER_WORD? */
1599 if (tbits > BITS_PER_WORD)
1600 break;
1602 /* Find the first available standard integral type. */
1603 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1605 iv_type = build_nonstandard_integer_type (tbits, true);
1606 break;
1610 if (!iv_type)
1612 if (dump_enabled_p ())
1613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1614 "can't vectorize with length-based partial vectors"
1615 " because there is no suitable iv type.\n");
1616 return false;
1619 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1620 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1621 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1623 return true;
1626 /* Calculate the cost of one scalar iteration of the loop. */
1627 static void
1628 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1630 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1631 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1632 int nbbs = loop->num_nodes, factor;
1633 int innerloop_iters, i;
1635 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1637 /* Gather costs for statements in the scalar loop. */
1639 /* FORNOW. */
1640 innerloop_iters = 1;
1641 if (loop->inner)
1642 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1644 for (i = 0; i < nbbs; i++)
1646 gimple_stmt_iterator si;
1647 basic_block bb = bbs[i];
1649 if (bb->loop_father == loop->inner)
1650 factor = innerloop_iters;
1651 else
1652 factor = 1;
1654 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1656 gimple *stmt = gsi_stmt (si);
1657 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1659 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1660 continue;
1662 /* Skip stmts that are not vectorized inside the loop. */
1663 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1664 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1665 && (!STMT_VINFO_LIVE_P (vstmt_info)
1666 || !VECTORIZABLE_CYCLE_DEF
1667 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1668 continue;
1670 vect_cost_for_stmt kind;
1671 if (STMT_VINFO_DATA_REF (stmt_info))
1673 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1674 kind = scalar_load;
1675 else
1676 kind = scalar_store;
1678 else if (vect_nop_conversion_p (stmt_info))
1679 continue;
1680 else
1681 kind = scalar_stmt;
1683 /* We are using vect_prologue here to avoid scaling twice
1684 by the inner loop factor. */
1685 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1686 factor, kind, stmt_info, 0, vect_prologue);
1690 /* Now accumulate cost. */
1691 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1692 add_stmt_costs (loop_vinfo->scalar_costs,
1693 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1694 loop_vinfo->scalar_costs->finish_cost (nullptr);
1698 /* Function vect_analyze_loop_form.
1700 Verify that certain CFG restrictions hold, including:
1701 - the loop has a pre-header
1702 - the loop has a single entry and exit
1703 - the loop exit condition is simple enough
1704 - the number of iterations can be analyzed, i.e, a countable loop. The
1705 niter could be analyzed under some assumptions. */
1707 opt_result
1708 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1710 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1712 edge exit_e = vec_init_loop_exit_info (loop);
1713 if (!exit_e)
1714 return opt_result::failure_at (vect_location,
1715 "not vectorized:"
1716 " could not determine main exit from"
1717 " loop with multiple exits.\n");
1718 info->loop_exit = exit_e;
1719 if (dump_enabled_p ())
1720 dump_printf_loc (MSG_NOTE, vect_location,
1721 "using as main loop exit: %d -> %d [AUX: %p]\n",
1722 exit_e->src->index, exit_e->dest->index, exit_e->aux);
1724 /* Different restrictions apply when we are considering an inner-most loop,
1725 vs. an outer (nested) loop.
1726 (FORNOW. May want to relax some of these restrictions in the future). */
1728 info->inner_loop_cond = NULL;
1729 if (!loop->inner)
1731 /* Inner-most loop. We currently require that the number of BBs is
1732 exactly 2 (the header and latch). Vectorizable inner-most loops
1733 look like this:
1735 (pre-header)
1737 header <--------+
1738 | | |
1739 | +--> latch --+
1741 (exit-bb) */
1743 if (loop->num_nodes != 2)
1744 return opt_result::failure_at (vect_location,
1745 "not vectorized:"
1746 " control flow in loop.\n");
1748 if (empty_block_p (loop->header))
1749 return opt_result::failure_at (vect_location,
1750 "not vectorized: empty loop.\n");
1752 else
1754 class loop *innerloop = loop->inner;
1755 edge entryedge;
1757 /* Nested loop. We currently require that the loop is doubly-nested,
1758 contains a single inner loop, and the number of BBs is exactly 5.
1759 Vectorizable outer-loops look like this:
1761 (pre-header)
1763 header <---+
1765 inner-loop |
1767 tail ------+
1769 (exit-bb)
1771 The inner-loop has the properties expected of inner-most loops
1772 as described above. */
1774 if ((loop->inner)->inner || (loop->inner)->next)
1775 return opt_result::failure_at (vect_location,
1776 "not vectorized:"
1777 " multiple nested loops.\n");
1779 if (loop->num_nodes != 5)
1780 return opt_result::failure_at (vect_location,
1781 "not vectorized:"
1782 " control flow in loop.\n");
1784 entryedge = loop_preheader_edge (innerloop);
1785 if (entryedge->src != loop->header
1786 || !single_exit (innerloop)
1787 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1788 return opt_result::failure_at (vect_location,
1789 "not vectorized:"
1790 " unsupported outerloop form.\n");
1792 /* Analyze the inner-loop. */
1793 vect_loop_form_info inner;
1794 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1795 if (!res)
1797 if (dump_enabled_p ())
1798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1799 "not vectorized: Bad inner loop.\n");
1800 return res;
1803 /* Don't support analyzing niter under assumptions for inner
1804 loop. */
1805 if (!integer_onep (inner.assumptions))
1806 return opt_result::failure_at (vect_location,
1807 "not vectorized: Bad inner loop.\n");
1809 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1810 return opt_result::failure_at (vect_location,
1811 "not vectorized: inner-loop count not"
1812 " invariant.\n");
1814 if (dump_enabled_p ())
1815 dump_printf_loc (MSG_NOTE, vect_location,
1816 "Considering outer-loop vectorization.\n");
1817 info->inner_loop_cond = inner.conds[0];
1820 if (!single_exit (loop))
1821 return opt_result::failure_at (vect_location,
1822 "not vectorized: multiple exits.\n");
1823 if (EDGE_COUNT (loop->header->preds) != 2)
1824 return opt_result::failure_at (vect_location,
1825 "not vectorized:"
1826 " too many incoming edges.\n");
1828 /* We assume that the loop exit condition is at the end of the loop. i.e,
1829 that the loop is represented as a do-while (with a proper if-guard
1830 before the loop if needed), where the loop header contains all the
1831 executable statements, and the latch is empty. */
1832 if (!empty_block_p (loop->latch)
1833 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1834 return opt_result::failure_at (vect_location,
1835 "not vectorized: latch block not empty.\n");
1837 /* Make sure the exit is not abnormal. */
1838 if (exit_e->flags & EDGE_ABNORMAL)
1839 return opt_result::failure_at (vect_location,
1840 "not vectorized:"
1841 " abnormal loop exit edge.\n");
1843 info->conds
1844 = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1845 &info->number_of_iterations,
1846 &info->number_of_iterationsm1);
1848 if (info->conds.is_empty ())
1849 return opt_result::failure_at
1850 (vect_location,
1851 "not vectorized: complicated exit condition.\n");
1853 /* Determine what the primary and alternate exit conds are. */
1854 for (unsigned i = 0; i < info->conds.length (); i++)
1856 gcond *cond = info->conds[i];
1857 if (exit_e->src == gimple_bb (cond))
1858 std::swap (info->conds[0], info->conds[i]);
1861 if (integer_zerop (info->assumptions)
1862 || !info->number_of_iterations
1863 || chrec_contains_undetermined (info->number_of_iterations))
1864 return opt_result::failure_at
1865 (info->conds[0],
1866 "not vectorized: number of iterations cannot be computed.\n");
1868 if (integer_zerop (info->number_of_iterations))
1869 return opt_result::failure_at
1870 (info->conds[0],
1871 "not vectorized: number of iterations = 0.\n");
1873 if (!(tree_fits_shwi_p (info->number_of_iterations)
1874 && tree_to_shwi (info->number_of_iterations) > 0))
1876 if (dump_enabled_p ())
1878 dump_printf_loc (MSG_NOTE, vect_location,
1879 "Symbolic number of iterations is ");
1880 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1881 dump_printf (MSG_NOTE, "\n");
1885 return opt_result::success ();
1888 /* Create a loop_vec_info for LOOP with SHARED and the
1889 vect_analyze_loop_form result. */
1891 loop_vec_info
1892 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1893 const vect_loop_form_info *info,
1894 loop_vec_info main_loop_info)
1896 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1897 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1898 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1899 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1900 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1901 /* Also record the assumptions for versioning. */
1902 if (!integer_onep (info->assumptions) && !main_loop_info)
1903 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1905 for (gcond *cond : info->conds)
1907 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1908 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1911 for (unsigned i = 1; i < info->conds.length (); i ++)
1912 LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1913 LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1915 LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1917 if (info->inner_loop_cond)
1919 stmt_vec_info inner_loop_cond_info
1920 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1921 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1922 /* If we have an estimate on the number of iterations of the inner
1923 loop use that to limit the scale for costing, otherwise use
1924 --param vect-inner-loop-cost-factor literally. */
1925 widest_int nit;
1926 if (estimated_stmt_executions (loop->inner, &nit))
1927 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1928 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1931 return loop_vinfo;
1936 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1937 statements update the vectorization factor. */
1939 static void
1940 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1942 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1943 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1944 int nbbs = loop->num_nodes;
1945 poly_uint64 vectorization_factor;
1946 int i;
1948 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1950 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1951 gcc_assert (known_ne (vectorization_factor, 0U));
1953 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1954 vectorization factor of the loop is the unrolling factor required by
1955 the SLP instances. If that unrolling factor is 1, we say, that we
1956 perform pure SLP on loop - cross iteration parallelism is not
1957 exploited. */
1958 bool only_slp_in_loop = true;
1959 for (i = 0; i < nbbs; i++)
1961 basic_block bb = bbs[i];
1962 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1963 gsi_next (&si))
1965 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1966 if (!stmt_info)
1967 continue;
1968 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1969 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1970 && !PURE_SLP_STMT (stmt_info))
1971 /* STMT needs both SLP and loop-based vectorization. */
1972 only_slp_in_loop = false;
1974 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1975 gsi_next (&si))
1977 if (is_gimple_debug (gsi_stmt (si)))
1978 continue;
1979 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1980 stmt_info = vect_stmt_to_vectorize (stmt_info);
1981 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1982 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1983 && !PURE_SLP_STMT (stmt_info))
1984 /* STMT needs both SLP and loop-based vectorization. */
1985 only_slp_in_loop = false;
1989 if (only_slp_in_loop)
1991 if (dump_enabled_p ())
1992 dump_printf_loc (MSG_NOTE, vect_location,
1993 "Loop contains only SLP stmts\n");
1994 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1996 else
1998 if (dump_enabled_p ())
1999 dump_printf_loc (MSG_NOTE, vect_location,
2000 "Loop contains SLP and non-SLP stmts\n");
2001 /* Both the vectorization factor and unroll factor have the form
2002 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2003 so they must have a common multiple. */
2004 vectorization_factor
2005 = force_common_multiple (vectorization_factor,
2006 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2009 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2010 if (dump_enabled_p ())
2012 dump_printf_loc (MSG_NOTE, vect_location,
2013 "Updating vectorization factor to ");
2014 dump_dec (MSG_NOTE, vectorization_factor);
2015 dump_printf (MSG_NOTE, ".\n");
2019 /* Return true if STMT_INFO describes a double reduction phi and if
2020 the other phi in the reduction is also relevant for vectorization.
2021 This rejects cases such as:
2023 outer1:
2024 x_1 = PHI <x_3(outer2), ...>;
2027 inner:
2028 x_2 = ...;
2031 outer2:
2032 x_3 = PHI <x_2(inner)>;
2034 if nothing in x_2 or elsewhere makes x_1 relevant. */
2036 static bool
2037 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2039 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2040 return false;
2042 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2045 /* Function vect_analyze_loop_operations.
2047 Scan the loop stmts and make sure they are all vectorizable. */
2049 static opt_result
2050 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2052 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2053 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2054 int nbbs = loop->num_nodes;
2055 int i;
2056 stmt_vec_info stmt_info;
2057 bool need_to_vectorize = false;
2058 bool ok;
2060 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2062 auto_vec<stmt_info_for_cost> cost_vec;
2064 for (i = 0; i < nbbs; i++)
2066 basic_block bb = bbs[i];
2068 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2069 gsi_next (&si))
2071 gphi *phi = si.phi ();
2072 ok = true;
2074 stmt_info = loop_vinfo->lookup_stmt (phi);
2075 if (dump_enabled_p ())
2076 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2077 (gimple *) phi);
2078 if (virtual_operand_p (gimple_phi_result (phi)))
2079 continue;
2081 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2082 (i.e., a phi in the tail of the outer-loop). */
2083 if (! is_loop_header_bb_p (bb))
2085 /* FORNOW: we currently don't support the case that these phis
2086 are not used in the outerloop (unless it is double reduction,
2087 i.e., this phi is vect_reduction_def), cause this case
2088 requires to actually do something here. */
2089 if (STMT_VINFO_LIVE_P (stmt_info)
2090 && !vect_active_double_reduction_p (stmt_info))
2091 return opt_result::failure_at (phi,
2092 "Unsupported loop-closed phi"
2093 " in outer-loop.\n");
2095 /* If PHI is used in the outer loop, we check that its operand
2096 is defined in the inner loop. */
2097 if (STMT_VINFO_RELEVANT_P (stmt_info))
2099 tree phi_op;
2101 if (gimple_phi_num_args (phi) != 1)
2102 return opt_result::failure_at (phi, "unsupported phi");
2104 phi_op = PHI_ARG_DEF (phi, 0);
2105 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2106 if (!op_def_info)
2107 return opt_result::failure_at (phi, "unsupported phi\n");
2109 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2110 && (STMT_VINFO_RELEVANT (op_def_info)
2111 != vect_used_in_outer_by_reduction))
2112 return opt_result::failure_at (phi, "unsupported phi\n");
2114 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2115 || (STMT_VINFO_DEF_TYPE (stmt_info)
2116 == vect_double_reduction_def))
2117 && !vectorizable_lc_phi (loop_vinfo,
2118 stmt_info, NULL, NULL))
2119 return opt_result::failure_at (phi, "unsupported phi\n");
2122 continue;
2125 gcc_assert (stmt_info);
2127 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2128 || STMT_VINFO_LIVE_P (stmt_info))
2129 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2130 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2131 /* A scalar-dependence cycle that we don't support. */
2132 return opt_result::failure_at (phi,
2133 "not vectorized:"
2134 " scalar dependence cycle.\n");
2136 if (STMT_VINFO_RELEVANT_P (stmt_info))
2138 need_to_vectorize = true;
2139 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2140 && ! PURE_SLP_STMT (stmt_info))
2141 ok = vectorizable_induction (loop_vinfo,
2142 stmt_info, NULL, NULL,
2143 &cost_vec);
2144 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2145 || (STMT_VINFO_DEF_TYPE (stmt_info)
2146 == vect_double_reduction_def)
2147 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2148 && ! PURE_SLP_STMT (stmt_info))
2149 ok = vectorizable_reduction (loop_vinfo,
2150 stmt_info, NULL, NULL, &cost_vec);
2151 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2152 == vect_first_order_recurrence)
2153 && ! PURE_SLP_STMT (stmt_info))
2154 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2155 &cost_vec);
2158 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2159 if (ok
2160 && STMT_VINFO_LIVE_P (stmt_info)
2161 && !PURE_SLP_STMT (stmt_info))
2162 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2163 -1, false, &cost_vec);
2165 if (!ok)
2166 return opt_result::failure_at (phi,
2167 "not vectorized: relevant phi not "
2168 "supported: %G",
2169 static_cast <gimple *> (phi));
2172 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2173 gsi_next (&si))
2175 gimple *stmt = gsi_stmt (si);
2176 if (!gimple_clobber_p (stmt)
2177 && !is_gimple_debug (stmt))
2179 opt_result res
2180 = vect_analyze_stmt (loop_vinfo,
2181 loop_vinfo->lookup_stmt (stmt),
2182 &need_to_vectorize,
2183 NULL, NULL, &cost_vec);
2184 if (!res)
2185 return res;
2188 } /* bbs */
2190 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2192 /* All operations in the loop are either irrelevant (deal with loop
2193 control, or dead), or only used outside the loop and can be moved
2194 out of the loop (e.g. invariants, inductions). The loop can be
2195 optimized away by scalar optimizations. We're better off not
2196 touching this loop. */
2197 if (!need_to_vectorize)
2199 if (dump_enabled_p ())
2200 dump_printf_loc (MSG_NOTE, vect_location,
2201 "All the computation can be taken out of the loop.\n");
2202 return opt_result::failure_at
2203 (vect_location,
2204 "not vectorized: redundant loop. no profit to vectorize.\n");
2207 return opt_result::success ();
2210 /* Return true if we know that the iteration count is smaller than the
2211 vectorization factor. Return false if it isn't, or if we can't be sure
2212 either way. */
2214 static bool
2215 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2217 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2219 HOST_WIDE_INT max_niter;
2220 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2221 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2222 else
2223 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2225 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2226 return true;
2228 return false;
2231 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2232 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2233 definitely no, or -1 if it's worth retrying. */
2235 static int
2236 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2237 unsigned *suggested_unroll_factor)
2239 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2240 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2242 /* Only loops that can handle partially-populated vectors can have iteration
2243 counts less than the vectorization factor. */
2244 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2245 && vect_known_niters_smaller_than_vf (loop_vinfo))
2247 if (dump_enabled_p ())
2248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249 "not vectorized: iteration count smaller than "
2250 "vectorization factor.\n");
2251 return 0;
2254 /* If we know the number of iterations we can do better, for the
2255 epilogue we can also decide whether the main loop leaves us
2256 with enough iterations, prefering a smaller vector epilog then
2257 also possibly used for the case we skip the vector loop. */
2258 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2260 widest_int scalar_niters
2261 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2262 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2264 loop_vec_info orig_loop_vinfo
2265 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2266 unsigned lowest_vf
2267 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2268 int prolog_peeling = 0;
2269 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2270 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2271 if (prolog_peeling >= 0
2272 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2273 lowest_vf))
2275 unsigned gap
2276 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2277 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2278 % lowest_vf + gap);
2281 /* Reject vectorizing for a single scalar iteration, even if
2282 we could in principle implement that using partial vectors. */
2283 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2284 if (scalar_niters <= peeling_gap + 1)
2286 if (dump_enabled_p ())
2287 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2288 "not vectorized: loop only has a single "
2289 "scalar iteration.\n");
2290 return 0;
2293 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2295 /* Check that the loop processes at least one full vector. */
2296 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2297 if (known_lt (scalar_niters, vf))
2299 if (dump_enabled_p ())
2300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301 "loop does not have enough iterations "
2302 "to support vectorization.\n");
2303 return 0;
2306 /* If we need to peel an extra epilogue iteration to handle data
2307 accesses with gaps, check that there are enough scalar iterations
2308 available.
2310 The check above is redundant with this one when peeling for gaps,
2311 but the distinction is useful for diagnostics. */
2312 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2313 && known_le (scalar_niters, vf))
2315 if (dump_enabled_p ())
2316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317 "loop does not have enough iterations "
2318 "to support peeling for gaps.\n");
2319 return 0;
2324 /* If using the "very cheap" model. reject cases in which we'd keep
2325 a copy of the scalar code (even if we might be able to vectorize it). */
2326 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2327 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2328 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2329 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333 "some scalar iterations would need to be peeled\n");
2334 return 0;
2337 int min_profitable_iters, min_profitable_estimate;
2338 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2339 &min_profitable_estimate,
2340 suggested_unroll_factor);
2342 if (min_profitable_iters < 0)
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346 "not vectorized: vectorization not profitable.\n");
2347 if (dump_enabled_p ())
2348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2349 "not vectorized: vector version will never be "
2350 "profitable.\n");
2351 return -1;
2354 int min_scalar_loop_bound = (param_min_vect_loop_bound
2355 * assumed_vf);
2357 /* Use the cost model only if it is more conservative than user specified
2358 threshold. */
2359 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2360 min_profitable_iters);
2362 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2364 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2365 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369 "not vectorized: vectorization not profitable.\n");
2370 if (dump_enabled_p ())
2371 dump_printf_loc (MSG_NOTE, vect_location,
2372 "not vectorized: iteration count smaller than user "
2373 "specified loop bound parameter or minimum profitable "
2374 "iterations (whichever is more conservative).\n");
2375 return 0;
2378 /* The static profitablity threshold min_profitable_estimate includes
2379 the cost of having to check at runtime whether the scalar loop
2380 should be used instead. If it turns out that we don't need or want
2381 such a check, the threshold we should use for the static estimate
2382 is simply the point at which the vector loop becomes more profitable
2383 than the scalar loop. */
2384 if (min_profitable_estimate > min_profitable_iters
2385 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2386 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2387 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2388 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2390 if (dump_enabled_p ())
2391 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2392 " choice between the scalar and vector loops\n");
2393 min_profitable_estimate = min_profitable_iters;
2396 /* If the vector loop needs multiple iterations to be beneficial then
2397 things are probably too close to call, and the conservative thing
2398 would be to stick with the scalar code. */
2399 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2400 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2402 if (dump_enabled_p ())
2403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2404 "one iteration of the vector loop would be"
2405 " more expensive than the equivalent number of"
2406 " iterations of the scalar loop\n");
2407 return 0;
2410 HOST_WIDE_INT estimated_niter;
2412 /* If we are vectorizing an epilogue then we know the maximum number of
2413 scalar iterations it will cover is at least one lower than the
2414 vectorization factor of the main loop. */
2415 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2416 estimated_niter
2417 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2418 else
2420 estimated_niter = estimated_stmt_executions_int (loop);
2421 if (estimated_niter == -1)
2422 estimated_niter = likely_max_stmt_executions_int (loop);
2424 if (estimated_niter != -1
2425 && ((unsigned HOST_WIDE_INT) estimated_niter
2426 < MAX (th, (unsigned) min_profitable_estimate)))
2428 if (dump_enabled_p ())
2429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2430 "not vectorized: estimated iteration count too "
2431 "small.\n");
2432 if (dump_enabled_p ())
2433 dump_printf_loc (MSG_NOTE, vect_location,
2434 "not vectorized: estimated iteration count smaller "
2435 "than specified loop bound parameter or minimum "
2436 "profitable iterations (whichever is more "
2437 "conservative).\n");
2438 return -1;
2441 return 1;
2444 static opt_result
2445 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2446 vec<data_reference_p> *datarefs,
2447 unsigned int *n_stmts)
2449 *n_stmts = 0;
2450 for (unsigned i = 0; i < loop->num_nodes; i++)
2451 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2452 !gsi_end_p (gsi); gsi_next (&gsi))
2454 gimple *stmt = gsi_stmt (gsi);
2455 if (is_gimple_debug (stmt))
2456 continue;
2457 ++(*n_stmts);
2458 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2459 NULL, 0);
2460 if (!res)
2462 if (is_gimple_call (stmt) && loop->safelen)
2464 tree fndecl = gimple_call_fndecl (stmt), op;
2465 if (fndecl == NULL_TREE
2466 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2468 fndecl = gimple_call_arg (stmt, 0);
2469 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2470 fndecl = TREE_OPERAND (fndecl, 0);
2471 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2473 if (fndecl != NULL_TREE)
2475 cgraph_node *node = cgraph_node::get (fndecl);
2476 if (node != NULL && node->simd_clones != NULL)
2478 unsigned int j, n = gimple_call_num_args (stmt);
2479 for (j = 0; j < n; j++)
2481 op = gimple_call_arg (stmt, j);
2482 if (DECL_P (op)
2483 || (REFERENCE_CLASS_P (op)
2484 && get_base_address (op)))
2485 break;
2487 op = gimple_call_lhs (stmt);
2488 /* Ignore #pragma omp declare simd functions
2489 if they don't have data references in the
2490 call stmt itself. */
2491 if (j == n
2492 && !(op
2493 && (DECL_P (op)
2494 || (REFERENCE_CLASS_P (op)
2495 && get_base_address (op)))))
2496 continue;
2500 return res;
2502 /* If dependence analysis will give up due to the limit on the
2503 number of datarefs stop here and fail fatally. */
2504 if (datarefs->length ()
2505 > (unsigned)param_loop_max_datarefs_for_datadeps)
2506 return opt_result::failure_at (stmt, "exceeded param "
2507 "loop-max-datarefs-for-datadeps\n");
2509 return opt_result::success ();
2512 /* Look for SLP-only access groups and turn each individual access into its own
2513 group. */
2514 static void
2515 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2517 unsigned int i;
2518 struct data_reference *dr;
2520 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2522 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2523 FOR_EACH_VEC_ELT (datarefs, i, dr)
2525 gcc_assert (DR_REF (dr));
2526 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2528 /* Check if the load is a part of an interleaving chain. */
2529 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2531 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2532 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2533 unsigned int group_size = DR_GROUP_SIZE (first_element);
2535 /* Check if SLP-only groups. */
2536 if (!STMT_SLP_TYPE (stmt_info)
2537 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2539 /* Dissolve the group. */
2540 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2542 stmt_vec_info vinfo = first_element;
2543 while (vinfo)
2545 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2546 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2547 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2548 DR_GROUP_SIZE (vinfo) = 1;
2549 if (STMT_VINFO_STRIDED_P (first_element)
2550 /* We cannot handle stores with gaps. */
2551 || DR_IS_WRITE (dr_info->dr))
2553 STMT_VINFO_STRIDED_P (vinfo) = true;
2554 DR_GROUP_GAP (vinfo) = 0;
2556 else
2557 DR_GROUP_GAP (vinfo) = group_size - 1;
2558 /* Duplicate and adjust alignment info, it needs to
2559 be present on each group leader, see dr_misalignment. */
2560 if (vinfo != first_element)
2562 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2563 dr_info2->target_alignment = dr_info->target_alignment;
2564 int misalignment = dr_info->misalignment;
2565 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2567 HOST_WIDE_INT diff
2568 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2569 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2570 unsigned HOST_WIDE_INT align_c
2571 = dr_info->target_alignment.to_constant ();
2572 misalignment = (misalignment + diff) % align_c;
2574 dr_info2->misalignment = misalignment;
2576 vinfo = next;
2583 /* Determine if operating on full vectors for LOOP_VINFO might leave
2584 some scalar iterations still to do. If so, decide how we should
2585 handle those scalar iterations. The possibilities are:
2587 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2588 In this case:
2590 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2591 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2592 LOOP_VINFO_PEELING_FOR_NITER == false
2594 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2595 to handle the remaining scalar iterations. In this case:
2597 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2598 LOOP_VINFO_PEELING_FOR_NITER == true
2600 There are two choices:
2602 (2a) Consider vectorizing the epilogue loop at the same VF as the
2603 main loop, but using partial vectors instead of full vectors.
2604 In this case:
2606 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2608 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2609 In this case:
2611 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2614 opt_result
2615 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2617 /* Determine whether there would be any scalar iterations left over. */
2618 bool need_peeling_or_partial_vectors_p
2619 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2621 /* Decide whether to vectorize the loop with partial vectors. */
2622 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2623 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2624 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2625 && need_peeling_or_partial_vectors_p)
2627 /* For partial-vector-usage=1, try to push the handling of partial
2628 vectors to the epilogue, with the main loop continuing to operate
2629 on full vectors.
2631 If we are unrolling we also do not want to use partial vectors. This
2632 is to avoid the overhead of generating multiple masks and also to
2633 avoid having to execute entire iterations of FALSE masked instructions
2634 when dealing with one or less full iterations.
2636 ??? We could then end up failing to use partial vectors if we
2637 decide to peel iterations into a prologue, and if the main loop
2638 then ends up processing fewer than VF iterations. */
2639 if ((param_vect_partial_vector_usage == 1
2640 || loop_vinfo->suggested_unroll_factor > 1)
2641 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2642 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2643 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2644 else
2645 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2648 if (dump_enabled_p ())
2649 dump_printf_loc (MSG_NOTE, vect_location,
2650 "operating on %s vectors%s.\n",
2651 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2652 ? "partial" : "full",
2653 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2654 ? " for epilogue loop" : "");
2656 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2657 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2658 && need_peeling_or_partial_vectors_p);
2660 /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2661 analysis that we don't know whether the loop is vectorized by partial
2662 vectors (More details see tree-vect-loop-manip.cc).
2664 However, SELECT_VL vectorizaton style should only applied on partial
2665 vectorization since SELECT_VL is the GIMPLE IR that calculates the
2666 number of elements to be process for each iteration.
2668 After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2669 if it is not partial vectorized loop. */
2670 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2671 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2673 return opt_result::success ();
2676 /* Function vect_analyze_loop_2.
2678 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2679 analyses will record information in some members of LOOP_VINFO. FATAL
2680 indicates if some analysis meets fatal error. If one non-NULL pointer
2681 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2682 worked out suggested unroll factor, while one NULL pointer shows it's
2683 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2684 is to hold the slp decision when the suggested unroll factor is worked
2685 out. */
2686 static opt_result
2687 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2688 unsigned *suggested_unroll_factor,
2689 bool& slp_done_for_suggested_uf)
2691 opt_result ok = opt_result::success ();
2692 int res;
2693 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2694 poly_uint64 min_vf = 2;
2695 loop_vec_info orig_loop_vinfo = NULL;
2697 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2698 loop_vec_info of the first vectorized loop. */
2699 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2700 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2701 else
2702 orig_loop_vinfo = loop_vinfo;
2703 gcc_assert (orig_loop_vinfo);
2705 /* The first group of checks is independent of the vector size. */
2706 fatal = true;
2708 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2709 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2710 return opt_result::failure_at (vect_location,
2711 "not vectorized: simd if(0)\n");
2713 /* Find all data references in the loop (which correspond to vdefs/vuses)
2714 and analyze their evolution in the loop. */
2716 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2718 /* Gather the data references and count stmts in the loop. */
2719 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2721 opt_result res
2722 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2723 &LOOP_VINFO_DATAREFS (loop_vinfo),
2724 &LOOP_VINFO_N_STMTS (loop_vinfo));
2725 if (!res)
2727 if (dump_enabled_p ())
2728 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2729 "not vectorized: loop contains function "
2730 "calls or data references that cannot "
2731 "be analyzed\n");
2732 return res;
2734 loop_vinfo->shared->save_datarefs ();
2736 else
2737 loop_vinfo->shared->check_datarefs ();
2739 /* Analyze the data references and also adjust the minimal
2740 vectorization factor according to the loads and stores. */
2742 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2743 if (!ok)
2745 if (dump_enabled_p ())
2746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2747 "bad data references.\n");
2748 return ok;
2751 /* Check if we are applying unroll factor now. */
2752 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2753 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2755 /* If the slp decision is false when suggested unroll factor is worked
2756 out, and we are applying suggested unroll factor, we can simply skip
2757 all slp related analyses this time. */
2758 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2760 /* Classify all cross-iteration scalar data-flow cycles.
2761 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2762 vect_analyze_scalar_cycles (loop_vinfo, slp);
2764 vect_pattern_recog (loop_vinfo);
2766 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2768 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2769 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2771 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2772 if (!ok)
2774 if (dump_enabled_p ())
2775 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2776 "bad data access.\n");
2777 return ok;
2780 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2782 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2783 if (!ok)
2785 if (dump_enabled_p ())
2786 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2787 "unexpected pattern.\n");
2788 return ok;
2791 /* While the rest of the analysis below depends on it in some way. */
2792 fatal = false;
2794 /* Analyze data dependences between the data-refs in the loop
2795 and adjust the maximum vectorization factor according to
2796 the dependences.
2797 FORNOW: fail at the first data dependence that we encounter. */
2799 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2800 if (!ok)
2802 if (dump_enabled_p ())
2803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2804 "bad data dependence.\n");
2805 return ok;
2807 if (max_vf != MAX_VECTORIZATION_FACTOR
2808 && maybe_lt (max_vf, min_vf))
2809 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2810 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2812 ok = vect_determine_vectorization_factor (loop_vinfo);
2813 if (!ok)
2815 if (dump_enabled_p ())
2816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2817 "can't determine vectorization factor.\n");
2818 return ok;
2821 /* Compute the scalar iteration cost. */
2822 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2824 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2826 if (slp)
2828 /* Check the SLP opportunities in the loop, analyze and build
2829 SLP trees. */
2830 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2831 if (!ok)
2832 return ok;
2834 /* If there are any SLP instances mark them as pure_slp. */
2835 slp = vect_make_slp_decision (loop_vinfo);
2836 if (slp)
2838 /* Find stmts that need to be both vectorized and SLPed. */
2839 vect_detect_hybrid_slp (loop_vinfo);
2841 /* Update the vectorization factor based on the SLP decision. */
2842 vect_update_vf_for_slp (loop_vinfo);
2844 /* Optimize the SLP graph with the vectorization factor fixed. */
2845 vect_optimize_slp (loop_vinfo);
2847 /* Gather the loads reachable from the SLP graph entries. */
2848 vect_gather_slp_loads (loop_vinfo);
2852 bool saved_can_use_partial_vectors_p
2853 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2855 /* We don't expect to have to roll back to anything other than an empty
2856 set of rgroups. */
2857 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2859 /* This is the point where we can re-start analysis with SLP forced off. */
2860 start_over:
2862 /* Apply the suggested unrolling factor, this was determined by the backend
2863 during finish_cost the first time we ran the analyzis for this
2864 vector mode. */
2865 if (applying_suggested_uf)
2866 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2868 /* Now the vectorization factor is final. */
2869 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2870 gcc_assert (known_ne (vectorization_factor, 0U));
2872 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2874 dump_printf_loc (MSG_NOTE, vect_location,
2875 "vectorization_factor = ");
2876 dump_dec (MSG_NOTE, vectorization_factor);
2877 dump_printf (MSG_NOTE, ", niters = %wd\n",
2878 LOOP_VINFO_INT_NITERS (loop_vinfo));
2881 if (max_vf != MAX_VECTORIZATION_FACTOR
2882 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2883 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2885 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2887 /* Analyze the alignment of the data-refs in the loop.
2888 Fail if a data reference is found that cannot be vectorized. */
2890 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2891 if (!ok)
2893 if (dump_enabled_p ())
2894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2895 "bad data alignment.\n");
2896 return ok;
2899 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2900 It is important to call pruning after vect_analyze_data_ref_accesses,
2901 since we use grouping information gathered by interleaving analysis. */
2902 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2903 if (!ok)
2904 return ok;
2906 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2907 vectorization, since we do not want to add extra peeling or
2908 add versioning for alignment. */
2909 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2910 /* This pass will decide on using loop versioning and/or loop peeling in
2911 order to enhance the alignment of data references in the loop. */
2912 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2913 if (!ok)
2914 return ok;
2916 if (slp)
2918 /* Analyze operations in the SLP instances. Note this may
2919 remove unsupported SLP instances which makes the above
2920 SLP kind detection invalid. */
2921 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2922 vect_slp_analyze_operations (loop_vinfo);
2923 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2925 ok = opt_result::failure_at (vect_location,
2926 "unsupported SLP instances\n");
2927 goto again;
2930 /* Check whether any load in ALL SLP instances is possibly permuted. */
2931 slp_tree load_node, slp_root;
2932 unsigned i, x;
2933 slp_instance instance;
2934 bool can_use_lanes = true;
2935 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2937 slp_root = SLP_INSTANCE_TREE (instance);
2938 int group_size = SLP_TREE_LANES (slp_root);
2939 tree vectype = SLP_TREE_VECTYPE (slp_root);
2940 bool loads_permuted = false;
2941 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2943 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2944 continue;
2945 unsigned j;
2946 stmt_vec_info load_info;
2947 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2948 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2950 loads_permuted = true;
2951 break;
2955 /* If the loads and stores can be handled with load/store-lane
2956 instructions record it and move on to the next instance. */
2957 if (loads_permuted
2958 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2959 && vect_store_lanes_supported (vectype, group_size, false)
2960 != IFN_LAST)
2962 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2963 if (STMT_VINFO_GROUPED_ACCESS
2964 (SLP_TREE_REPRESENTATIVE (load_node)))
2966 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2967 (SLP_TREE_REPRESENTATIVE (load_node));
2968 /* Use SLP for strided accesses (or if we can't
2969 load-lanes). */
2970 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2971 || vect_load_lanes_supported
2972 (STMT_VINFO_VECTYPE (stmt_vinfo),
2973 DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2974 break;
2977 can_use_lanes
2978 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2980 if (can_use_lanes && dump_enabled_p ())
2981 dump_printf_loc (MSG_NOTE, vect_location,
2982 "SLP instance %p can use load/store-lanes\n",
2983 (void *) instance);
2985 else
2987 can_use_lanes = false;
2988 break;
2992 /* If all SLP instances can use load/store-lanes abort SLP and try again
2993 with SLP disabled. */
2994 if (can_use_lanes)
2996 ok = opt_result::failure_at (vect_location,
2997 "Built SLP cancelled: can use "
2998 "load/store-lanes\n");
2999 if (dump_enabled_p ())
3000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3001 "Built SLP cancelled: all SLP instances support "
3002 "load/store-lanes\n");
3003 goto again;
3007 /* Dissolve SLP-only groups. */
3008 vect_dissolve_slp_only_groups (loop_vinfo);
3010 /* Scan all the remaining operations in the loop that are not subject
3011 to SLP and make sure they are vectorizable. */
3012 ok = vect_analyze_loop_operations (loop_vinfo);
3013 if (!ok)
3015 if (dump_enabled_p ())
3016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3017 "bad operation or unsupported loop bound.\n");
3018 return ok;
3021 /* For now, we don't expect to mix both masking and length approaches for one
3022 loop, disable it if both are recorded. */
3023 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3024 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3025 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3027 if (dump_enabled_p ())
3028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3029 "can't vectorize a loop with partial vectors"
3030 " because we don't expect to mix different"
3031 " approaches with partial vectors for the"
3032 " same loop.\n");
3033 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3036 /* If we still have the option of using partial vectors,
3037 check whether we can generate the necessary loop controls. */
3038 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3040 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3042 if (!vect_verify_full_masking (loop_vinfo)
3043 && !vect_verify_full_masking_avx512 (loop_vinfo))
3044 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3046 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3047 if (!vect_verify_loop_lens (loop_vinfo))
3048 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3051 /* If we're vectorizing a loop that uses length "controls" and
3052 can iterate more than once, we apply decrementing IV approach
3053 in loop control. */
3054 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3055 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3056 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3057 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3058 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3059 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3060 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3062 /* If a loop uses length controls and has a decrementing loop control IV,
3063 we will normally pass that IV through a MIN_EXPR to calcaluate the
3064 basis for the length controls. E.g. in a loop that processes one
3065 element per scalar iteration, the number of elements would be
3066 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3068 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3069 step, since only the final iteration of the vector loop can have
3070 inactive lanes.
3072 However, some targets have a dedicated instruction for calculating the
3073 preferred length, given the total number of elements that still need to
3074 be processed. This is encapsulated in the SELECT_VL internal function.
3076 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3077 to determine the basis for the length controls. However, unlike the
3078 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3079 lanes inactive in any iteration of the vector loop, not just the last
3080 iteration. This SELECT_VL approach therefore requires us to use pointer
3081 IVs with variable steps.
3083 Once we've decided how many elements should be processed by one
3084 iteration of the vector loop, we need to populate the rgroup controls.
3085 If a loop has multiple rgroups, we need to make sure that those rgroups
3086 "line up" (that is, they must be consistent about which elements are
3087 active and which aren't). This is done by vect_adjust_loop_lens_control.
3089 In principle, it would be possible to use vect_adjust_loop_lens_control
3090 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3091 However:
3093 (1) In practice, it only makes sense to use SELECT_VL when a vector
3094 operation will be controlled directly by the result. It is not
3095 worth using SELECT_VL if it would only be the input to other
3096 calculations.
3098 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3099 pointer IV will need N updates by a variable amount (N-1 updates
3100 within the iteration and 1 update to move to the next iteration).
3102 Because of this, we prefer to use the MIN_EXPR approach whenever there
3103 is more than one length control.
3105 In addition, SELECT_VL always operates to a granularity of 1 unit.
3106 If we wanted to use it to control an SLP operation on N consecutive
3107 elements, we would need to make the SELECT_VL inputs measure scalar
3108 iterations (rather than elements) and then multiply the SELECT_VL
3109 result by N. But using SELECT_VL this way is inefficient because
3110 of (1) above.
3112 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3113 satisfied:
3115 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3116 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3118 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3119 we will fail to gain benefits of following unroll optimizations. We prefer
3120 using the MIN_EXPR approach in this situation. */
3121 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3123 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3124 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3125 OPTIMIZE_FOR_SPEED)
3126 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3127 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3128 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3129 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3130 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3133 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3134 assuming that the loop will be used as a main loop. We will redo
3135 this analysis later if we instead decide to use the loop as an
3136 epilogue loop. */
3137 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3138 if (!ok)
3139 return ok;
3141 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3142 to be able to handle fewer than VF scalars, or needs to have a lower VF
3143 than the main loop. */
3144 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3145 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3147 poly_uint64 unscaled_vf
3148 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3149 orig_loop_vinfo->suggested_unroll_factor);
3150 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3151 return opt_result::failure_at (vect_location,
3152 "Vectorization factor too high for"
3153 " epilogue loop.\n");
3156 /* Check the costings of the loop make vectorizing worthwhile. */
3157 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3158 if (res < 0)
3160 ok = opt_result::failure_at (vect_location,
3161 "Loop costings may not be worthwhile.\n");
3162 goto again;
3164 if (!res)
3165 return opt_result::failure_at (vect_location,
3166 "Loop costings not worthwhile.\n");
3168 /* If an epilogue loop is required make sure we can create one. */
3169 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3170 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3172 if (dump_enabled_p ())
3173 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3174 if (!vect_can_advance_ivs_p (loop_vinfo)
3175 || !slpeel_can_duplicate_loop_p (loop,
3176 LOOP_VINFO_IV_EXIT (loop_vinfo),
3177 LOOP_VINFO_IV_EXIT (loop_vinfo)))
3179 ok = opt_result::failure_at (vect_location,
3180 "not vectorized: can't create required "
3181 "epilog loop\n");
3182 goto again;
3186 /* During peeling, we need to check if number of loop iterations is
3187 enough for both peeled prolog loop and vector loop. This check
3188 can be merged along with threshold check of loop versioning, so
3189 increase threshold for this case if necessary.
3191 If we are analyzing an epilogue we still want to check what its
3192 versioning threshold would be. If we decide to vectorize the epilogues we
3193 will want to use the lowest versioning threshold of all epilogues and main
3194 loop. This will enable us to enter a vectorized epilogue even when
3195 versioning the loop. We can't simply check whether the epilogue requires
3196 versioning though since we may have skipped some versioning checks when
3197 analyzing the epilogue. For instance, checks for alias versioning will be
3198 skipped when dealing with epilogues as we assume we already checked them
3199 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3200 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3202 poly_uint64 niters_th = 0;
3203 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3205 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3207 /* Niters for peeled prolog loop. */
3208 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3210 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3211 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3212 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3214 else
3215 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3218 /* Niters for at least one iteration of vectorized loop. */
3219 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3220 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3221 /* One additional iteration because of peeling for gap. */
3222 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3223 niters_th += 1;
3225 /* Use the same condition as vect_transform_loop to decide when to use
3226 the cost to determine a versioning threshold. */
3227 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3228 && ordered_p (th, niters_th))
3229 niters_th = ordered_max (poly_uint64 (th), niters_th);
3231 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3234 gcc_assert (known_eq (vectorization_factor,
3235 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3237 slp_done_for_suggested_uf = slp;
3239 /* Ok to vectorize! */
3240 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3241 return opt_result::success ();
3243 again:
3244 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3245 gcc_assert (!ok);
3247 /* Try again with SLP forced off but if we didn't do any SLP there is
3248 no point in re-trying. */
3249 if (!slp)
3250 return ok;
3252 /* If the slp decision is true when suggested unroll factor is worked
3253 out, and we are applying suggested unroll factor, we don't need to
3254 re-try any more. */
3255 if (applying_suggested_uf && slp_done_for_suggested_uf)
3256 return ok;
3258 /* If there are reduction chains re-trying will fail anyway. */
3259 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3260 return ok;
3262 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3263 via interleaving or lane instructions. */
3264 slp_instance instance;
3265 slp_tree node;
3266 unsigned i, j;
3267 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3269 stmt_vec_info vinfo;
3270 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3271 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3272 continue;
3273 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3274 unsigned int size = DR_GROUP_SIZE (vinfo);
3275 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3276 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3277 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3278 && ! vect_grouped_store_supported (vectype, size))
3279 return opt_result::failure_at (vinfo->stmt,
3280 "unsupported grouped store\n");
3281 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3283 vinfo = SLP_TREE_REPRESENTATIVE (node);
3284 if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3286 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3287 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3288 size = DR_GROUP_SIZE (vinfo);
3289 vectype = STMT_VINFO_VECTYPE (vinfo);
3290 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3291 && ! vect_grouped_load_supported (vectype, single_element_p,
3292 size))
3293 return opt_result::failure_at (vinfo->stmt,
3294 "unsupported grouped load\n");
3299 if (dump_enabled_p ())
3300 dump_printf_loc (MSG_NOTE, vect_location,
3301 "re-trying with SLP disabled\n");
3303 /* Roll back state appropriately. No SLP this time. */
3304 slp = false;
3305 /* Restore vectorization factor as it were without SLP. */
3306 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3307 /* Free the SLP instances. */
3308 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3309 vect_free_slp_instance (instance);
3310 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3311 /* Reset SLP type to loop_vect on all stmts. */
3312 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3314 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3315 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3316 !gsi_end_p (si); gsi_next (&si))
3318 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3319 STMT_SLP_TYPE (stmt_info) = loop_vect;
3320 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3321 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3323 /* vectorizable_reduction adjusts reduction stmt def-types,
3324 restore them to that of the PHI. */
3325 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3326 = STMT_VINFO_DEF_TYPE (stmt_info);
3327 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3328 (STMT_VINFO_REDUC_DEF (stmt_info)))
3329 = STMT_VINFO_DEF_TYPE (stmt_info);
3332 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3333 !gsi_end_p (si); gsi_next (&si))
3335 if (is_gimple_debug (gsi_stmt (si)))
3336 continue;
3337 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3338 STMT_SLP_TYPE (stmt_info) = loop_vect;
3339 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3341 stmt_vec_info pattern_stmt_info
3342 = STMT_VINFO_RELATED_STMT (stmt_info);
3343 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3344 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3346 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3347 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3348 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3349 !gsi_end_p (pi); gsi_next (&pi))
3350 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3351 = loop_vect;
3355 /* Free optimized alias test DDRS. */
3356 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3357 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3358 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3359 /* Reset target cost data. */
3360 delete loop_vinfo->vector_costs;
3361 loop_vinfo->vector_costs = nullptr;
3362 /* Reset accumulated rgroup information. */
3363 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3364 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3365 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3366 /* Reset assorted flags. */
3367 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3368 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3369 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3370 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3371 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3372 = saved_can_use_partial_vectors_p;
3374 goto start_over;
3377 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3378 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3379 OLD_LOOP_VINFO is better unless something specifically indicates
3380 otherwise.
3382 Note that this deliberately isn't a partial order. */
3384 static bool
3385 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3386 loop_vec_info old_loop_vinfo)
3388 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3389 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3391 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3392 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3394 /* Always prefer a VF of loop->simdlen over any other VF. */
3395 if (loop->simdlen)
3397 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3398 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3399 if (new_simdlen_p != old_simdlen_p)
3400 return new_simdlen_p;
3403 const auto *old_costs = old_loop_vinfo->vector_costs;
3404 const auto *new_costs = new_loop_vinfo->vector_costs;
3405 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3406 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3408 return new_costs->better_main_loop_than_p (old_costs);
3411 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3412 true if we should. */
3414 static bool
3415 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3416 loop_vec_info old_loop_vinfo)
3418 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3419 return false;
3421 if (dump_enabled_p ())
3422 dump_printf_loc (MSG_NOTE, vect_location,
3423 "***** Preferring vector mode %s to vector mode %s\n",
3424 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3425 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3426 return true;
3429 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3430 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3431 MODE_I to the next mode useful to analyze.
3432 Return the loop_vinfo on success and wrapped null on failure. */
3434 static opt_loop_vec_info
3435 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3436 const vect_loop_form_info *loop_form_info,
3437 loop_vec_info main_loop_vinfo,
3438 const vector_modes &vector_modes, unsigned &mode_i,
3439 machine_mode &autodetected_vector_mode,
3440 bool &fatal)
3442 loop_vec_info loop_vinfo
3443 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3445 machine_mode vector_mode = vector_modes[mode_i];
3446 loop_vinfo->vector_mode = vector_mode;
3447 unsigned int suggested_unroll_factor = 1;
3448 bool slp_done_for_suggested_uf = false;
3450 /* Run the main analysis. */
3451 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3452 &suggested_unroll_factor,
3453 slp_done_for_suggested_uf);
3454 if (dump_enabled_p ())
3455 dump_printf_loc (MSG_NOTE, vect_location,
3456 "***** Analysis %s with vector mode %s\n",
3457 res ? "succeeded" : " failed",
3458 GET_MODE_NAME (loop_vinfo->vector_mode));
3460 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3462 if (dump_enabled_p ())
3463 dump_printf_loc (MSG_NOTE, vect_location,
3464 "***** Re-trying analysis for unrolling"
3465 " with unroll factor %d and slp %s.\n",
3466 suggested_unroll_factor,
3467 slp_done_for_suggested_uf ? "on" : "off");
3468 loop_vec_info unroll_vinfo
3469 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3470 unroll_vinfo->vector_mode = vector_mode;
3471 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3472 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3473 slp_done_for_suggested_uf);
3474 if (new_res)
3476 delete loop_vinfo;
3477 loop_vinfo = unroll_vinfo;
3479 else
3480 delete unroll_vinfo;
3483 /* Remember the autodetected vector mode. */
3484 if (vector_mode == VOIDmode)
3485 autodetected_vector_mode = loop_vinfo->vector_mode;
3487 /* Advance mode_i, first skipping modes that would result in the
3488 same analysis result. */
3489 while (mode_i + 1 < vector_modes.length ()
3490 && vect_chooses_same_modes_p (loop_vinfo,
3491 vector_modes[mode_i + 1]))
3493 if (dump_enabled_p ())
3494 dump_printf_loc (MSG_NOTE, vect_location,
3495 "***** The result for vector mode %s would"
3496 " be the same\n",
3497 GET_MODE_NAME (vector_modes[mode_i + 1]));
3498 mode_i += 1;
3500 if (mode_i + 1 < vector_modes.length ()
3501 && VECTOR_MODE_P (autodetected_vector_mode)
3502 && (related_vector_mode (vector_modes[mode_i + 1],
3503 GET_MODE_INNER (autodetected_vector_mode))
3504 == autodetected_vector_mode)
3505 && (related_vector_mode (autodetected_vector_mode,
3506 GET_MODE_INNER (vector_modes[mode_i + 1]))
3507 == vector_modes[mode_i + 1]))
3509 if (dump_enabled_p ())
3510 dump_printf_loc (MSG_NOTE, vect_location,
3511 "***** Skipping vector mode %s, which would"
3512 " repeat the analysis for %s\n",
3513 GET_MODE_NAME (vector_modes[mode_i + 1]),
3514 GET_MODE_NAME (autodetected_vector_mode));
3515 mode_i += 1;
3517 mode_i++;
3519 if (!res)
3521 delete loop_vinfo;
3522 if (fatal)
3523 gcc_checking_assert (main_loop_vinfo == NULL);
3524 return opt_loop_vec_info::propagate_failure (res);
3527 return opt_loop_vec_info::success (loop_vinfo);
3530 /* Function vect_analyze_loop.
3532 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3533 for it. The different analyses will record information in the
3534 loop_vec_info struct. */
3535 opt_loop_vec_info
3536 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3538 DUMP_VECT_SCOPE ("analyze_loop_nest");
3540 if (loop_outer (loop)
3541 && loop_vec_info_for_loop (loop_outer (loop))
3542 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3543 return opt_loop_vec_info::failure_at (vect_location,
3544 "outer-loop already vectorized.\n");
3546 if (!find_loop_nest (loop, &shared->loop_nest))
3547 return opt_loop_vec_info::failure_at
3548 (vect_location,
3549 "not vectorized: loop nest containing two or more consecutive inner"
3550 " loops cannot be vectorized\n");
3552 /* Analyze the loop form. */
3553 vect_loop_form_info loop_form_info;
3554 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3555 if (!res)
3557 if (dump_enabled_p ())
3558 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3559 "bad loop form.\n");
3560 return opt_loop_vec_info::propagate_failure (res);
3562 if (!integer_onep (loop_form_info.assumptions))
3564 /* We consider to vectorize this loop by versioning it under
3565 some assumptions. In order to do this, we need to clear
3566 existing information computed by scev and niter analyzer. */
3567 scev_reset_htab ();
3568 free_numbers_of_iterations_estimates (loop);
3569 /* Also set flag for this loop so that following scev and niter
3570 analysis are done under the assumptions. */
3571 loop_constraint_set (loop, LOOP_C_FINITE);
3574 auto_vector_modes vector_modes;
3575 /* Autodetect first vector size we try. */
3576 vector_modes.safe_push (VOIDmode);
3577 unsigned int autovec_flags
3578 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3579 loop->simdlen != 0);
3580 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3581 && !unlimited_cost_model (loop));
3582 machine_mode autodetected_vector_mode = VOIDmode;
3583 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3584 unsigned int mode_i = 0;
3585 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3587 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3588 a mode has not been analyzed. */
3589 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3590 for (unsigned i = 0; i < vector_modes.length (); ++i)
3591 cached_vf_per_mode.safe_push (0);
3593 /* First determine the main loop vectorization mode, either the first
3594 one that works, starting with auto-detecting the vector mode and then
3595 following the targets order of preference, or the one with the
3596 lowest cost if pick_lowest_cost_p. */
3597 while (1)
3599 bool fatal;
3600 unsigned int last_mode_i = mode_i;
3601 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3602 failed. */
3603 cached_vf_per_mode[last_mode_i] = -1;
3604 opt_loop_vec_info loop_vinfo
3605 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3606 NULL, vector_modes, mode_i,
3607 autodetected_vector_mode, fatal);
3608 if (fatal)
3609 break;
3611 if (loop_vinfo)
3613 /* Analyzis has been successful so update the VF value. The
3614 VF should always be a multiple of unroll_factor and we want to
3615 capture the original VF here. */
3616 cached_vf_per_mode[last_mode_i]
3617 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3618 loop_vinfo->suggested_unroll_factor);
3619 /* Once we hit the desired simdlen for the first time,
3620 discard any previous attempts. */
3621 if (simdlen
3622 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3624 delete first_loop_vinfo;
3625 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3626 simdlen = 0;
3628 else if (pick_lowest_cost_p
3629 && first_loop_vinfo
3630 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3632 /* Pick loop_vinfo over first_loop_vinfo. */
3633 delete first_loop_vinfo;
3634 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3636 if (first_loop_vinfo == NULL)
3637 first_loop_vinfo = loop_vinfo;
3638 else
3640 delete loop_vinfo;
3641 loop_vinfo = opt_loop_vec_info::success (NULL);
3644 /* Commit to first_loop_vinfo if we have no reason to try
3645 alternatives. */
3646 if (!simdlen && !pick_lowest_cost_p)
3647 break;
3649 if (mode_i == vector_modes.length ()
3650 || autodetected_vector_mode == VOIDmode)
3651 break;
3653 /* Try the next biggest vector size. */
3654 if (dump_enabled_p ())
3655 dump_printf_loc (MSG_NOTE, vect_location,
3656 "***** Re-trying analysis with vector mode %s\n",
3657 GET_MODE_NAME (vector_modes[mode_i]));
3659 if (!first_loop_vinfo)
3660 return opt_loop_vec_info::propagate_failure (res);
3662 if (dump_enabled_p ())
3663 dump_printf_loc (MSG_NOTE, vect_location,
3664 "***** Choosing vector mode %s\n",
3665 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3667 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3668 enabled, SIMDUID is not set, it is the innermost loop and we have
3669 either already found the loop's SIMDLEN or there was no SIMDLEN to
3670 begin with.
3671 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3672 bool vect_epilogues = (!simdlen
3673 && loop->inner == NULL
3674 && param_vect_epilogues_nomask
3675 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3676 && !loop->simduid);
3677 if (!vect_epilogues)
3678 return first_loop_vinfo;
3680 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3681 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3683 /* For epilogues start the analysis from the first mode. The motivation
3684 behind starting from the beginning comes from cases where the VECTOR_MODES
3685 array may contain length-agnostic and length-specific modes. Their
3686 ordering is not guaranteed, so we could end up picking a mode for the main
3687 loop that is after the epilogue's optimal mode. */
3688 vector_modes[0] = autodetected_vector_mode;
3689 mode_i = 0;
3691 bool supports_partial_vectors =
3692 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3693 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3695 while (1)
3697 /* If the target does not support partial vectors we can shorten the
3698 number of modes to analyze for the epilogue as we know we can't pick a
3699 mode that would lead to a VF at least as big as the
3700 FIRST_VINFO_VF. */
3701 if (!supports_partial_vectors
3702 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3704 mode_i++;
3705 if (mode_i == vector_modes.length ())
3706 break;
3707 continue;
3710 if (dump_enabled_p ())
3711 dump_printf_loc (MSG_NOTE, vect_location,
3712 "***** Re-trying epilogue analysis with vector "
3713 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3715 bool fatal;
3716 opt_loop_vec_info loop_vinfo
3717 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3718 first_loop_vinfo,
3719 vector_modes, mode_i,
3720 autodetected_vector_mode, fatal);
3721 if (fatal)
3722 break;
3724 if (loop_vinfo)
3726 if (pick_lowest_cost_p)
3728 /* Keep trying to roll back vectorization attempts while the
3729 loop_vec_infos they produced were worse than this one. */
3730 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3731 while (!vinfos.is_empty ()
3732 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3734 gcc_assert (vect_epilogues);
3735 delete vinfos.pop ();
3738 /* For now only allow one epilogue loop. */
3739 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3741 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3742 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3743 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3744 || maybe_ne (lowest_th, 0U));
3745 /* Keep track of the known smallest versioning
3746 threshold. */
3747 if (ordered_p (lowest_th, th))
3748 lowest_th = ordered_min (lowest_th, th);
3750 else
3752 delete loop_vinfo;
3753 loop_vinfo = opt_loop_vec_info::success (NULL);
3756 /* For now only allow one epilogue loop, but allow
3757 pick_lowest_cost_p to replace it, so commit to the
3758 first epilogue if we have no reason to try alternatives. */
3759 if (!pick_lowest_cost_p)
3760 break;
3763 if (mode_i == vector_modes.length ())
3764 break;
3768 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3770 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3771 if (dump_enabled_p ())
3772 dump_printf_loc (MSG_NOTE, vect_location,
3773 "***** Choosing epilogue vector mode %s\n",
3774 GET_MODE_NAME
3775 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3778 return first_loop_vinfo;
3781 /* Return true if there is an in-order reduction function for CODE, storing
3782 it in *REDUC_FN if so. */
3784 static bool
3785 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3787 /* We support MINUS_EXPR by negating the operand. This also preserves an
3788 initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3789 (-0.0) = -0.0. */
3790 if (code == PLUS_EXPR || code == MINUS_EXPR)
3792 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3793 return true;
3795 return false;
3798 /* Function reduction_fn_for_scalar_code
3800 Input:
3801 CODE - tree_code of a reduction operations.
3803 Output:
3804 REDUC_FN - the corresponding internal function to be used to reduce the
3805 vector of partial results into a single scalar result, or IFN_LAST
3806 if the operation is a supported reduction operation, but does not have
3807 such an internal function.
3809 Return FALSE if CODE currently cannot be vectorized as reduction. */
3811 bool
3812 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3814 if (code.is_tree_code ())
3815 switch (tree_code (code))
3817 case MAX_EXPR:
3818 *reduc_fn = IFN_REDUC_MAX;
3819 return true;
3821 case MIN_EXPR:
3822 *reduc_fn = IFN_REDUC_MIN;
3823 return true;
3825 case PLUS_EXPR:
3826 *reduc_fn = IFN_REDUC_PLUS;
3827 return true;
3829 case BIT_AND_EXPR:
3830 *reduc_fn = IFN_REDUC_AND;
3831 return true;
3833 case BIT_IOR_EXPR:
3834 *reduc_fn = IFN_REDUC_IOR;
3835 return true;
3837 case BIT_XOR_EXPR:
3838 *reduc_fn = IFN_REDUC_XOR;
3839 return true;
3841 case MULT_EXPR:
3842 case MINUS_EXPR:
3843 *reduc_fn = IFN_LAST;
3844 return true;
3846 default:
3847 return false;
3849 else
3850 switch (combined_fn (code))
3852 CASE_CFN_FMAX:
3853 *reduc_fn = IFN_REDUC_FMAX;
3854 return true;
3856 CASE_CFN_FMIN:
3857 *reduc_fn = IFN_REDUC_FMIN;
3858 return true;
3860 default:
3861 return false;
3865 /* If there is a neutral value X such that a reduction would not be affected
3866 by the introduction of additional X elements, return that X, otherwise
3867 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3868 of the scalar elements. If the reduction has just a single initial value
3869 then INITIAL_VALUE is that value, otherwise it is null.
3870 If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3871 In that case no signed zero is returned. */
3873 tree
3874 neutral_op_for_reduction (tree scalar_type, code_helper code,
3875 tree initial_value, bool as_initial)
3877 if (code.is_tree_code ())
3878 switch (tree_code (code))
3880 case DOT_PROD_EXPR:
3881 case SAD_EXPR:
3882 case MINUS_EXPR:
3883 case BIT_IOR_EXPR:
3884 case BIT_XOR_EXPR:
3885 return build_zero_cst (scalar_type);
3886 case WIDEN_SUM_EXPR:
3887 case PLUS_EXPR:
3888 if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3889 return build_real (scalar_type, dconstm0);
3890 else
3891 return build_zero_cst (scalar_type);
3893 case MULT_EXPR:
3894 return build_one_cst (scalar_type);
3896 case BIT_AND_EXPR:
3897 return build_all_ones_cst (scalar_type);
3899 case MAX_EXPR:
3900 case MIN_EXPR:
3901 return initial_value;
3903 default:
3904 return NULL_TREE;
3906 else
3907 switch (combined_fn (code))
3909 CASE_CFN_FMIN:
3910 CASE_CFN_FMAX:
3911 return initial_value;
3913 default:
3914 return NULL_TREE;
3918 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3919 STMT is printed with a message MSG. */
3921 static void
3922 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3924 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3927 /* Return true if we need an in-order reduction for operation CODE
3928 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3929 overflow must wrap. */
3931 bool
3932 needs_fold_left_reduction_p (tree type, code_helper code)
3934 /* CHECKME: check for !flag_finite_math_only too? */
3935 if (SCALAR_FLOAT_TYPE_P (type))
3937 if (code.is_tree_code ())
3938 switch (tree_code (code))
3940 case MIN_EXPR:
3941 case MAX_EXPR:
3942 return false;
3944 default:
3945 return !flag_associative_math;
3947 else
3948 switch (combined_fn (code))
3950 CASE_CFN_FMIN:
3951 CASE_CFN_FMAX:
3952 return false;
3954 default:
3955 return !flag_associative_math;
3959 if (INTEGRAL_TYPE_P (type))
3960 return (!code.is_tree_code ()
3961 || !operation_no_trapping_overflow (type, tree_code (code)));
3963 if (SAT_FIXED_POINT_TYPE_P (type))
3964 return true;
3966 return false;
3969 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3970 has a handled computation expression. Store the main reduction
3971 operation in *CODE. */
3973 static bool
3974 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3975 tree loop_arg, code_helper *code,
3976 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3978 auto_bitmap visited;
3979 tree lookfor = PHI_RESULT (phi);
3980 ssa_op_iter curri;
3981 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3982 while (USE_FROM_PTR (curr) != loop_arg)
3983 curr = op_iter_next_use (&curri);
3984 curri.i = curri.numops;
3987 path.safe_push (std::make_pair (curri, curr));
3988 tree use = USE_FROM_PTR (curr);
3989 if (use == lookfor)
3990 break;
3991 gimple *def = SSA_NAME_DEF_STMT (use);
3992 if (gimple_nop_p (def)
3993 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3995 pop:
3998 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3999 curri = x.first;
4000 curr = x.second;
4002 curr = op_iter_next_use (&curri);
4003 /* Skip already visited or non-SSA operands (from iterating
4004 over PHI args). */
4005 while (curr != NULL_USE_OPERAND_P
4006 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4007 || ! bitmap_set_bit (visited,
4008 SSA_NAME_VERSION
4009 (USE_FROM_PTR (curr)))));
4011 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4012 if (curr == NULL_USE_OPERAND_P)
4013 break;
4015 else
4017 if (gimple_code (def) == GIMPLE_PHI)
4018 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4019 else
4020 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4021 while (curr != NULL_USE_OPERAND_P
4022 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4023 || ! bitmap_set_bit (visited,
4024 SSA_NAME_VERSION
4025 (USE_FROM_PTR (curr)))))
4026 curr = op_iter_next_use (&curri);
4027 if (curr == NULL_USE_OPERAND_P)
4028 goto pop;
4031 while (1);
4032 if (dump_file && (dump_flags & TDF_DETAILS))
4034 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4035 unsigned i;
4036 std::pair<ssa_op_iter, use_operand_p> *x;
4037 FOR_EACH_VEC_ELT (path, i, x)
4038 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4039 dump_printf (MSG_NOTE, "\n");
4042 /* Check whether the reduction path detected is valid. */
4043 bool fail = path.length () == 0;
4044 bool neg = false;
4045 int sign = -1;
4046 *code = ERROR_MARK;
4047 for (unsigned i = 1; i < path.length (); ++i)
4049 gimple *use_stmt = USE_STMT (path[i].second);
4050 gimple_match_op op;
4051 if (!gimple_extract_op (use_stmt, &op))
4053 fail = true;
4054 break;
4056 unsigned int opi = op.num_ops;
4057 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4059 /* The following make sure we can compute the operand index
4060 easily plus it mostly disallows chaining via COND_EXPR condition
4061 operands. */
4062 for (opi = 0; opi < op.num_ops; ++opi)
4063 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4064 break;
4066 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4068 for (opi = 0; opi < op.num_ops; ++opi)
4069 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4070 break;
4072 if (opi == op.num_ops)
4074 fail = true;
4075 break;
4077 op.code = canonicalize_code (op.code, op.type);
4078 if (op.code == MINUS_EXPR)
4080 op.code = PLUS_EXPR;
4081 /* Track whether we negate the reduction value each iteration. */
4082 if (op.ops[1] == op.ops[opi])
4083 neg = ! neg;
4085 if (CONVERT_EXPR_CODE_P (op.code)
4086 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4088 else if (*code == ERROR_MARK)
4090 *code = op.code;
4091 sign = TYPE_SIGN (op.type);
4093 else if (op.code != *code)
4095 fail = true;
4096 break;
4098 else if ((op.code == MIN_EXPR
4099 || op.code == MAX_EXPR)
4100 && sign != TYPE_SIGN (op.type))
4102 fail = true;
4103 break;
4105 /* Check there's only a single stmt the op is used on. For the
4106 not value-changing tail and the last stmt allow out-of-loop uses.
4107 ??? We could relax this and handle arbitrary live stmts by
4108 forcing a scalar epilogue for example. */
4109 imm_use_iterator imm_iter;
4110 use_operand_p use_p;
4111 gimple *op_use_stmt;
4112 unsigned cnt = 0;
4113 bool cond_fn_p = op.code.is_internal_fn ()
4114 && (conditional_internal_fn_code (internal_fn (op.code))
4115 != ERROR_MARK);
4117 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4119 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4120 op1 twice (once as definition, once as else) in the same operation.
4121 Allow this. */
4122 if (cond_fn_p && op_use_stmt == use_stmt)
4124 gcall *call = as_a<gcall *> (use_stmt);
4125 unsigned else_pos
4126 = internal_fn_else_index (internal_fn (op.code));
4128 for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4130 if (j == else_pos)
4131 continue;
4132 if (gimple_call_arg (call, j) == op.ops[opi])
4133 cnt++;
4136 else if (!is_gimple_debug (op_use_stmt)
4137 && (*code != ERROR_MARK
4138 || flow_bb_inside_loop_p (loop,
4139 gimple_bb (op_use_stmt))))
4140 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4141 cnt++;
4144 if (cnt != 1)
4146 fail = true;
4147 break;
4150 return ! fail && ! neg && *code != ERROR_MARK;
4153 bool
4154 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4155 tree loop_arg, enum tree_code code)
4157 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4158 code_helper code_;
4159 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4160 && code_ == code);
4165 /* Function vect_is_simple_reduction
4167 (1) Detect a cross-iteration def-use cycle that represents a simple
4168 reduction computation. We look for the following pattern:
4170 loop_header:
4171 a1 = phi < a0, a2 >
4172 a3 = ...
4173 a2 = operation (a3, a1)
4177 a3 = ...
4178 loop_header:
4179 a1 = phi < a0, a2 >
4180 a2 = operation (a3, a1)
4182 such that:
4183 1. operation is commutative and associative and it is safe to
4184 change the order of the computation
4185 2. no uses for a2 in the loop (a2 is used out of the loop)
4186 3. no uses of a1 in the loop besides the reduction operation
4187 4. no uses of a1 outside the loop.
4189 Conditions 1,4 are tested here.
4190 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4192 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4193 nested cycles.
4195 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4196 reductions:
4198 a1 = phi < a0, a2 >
4199 inner loop (def of a3)
4200 a2 = phi < a3 >
4202 (4) Detect condition expressions, ie:
4203 for (int i = 0; i < N; i++)
4204 if (a[i] < val)
4205 ret_val = a[i];
4209 static stmt_vec_info
4210 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4211 bool *double_reduc, bool *reduc_chain_p, bool slp)
4213 gphi *phi = as_a <gphi *> (phi_info->stmt);
4214 gimple *phi_use_stmt = NULL;
4215 imm_use_iterator imm_iter;
4216 use_operand_p use_p;
4218 *double_reduc = false;
4219 *reduc_chain_p = false;
4220 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4222 tree phi_name = PHI_RESULT (phi);
4223 /* ??? If there are no uses of the PHI result the inner loop reduction
4224 won't be detected as possibly double-reduction by vectorizable_reduction
4225 because that tries to walk the PHI arg from the preheader edge which
4226 can be constant. See PR60382. */
4227 if (has_zero_uses (phi_name))
4228 return NULL;
4229 class loop *loop = (gimple_bb (phi))->loop_father;
4230 unsigned nphi_def_loop_uses = 0;
4231 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4233 gimple *use_stmt = USE_STMT (use_p);
4234 if (is_gimple_debug (use_stmt))
4235 continue;
4237 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4239 if (dump_enabled_p ())
4240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4241 "intermediate value used outside loop.\n");
4243 return NULL;
4246 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4247 op1 twice (once as definition, once as else) in the same operation.
4248 Only count it as one. */
4249 if (use_stmt != phi_use_stmt)
4251 nphi_def_loop_uses++;
4252 phi_use_stmt = use_stmt;
4256 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4257 if (TREE_CODE (latch_def) != SSA_NAME)
4259 if (dump_enabled_p ())
4260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4261 "reduction: not ssa_name: %T\n", latch_def);
4262 return NULL;
4265 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4266 if (!def_stmt_info
4267 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4268 return NULL;
4270 bool nested_in_vect_loop
4271 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4272 unsigned nlatch_def_loop_uses = 0;
4273 auto_vec<gphi *, 3> lcphis;
4274 bool inner_loop_of_double_reduc = false;
4275 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4277 gimple *use_stmt = USE_STMT (use_p);
4278 if (is_gimple_debug (use_stmt))
4279 continue;
4280 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4281 nlatch_def_loop_uses++;
4282 else
4284 /* We can have more than one loop-closed PHI. */
4285 lcphis.safe_push (as_a <gphi *> (use_stmt));
4286 if (nested_in_vect_loop
4287 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4288 == vect_double_reduction_def))
4289 inner_loop_of_double_reduc = true;
4293 /* If we are vectorizing an inner reduction we are executing that
4294 in the original order only in case we are not dealing with a
4295 double reduction. */
4296 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4298 if (dump_enabled_p ())
4299 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4300 "detected nested cycle: ");
4301 return def_stmt_info;
4304 /* When the inner loop of a double reduction ends up with more than
4305 one loop-closed PHI we have failed to classify alternate such
4306 PHIs as double reduction, leading to wrong code. See PR103237. */
4307 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4309 if (dump_enabled_p ())
4310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4311 "unhandle double reduction\n");
4312 return NULL;
4315 /* If this isn't a nested cycle or if the nested cycle reduction value
4316 is used ouside of the inner loop we cannot handle uses of the reduction
4317 value. */
4318 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4320 if (dump_enabled_p ())
4321 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4322 "reduction used in loop.\n");
4323 return NULL;
4326 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4327 defined in the inner loop. */
4328 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4330 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4331 if (gimple_phi_num_args (def_stmt) != 1
4332 || TREE_CODE (op1) != SSA_NAME)
4334 if (dump_enabled_p ())
4335 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4336 "unsupported phi node definition.\n");
4338 return NULL;
4341 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4342 and the latch definition op1. */
4343 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4344 if (gimple_bb (def1)
4345 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4346 && loop->inner
4347 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4348 && (is_gimple_assign (def1) || is_gimple_call (def1))
4349 && is_a <gphi *> (phi_use_stmt)
4350 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4351 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4352 loop_latch_edge (loop->inner))))
4354 if (dump_enabled_p ())
4355 report_vect_op (MSG_NOTE, def_stmt,
4356 "detected double reduction: ");
4358 *double_reduc = true;
4359 return def_stmt_info;
4362 return NULL;
4365 /* Look for the expression computing latch_def from then loop PHI result. */
4366 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4367 code_helper code;
4368 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4369 path))
4371 STMT_VINFO_REDUC_CODE (phi_info) = code;
4372 if (code == COND_EXPR && !nested_in_vect_loop)
4373 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4375 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4376 reduction chain for which the additional restriction is that
4377 all operations in the chain are the same. */
4378 auto_vec<stmt_vec_info, 8> reduc_chain;
4379 unsigned i;
4380 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4381 for (i = path.length () - 1; i >= 1; --i)
4383 gimple *stmt = USE_STMT (path[i].second);
4384 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4385 gimple_match_op op;
4386 if (!gimple_extract_op (stmt, &op))
4387 gcc_unreachable ();
4388 if (gassign *assign = dyn_cast<gassign *> (stmt))
4389 STMT_VINFO_REDUC_IDX (stmt_info)
4390 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4391 else
4393 gcall *call = as_a<gcall *> (stmt);
4394 STMT_VINFO_REDUC_IDX (stmt_info)
4395 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4397 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4398 && (i == 1 || i == path.length () - 1));
4399 if ((op.code != code && !leading_conversion)
4400 /* We can only handle the final value in epilogue
4401 generation for reduction chains. */
4402 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4403 is_slp_reduc = false;
4404 /* For reduction chains we support a trailing/leading
4405 conversions. We do not store those in the actual chain. */
4406 if (leading_conversion)
4407 continue;
4408 reduc_chain.safe_push (stmt_info);
4410 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4412 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4414 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4415 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4417 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4418 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4420 /* Save the chain for further analysis in SLP detection. */
4421 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4422 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4424 *reduc_chain_p = true;
4425 if (dump_enabled_p ())
4426 dump_printf_loc (MSG_NOTE, vect_location,
4427 "reduction: detected reduction chain\n");
4429 else if (dump_enabled_p ())
4430 dump_printf_loc (MSG_NOTE, vect_location,
4431 "reduction: detected reduction\n");
4433 return def_stmt_info;
4436 if (dump_enabled_p ())
4437 dump_printf_loc (MSG_NOTE, vect_location,
4438 "reduction: unknown pattern\n");
4440 return NULL;
4443 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4444 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4445 or -1 if not known. */
4447 static int
4448 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4450 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4451 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4453 if (dump_enabled_p ())
4454 dump_printf_loc (MSG_NOTE, vect_location,
4455 "cost model: epilogue peel iters set to vf/2 "
4456 "because loop iterations are unknown .\n");
4457 return assumed_vf / 2;
4459 else
4461 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4462 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4463 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4464 /* If we need to peel for gaps, but no peeling is required, we have to
4465 peel VF iterations. */
4466 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4467 peel_iters_epilogue = assumed_vf;
4468 return peel_iters_epilogue;
4472 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4474 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4475 int *peel_iters_epilogue,
4476 stmt_vector_for_cost *scalar_cost_vec,
4477 stmt_vector_for_cost *prologue_cost_vec,
4478 stmt_vector_for_cost *epilogue_cost_vec)
4480 int retval = 0;
4482 *peel_iters_epilogue
4483 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4485 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4487 /* If peeled iterations are known but number of scalar loop
4488 iterations are unknown, count a taken branch per peeled loop. */
4489 if (peel_iters_prologue > 0)
4490 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4491 vect_prologue);
4492 if (*peel_iters_epilogue > 0)
4493 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4494 vect_epilogue);
4497 stmt_info_for_cost *si;
4498 int j;
4499 if (peel_iters_prologue)
4500 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4501 retval += record_stmt_cost (prologue_cost_vec,
4502 si->count * peel_iters_prologue,
4503 si->kind, si->stmt_info, si->misalign,
4504 vect_prologue);
4505 if (*peel_iters_epilogue)
4506 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4507 retval += record_stmt_cost (epilogue_cost_vec,
4508 si->count * *peel_iters_epilogue,
4509 si->kind, si->stmt_info, si->misalign,
4510 vect_epilogue);
4512 return retval;
4515 /* Function vect_estimate_min_profitable_iters
4517 Return the number of iterations required for the vector version of the
4518 loop to be profitable relative to the cost of the scalar version of the
4519 loop.
4521 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4522 of iterations for vectorization. -1 value means loop vectorization
4523 is not profitable. This returned value may be used for dynamic
4524 profitability check.
4526 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4527 for static check against estimated number of iterations. */
4529 static void
4530 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4531 int *ret_min_profitable_niters,
4532 int *ret_min_profitable_estimate,
4533 unsigned *suggested_unroll_factor)
4535 int min_profitable_iters;
4536 int min_profitable_estimate;
4537 int peel_iters_prologue;
4538 int peel_iters_epilogue;
4539 unsigned vec_inside_cost = 0;
4540 int vec_outside_cost = 0;
4541 unsigned vec_prologue_cost = 0;
4542 unsigned vec_epilogue_cost = 0;
4543 int scalar_single_iter_cost = 0;
4544 int scalar_outside_cost = 0;
4545 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4546 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4547 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4549 /* Cost model disabled. */
4550 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4552 if (dump_enabled_p ())
4553 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4554 *ret_min_profitable_niters = 0;
4555 *ret_min_profitable_estimate = 0;
4556 return;
4559 /* Requires loop versioning tests to handle misalignment. */
4560 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4562 /* FIXME: Make cost depend on complexity of individual check. */
4563 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4564 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4565 if (dump_enabled_p ())
4566 dump_printf (MSG_NOTE,
4567 "cost model: Adding cost of checks for loop "
4568 "versioning to treat misalignment.\n");
4571 /* Requires loop versioning with alias checks. */
4572 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4574 /* FIXME: Make cost depend on complexity of individual check. */
4575 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4576 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4577 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4578 if (len)
4579 /* Count LEN - 1 ANDs and LEN comparisons. */
4580 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4581 scalar_stmt, vect_prologue);
4582 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4583 if (len)
4585 /* Count LEN - 1 ANDs and LEN comparisons. */
4586 unsigned int nstmts = len * 2 - 1;
4587 /* +1 for each bias that needs adding. */
4588 for (unsigned int i = 0; i < len; ++i)
4589 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4590 nstmts += 1;
4591 (void) add_stmt_cost (target_cost_data, nstmts,
4592 scalar_stmt, vect_prologue);
4594 if (dump_enabled_p ())
4595 dump_printf (MSG_NOTE,
4596 "cost model: Adding cost of checks for loop "
4597 "versioning aliasing.\n");
4600 /* Requires loop versioning with niter checks. */
4601 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4603 /* FIXME: Make cost depend on complexity of individual check. */
4604 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4605 NULL, NULL, NULL_TREE, 0, vect_prologue);
4606 if (dump_enabled_p ())
4607 dump_printf (MSG_NOTE,
4608 "cost model: Adding cost of checks for loop "
4609 "versioning niters.\n");
4612 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4613 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4614 vect_prologue);
4616 /* Count statements in scalar loop. Using this as scalar cost for a single
4617 iteration for now.
4619 TODO: Add outer loop support.
4621 TODO: Consider assigning different costs to different scalar
4622 statements. */
4624 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4626 /* Add additional cost for the peeled instructions in prologue and epilogue
4627 loop. (For fully-masked loops there will be no peeling.)
4629 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4630 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4632 TODO: Build an expression that represents peel_iters for prologue and
4633 epilogue to be used in a run-time test. */
4635 bool prologue_need_br_taken_cost = false;
4636 bool prologue_need_br_not_taken_cost = false;
4638 /* Calculate peel_iters_prologue. */
4639 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4640 peel_iters_prologue = 0;
4641 else if (npeel < 0)
4643 peel_iters_prologue = assumed_vf / 2;
4644 if (dump_enabled_p ())
4645 dump_printf (MSG_NOTE, "cost model: "
4646 "prologue peel iters set to vf/2.\n");
4648 /* If peeled iterations are unknown, count a taken branch and a not taken
4649 branch per peeled loop. Even if scalar loop iterations are known,
4650 vector iterations are not known since peeled prologue iterations are
4651 not known. Hence guards remain the same. */
4652 prologue_need_br_taken_cost = true;
4653 prologue_need_br_not_taken_cost = true;
4655 else
4657 peel_iters_prologue = npeel;
4658 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4659 /* If peeled iterations are known but number of scalar loop
4660 iterations are unknown, count a taken branch per peeled loop. */
4661 prologue_need_br_taken_cost = true;
4664 bool epilogue_need_br_taken_cost = false;
4665 bool epilogue_need_br_not_taken_cost = false;
4667 /* Calculate peel_iters_epilogue. */
4668 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4669 /* We need to peel exactly one iteration for gaps. */
4670 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4671 else if (npeel < 0)
4673 /* If peeling for alignment is unknown, loop bound of main loop
4674 becomes unknown. */
4675 peel_iters_epilogue = assumed_vf / 2;
4676 if (dump_enabled_p ())
4677 dump_printf (MSG_NOTE, "cost model: "
4678 "epilogue peel iters set to vf/2 because "
4679 "peeling for alignment is unknown.\n");
4681 /* See the same reason above in peel_iters_prologue calculation. */
4682 epilogue_need_br_taken_cost = true;
4683 epilogue_need_br_not_taken_cost = true;
4685 else
4687 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4688 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4689 /* If peeled iterations are known but number of scalar loop
4690 iterations are unknown, count a taken branch per peeled loop. */
4691 epilogue_need_br_taken_cost = true;
4694 stmt_info_for_cost *si;
4695 int j;
4696 /* Add costs associated with peel_iters_prologue. */
4697 if (peel_iters_prologue)
4698 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4700 (void) add_stmt_cost (target_cost_data,
4701 si->count * peel_iters_prologue, si->kind,
4702 si->stmt_info, si->node, si->vectype,
4703 si->misalign, vect_prologue);
4706 /* Add costs associated with peel_iters_epilogue. */
4707 if (peel_iters_epilogue)
4708 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4710 (void) add_stmt_cost (target_cost_data,
4711 si->count * peel_iters_epilogue, si->kind,
4712 si->stmt_info, si->node, si->vectype,
4713 si->misalign, vect_epilogue);
4716 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4718 if (prologue_need_br_taken_cost)
4719 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4720 vect_prologue);
4722 if (prologue_need_br_not_taken_cost)
4723 (void) add_stmt_cost (target_cost_data, 1,
4724 cond_branch_not_taken, vect_prologue);
4726 if (epilogue_need_br_taken_cost)
4727 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4728 vect_epilogue);
4730 if (epilogue_need_br_not_taken_cost)
4731 (void) add_stmt_cost (target_cost_data, 1,
4732 cond_branch_not_taken, vect_epilogue);
4734 /* Take care of special costs for rgroup controls of partial vectors. */
4735 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4736 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4737 == vect_partial_vectors_avx512))
4739 /* Calculate how many masks we need to generate. */
4740 unsigned int num_masks = 0;
4741 bool need_saturation = false;
4742 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4743 if (rgm.type)
4745 unsigned nvectors = rgm.factor;
4746 num_masks += nvectors;
4747 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4748 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4749 need_saturation = true;
4752 /* ??? The target isn't able to identify the costs below as
4753 producing masks so it cannot penaltize cases where we'd run
4754 out of mask registers for example. */
4756 /* ??? We are also failing to account for smaller vector masks
4757 we generate by splitting larger masks in vect_get_loop_mask. */
4759 /* In the worst case, we need to generate each mask in the prologue
4760 and in the loop body. We need one splat per group and one
4761 compare per mask.
4763 Sometimes the prologue mask will fold to a constant,
4764 so the actual prologue cost might be smaller. However, it's
4765 simpler and safer to use the worst-case cost; if this ends up
4766 being the tie-breaker between vectorizing or not, then it's
4767 probably better not to vectorize. */
4768 (void) add_stmt_cost (target_cost_data,
4769 num_masks
4770 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4771 vector_stmt, NULL, NULL, NULL_TREE, 0,
4772 vect_prologue);
4773 (void) add_stmt_cost (target_cost_data,
4774 num_masks
4775 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4776 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4778 /* When we need saturation we need it both in the prologue and
4779 the epilogue. */
4780 if (need_saturation)
4782 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4783 NULL, NULL, NULL_TREE, 0, vect_prologue);
4784 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4785 NULL, NULL, NULL_TREE, 0, vect_body);
4788 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4789 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4790 == vect_partial_vectors_while_ult))
4792 /* Calculate how many masks we need to generate. */
4793 unsigned int num_masks = 0;
4794 rgroup_controls *rgm;
4795 unsigned int num_vectors_m1;
4796 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4797 num_vectors_m1, rgm)
4798 if (rgm->type)
4799 num_masks += num_vectors_m1 + 1;
4800 gcc_assert (num_masks > 0);
4802 /* In the worst case, we need to generate each mask in the prologue
4803 and in the loop body. One of the loop body mask instructions
4804 replaces the comparison in the scalar loop, and since we don't
4805 count the scalar comparison against the scalar body, we shouldn't
4806 count that vector instruction against the vector body either.
4808 Sometimes we can use unpacks instead of generating prologue
4809 masks and sometimes the prologue mask will fold to a constant,
4810 so the actual prologue cost might be smaller. However, it's
4811 simpler and safer to use the worst-case cost; if this ends up
4812 being the tie-breaker between vectorizing or not, then it's
4813 probably better not to vectorize. */
4814 (void) add_stmt_cost (target_cost_data, num_masks,
4815 vector_stmt, NULL, NULL, NULL_TREE, 0,
4816 vect_prologue);
4817 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4818 vector_stmt, NULL, NULL, NULL_TREE, 0,
4819 vect_body);
4821 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4823 /* Referring to the functions vect_set_loop_condition_partial_vectors
4824 and vect_set_loop_controls_directly, we need to generate each
4825 length in the prologue and in the loop body if required. Although
4826 there are some possible optimizations, we consider the worst case
4827 here. */
4829 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4830 signed char partial_load_store_bias
4831 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4832 bool need_iterate_p
4833 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4834 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4836 /* Calculate how many statements to be added. */
4837 unsigned int prologue_stmts = 0;
4838 unsigned int body_stmts = 0;
4840 rgroup_controls *rgc;
4841 unsigned int num_vectors_m1;
4842 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4843 if (rgc->type)
4845 /* May need one SHIFT for nitems_total computation. */
4846 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4847 if (nitems != 1 && !niters_known_p)
4848 prologue_stmts += 1;
4850 /* May need one MAX and one MINUS for wrap around. */
4851 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4852 prologue_stmts += 2;
4854 /* Need one MAX and one MINUS for each batch limit excepting for
4855 the 1st one. */
4856 prologue_stmts += num_vectors_m1 * 2;
4858 unsigned int num_vectors = num_vectors_m1 + 1;
4860 /* Need to set up lengths in prologue, only one MIN required
4861 for each since start index is zero. */
4862 prologue_stmts += num_vectors;
4864 /* If we have a non-zero partial load bias, we need one PLUS
4865 to adjust the load length. */
4866 if (partial_load_store_bias != 0)
4867 body_stmts += 1;
4869 /* Each may need two MINs and one MINUS to update lengths in body
4870 for next iteration. */
4871 if (need_iterate_p)
4872 body_stmts += 3 * num_vectors;
4875 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4876 scalar_stmt, vect_prologue);
4877 (void) add_stmt_cost (target_cost_data, body_stmts,
4878 scalar_stmt, vect_body);
4881 /* FORNOW: The scalar outside cost is incremented in one of the
4882 following ways:
4884 1. The vectorizer checks for alignment and aliasing and generates
4885 a condition that allows dynamic vectorization. A cost model
4886 check is ANDED with the versioning condition. Hence scalar code
4887 path now has the added cost of the versioning check.
4889 if (cost > th & versioning_check)
4890 jmp to vector code
4892 Hence run-time scalar is incremented by not-taken branch cost.
4894 2. The vectorizer then checks if a prologue is required. If the
4895 cost model check was not done before during versioning, it has to
4896 be done before the prologue check.
4898 if (cost <= th)
4899 prologue = scalar_iters
4900 if (prologue == 0)
4901 jmp to vector code
4902 else
4903 execute prologue
4904 if (prologue == num_iters)
4905 go to exit
4907 Hence the run-time scalar cost is incremented by a taken branch,
4908 plus a not-taken branch, plus a taken branch cost.
4910 3. The vectorizer then checks if an epilogue is required. If the
4911 cost model check was not done before during prologue check, it
4912 has to be done with the epilogue check.
4914 if (prologue == 0)
4915 jmp to vector code
4916 else
4917 execute prologue
4918 if (prologue == num_iters)
4919 go to exit
4920 vector code:
4921 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4922 jmp to epilogue
4924 Hence the run-time scalar cost should be incremented by 2 taken
4925 branches.
4927 TODO: The back end may reorder the BBS's differently and reverse
4928 conditions/branch directions. Change the estimates below to
4929 something more reasonable. */
4931 /* If the number of iterations is known and we do not do versioning, we can
4932 decide whether to vectorize at compile time. Hence the scalar version
4933 do not carry cost model guard costs. */
4934 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4935 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4937 /* Cost model check occurs at versioning. */
4938 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4939 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4940 else
4942 /* Cost model check occurs at prologue generation. */
4943 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4944 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4945 + vect_get_stmt_cost (cond_branch_not_taken);
4946 /* Cost model check occurs at epilogue generation. */
4947 else
4948 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4952 /* Complete the target-specific cost calculations. */
4953 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4954 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4955 suggested_unroll_factor);
4957 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4958 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4959 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4960 *suggested_unroll_factor,
4961 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4963 if (dump_enabled_p ())
4964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4965 "can't unroll as unrolled vectorization factor larger"
4966 " than maximum vectorization factor: "
4967 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4968 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4969 *suggested_unroll_factor = 1;
4972 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4974 if (dump_enabled_p ())
4976 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4977 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4978 vec_inside_cost);
4979 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4980 vec_prologue_cost);
4981 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4982 vec_epilogue_cost);
4983 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4984 scalar_single_iter_cost);
4985 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4986 scalar_outside_cost);
4987 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4988 vec_outside_cost);
4989 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4990 peel_iters_prologue);
4991 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4992 peel_iters_epilogue);
4995 /* Calculate number of iterations required to make the vector version
4996 profitable, relative to the loop bodies only. The following condition
4997 must hold true:
4998 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4999 where
5000 SIC = scalar iteration cost, VIC = vector iteration cost,
5001 VOC = vector outside cost, VF = vectorization factor,
5002 NPEEL = prologue iterations + epilogue iterations,
5003 SOC = scalar outside cost for run time cost model check. */
5005 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5006 - vec_inside_cost);
5007 if (saving_per_viter <= 0)
5009 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5010 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5011 "vectorization did not happen for a simd loop");
5013 if (dump_enabled_p ())
5014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5015 "cost model: the vector iteration cost = %d "
5016 "divided by the scalar iteration cost = %d "
5017 "is greater or equal to the vectorization factor = %d"
5018 ".\n",
5019 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5020 *ret_min_profitable_niters = -1;
5021 *ret_min_profitable_estimate = -1;
5022 return;
5025 /* ??? The "if" arm is written to handle all cases; see below for what
5026 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5027 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5029 /* Rewriting the condition above in terms of the number of
5030 vector iterations (vniters) rather than the number of
5031 scalar iterations (niters) gives:
5033 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5035 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5037 For integer N, X and Y when X > 0:
5039 N * X > Y <==> N >= (Y /[floor] X) + 1. */
5040 int outside_overhead = (vec_outside_cost
5041 - scalar_single_iter_cost * peel_iters_prologue
5042 - scalar_single_iter_cost * peel_iters_epilogue
5043 - scalar_outside_cost);
5044 /* We're only interested in cases that require at least one
5045 vector iteration. */
5046 int min_vec_niters = 1;
5047 if (outside_overhead > 0)
5048 min_vec_niters = outside_overhead / saving_per_viter + 1;
5050 if (dump_enabled_p ())
5051 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5052 min_vec_niters);
5054 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5056 /* Now that we know the minimum number of vector iterations,
5057 find the minimum niters for which the scalar cost is larger:
5059 SIC * niters > VIC * vniters + VOC - SOC
5061 We know that the minimum niters is no more than
5062 vniters * VF + NPEEL, but it might be (and often is) less
5063 than that if a partial vector iteration is cheaper than the
5064 equivalent scalar code. */
5065 int threshold = (vec_inside_cost * min_vec_niters
5066 + vec_outside_cost
5067 - scalar_outside_cost);
5068 if (threshold <= 0)
5069 min_profitable_iters = 1;
5070 else
5071 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5073 else
5074 /* Convert the number of vector iterations into a number of
5075 scalar iterations. */
5076 min_profitable_iters = (min_vec_niters * assumed_vf
5077 + peel_iters_prologue
5078 + peel_iters_epilogue);
5080 else
5082 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5083 * assumed_vf
5084 - vec_inside_cost * peel_iters_prologue
5085 - vec_inside_cost * peel_iters_epilogue);
5086 if (min_profitable_iters <= 0)
5087 min_profitable_iters = 0;
5088 else
5090 min_profitable_iters /= saving_per_viter;
5092 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5093 <= (((int) vec_inside_cost * min_profitable_iters)
5094 + (((int) vec_outside_cost - scalar_outside_cost)
5095 * assumed_vf)))
5096 min_profitable_iters++;
5100 if (dump_enabled_p ())
5101 dump_printf (MSG_NOTE,
5102 " Calculated minimum iters for profitability: %d\n",
5103 min_profitable_iters);
5105 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5106 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5107 /* We want the vectorized loop to execute at least once. */
5108 min_profitable_iters = assumed_vf + peel_iters_prologue;
5109 else if (min_profitable_iters < peel_iters_prologue)
5110 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5111 vectorized loop executes at least once. */
5112 min_profitable_iters = peel_iters_prologue;
5114 if (dump_enabled_p ())
5115 dump_printf_loc (MSG_NOTE, vect_location,
5116 " Runtime profitability threshold = %d\n",
5117 min_profitable_iters);
5119 *ret_min_profitable_niters = min_profitable_iters;
5121 /* Calculate number of iterations required to make the vector version
5122 profitable, relative to the loop bodies only.
5124 Non-vectorized variant is SIC * niters and it must win over vector
5125 variant on the expected loop trip count. The following condition must hold true:
5126 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5128 if (vec_outside_cost <= 0)
5129 min_profitable_estimate = 0;
5130 /* ??? This "else if" arm is written to handle all cases; see below for
5131 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5132 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5134 /* This is a repeat of the code above, but with + SOC rather
5135 than - SOC. */
5136 int outside_overhead = (vec_outside_cost
5137 - scalar_single_iter_cost * peel_iters_prologue
5138 - scalar_single_iter_cost * peel_iters_epilogue
5139 + scalar_outside_cost);
5140 int min_vec_niters = 1;
5141 if (outside_overhead > 0)
5142 min_vec_niters = outside_overhead / saving_per_viter + 1;
5144 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5146 int threshold = (vec_inside_cost * min_vec_niters
5147 + vec_outside_cost
5148 + scalar_outside_cost);
5149 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5151 else
5152 min_profitable_estimate = (min_vec_niters * assumed_vf
5153 + peel_iters_prologue
5154 + peel_iters_epilogue);
5156 else
5158 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5159 * assumed_vf
5160 - vec_inside_cost * peel_iters_prologue
5161 - vec_inside_cost * peel_iters_epilogue)
5162 / ((scalar_single_iter_cost * assumed_vf)
5163 - vec_inside_cost);
5165 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5166 if (dump_enabled_p ())
5167 dump_printf_loc (MSG_NOTE, vect_location,
5168 " Static estimate profitability threshold = %d\n",
5169 min_profitable_estimate);
5171 *ret_min_profitable_estimate = min_profitable_estimate;
5174 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5175 vector elements (not bits) for a vector with NELT elements. */
5176 static void
5177 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5178 vec_perm_builder *sel)
5180 /* The encoding is a single stepped pattern. Any wrap-around is handled
5181 by vec_perm_indices. */
5182 sel->new_vector (nelt, 1, 3);
5183 for (unsigned int i = 0; i < 3; i++)
5184 sel->quick_push (i + offset);
5187 /* Checks whether the target supports whole-vector shifts for vectors of mode
5188 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5189 it supports vec_perm_const with masks for all necessary shift amounts. */
5190 static bool
5191 have_whole_vector_shift (machine_mode mode)
5193 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5194 return true;
5196 /* Variable-length vectors should be handled via the optab. */
5197 unsigned int nelt;
5198 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5199 return false;
5201 vec_perm_builder sel;
5202 vec_perm_indices indices;
5203 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5205 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5206 indices.new_vector (sel, 2, nelt);
5207 if (!can_vec_perm_const_p (mode, mode, indices, false))
5208 return false;
5210 return true;
5213 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5214 multiplication operands have differing signs and (b) we intend
5215 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5216 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5218 static bool
5219 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5220 stmt_vec_info stmt_info)
5222 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5223 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5224 return false;
5226 tree rhs1 = gimple_assign_rhs1 (assign);
5227 tree rhs2 = gimple_assign_rhs2 (assign);
5228 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5229 return false;
5231 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5232 gcc_assert (reduc_info->is_reduc_info);
5233 return !directly_supported_p (DOT_PROD_EXPR,
5234 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5235 optab_vector_mixed_sign);
5238 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5239 functions. Design better to avoid maintenance issues. */
5241 /* Function vect_model_reduction_cost.
5243 Models cost for a reduction operation, including the vector ops
5244 generated within the strip-mine loop in some cases, the initial
5245 definition before the loop, and the epilogue code that must be generated. */
5247 static void
5248 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5249 stmt_vec_info stmt_info, internal_fn reduc_fn,
5250 vect_reduction_type reduction_type,
5251 int ncopies, stmt_vector_for_cost *cost_vec)
5253 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5254 tree vectype;
5255 machine_mode mode;
5256 class loop *loop = NULL;
5258 if (loop_vinfo)
5259 loop = LOOP_VINFO_LOOP (loop_vinfo);
5261 /* Condition reductions generate two reductions in the loop. */
5262 if (reduction_type == COND_REDUCTION)
5263 ncopies *= 2;
5265 vectype = STMT_VINFO_VECTYPE (stmt_info);
5266 mode = TYPE_MODE (vectype);
5267 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5269 gimple_match_op op;
5270 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5271 gcc_unreachable ();
5273 bool emulated_mixed_dot_prod
5274 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5275 if (reduction_type == EXTRACT_LAST_REDUCTION)
5276 /* No extra instructions are needed in the prologue. The loop body
5277 operations are costed in vectorizable_condition. */
5278 inside_cost = 0;
5279 else if (reduction_type == FOLD_LEFT_REDUCTION)
5281 /* No extra instructions needed in the prologue. */
5282 prologue_cost = 0;
5284 if (reduc_fn != IFN_LAST)
5285 /* Count one reduction-like operation per vector. */
5286 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5287 stmt_info, 0, vect_body);
5288 else
5290 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5291 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5292 inside_cost = record_stmt_cost (cost_vec, nelements,
5293 vec_to_scalar, stmt_info, 0,
5294 vect_body);
5295 inside_cost += record_stmt_cost (cost_vec, nelements,
5296 scalar_stmt, stmt_info, 0,
5297 vect_body);
5300 else
5302 /* Add in the cost of the initial definitions. */
5303 int prologue_stmts;
5304 if (reduction_type == COND_REDUCTION)
5305 /* For cond reductions we have four vectors: initial index, step,
5306 initial result of the data reduction, initial value of the index
5307 reduction. */
5308 prologue_stmts = 4;
5309 else if (emulated_mixed_dot_prod)
5310 /* We need the initial reduction value and two invariants:
5311 one that contains the minimum signed value and one that
5312 contains half of its negative. */
5313 prologue_stmts = 3;
5314 else
5315 prologue_stmts = 1;
5316 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5317 scalar_to_vec, stmt_info, 0,
5318 vect_prologue);
5321 /* Determine cost of epilogue code.
5323 We have a reduction operator that will reduce the vector in one statement.
5324 Also requires scalar extract. */
5326 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5328 if (reduc_fn != IFN_LAST)
5330 if (reduction_type == COND_REDUCTION)
5332 /* An EQ stmt and an COND_EXPR stmt. */
5333 epilogue_cost += record_stmt_cost (cost_vec, 2,
5334 vector_stmt, stmt_info, 0,
5335 vect_epilogue);
5336 /* Reduction of the max index and a reduction of the found
5337 values. */
5338 epilogue_cost += record_stmt_cost (cost_vec, 2,
5339 vec_to_scalar, stmt_info, 0,
5340 vect_epilogue);
5341 /* A broadcast of the max value. */
5342 epilogue_cost += record_stmt_cost (cost_vec, 1,
5343 scalar_to_vec, stmt_info, 0,
5344 vect_epilogue);
5346 else
5348 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5349 stmt_info, 0, vect_epilogue);
5350 epilogue_cost += record_stmt_cost (cost_vec, 1,
5351 vec_to_scalar, stmt_info, 0,
5352 vect_epilogue);
5355 else if (reduction_type == COND_REDUCTION)
5357 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5358 /* Extraction of scalar elements. */
5359 epilogue_cost += record_stmt_cost (cost_vec,
5360 2 * estimated_nunits,
5361 vec_to_scalar, stmt_info, 0,
5362 vect_epilogue);
5363 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5364 epilogue_cost += record_stmt_cost (cost_vec,
5365 2 * estimated_nunits - 3,
5366 scalar_stmt, stmt_info, 0,
5367 vect_epilogue);
5369 else if (reduction_type == EXTRACT_LAST_REDUCTION
5370 || reduction_type == FOLD_LEFT_REDUCTION)
5371 /* No extra instructions need in the epilogue. */
5373 else
5375 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5376 tree bitsize = TYPE_SIZE (op.type);
5377 int element_bitsize = tree_to_uhwi (bitsize);
5378 int nelements = vec_size_in_bits / element_bitsize;
5380 if (op.code == COND_EXPR)
5381 op.code = MAX_EXPR;
5383 /* We have a whole vector shift available. */
5384 if (VECTOR_MODE_P (mode)
5385 && directly_supported_p (op.code, vectype)
5386 && have_whole_vector_shift (mode))
5388 /* Final reduction via vector shifts and the reduction operator.
5389 Also requires scalar extract. */
5390 epilogue_cost += record_stmt_cost (cost_vec,
5391 exact_log2 (nelements) * 2,
5392 vector_stmt, stmt_info, 0,
5393 vect_epilogue);
5394 epilogue_cost += record_stmt_cost (cost_vec, 1,
5395 vec_to_scalar, stmt_info, 0,
5396 vect_epilogue);
5398 else
5399 /* Use extracts and reduction op for final reduction. For N
5400 elements, we have N extracts and N-1 reduction ops. */
5401 epilogue_cost += record_stmt_cost (cost_vec,
5402 nelements + nelements - 1,
5403 vector_stmt, stmt_info, 0,
5404 vect_epilogue);
5408 if (dump_enabled_p ())
5409 dump_printf (MSG_NOTE,
5410 "vect_model_reduction_cost: inside_cost = %d, "
5411 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5412 prologue_cost, epilogue_cost);
5415 /* SEQ is a sequence of instructions that initialize the reduction
5416 described by REDUC_INFO. Emit them in the appropriate place. */
5418 static void
5419 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5420 stmt_vec_info reduc_info, gimple *seq)
5422 if (reduc_info->reused_accumulator)
5424 /* When reusing an accumulator from the main loop, we only need
5425 initialization instructions if the main loop can be skipped.
5426 In that case, emit the initialization instructions at the end
5427 of the guard block that does the skip. */
5428 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5429 gcc_assert (skip_edge);
5430 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5431 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5433 else
5435 /* The normal case: emit the initialization instructions on the
5436 preheader edge. */
5437 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5438 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5442 /* Function get_initial_def_for_reduction
5444 Input:
5445 REDUC_INFO - the info_for_reduction
5446 INIT_VAL - the initial value of the reduction variable
5447 NEUTRAL_OP - a value that has no effect on the reduction, as per
5448 neutral_op_for_reduction
5450 Output:
5451 Return a vector variable, initialized according to the operation that
5452 STMT_VINFO performs. This vector will be used as the initial value
5453 of the vector of partial results.
5455 The value we need is a vector in which element 0 has value INIT_VAL
5456 and every other element has value NEUTRAL_OP. */
5458 static tree
5459 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5460 stmt_vec_info reduc_info,
5461 tree init_val, tree neutral_op)
5463 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5464 tree scalar_type = TREE_TYPE (init_val);
5465 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5466 tree init_def;
5467 gimple_seq stmts = NULL;
5469 gcc_assert (vectype);
5471 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5472 || SCALAR_FLOAT_TYPE_P (scalar_type));
5474 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5475 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5477 if (operand_equal_p (init_val, neutral_op))
5479 /* If both elements are equal then the vector described above is
5480 just a splat. */
5481 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5482 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5484 else
5486 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5487 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5488 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5490 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5491 element 0. */
5492 init_def = gimple_build_vector_from_val (&stmts, vectype,
5493 neutral_op);
5494 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5495 vectype, init_def, init_val);
5497 else
5499 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5500 tree_vector_builder elts (vectype, 1, 2);
5501 elts.quick_push (init_val);
5502 elts.quick_push (neutral_op);
5503 init_def = gimple_build_vector (&stmts, &elts);
5507 if (stmts)
5508 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5509 return init_def;
5512 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5513 which performs a reduction involving GROUP_SIZE scalar statements.
5514 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5515 is nonnull, introducing extra elements of that value will not change the
5516 result. */
5518 static void
5519 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5520 stmt_vec_info reduc_info,
5521 vec<tree> *vec_oprnds,
5522 unsigned int number_of_vectors,
5523 unsigned int group_size, tree neutral_op)
5525 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5526 unsigned HOST_WIDE_INT nunits;
5527 unsigned j, number_of_places_left_in_vector;
5528 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5529 unsigned int i;
5531 gcc_assert (group_size == initial_values.length () || neutral_op);
5533 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5534 created vectors. It is greater than 1 if unrolling is performed.
5536 For example, we have two scalar operands, s1 and s2 (e.g., group of
5537 strided accesses of size two), while NUNITS is four (i.e., four scalars
5538 of this type can be packed in a vector). The output vector will contain
5539 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5540 will be 2).
5542 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5543 vectors containing the operands.
5545 For example, NUNITS is four as before, and the group size is 8
5546 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5547 {s5, s6, s7, s8}. */
5549 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5550 nunits = group_size;
5552 number_of_places_left_in_vector = nunits;
5553 bool constant_p = true;
5554 tree_vector_builder elts (vector_type, nunits, 1);
5555 elts.quick_grow (nunits);
5556 gimple_seq ctor_seq = NULL;
5557 for (j = 0; j < nunits * number_of_vectors; ++j)
5559 tree op;
5560 i = j % group_size;
5562 /* Get the def before the loop. In reduction chain we have only
5563 one initial value. Else we have as many as PHIs in the group. */
5564 if (i >= initial_values.length () || (j > i && neutral_op))
5565 op = neutral_op;
5566 else
5567 op = initial_values[i];
5569 /* Create 'vect_ = {op0,op1,...,opn}'. */
5570 number_of_places_left_in_vector--;
5571 elts[nunits - number_of_places_left_in_vector - 1] = op;
5572 if (!CONSTANT_CLASS_P (op))
5573 constant_p = false;
5575 if (number_of_places_left_in_vector == 0)
5577 tree init;
5578 if (constant_p && !neutral_op
5579 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5580 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5581 /* Build the vector directly from ELTS. */
5582 init = gimple_build_vector (&ctor_seq, &elts);
5583 else if (neutral_op)
5585 /* Build a vector of the neutral value and shift the
5586 other elements into place. */
5587 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5588 neutral_op);
5589 int k = nunits;
5590 while (k > 0 && elts[k - 1] == neutral_op)
5591 k -= 1;
5592 while (k > 0)
5594 k -= 1;
5595 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5596 vector_type, init, elts[k]);
5599 else
5601 /* First time round, duplicate ELTS to fill the
5602 required number of vectors. */
5603 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5604 elts, number_of_vectors, *vec_oprnds);
5605 break;
5607 vec_oprnds->quick_push (init);
5609 number_of_places_left_in_vector = nunits;
5610 elts.new_vector (vector_type, nunits, 1);
5611 elts.quick_grow (nunits);
5612 constant_p = true;
5615 if (ctor_seq != NULL)
5616 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5619 /* For a statement STMT_INFO taking part in a reduction operation return
5620 the stmt_vec_info the meta information is stored on. */
5622 stmt_vec_info
5623 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5625 stmt_info = vect_orig_stmt (stmt_info);
5626 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5627 if (!is_a <gphi *> (stmt_info->stmt)
5628 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5629 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5630 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5631 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5633 if (gimple_phi_num_args (phi) == 1)
5634 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5636 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5638 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5639 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5640 stmt_info = info;
5642 return stmt_info;
5645 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5646 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5647 return false. */
5649 static bool
5650 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5651 stmt_vec_info reduc_info)
5653 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5654 if (!main_loop_vinfo)
5655 return false;
5657 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5658 return false;
5660 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5661 auto_vec<tree, 16> main_loop_results (num_phis);
5662 auto_vec<tree, 16> initial_values (num_phis);
5663 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5665 /* The epilogue loop can be entered either from the main loop or
5666 from an earlier guard block. */
5667 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5668 for (tree incoming_value : reduc_info->reduc_initial_values)
5670 /* Look for:
5672 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5673 INITIAL_VALUE(guard block)>. */
5674 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5676 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5677 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5679 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5680 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5682 main_loop_results.quick_push (from_main_loop);
5683 initial_values.quick_push (from_skip);
5686 else
5687 /* The main loop dominates the epilogue loop. */
5688 main_loop_results.splice (reduc_info->reduc_initial_values);
5690 /* See if the main loop has the kind of accumulator we need. */
5691 vect_reusable_accumulator *accumulator
5692 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5693 if (!accumulator
5694 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5695 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5696 accumulator->reduc_info->reduc_scalar_results.begin ()))
5697 return false;
5699 /* Handle the case where we can reduce wider vectors to narrower ones. */
5700 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5701 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5702 unsigned HOST_WIDE_INT m;
5703 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5704 TYPE_VECTOR_SUBPARTS (vectype), &m))
5705 return false;
5706 /* Check the intermediate vector types and operations are available. */
5707 tree prev_vectype = old_vectype;
5708 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5709 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5711 intermediate_nunits = exact_div (intermediate_nunits, 2);
5712 tree intermediate_vectype = get_related_vectype_for_scalar_type
5713 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5714 if (!intermediate_vectype
5715 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5716 intermediate_vectype)
5717 || !can_vec_extract (TYPE_MODE (prev_vectype),
5718 TYPE_MODE (intermediate_vectype)))
5719 return false;
5720 prev_vectype = intermediate_vectype;
5723 /* Non-SLP reductions might apply an adjustment after the reduction
5724 operation, in order to simplify the initialization of the accumulator.
5725 If the epilogue loop carries on from where the main loop left off,
5726 it should apply the same adjustment to the final reduction result.
5728 If the epilogue loop can also be entered directly (rather than via
5729 the main loop), we need to be able to handle that case in the same way,
5730 with the same adjustment. (In principle we could add a PHI node
5731 to select the correct adjustment, but in practice that shouldn't be
5732 necessary.) */
5733 tree main_adjustment
5734 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5735 if (loop_vinfo->main_loop_edge && main_adjustment)
5737 gcc_assert (num_phis == 1);
5738 tree initial_value = initial_values[0];
5739 /* Check that we can use INITIAL_VALUE as the adjustment and
5740 initialize the accumulator with a neutral value instead. */
5741 if (!operand_equal_p (initial_value, main_adjustment))
5742 return false;
5743 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5744 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5745 code, initial_value);
5747 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5748 reduc_info->reduc_initial_values.truncate (0);
5749 reduc_info->reduc_initial_values.splice (initial_values);
5750 reduc_info->reused_accumulator = accumulator;
5751 return true;
5754 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5755 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5757 static tree
5758 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5759 gimple_seq *seq)
5761 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5762 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5763 tree stype = TREE_TYPE (vectype);
5764 tree new_temp = vec_def;
5765 while (nunits > nunits1)
5767 nunits /= 2;
5768 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5769 stype, nunits);
5770 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5772 /* The target has to make sure we support lowpart/highpart
5773 extraction, either via direct vector extract or through
5774 an integer mode punning. */
5775 tree dst1, dst2;
5776 gimple *epilog_stmt;
5777 if (convert_optab_handler (vec_extract_optab,
5778 TYPE_MODE (TREE_TYPE (new_temp)),
5779 TYPE_MODE (vectype1))
5780 != CODE_FOR_nothing)
5782 /* Extract sub-vectors directly once vec_extract becomes
5783 a conversion optab. */
5784 dst1 = make_ssa_name (vectype1);
5785 epilog_stmt
5786 = gimple_build_assign (dst1, BIT_FIELD_REF,
5787 build3 (BIT_FIELD_REF, vectype1,
5788 new_temp, TYPE_SIZE (vectype1),
5789 bitsize_int (0)));
5790 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5791 dst2 = make_ssa_name (vectype1);
5792 epilog_stmt
5793 = gimple_build_assign (dst2, BIT_FIELD_REF,
5794 build3 (BIT_FIELD_REF, vectype1,
5795 new_temp, TYPE_SIZE (vectype1),
5796 bitsize_int (bitsize)));
5797 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5799 else
5801 /* Extract via punning to appropriately sized integer mode
5802 vector. */
5803 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5804 tree etype = build_vector_type (eltype, 2);
5805 gcc_assert (convert_optab_handler (vec_extract_optab,
5806 TYPE_MODE (etype),
5807 TYPE_MODE (eltype))
5808 != CODE_FOR_nothing);
5809 tree tem = make_ssa_name (etype);
5810 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5811 build1 (VIEW_CONVERT_EXPR,
5812 etype, new_temp));
5813 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5814 new_temp = tem;
5815 tem = make_ssa_name (eltype);
5816 epilog_stmt
5817 = gimple_build_assign (tem, BIT_FIELD_REF,
5818 build3 (BIT_FIELD_REF, eltype,
5819 new_temp, TYPE_SIZE (eltype),
5820 bitsize_int (0)));
5821 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5822 dst1 = make_ssa_name (vectype1);
5823 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5824 build1 (VIEW_CONVERT_EXPR,
5825 vectype1, tem));
5826 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5827 tem = make_ssa_name (eltype);
5828 epilog_stmt
5829 = gimple_build_assign (tem, BIT_FIELD_REF,
5830 build3 (BIT_FIELD_REF, eltype,
5831 new_temp, TYPE_SIZE (eltype),
5832 bitsize_int (bitsize)));
5833 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5834 dst2 = make_ssa_name (vectype1);
5835 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5836 build1 (VIEW_CONVERT_EXPR,
5837 vectype1, tem));
5838 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5841 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5844 return new_temp;
5847 /* Function vect_create_epilog_for_reduction
5849 Create code at the loop-epilog to finalize the result of a reduction
5850 computation.
5852 STMT_INFO is the scalar reduction stmt that is being vectorized.
5853 SLP_NODE is an SLP node containing a group of reduction statements. The
5854 first one in this group is STMT_INFO.
5855 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5856 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5857 (counting from 0)
5859 This function:
5860 1. Completes the reduction def-use cycles.
5861 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5862 by calling the function specified by REDUC_FN if available, or by
5863 other means (whole-vector shifts or a scalar loop).
5864 The function also creates a new phi node at the loop exit to preserve
5865 loop-closed form, as illustrated below.
5867 The flow at the entry to this function:
5869 loop:
5870 vec_def = phi <vec_init, null> # REDUCTION_PHI
5871 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5872 s_loop = scalar_stmt # (scalar) STMT_INFO
5873 loop_exit:
5874 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5875 use <s_out0>
5876 use <s_out0>
5878 The above is transformed by this function into:
5880 loop:
5881 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5882 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5883 s_loop = scalar_stmt # (scalar) STMT_INFO
5884 loop_exit:
5885 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5886 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5887 v_out2 = reduce <v_out1>
5888 s_out3 = extract_field <v_out2, 0>
5889 s_out4 = adjust_result <s_out3>
5890 use <s_out4>
5891 use <s_out4>
5894 static void
5895 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5896 stmt_vec_info stmt_info,
5897 slp_tree slp_node,
5898 slp_instance slp_node_instance)
5900 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5901 gcc_assert (reduc_info->is_reduc_info);
5902 /* For double reductions we need to get at the inner loop reduction
5903 stmt which has the meta info attached. Our stmt_info is that of the
5904 loop-closed PHI of the inner loop which we remember as
5905 def for the reduction PHI generation. */
5906 bool double_reduc = false;
5907 stmt_vec_info rdef_info = stmt_info;
5908 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5910 gcc_assert (!slp_node);
5911 double_reduc = true;
5912 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5913 (stmt_info->stmt, 0));
5914 stmt_info = vect_stmt_to_vectorize (stmt_info);
5916 gphi *reduc_def_stmt
5917 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5918 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5919 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5920 tree vectype;
5921 machine_mode mode;
5922 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5923 basic_block exit_bb;
5924 tree scalar_dest;
5925 tree scalar_type;
5926 gimple *new_phi = NULL, *phi = NULL;
5927 gimple_stmt_iterator exit_gsi;
5928 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5929 gimple *epilog_stmt = NULL;
5930 gimple *exit_phi;
5931 tree bitsize;
5932 tree def;
5933 tree orig_name, scalar_result;
5934 imm_use_iterator imm_iter, phi_imm_iter;
5935 use_operand_p use_p, phi_use_p;
5936 gimple *use_stmt;
5937 auto_vec<tree> reduc_inputs;
5938 int j, i;
5939 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5940 unsigned int group_size = 1, k;
5941 auto_vec<gimple *> phis;
5942 /* SLP reduction without reduction chain, e.g.,
5943 # a1 = phi <a2, a0>
5944 # b1 = phi <b2, b0>
5945 a2 = operation (a1)
5946 b2 = operation (b1) */
5947 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5948 bool direct_slp_reduc;
5949 tree induction_index = NULL_TREE;
5951 if (slp_node)
5952 group_size = SLP_TREE_LANES (slp_node);
5954 if (nested_in_vect_loop_p (loop, stmt_info))
5956 outer_loop = loop;
5957 loop = loop->inner;
5958 gcc_assert (!slp_node && double_reduc);
5961 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5962 gcc_assert (vectype);
5963 mode = TYPE_MODE (vectype);
5965 tree induc_val = NULL_TREE;
5966 tree adjustment_def = NULL;
5967 if (slp_node)
5969 else
5971 /* Optimize: for induction condition reduction, if we can't use zero
5972 for induc_val, use initial_def. */
5973 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5974 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5975 else if (double_reduc)
5977 else
5978 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5981 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5982 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5983 if (slp_reduc)
5984 /* All statements produce live-out values. */
5985 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5986 else if (slp_node)
5988 /* The last statement in the reduction chain produces the live-out
5989 value. Note SLP optimization can shuffle scalar stmts to
5990 optimize permutations so we have to search for the last stmt. */
5991 for (k = 0; k < group_size; ++k)
5992 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5994 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5995 break;
5999 unsigned vec_num;
6000 int ncopies;
6001 if (slp_node)
6003 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6004 ncopies = 1;
6006 else
6008 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6009 vec_num = 1;
6010 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6013 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6014 which is updated with the current index of the loop for every match of
6015 the original loop's cond_expr (VEC_STMT). This results in a vector
6016 containing the last time the condition passed for that vector lane.
6017 The first match will be a 1 to allow 0 to be used for non-matching
6018 indexes. If there are no matches at all then the vector will be all
6019 zeroes.
6021 PR92772: This algorithm is broken for architectures that support
6022 masked vectors, but do not provide fold_extract_last. */
6023 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6025 auto_vec<std::pair<tree, bool>, 2> ccompares;
6026 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6027 cond_info = vect_stmt_to_vectorize (cond_info);
6028 while (cond_info != reduc_info)
6030 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6032 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6033 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6034 ccompares.safe_push
6035 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6036 STMT_VINFO_REDUC_IDX (cond_info) == 2));
6038 cond_info
6039 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6040 1 + STMT_VINFO_REDUC_IDX
6041 (cond_info)));
6042 cond_info = vect_stmt_to_vectorize (cond_info);
6044 gcc_assert (ccompares.length () != 0);
6046 tree indx_before_incr, indx_after_incr;
6047 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6048 int scalar_precision
6049 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6050 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6051 tree cr_index_vector_type = get_related_vectype_for_scalar_type
6052 (TYPE_MODE (vectype), cr_index_scalar_type,
6053 TYPE_VECTOR_SUBPARTS (vectype));
6055 /* First we create a simple vector induction variable which starts
6056 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6057 vector size (STEP). */
6059 /* Create a {1,2,3,...} vector. */
6060 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6062 /* Create a vector of the step value. */
6063 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6064 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6066 /* Create an induction variable. */
6067 gimple_stmt_iterator incr_gsi;
6068 bool insert_after;
6069 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6070 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6071 insert_after, &indx_before_incr, &indx_after_incr);
6073 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6074 filled with zeros (VEC_ZERO). */
6076 /* Create a vector of 0s. */
6077 tree zero = build_zero_cst (cr_index_scalar_type);
6078 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6080 /* Create a vector phi node. */
6081 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6082 new_phi = create_phi_node (new_phi_tree, loop->header);
6083 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6084 loop_preheader_edge (loop), UNKNOWN_LOCATION);
6086 /* Now take the condition from the loops original cond_exprs
6087 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6088 every match uses values from the induction variable
6089 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6090 (NEW_PHI_TREE).
6091 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6092 the new cond_expr (INDEX_COND_EXPR). */
6093 gimple_seq stmts = NULL;
6094 for (int i = ccompares.length () - 1; i != -1; --i)
6096 tree ccompare = ccompares[i].first;
6097 if (ccompares[i].second)
6098 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6099 cr_index_vector_type,
6100 ccompare,
6101 indx_before_incr, new_phi_tree);
6102 else
6103 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6104 cr_index_vector_type,
6105 ccompare,
6106 new_phi_tree, indx_before_incr);
6108 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6110 /* Update the phi with the vec cond. */
6111 induction_index = new_phi_tree;
6112 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6113 loop_latch_edge (loop), UNKNOWN_LOCATION);
6116 /* 2. Create epilog code.
6117 The reduction epilog code operates across the elements of the vector
6118 of partial results computed by the vectorized loop.
6119 The reduction epilog code consists of:
6121 step 1: compute the scalar result in a vector (v_out2)
6122 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6123 step 3: adjust the scalar result (s_out3) if needed.
6125 Step 1 can be accomplished using one the following three schemes:
6126 (scheme 1) using reduc_fn, if available.
6127 (scheme 2) using whole-vector shifts, if available.
6128 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6129 combined.
6131 The overall epilog code looks like this:
6133 s_out0 = phi <s_loop> # original EXIT_PHI
6134 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6135 v_out2 = reduce <v_out1> # step 1
6136 s_out3 = extract_field <v_out2, 0> # step 2
6137 s_out4 = adjust_result <s_out3> # step 3
6139 (step 3 is optional, and steps 1 and 2 may be combined).
6140 Lastly, the uses of s_out0 are replaced by s_out4. */
6143 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6144 v_out1 = phi <VECT_DEF>
6145 Store them in NEW_PHIS. */
6146 if (double_reduc)
6147 loop = outer_loop;
6148 exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
6149 exit_gsi = gsi_after_labels (exit_bb);
6150 reduc_inputs.create (slp_node ? vec_num : ncopies);
6151 for (unsigned i = 0; i < vec_num; i++)
6153 gimple_seq stmts = NULL;
6154 if (slp_node)
6155 def = vect_get_slp_vect_def (slp_node, i);
6156 else
6157 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6158 for (j = 0; j < ncopies; j++)
6160 tree new_def = copy_ssa_name (def);
6161 phi = create_phi_node (new_def, exit_bb);
6162 if (j)
6163 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6164 SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
6165 new_def = gimple_convert (&stmts, vectype, new_def);
6166 reduc_inputs.quick_push (new_def);
6168 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6171 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6172 (i.e. when reduc_fn is not available) and in the final adjustment
6173 code (if needed). Also get the original scalar reduction variable as
6174 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6175 represents a reduction pattern), the tree-code and scalar-def are
6176 taken from the original stmt that the pattern-stmt (STMT) replaces.
6177 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6178 are taken from STMT. */
6180 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6181 if (orig_stmt_info != stmt_info)
6183 /* Reduction pattern */
6184 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6185 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6188 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6189 scalar_type = TREE_TYPE (scalar_dest);
6190 scalar_results.truncate (0);
6191 scalar_results.reserve_exact (group_size);
6192 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6193 bitsize = TYPE_SIZE (scalar_type);
6195 /* True if we should implement SLP_REDUC using native reduction operations
6196 instead of scalar operations. */
6197 direct_slp_reduc = (reduc_fn != IFN_LAST
6198 && slp_reduc
6199 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6201 /* In case of reduction chain, e.g.,
6202 # a1 = phi <a3, a0>
6203 a2 = operation (a1)
6204 a3 = operation (a2),
6206 we may end up with more than one vector result. Here we reduce them
6207 to one vector.
6209 The same is true for a SLP reduction, e.g.,
6210 # a1 = phi <a2, a0>
6211 # b1 = phi <b2, b0>
6212 a2 = operation (a1)
6213 b2 = operation (a2),
6215 where we can end up with more than one vector as well. We can
6216 easily accumulate vectors when the number of vector elements is
6217 a multiple of the SLP group size.
6219 The same is true if we couldn't use a single defuse cycle. */
6220 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6221 || direct_slp_reduc
6222 || (slp_reduc
6223 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6224 || ncopies > 1)
6226 gimple_seq stmts = NULL;
6227 tree single_input = reduc_inputs[0];
6228 for (k = 1; k < reduc_inputs.length (); k++)
6229 single_input = gimple_build (&stmts, code, vectype,
6230 single_input, reduc_inputs[k]);
6231 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6233 reduc_inputs.truncate (0);
6234 reduc_inputs.safe_push (single_input);
6237 tree orig_reduc_input = reduc_inputs[0];
6239 /* If this loop is an epilogue loop that can be skipped after the
6240 main loop, we can only share a reduction operation between the
6241 main loop and the epilogue if we put it at the target of the
6242 skip edge.
6244 We can still reuse accumulators if this check fails. Doing so has
6245 the minor(?) benefit of making the epilogue loop's scalar result
6246 independent of the main loop's scalar result. */
6247 bool unify_with_main_loop_p = false;
6248 if (reduc_info->reused_accumulator
6249 && loop_vinfo->skip_this_loop_edge
6250 && single_succ_p (exit_bb)
6251 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6253 unify_with_main_loop_p = true;
6255 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6256 reduc_inputs[0] = make_ssa_name (vectype);
6257 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6258 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6259 UNKNOWN_LOCATION);
6260 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6261 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6262 exit_gsi = gsi_after_labels (reduc_block);
6265 /* Shouldn't be used beyond this point. */
6266 exit_bb = nullptr;
6268 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6269 && reduc_fn != IFN_LAST)
6271 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6272 various data values where the condition matched and another vector
6273 (INDUCTION_INDEX) containing all the indexes of those matches. We
6274 need to extract the last matching index (which will be the index with
6275 highest value) and use this to index into the data vector.
6276 For the case where there were no matches, the data vector will contain
6277 all default values and the index vector will be all zeros. */
6279 /* Get various versions of the type of the vector of indexes. */
6280 tree index_vec_type = TREE_TYPE (induction_index);
6281 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6282 tree index_scalar_type = TREE_TYPE (index_vec_type);
6283 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6285 /* Get an unsigned integer version of the type of the data vector. */
6286 int scalar_precision
6287 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6288 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6289 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6290 vectype);
6292 /* First we need to create a vector (ZERO_VEC) of zeros and another
6293 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6294 can create using a MAX reduction and then expanding.
6295 In the case where the loop never made any matches, the max index will
6296 be zero. */
6298 /* Vector of {0, 0, 0,...}. */
6299 tree zero_vec = build_zero_cst (vectype);
6301 /* Find maximum value from the vector of found indexes. */
6302 tree max_index = make_ssa_name (index_scalar_type);
6303 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6304 1, induction_index);
6305 gimple_call_set_lhs (max_index_stmt, max_index);
6306 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6308 /* Vector of {max_index, max_index, max_index,...}. */
6309 tree max_index_vec = make_ssa_name (index_vec_type);
6310 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6311 max_index);
6312 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6313 max_index_vec_rhs);
6314 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6316 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6317 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6318 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6319 otherwise. Only one value should match, resulting in a vector
6320 (VEC_COND) with one data value and the rest zeros.
6321 In the case where the loop never made any matches, every index will
6322 match, resulting in a vector with all data values (which will all be
6323 the default value). */
6325 /* Compare the max index vector to the vector of found indexes to find
6326 the position of the max value. */
6327 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6328 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6329 induction_index,
6330 max_index_vec);
6331 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6333 /* Use the compare to choose either values from the data vector or
6334 zero. */
6335 tree vec_cond = make_ssa_name (vectype);
6336 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6337 vec_compare,
6338 reduc_inputs[0],
6339 zero_vec);
6340 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6342 /* Finally we need to extract the data value from the vector (VEC_COND)
6343 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6344 reduction, but because this doesn't exist, we can use a MAX reduction
6345 instead. The data value might be signed or a float so we need to cast
6346 it first.
6347 In the case where the loop never made any matches, the data values are
6348 all identical, and so will reduce down correctly. */
6350 /* Make the matched data values unsigned. */
6351 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6352 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6353 vec_cond);
6354 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6355 VIEW_CONVERT_EXPR,
6356 vec_cond_cast_rhs);
6357 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6359 /* Reduce down to a scalar value. */
6360 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6361 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6362 1, vec_cond_cast);
6363 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6364 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6366 /* Convert the reduced value back to the result type and set as the
6367 result. */
6368 gimple_seq stmts = NULL;
6369 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6370 data_reduc);
6371 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6372 scalar_results.safe_push (new_temp);
6374 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6375 && reduc_fn == IFN_LAST)
6377 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6378 idx = 0;
6379 idx_val = induction_index[0];
6380 val = data_reduc[0];
6381 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6382 if (induction_index[i] > idx_val)
6383 val = data_reduc[i], idx_val = induction_index[i];
6384 return val; */
6386 tree data_eltype = TREE_TYPE (vectype);
6387 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6388 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6389 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6390 /* Enforced by vectorizable_reduction, which ensures we have target
6391 support before allowing a conditional reduction on variable-length
6392 vectors. */
6393 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6394 tree idx_val = NULL_TREE, val = NULL_TREE;
6395 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6397 tree old_idx_val = idx_val;
6398 tree old_val = val;
6399 idx_val = make_ssa_name (idx_eltype);
6400 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6401 build3 (BIT_FIELD_REF, idx_eltype,
6402 induction_index,
6403 bitsize_int (el_size),
6404 bitsize_int (off)));
6405 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6406 val = make_ssa_name (data_eltype);
6407 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6408 build3 (BIT_FIELD_REF,
6409 data_eltype,
6410 reduc_inputs[0],
6411 bitsize_int (el_size),
6412 bitsize_int (off)));
6413 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6414 if (off != 0)
6416 tree new_idx_val = idx_val;
6417 if (off != v_size - el_size)
6419 new_idx_val = make_ssa_name (idx_eltype);
6420 epilog_stmt = gimple_build_assign (new_idx_val,
6421 MAX_EXPR, idx_val,
6422 old_idx_val);
6423 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6425 tree cond = make_ssa_name (boolean_type_node);
6426 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6427 idx_val, old_idx_val);
6428 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6429 tree new_val = make_ssa_name (data_eltype);
6430 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6431 cond, val, old_val);
6432 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6433 idx_val = new_idx_val;
6434 val = new_val;
6437 /* Convert the reduced value back to the result type and set as the
6438 result. */
6439 gimple_seq stmts = NULL;
6440 val = gimple_convert (&stmts, scalar_type, val);
6441 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6442 scalar_results.safe_push (val);
6445 /* 2.3 Create the reduction code, using one of the three schemes described
6446 above. In SLP we simply need to extract all the elements from the
6447 vector (without reducing them), so we use scalar shifts. */
6448 else if (reduc_fn != IFN_LAST && !slp_reduc)
6450 tree tmp;
6451 tree vec_elem_type;
6453 /* Case 1: Create:
6454 v_out2 = reduc_expr <v_out1> */
6456 if (dump_enabled_p ())
6457 dump_printf_loc (MSG_NOTE, vect_location,
6458 "Reduce using direct vector reduction.\n");
6460 gimple_seq stmts = NULL;
6461 vec_elem_type = TREE_TYPE (vectype);
6462 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6463 vec_elem_type, reduc_inputs[0]);
6464 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6465 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6467 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6468 && induc_val)
6470 /* Earlier we set the initial value to be a vector if induc_val
6471 values. Check the result and if it is induc_val then replace
6472 with the original initial value, unless induc_val is
6473 the same as initial_def already. */
6474 tree zcompare = make_ssa_name (boolean_type_node);
6475 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6476 new_temp, induc_val);
6477 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6478 tree initial_def = reduc_info->reduc_initial_values[0];
6479 tmp = make_ssa_name (new_scalar_dest);
6480 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6481 initial_def, new_temp);
6482 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6483 new_temp = tmp;
6486 scalar_results.safe_push (new_temp);
6488 else if (direct_slp_reduc)
6490 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6491 with the elements for other SLP statements replaced with the
6492 neutral value. We can then do a normal reduction on each vector. */
6494 /* Enforced by vectorizable_reduction. */
6495 gcc_assert (reduc_inputs.length () == 1);
6496 gcc_assert (pow2p_hwi (group_size));
6498 gimple_seq seq = NULL;
6500 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6501 and the same element size as VECTYPE. */
6502 tree index = build_index_vector (vectype, 0, 1);
6503 tree index_type = TREE_TYPE (index);
6504 tree index_elt_type = TREE_TYPE (index_type);
6505 tree mask_type = truth_type_for (index_type);
6507 /* Create a vector that, for each element, identifies which of
6508 the REDUC_GROUP_SIZE results should use it. */
6509 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6510 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6511 build_vector_from_val (index_type, index_mask));
6513 /* Get a neutral vector value. This is simply a splat of the neutral
6514 scalar value if we have one, otherwise the initial scalar value
6515 is itself a neutral value. */
6516 tree vector_identity = NULL_TREE;
6517 tree neutral_op = NULL_TREE;
6518 if (slp_node)
6520 tree initial_value = NULL_TREE;
6521 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6522 initial_value = reduc_info->reduc_initial_values[0];
6523 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6524 initial_value, false);
6526 if (neutral_op)
6527 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6528 neutral_op);
6529 for (unsigned int i = 0; i < group_size; ++i)
6531 /* If there's no univeral neutral value, we can use the
6532 initial scalar value from the original PHI. This is used
6533 for MIN and MAX reduction, for example. */
6534 if (!neutral_op)
6536 tree scalar_value = reduc_info->reduc_initial_values[i];
6537 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6538 scalar_value);
6539 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6540 scalar_value);
6543 /* Calculate the equivalent of:
6545 sel[j] = (index[j] == i);
6547 which selects the elements of REDUC_INPUTS[0] that should
6548 be included in the result. */
6549 tree compare_val = build_int_cst (index_elt_type, i);
6550 compare_val = build_vector_from_val (index_type, compare_val);
6551 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6552 index, compare_val);
6554 /* Calculate the equivalent of:
6556 vec = seq ? reduc_inputs[0] : vector_identity;
6558 VEC is now suitable for a full vector reduction. */
6559 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6560 sel, reduc_inputs[0], vector_identity);
6562 /* Do the reduction and convert it to the appropriate type. */
6563 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6564 TREE_TYPE (vectype), vec);
6565 scalar = gimple_convert (&seq, scalar_type, scalar);
6566 scalar_results.safe_push (scalar);
6568 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6570 else
6572 bool reduce_with_shift;
6573 tree vec_temp;
6575 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6577 /* See if the target wants to do the final (shift) reduction
6578 in a vector mode of smaller size and first reduce upper/lower
6579 halves against each other. */
6580 enum machine_mode mode1 = mode;
6581 tree stype = TREE_TYPE (vectype);
6582 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6583 unsigned nunits1 = nunits;
6584 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6585 && reduc_inputs.length () == 1)
6587 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6588 /* For SLP reductions we have to make sure lanes match up, but
6589 since we're doing individual element final reduction reducing
6590 vector width here is even more important.
6591 ??? We can also separate lanes with permutes, for the common
6592 case of power-of-two group-size odd/even extracts would work. */
6593 if (slp_reduc && nunits != nunits1)
6595 nunits1 = least_common_multiple (nunits1, group_size);
6596 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6599 if (!slp_reduc
6600 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6601 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6603 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6604 stype, nunits1);
6605 reduce_with_shift = have_whole_vector_shift (mode1);
6606 if (!VECTOR_MODE_P (mode1)
6607 || !directly_supported_p (code, vectype1))
6608 reduce_with_shift = false;
6610 /* First reduce the vector to the desired vector size we should
6611 do shift reduction on by combining upper and lower halves. */
6612 gimple_seq stmts = NULL;
6613 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6614 code, &stmts);
6615 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6616 reduc_inputs[0] = new_temp;
6618 if (reduce_with_shift && !slp_reduc)
6620 int element_bitsize = tree_to_uhwi (bitsize);
6621 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6622 for variable-length vectors and also requires direct target support
6623 for loop reductions. */
6624 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6625 int nelements = vec_size_in_bits / element_bitsize;
6626 vec_perm_builder sel;
6627 vec_perm_indices indices;
6629 int elt_offset;
6631 tree zero_vec = build_zero_cst (vectype1);
6632 /* Case 2: Create:
6633 for (offset = nelements/2; offset >= 1; offset/=2)
6635 Create: va' = vec_shift <va, offset>
6636 Create: va = vop <va, va'>
6637 } */
6639 tree rhs;
6641 if (dump_enabled_p ())
6642 dump_printf_loc (MSG_NOTE, vect_location,
6643 "Reduce using vector shifts\n");
6645 gimple_seq stmts = NULL;
6646 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6647 for (elt_offset = nelements / 2;
6648 elt_offset >= 1;
6649 elt_offset /= 2)
6651 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6652 indices.new_vector (sel, 2, nelements);
6653 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6654 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6655 new_temp, zero_vec, mask);
6656 new_temp = gimple_build (&stmts, code,
6657 vectype1, new_name, new_temp);
6659 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6661 /* 2.4 Extract the final scalar result. Create:
6662 s_out3 = extract_field <v_out2, bitpos> */
6664 if (dump_enabled_p ())
6665 dump_printf_loc (MSG_NOTE, vect_location,
6666 "extract scalar result\n");
6668 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6669 bitsize, bitsize_zero_node);
6670 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6671 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6672 gimple_assign_set_lhs (epilog_stmt, new_temp);
6673 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6674 scalar_results.safe_push (new_temp);
6676 else
6678 /* Case 3: Create:
6679 s = extract_field <v_out2, 0>
6680 for (offset = element_size;
6681 offset < vector_size;
6682 offset += element_size;)
6684 Create: s' = extract_field <v_out2, offset>
6685 Create: s = op <s, s'> // For non SLP cases
6686 } */
6688 if (dump_enabled_p ())
6689 dump_printf_loc (MSG_NOTE, vect_location,
6690 "Reduce using scalar code.\n");
6692 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6693 int element_bitsize = tree_to_uhwi (bitsize);
6694 tree compute_type = TREE_TYPE (vectype);
6695 gimple_seq stmts = NULL;
6696 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6698 int bit_offset;
6699 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6700 vec_temp, bitsize, bitsize_zero_node);
6702 /* In SLP we don't need to apply reduction operation, so we just
6703 collect s' values in SCALAR_RESULTS. */
6704 if (slp_reduc)
6705 scalar_results.safe_push (new_temp);
6707 for (bit_offset = element_bitsize;
6708 bit_offset < vec_size_in_bits;
6709 bit_offset += element_bitsize)
6711 tree bitpos = bitsize_int (bit_offset);
6712 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6713 compute_type, vec_temp,
6714 bitsize, bitpos);
6715 if (slp_reduc)
6717 /* In SLP we don't need to apply reduction operation, so
6718 we just collect s' values in SCALAR_RESULTS. */
6719 new_temp = new_name;
6720 scalar_results.safe_push (new_name);
6722 else
6723 new_temp = gimple_build (&stmts, code, compute_type,
6724 new_name, new_temp);
6728 /* The only case where we need to reduce scalar results in SLP, is
6729 unrolling. If the size of SCALAR_RESULTS is greater than
6730 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6731 REDUC_GROUP_SIZE. */
6732 if (slp_reduc)
6734 tree res, first_res, new_res;
6736 /* Reduce multiple scalar results in case of SLP unrolling. */
6737 for (j = group_size; scalar_results.iterate (j, &res);
6738 j++)
6740 first_res = scalar_results[j % group_size];
6741 new_res = gimple_build (&stmts, code, compute_type,
6742 first_res, res);
6743 scalar_results[j % group_size] = new_res;
6745 scalar_results.truncate (group_size);
6746 for (k = 0; k < group_size; k++)
6747 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6748 scalar_results[k]);
6750 else
6752 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6753 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6754 scalar_results.safe_push (new_temp);
6757 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6760 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6761 && induc_val)
6763 /* Earlier we set the initial value to be a vector if induc_val
6764 values. Check the result and if it is induc_val then replace
6765 with the original initial value, unless induc_val is
6766 the same as initial_def already. */
6767 tree zcompare = make_ssa_name (boolean_type_node);
6768 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6769 induc_val);
6770 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6771 tree initial_def = reduc_info->reduc_initial_values[0];
6772 tree tmp = make_ssa_name (new_scalar_dest);
6773 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6774 initial_def, new_temp);
6775 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6776 scalar_results[0] = tmp;
6780 /* 2.5 Adjust the final result by the initial value of the reduction
6781 variable. (When such adjustment is not needed, then
6782 'adjustment_def' is zero). For example, if code is PLUS we create:
6783 new_temp = loop_exit_def + adjustment_def */
6785 if (adjustment_def)
6787 gcc_assert (!slp_reduc);
6788 gimple_seq stmts = NULL;
6789 if (double_reduc)
6791 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6792 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6793 new_temp = gimple_build (&stmts, code, vectype,
6794 reduc_inputs[0], adjustment_def);
6796 else
6798 new_temp = scalar_results[0];
6799 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6800 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6801 adjustment_def);
6802 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6803 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6804 new_temp, adjustment_def);
6805 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6808 epilog_stmt = gimple_seq_last_stmt (stmts);
6809 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6810 scalar_results[0] = new_temp;
6813 /* Record this operation if it could be reused by the epilogue loop. */
6814 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6815 && reduc_inputs.length () == 1)
6816 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6817 { orig_reduc_input, reduc_info });
6819 if (double_reduc)
6820 loop = outer_loop;
6822 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6823 phis with new adjusted scalar results, i.e., replace use <s_out0>
6824 with use <s_out4>.
6826 Transform:
6827 loop_exit:
6828 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6829 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6830 v_out2 = reduce <v_out1>
6831 s_out3 = extract_field <v_out2, 0>
6832 s_out4 = adjust_result <s_out3>
6833 use <s_out0>
6834 use <s_out0>
6836 into:
6838 loop_exit:
6839 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6840 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6841 v_out2 = reduce <v_out1>
6842 s_out3 = extract_field <v_out2, 0>
6843 s_out4 = adjust_result <s_out3>
6844 use <s_out4>
6845 use <s_out4> */
6847 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6848 for (k = 0; k < live_out_stmts.size (); k++)
6850 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6851 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6853 phis.create (3);
6854 /* Find the loop-closed-use at the loop exit of the original scalar
6855 result. (The reduction result is expected to have two immediate uses,
6856 one at the latch block, and one at the loop exit). For double
6857 reductions we are looking for exit phis of the outer loop. */
6858 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6860 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6862 if (!is_gimple_debug (USE_STMT (use_p)))
6863 phis.safe_push (USE_STMT (use_p));
6865 else
6867 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6869 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6871 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6873 if (!flow_bb_inside_loop_p (loop,
6874 gimple_bb (USE_STMT (phi_use_p)))
6875 && !is_gimple_debug (USE_STMT (phi_use_p)))
6876 phis.safe_push (USE_STMT (phi_use_p));
6882 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6884 /* Replace the uses: */
6885 orig_name = PHI_RESULT (exit_phi);
6887 /* Look for a single use at the target of the skip edge. */
6888 if (unify_with_main_loop_p)
6890 use_operand_p use_p;
6891 gimple *user;
6892 if (!single_imm_use (orig_name, &use_p, &user))
6893 gcc_unreachable ();
6894 orig_name = gimple_get_lhs (user);
6897 scalar_result = scalar_results[k];
6898 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6900 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6901 SET_USE (use_p, scalar_result);
6902 update_stmt (use_stmt);
6906 phis.release ();
6910 /* Return a vector of type VECTYPE that is equal to the vector select
6911 operation "MASK ? VEC : IDENTITY". Insert the select statements
6912 before GSI. */
6914 static tree
6915 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6916 tree vec, tree identity)
6918 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6919 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6920 mask, vec, identity);
6921 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6922 return cond;
6925 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6926 order, starting with LHS. Insert the extraction statements before GSI and
6927 associate the new scalar SSA names with variable SCALAR_DEST.
6928 If MASK is nonzero mask the input and then operate on it unconditionally.
6929 Return the SSA name for the result. */
6931 static tree
6932 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6933 tree_code code, tree lhs, tree vector_rhs,
6934 tree mask)
6936 tree vectype = TREE_TYPE (vector_rhs);
6937 tree scalar_type = TREE_TYPE (vectype);
6938 tree bitsize = TYPE_SIZE (scalar_type);
6939 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6940 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6942 /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6943 to perform an unconditional element-wise reduction of it. */
6944 if (mask)
6946 tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6947 "masked_vector_rhs");
6948 tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6949 false);
6950 tree vector_identity = build_vector_from_val (vectype, neutral_op);
6951 gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6952 mask, vector_rhs, vector_identity);
6953 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6954 vector_rhs = masked_vector_rhs;
6957 for (unsigned HOST_WIDE_INT bit_offset = 0;
6958 bit_offset < vec_size_in_bits;
6959 bit_offset += element_bitsize)
6961 tree bitpos = bitsize_int (bit_offset);
6962 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6963 bitsize, bitpos);
6965 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6966 rhs = make_ssa_name (scalar_dest, stmt);
6967 gimple_assign_set_lhs (stmt, rhs);
6968 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6970 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6971 tree new_name = make_ssa_name (scalar_dest, stmt);
6972 gimple_assign_set_lhs (stmt, new_name);
6973 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6974 lhs = new_name;
6976 return lhs;
6979 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6980 type of the vector input. */
6982 static internal_fn
6983 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6985 internal_fn mask_reduc_fn;
6986 internal_fn mask_len_reduc_fn;
6988 switch (reduc_fn)
6990 case IFN_FOLD_LEFT_PLUS:
6991 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6992 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6993 break;
6995 default:
6996 return IFN_LAST;
6999 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7000 OPTIMIZE_FOR_SPEED))
7001 return mask_reduc_fn;
7002 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7003 OPTIMIZE_FOR_SPEED))
7004 return mask_len_reduc_fn;
7005 return IFN_LAST;
7008 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
7009 statement that sets the live-out value. REDUC_DEF_STMT is the phi
7010 statement. CODE is the operation performed by STMT_INFO and OPS are
7011 its scalar operands. REDUC_INDEX is the index of the operand in
7012 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
7013 implements in-order reduction, or IFN_LAST if we should open-code it.
7014 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7015 that should be used to control the operation in a fully-masked loop. */
7017 static bool
7018 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7019 stmt_vec_info stmt_info,
7020 gimple_stmt_iterator *gsi,
7021 gimple **vec_stmt, slp_tree slp_node,
7022 gimple *reduc_def_stmt,
7023 code_helper code, internal_fn reduc_fn,
7024 tree *ops, int num_ops, tree vectype_in,
7025 int reduc_index, vec_loop_masks *masks,
7026 vec_loop_lens *lens)
7028 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7029 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7030 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7032 int ncopies;
7033 if (slp_node)
7034 ncopies = 1;
7035 else
7036 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7038 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7039 gcc_assert (ncopies == 1);
7041 bool is_cond_op = false;
7042 if (!code.is_tree_code ())
7044 code = conditional_internal_fn_code (internal_fn (code));
7045 gcc_assert (code != ERROR_MARK);
7046 is_cond_op = true;
7049 gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7051 if (slp_node)
7053 if (is_cond_op)
7055 if (dump_enabled_p ())
7056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7057 "fold-left reduction on SLP not supported.\n");
7058 return false;
7061 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7062 TYPE_VECTOR_SUBPARTS (vectype_in)));
7065 /* The operands either come from a binary operation or an IFN_COND operation.
7066 The former is a gimple assign with binary rhs and the latter is a
7067 gimple call with four arguments. */
7068 gcc_assert (num_ops == 2 || num_ops == 4);
7069 tree op0, opmask;
7070 if (!is_cond_op)
7071 op0 = ops[1 - reduc_index];
7072 else
7074 op0 = ops[2 + (1 - reduc_index)];
7075 opmask = ops[0];
7076 gcc_assert (!slp_node);
7079 int group_size = 1;
7080 stmt_vec_info scalar_dest_def_info;
7081 auto_vec<tree> vec_oprnds0, vec_opmask;
7082 if (slp_node)
7084 auto_vec<vec<tree> > vec_defs (2);
7085 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7086 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7087 vec_defs[0].release ();
7088 vec_defs[1].release ();
7089 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7090 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7092 else
7094 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7095 op0, &vec_oprnds0);
7096 scalar_dest_def_info = stmt_info;
7098 /* For an IFN_COND_OP we also need the vector mask operand. */
7099 if (is_cond_op)
7100 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7101 opmask, &vec_opmask);
7104 gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7105 tree scalar_dest = gimple_get_lhs (sdef);
7106 tree scalar_type = TREE_TYPE (scalar_dest);
7107 tree reduc_var = gimple_phi_result (reduc_def_stmt);
7109 int vec_num = vec_oprnds0.length ();
7110 gcc_assert (vec_num == 1 || slp_node);
7111 tree vec_elem_type = TREE_TYPE (vectype_out);
7112 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7114 tree vector_identity = NULL_TREE;
7115 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7117 vector_identity = build_zero_cst (vectype_out);
7118 if (!HONOR_SIGNED_ZEROS (vectype_out))
7120 else
7122 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7123 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7124 vector_identity);
7128 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7129 int i;
7130 tree def0;
7131 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7133 gimple *new_stmt;
7134 tree mask = NULL_TREE;
7135 tree len = NULL_TREE;
7136 tree bias = NULL_TREE;
7137 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7138 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7139 else if (is_cond_op)
7140 mask = vec_opmask[0];
7141 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7143 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7144 i, 1);
7145 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7146 bias = build_int_cst (intQI_type_node, biasval);
7147 if (!is_cond_op)
7148 mask = build_minus_one_cst (truth_type_for (vectype_in));
7151 /* Handle MINUS by adding the negative. */
7152 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7154 tree negated = make_ssa_name (vectype_out);
7155 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7156 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7157 def0 = negated;
7160 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7161 && mask && mask_reduc_fn == IFN_LAST)
7162 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7163 vector_identity);
7165 /* On the first iteration the input is simply the scalar phi
7166 result, and for subsequent iterations it is the output of
7167 the preceding operation. */
7168 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7170 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7171 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7172 def0, mask, len, bias);
7173 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7174 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7175 def0, mask);
7176 else
7177 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7178 def0);
7179 /* For chained SLP reductions the output of the previous reduction
7180 operation serves as the input of the next. For the final statement
7181 the output cannot be a temporary - we reuse the original
7182 scalar destination of the last statement. */
7183 if (i != vec_num - 1)
7185 gimple_set_lhs (new_stmt, scalar_dest_var);
7186 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7187 gimple_set_lhs (new_stmt, reduc_var);
7190 else
7192 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7193 tree_code (code), reduc_var, def0,
7194 mask);
7195 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7196 /* Remove the statement, so that we can use the same code paths
7197 as for statements that we've just created. */
7198 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7199 gsi_remove (&tmp_gsi, true);
7202 if (i == vec_num - 1)
7204 gimple_set_lhs (new_stmt, scalar_dest);
7205 vect_finish_replace_stmt (loop_vinfo,
7206 scalar_dest_def_info,
7207 new_stmt);
7209 else
7210 vect_finish_stmt_generation (loop_vinfo,
7211 scalar_dest_def_info,
7212 new_stmt, gsi);
7214 if (slp_node)
7215 slp_node->push_vec_def (new_stmt);
7216 else
7218 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7219 *vec_stmt = new_stmt;
7223 return true;
7226 /* Function is_nonwrapping_integer_induction.
7228 Check if STMT_VINO (which is part of loop LOOP) both increments and
7229 does not cause overflow. */
7231 static bool
7232 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7234 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7235 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7236 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7237 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7238 widest_int ni, max_loop_value, lhs_max;
7239 wi::overflow_type overflow = wi::OVF_NONE;
7241 /* Make sure the loop is integer based. */
7242 if (TREE_CODE (base) != INTEGER_CST
7243 || TREE_CODE (step) != INTEGER_CST)
7244 return false;
7246 /* Check that the max size of the loop will not wrap. */
7248 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7249 return true;
7251 if (! max_stmt_executions (loop, &ni))
7252 return false;
7254 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7255 &overflow);
7256 if (overflow)
7257 return false;
7259 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7260 TYPE_SIGN (lhs_type), &overflow);
7261 if (overflow)
7262 return false;
7264 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7265 <= TYPE_PRECISION (lhs_type));
7268 /* Check if masking can be supported by inserting a conditional expression.
7269 CODE is the code for the operation. COND_FN is the conditional internal
7270 function, if it exists. VECTYPE_IN is the type of the vector input. */
7271 static bool
7272 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7273 tree vectype_in)
7275 if (cond_fn != IFN_LAST
7276 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7277 OPTIMIZE_FOR_SPEED))
7278 return false;
7280 if (code.is_tree_code ())
7281 switch (tree_code (code))
7283 case DOT_PROD_EXPR:
7284 case SAD_EXPR:
7285 return true;
7287 default:
7288 break;
7290 return false;
7293 /* Insert a conditional expression to enable masked vectorization. CODE is the
7294 code for the operation. VOP is the array of operands. MASK is the loop
7295 mask. GSI is a statement iterator used to place the new conditional
7296 expression. */
7297 static void
7298 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7299 gimple_stmt_iterator *gsi)
7301 switch (tree_code (code))
7303 case DOT_PROD_EXPR:
7305 tree vectype = TREE_TYPE (vop[1]);
7306 tree zero = build_zero_cst (vectype);
7307 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7308 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7309 mask, vop[1], zero);
7310 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7311 vop[1] = masked_op1;
7312 break;
7315 case SAD_EXPR:
7317 tree vectype = TREE_TYPE (vop[1]);
7318 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7319 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7320 mask, vop[1], vop[0]);
7321 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7322 vop[1] = masked_op1;
7323 break;
7326 default:
7327 gcc_unreachable ();
7331 /* Function vectorizable_reduction.
7333 Check if STMT_INFO performs a reduction operation that can be vectorized.
7334 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7335 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7336 Return true if STMT_INFO is vectorizable in this way.
7338 This function also handles reduction idioms (patterns) that have been
7339 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7340 may be of this form:
7341 X = pattern_expr (arg0, arg1, ..., X)
7342 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7343 sequence that had been detected and replaced by the pattern-stmt
7344 (STMT_INFO).
7346 This function also handles reduction of condition expressions, for example:
7347 for (int i = 0; i < N; i++)
7348 if (a[i] < value)
7349 last = a[i];
7350 This is handled by vectorising the loop and creating an additional vector
7351 containing the loop indexes for which "a[i] < value" was true. In the
7352 function epilogue this is reduced to a single max value and then used to
7353 index into the vector of results.
7355 In some cases of reduction patterns, the type of the reduction variable X is
7356 different than the type of the other arguments of STMT_INFO.
7357 In such cases, the vectype that is used when transforming STMT_INFO into
7358 a vector stmt is different than the vectype that is used to determine the
7359 vectorization factor, because it consists of a different number of elements
7360 than the actual number of elements that are being operated upon in parallel.
7362 For example, consider an accumulation of shorts into an int accumulator.
7363 On some targets it's possible to vectorize this pattern operating on 8
7364 shorts at a time (hence, the vectype for purposes of determining the
7365 vectorization factor should be V8HI); on the other hand, the vectype that
7366 is used to create the vector form is actually V4SI (the type of the result).
7368 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7369 indicates what is the actual level of parallelism (V8HI in the example), so
7370 that the right vectorization factor would be derived. This vectype
7371 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7372 be used to create the vectorized stmt. The right vectype for the vectorized
7373 stmt is obtained from the type of the result X:
7374 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7376 This means that, contrary to "regular" reductions (or "regular" stmts in
7377 general), the following equation:
7378 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7379 does *NOT* necessarily hold for reduction patterns. */
7381 bool
7382 vectorizable_reduction (loop_vec_info loop_vinfo,
7383 stmt_vec_info stmt_info, slp_tree slp_node,
7384 slp_instance slp_node_instance,
7385 stmt_vector_for_cost *cost_vec)
7387 tree vectype_in = NULL_TREE;
7388 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7389 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7390 stmt_vec_info cond_stmt_vinfo = NULL;
7391 int i;
7392 int ncopies;
7393 bool single_defuse_cycle = false;
7394 bool nested_cycle = false;
7395 bool double_reduc = false;
7396 int vec_num;
7397 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7398 tree cond_reduc_val = NULL_TREE;
7400 /* Make sure it was already recognized as a reduction computation. */
7401 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7402 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7403 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7404 return false;
7406 /* The stmt we store reduction analysis meta on. */
7407 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7408 reduc_info->is_reduc_info = true;
7410 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7412 if (is_a <gphi *> (stmt_info->stmt))
7414 if (slp_node)
7416 /* We eventually need to set a vector type on invariant
7417 arguments. */
7418 unsigned j;
7419 slp_tree child;
7420 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7421 if (!vect_maybe_update_slp_op_vectype
7422 (child, SLP_TREE_VECTYPE (slp_node)))
7424 if (dump_enabled_p ())
7425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7426 "incompatible vector types for "
7427 "invariants\n");
7428 return false;
7431 /* Analysis for double-reduction is done on the outer
7432 loop PHI, nested cycles have no further restrictions. */
7433 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7435 else
7436 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7437 return true;
7440 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7441 stmt_vec_info phi_info = stmt_info;
7442 if (!is_a <gphi *> (stmt_info->stmt))
7444 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7445 return true;
7447 if (slp_node)
7449 slp_node_instance->reduc_phis = slp_node;
7450 /* ??? We're leaving slp_node to point to the PHIs, we only
7451 need it to get at the number of vector stmts which wasn't
7452 yet initialized for the instance root. */
7454 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7456 use_operand_p use_p;
7457 gimple *use_stmt;
7458 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7459 &use_p, &use_stmt);
7460 gcc_assert (res);
7461 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7464 /* PHIs should not participate in patterns. */
7465 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7466 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7468 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7469 and compute the reduction chain length. Discover the real
7470 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7471 tree reduc_def
7472 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7473 loop_latch_edge
7474 (gimple_bb (reduc_def_phi)->loop_father));
7475 unsigned reduc_chain_length = 0;
7476 bool only_slp_reduc_chain = true;
7477 stmt_info = NULL;
7478 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7479 while (reduc_def != PHI_RESULT (reduc_def_phi))
7481 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7482 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7483 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7485 if (dump_enabled_p ())
7486 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7487 "reduction chain broken by patterns.\n");
7488 return false;
7490 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7491 only_slp_reduc_chain = false;
7492 /* For epilogue generation live members of the chain need
7493 to point back to the PHI via their original stmt for
7494 info_for_reduction to work. For SLP we need to look at
7495 all lanes here - even though we only will vectorize from
7496 the SLP node with live lane zero the other live lanes also
7497 need to be identified as part of a reduction to be able
7498 to skip code generation for them. */
7499 if (slp_for_stmt_info)
7501 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7502 if (STMT_VINFO_LIVE_P (s))
7503 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7505 else if (STMT_VINFO_LIVE_P (vdef))
7506 STMT_VINFO_REDUC_DEF (def) = phi_info;
7507 gimple_match_op op;
7508 if (!gimple_extract_op (vdef->stmt, &op))
7510 if (dump_enabled_p ())
7511 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7512 "reduction chain includes unsupported"
7513 " statement type.\n");
7514 return false;
7516 if (CONVERT_EXPR_CODE_P (op.code))
7518 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7520 if (dump_enabled_p ())
7521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7522 "conversion in the reduction chain.\n");
7523 return false;
7526 else if (!stmt_info)
7527 /* First non-conversion stmt. */
7528 stmt_info = vdef;
7529 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7530 reduc_chain_length++;
7531 if (!stmt_info && slp_node)
7532 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7534 /* PHIs should not participate in patterns. */
7535 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7537 if (nested_in_vect_loop_p (loop, stmt_info))
7539 loop = loop->inner;
7540 nested_cycle = true;
7543 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7544 element. */
7545 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7547 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7548 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7550 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7551 gcc_assert (slp_node
7552 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7554 /* 1. Is vectorizable reduction? */
7555 /* Not supportable if the reduction variable is used in the loop, unless
7556 it's a reduction chain. */
7557 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7558 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7559 return false;
7561 /* Reductions that are not used even in an enclosing outer-loop,
7562 are expected to be "live" (used out of the loop). */
7563 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7564 && !STMT_VINFO_LIVE_P (stmt_info))
7565 return false;
7567 /* 2. Has this been recognized as a reduction pattern?
7569 Check if STMT represents a pattern that has been recognized
7570 in earlier analysis stages. For stmts that represent a pattern,
7571 the STMT_VINFO_RELATED_STMT field records the last stmt in
7572 the original sequence that constitutes the pattern. */
7574 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7575 if (orig_stmt_info)
7577 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7578 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7581 /* 3. Check the operands of the operation. The first operands are defined
7582 inside the loop body. The last operand is the reduction variable,
7583 which is defined by the loop-header-phi. */
7585 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7586 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7587 gimple_match_op op;
7588 if (!gimple_extract_op (stmt_info->stmt, &op))
7589 gcc_unreachable ();
7590 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7591 || op.code == WIDEN_SUM_EXPR
7592 || op.code == SAD_EXPR);
7594 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7595 && !SCALAR_FLOAT_TYPE_P (op.type))
7596 return false;
7598 /* Do not try to vectorize bit-precision reductions. */
7599 if (!type_has_mode_precision_p (op.type))
7600 return false;
7602 /* For lane-reducing ops we're reducing the number of reduction PHIs
7603 which means the only use of that may be in the lane-reducing operation. */
7604 if (lane_reduc_code_p
7605 && reduc_chain_length != 1
7606 && !only_slp_reduc_chain)
7608 if (dump_enabled_p ())
7609 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7610 "lane-reducing reduction with extra stmts.\n");
7611 return false;
7614 /* All uses but the last are expected to be defined in the loop.
7615 The last use is the reduction variable. In case of nested cycle this
7616 assumption is not true: we use reduc_index to record the index of the
7617 reduction variable. */
7618 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7619 tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7620 /* We need to skip an extra operand for COND_EXPRs with embedded
7621 comparison. */
7622 unsigned opno_adjust = 0;
7623 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7624 opno_adjust = 1;
7625 for (i = 0; i < (int) op.num_ops; i++)
7627 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7628 if (i == 0 && op.code == COND_EXPR)
7629 continue;
7631 stmt_vec_info def_stmt_info;
7632 enum vect_def_type dt;
7633 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7634 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7635 &vectype_op[i], &def_stmt_info))
7637 if (dump_enabled_p ())
7638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7639 "use not simple.\n");
7640 return false;
7642 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7643 continue;
7645 /* For an IFN_COND_OP we might hit the reduction definition operand
7646 twice (once as definition, once as else). */
7647 if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7648 continue;
7650 /* There should be only one cycle def in the stmt, the one
7651 leading to reduc_def. */
7652 if (VECTORIZABLE_CYCLE_DEF (dt))
7653 return false;
7655 if (!vectype_op[i])
7656 vectype_op[i]
7657 = get_vectype_for_scalar_type (loop_vinfo,
7658 TREE_TYPE (op.ops[i]), slp_op[i]);
7660 /* To properly compute ncopies we are interested in the widest
7661 non-reduction input type in case we're looking at a widening
7662 accumulation that we later handle in vect_transform_reduction. */
7663 if (lane_reduc_code_p
7664 && vectype_op[i]
7665 && (!vectype_in
7666 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7667 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7668 vectype_in = vectype_op[i];
7670 if (op.code == COND_EXPR)
7672 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7673 if (dt == vect_constant_def)
7675 cond_reduc_dt = dt;
7676 cond_reduc_val = op.ops[i];
7678 if (dt == vect_induction_def
7679 && def_stmt_info
7680 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7682 cond_reduc_dt = dt;
7683 cond_stmt_vinfo = def_stmt_info;
7687 if (!vectype_in)
7688 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7689 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7691 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7692 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7693 /* If we have a condition reduction, see if we can simplify it further. */
7694 if (v_reduc_type == COND_REDUCTION)
7696 if (slp_node)
7697 return false;
7699 /* When the condition uses the reduction value in the condition, fail. */
7700 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7702 if (dump_enabled_p ())
7703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7704 "condition depends on previous iteration\n");
7705 return false;
7708 if (reduc_chain_length == 1
7709 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7710 OPTIMIZE_FOR_SPEED)
7711 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7712 vectype_in,
7713 OPTIMIZE_FOR_SPEED)))
7715 if (dump_enabled_p ())
7716 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7717 "optimizing condition reduction with"
7718 " FOLD_EXTRACT_LAST.\n");
7719 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7721 else if (cond_reduc_dt == vect_induction_def)
7723 tree base
7724 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7725 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7727 gcc_assert (TREE_CODE (base) == INTEGER_CST
7728 && TREE_CODE (step) == INTEGER_CST);
7729 cond_reduc_val = NULL_TREE;
7730 enum tree_code cond_reduc_op_code = ERROR_MARK;
7731 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7732 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7734 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7735 above base; punt if base is the minimum value of the type for
7736 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7737 else if (tree_int_cst_sgn (step) == -1)
7739 cond_reduc_op_code = MIN_EXPR;
7740 if (tree_int_cst_sgn (base) == -1)
7741 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7742 else if (tree_int_cst_lt (base,
7743 TYPE_MAX_VALUE (TREE_TYPE (base))))
7744 cond_reduc_val
7745 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7747 else
7749 cond_reduc_op_code = MAX_EXPR;
7750 if (tree_int_cst_sgn (base) == 1)
7751 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7752 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7753 base))
7754 cond_reduc_val
7755 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7757 if (cond_reduc_val)
7759 if (dump_enabled_p ())
7760 dump_printf_loc (MSG_NOTE, vect_location,
7761 "condition expression based on "
7762 "integer induction.\n");
7763 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7764 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7765 = cond_reduc_val;
7766 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7769 else if (cond_reduc_dt == vect_constant_def)
7771 enum vect_def_type cond_initial_dt;
7772 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7773 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7774 if (cond_initial_dt == vect_constant_def
7775 && types_compatible_p (TREE_TYPE (cond_initial_val),
7776 TREE_TYPE (cond_reduc_val)))
7778 tree e = fold_binary (LE_EXPR, boolean_type_node,
7779 cond_initial_val, cond_reduc_val);
7780 if (e && (integer_onep (e) || integer_zerop (e)))
7782 if (dump_enabled_p ())
7783 dump_printf_loc (MSG_NOTE, vect_location,
7784 "condition expression based on "
7785 "compile time constant.\n");
7786 /* Record reduction code at analysis stage. */
7787 STMT_VINFO_REDUC_CODE (reduc_info)
7788 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7789 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7795 if (STMT_VINFO_LIVE_P (phi_info))
7796 return false;
7798 if (slp_node)
7799 ncopies = 1;
7800 else
7801 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7803 gcc_assert (ncopies >= 1);
7805 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7807 if (nested_cycle)
7809 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7810 == vect_double_reduction_def);
7811 double_reduc = true;
7814 /* 4.2. Check support for the epilog operation.
7816 If STMT represents a reduction pattern, then the type of the
7817 reduction variable may be different than the type of the rest
7818 of the arguments. For example, consider the case of accumulation
7819 of shorts into an int accumulator; The original code:
7820 S1: int_a = (int) short_a;
7821 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7823 was replaced with:
7824 STMT: int_acc = widen_sum <short_a, int_acc>
7826 This means that:
7827 1. The tree-code that is used to create the vector operation in the
7828 epilog code (that reduces the partial results) is not the
7829 tree-code of STMT, but is rather the tree-code of the original
7830 stmt from the pattern that STMT is replacing. I.e, in the example
7831 above we want to use 'widen_sum' in the loop, but 'plus' in the
7832 epilog.
7833 2. The type (mode) we use to check available target support
7834 for the vector operation to be created in the *epilog*, is
7835 determined by the type of the reduction variable (in the example
7836 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7837 However the type (mode) we use to check available target support
7838 for the vector operation to be created *inside the loop*, is
7839 determined by the type of the other arguments to STMT (in the
7840 example we'd check this: optab_handler (widen_sum_optab,
7841 vect_short_mode)).
7843 This is contrary to "regular" reductions, in which the types of all
7844 the arguments are the same as the type of the reduction variable.
7845 For "regular" reductions we can therefore use the same vector type
7846 (and also the same tree-code) when generating the epilog code and
7847 when generating the code inside the loop. */
7849 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7851 /* If conversion might have created a conditional operation like
7852 IFN_COND_ADD already. Use the internal code for the following checks. */
7853 if (orig_code.is_internal_fn ())
7855 tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7856 orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7859 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7861 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7862 if (reduction_type == TREE_CODE_REDUCTION)
7864 /* Check whether it's ok to change the order of the computation.
7865 Generally, when vectorizing a reduction we change the order of the
7866 computation. This may change the behavior of the program in some
7867 cases, so we need to check that this is ok. One exception is when
7868 vectorizing an outer-loop: the inner-loop is executed sequentially,
7869 and therefore vectorizing reductions in the inner-loop during
7870 outer-loop vectorization is safe. Likewise when we are vectorizing
7871 a series of reductions using SLP and the VF is one the reductions
7872 are performed in scalar order. */
7873 if (slp_node
7874 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7875 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7877 else if (needs_fold_left_reduction_p (op.type, orig_code))
7879 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7880 is not directy used in stmt. */
7881 if (!only_slp_reduc_chain
7882 && reduc_chain_length != 1)
7884 if (dump_enabled_p ())
7885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7886 "in-order reduction chain without SLP.\n");
7887 return false;
7889 STMT_VINFO_REDUC_TYPE (reduc_info)
7890 = reduction_type = FOLD_LEFT_REDUCTION;
7892 else if (!commutative_binary_op_p (orig_code, op.type)
7893 || !associative_binary_op_p (orig_code, op.type))
7895 if (dump_enabled_p ())
7896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7897 "reduction: not commutative/associative\n");
7898 return false;
7902 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7903 && ncopies > 1)
7905 if (dump_enabled_p ())
7906 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7907 "multiple types in double reduction or condition "
7908 "reduction or fold-left reduction.\n");
7909 return false;
7912 internal_fn reduc_fn = IFN_LAST;
7913 if (reduction_type == TREE_CODE_REDUCTION
7914 || reduction_type == FOLD_LEFT_REDUCTION
7915 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7916 || reduction_type == CONST_COND_REDUCTION)
7918 if (reduction_type == FOLD_LEFT_REDUCTION
7919 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7920 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7922 if (reduc_fn != IFN_LAST
7923 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7924 OPTIMIZE_FOR_SPEED))
7926 if (dump_enabled_p ())
7927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7928 "reduc op not supported by target.\n");
7930 reduc_fn = IFN_LAST;
7933 else
7935 if (!nested_cycle || double_reduc)
7937 if (dump_enabled_p ())
7938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7939 "no reduc code for scalar code.\n");
7941 return false;
7945 else if (reduction_type == COND_REDUCTION)
7947 int scalar_precision
7948 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7949 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7950 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7951 vectype_out);
7953 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7954 OPTIMIZE_FOR_SPEED))
7955 reduc_fn = IFN_REDUC_MAX;
7957 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7959 if (reduction_type != EXTRACT_LAST_REDUCTION
7960 && (!nested_cycle || double_reduc)
7961 && reduc_fn == IFN_LAST
7962 && !nunits_out.is_constant ())
7964 if (dump_enabled_p ())
7965 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7966 "missing target support for reduction on"
7967 " variable-length vectors.\n");
7968 return false;
7971 /* For SLP reductions, see if there is a neutral value we can use. */
7972 tree neutral_op = NULL_TREE;
7973 if (slp_node)
7975 tree initial_value = NULL_TREE;
7976 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7977 initial_value = vect_phi_initial_value (reduc_def_phi);
7978 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7979 orig_code, initial_value);
7982 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7984 /* We can't support in-order reductions of code such as this:
7986 for (int i = 0; i < n1; ++i)
7987 for (int j = 0; j < n2; ++j)
7988 l += a[j];
7990 since GCC effectively transforms the loop when vectorizing:
7992 for (int i = 0; i < n1 / VF; ++i)
7993 for (int j = 0; j < n2; ++j)
7994 for (int k = 0; k < VF; ++k)
7995 l += a[j];
7997 which is a reassociation of the original operation. */
7998 if (dump_enabled_p ())
7999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8000 "in-order double reduction not supported.\n");
8002 return false;
8005 if (reduction_type == FOLD_LEFT_REDUCTION
8006 && slp_node
8007 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8009 /* We cannot use in-order reductions in this case because there is
8010 an implicit reassociation of the operations involved. */
8011 if (dump_enabled_p ())
8012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8013 "in-order unchained SLP reductions not supported.\n");
8014 return false;
8017 /* For double reductions, and for SLP reductions with a neutral value,
8018 we construct a variable-length initial vector by loading a vector
8019 full of the neutral value and then shift-and-inserting the start
8020 values into the low-numbered elements. */
8021 if ((double_reduc || neutral_op)
8022 && !nunits_out.is_constant ()
8023 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8024 vectype_out, OPTIMIZE_FOR_SPEED))
8026 if (dump_enabled_p ())
8027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8028 "reduction on variable-length vectors requires"
8029 " target support for a vector-shift-and-insert"
8030 " operation.\n");
8031 return false;
8034 /* Check extra constraints for variable-length unchained SLP reductions. */
8035 if (slp_node
8036 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8037 && !nunits_out.is_constant ())
8039 /* We checked above that we could build the initial vector when
8040 there's a neutral element value. Check here for the case in
8041 which each SLP statement has its own initial value and in which
8042 that value needs to be repeated for every instance of the
8043 statement within the initial vector. */
8044 unsigned int group_size = SLP_TREE_LANES (slp_node);
8045 if (!neutral_op
8046 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8047 TREE_TYPE (vectype_out)))
8049 if (dump_enabled_p ())
8050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8051 "unsupported form of SLP reduction for"
8052 " variable-length vectors: cannot build"
8053 " initial vector.\n");
8054 return false;
8056 /* The epilogue code relies on the number of elements being a multiple
8057 of the group size. The duplicate-and-interleave approach to setting
8058 up the initial vector does too. */
8059 if (!multiple_p (nunits_out, group_size))
8061 if (dump_enabled_p ())
8062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8063 "unsupported form of SLP reduction for"
8064 " variable-length vectors: the vector size"
8065 " is not a multiple of the number of results.\n");
8066 return false;
8070 if (reduction_type == COND_REDUCTION)
8072 widest_int ni;
8074 if (! max_loop_iterations (loop, &ni))
8076 if (dump_enabled_p ())
8077 dump_printf_loc (MSG_NOTE, vect_location,
8078 "loop count not known, cannot create cond "
8079 "reduction.\n");
8080 return false;
8082 /* Convert backedges to iterations. */
8083 ni += 1;
8085 /* The additional index will be the same type as the condition. Check
8086 that the loop can fit into this less one (because we'll use up the
8087 zero slot for when there are no matches). */
8088 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8089 if (wi::geu_p (ni, wi::to_widest (max_index)))
8091 if (dump_enabled_p ())
8092 dump_printf_loc (MSG_NOTE, vect_location,
8093 "loop size is greater than data size.\n");
8094 return false;
8098 /* In case the vectorization factor (VF) is bigger than the number
8099 of elements that we can fit in a vectype (nunits), we have to generate
8100 more than one vector stmt - i.e - we need to "unroll" the
8101 vector stmt by a factor VF/nunits. For more details see documentation
8102 in vectorizable_operation. */
8104 /* If the reduction is used in an outer loop we need to generate
8105 VF intermediate results, like so (e.g. for ncopies=2):
8106 r0 = phi (init, r0)
8107 r1 = phi (init, r1)
8108 r0 = x0 + r0;
8109 r1 = x1 + r1;
8110 (i.e. we generate VF results in 2 registers).
8111 In this case we have a separate def-use cycle for each copy, and therefore
8112 for each copy we get the vector def for the reduction variable from the
8113 respective phi node created for this copy.
8115 Otherwise (the reduction is unused in the loop nest), we can combine
8116 together intermediate results, like so (e.g. for ncopies=2):
8117 r = phi (init, r)
8118 r = x0 + r;
8119 r = x1 + r;
8120 (i.e. we generate VF/2 results in a single register).
8121 In this case for each copy we get the vector def for the reduction variable
8122 from the vectorized reduction operation generated in the previous iteration.
8124 This only works when we see both the reduction PHI and its only consumer
8125 in vectorizable_reduction and there are no intermediate stmts
8126 participating. When unrolling we want each unrolled iteration to have its
8127 own reduction accumulator since one of the main goals of unrolling a
8128 reduction is to reduce the aggregate loop-carried latency. */
8129 if (ncopies > 1
8130 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8131 && reduc_chain_length == 1
8132 && loop_vinfo->suggested_unroll_factor == 1)
8133 single_defuse_cycle = true;
8135 if (single_defuse_cycle || lane_reduc_code_p)
8137 gcc_assert (op.code != COND_EXPR);
8139 /* 4. Supportable by target? */
8140 bool ok = true;
8142 /* 4.1. check support for the operation in the loop
8144 This isn't necessary for the lane reduction codes, since they
8145 can only be produced by pattern matching, and it's up to the
8146 pattern matcher to test for support. The main reason for
8147 specifically skipping this step is to avoid rechecking whether
8148 mixed-sign dot-products can be implemented using signed
8149 dot-products. */
8150 machine_mode vec_mode = TYPE_MODE (vectype_in);
8151 if (!lane_reduc_code_p
8152 && !directly_supported_p (op.code, vectype_in, optab_vector))
8154 if (dump_enabled_p ())
8155 dump_printf (MSG_NOTE, "op not supported by target.\n");
8156 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8157 || !vect_can_vectorize_without_simd_p (op.code))
8158 ok = false;
8159 else
8160 if (dump_enabled_p ())
8161 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8164 if (vect_emulated_vector_p (vectype_in)
8165 && !vect_can_vectorize_without_simd_p (op.code))
8167 if (dump_enabled_p ())
8168 dump_printf (MSG_NOTE, "using word mode not possible.\n");
8169 return false;
8172 /* lane-reducing operations have to go through vect_transform_reduction.
8173 For the other cases try without the single cycle optimization. */
8174 if (!ok)
8176 if (lane_reduc_code_p)
8177 return false;
8178 else
8179 single_defuse_cycle = false;
8182 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8184 /* If the reduction stmt is one of the patterns that have lane
8185 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
8186 if ((ncopies > 1 && ! single_defuse_cycle)
8187 && lane_reduc_code_p)
8189 if (dump_enabled_p ())
8190 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8191 "multi def-use cycle not possible for lane-reducing "
8192 "reduction operation\n");
8193 return false;
8196 if (slp_node
8197 && !(!single_defuse_cycle
8198 && !lane_reduc_code_p
8199 && reduction_type != FOLD_LEFT_REDUCTION))
8200 for (i = 0; i < (int) op.num_ops; i++)
8201 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8203 if (dump_enabled_p ())
8204 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8205 "incompatible vector types for invariants\n");
8206 return false;
8209 if (slp_node)
8210 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8211 else
8212 vec_num = 1;
8214 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8215 reduction_type, ncopies, cost_vec);
8216 /* Cost the reduction op inside the loop if transformed via
8217 vect_transform_reduction. Otherwise this is costed by the
8218 separate vectorizable_* routines. */
8219 if (single_defuse_cycle || lane_reduc_code_p)
8221 int factor = 1;
8222 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8223 /* Three dot-products and a subtraction. */
8224 factor = 4;
8225 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8226 stmt_info, 0, vect_body);
8229 if (dump_enabled_p ()
8230 && reduction_type == FOLD_LEFT_REDUCTION)
8231 dump_printf_loc (MSG_NOTE, vect_location,
8232 "using an in-order (fold-left) reduction.\n");
8233 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8234 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8235 reductions go through their own vectorizable_* routines. */
8236 if (!single_defuse_cycle
8237 && !lane_reduc_code_p
8238 && reduction_type != FOLD_LEFT_REDUCTION)
8240 stmt_vec_info tem
8241 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8242 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8244 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8245 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8247 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8248 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8250 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8252 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8253 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8254 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8256 if (reduction_type != FOLD_LEFT_REDUCTION
8257 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8258 && (cond_fn == IFN_LAST
8259 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8260 OPTIMIZE_FOR_SPEED)))
8262 if (dump_enabled_p ())
8263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8264 "can't operate on partial vectors because"
8265 " no conditional operation is available.\n");
8266 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8268 else if (reduction_type == FOLD_LEFT_REDUCTION
8269 && reduc_fn == IFN_LAST
8270 && !expand_vec_cond_expr_p (vectype_in,
8271 truth_type_for (vectype_in),
8272 SSA_NAME))
8274 if (dump_enabled_p ())
8275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8276 "can't operate on partial vectors because"
8277 " no conditional operation is available.\n");
8278 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8280 else if (reduction_type == FOLD_LEFT_REDUCTION
8281 && internal_fn_mask_index (reduc_fn) == -1
8282 && FLOAT_TYPE_P (vectype_in)
8283 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8285 if (dump_enabled_p ())
8286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8287 "can't operate on partial vectors because"
8288 " signed zeros cannot be preserved.\n");
8289 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8291 else
8293 internal_fn mask_reduc_fn
8294 = get_masked_reduction_fn (reduc_fn, vectype_in);
8296 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8297 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8298 vectype_in, 1);
8299 else
8300 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8301 vectype_in, NULL);
8304 return true;
8307 /* STMT_INFO is a dot-product reduction whose multiplication operands
8308 have different signs. Emit a sequence to emulate the operation
8309 using a series of signed DOT_PROD_EXPRs and return the last
8310 statement generated. VEC_DEST is the result of the vector operation
8311 and VOP lists its inputs. */
8313 static gassign *
8314 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8315 gimple_stmt_iterator *gsi, tree vec_dest,
8316 tree vop[3])
8318 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8319 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8320 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8321 gimple *new_stmt;
8323 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8324 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8325 std::swap (vop[0], vop[1]);
8327 /* Convert all inputs to signed types. */
8328 for (int i = 0; i < 3; ++i)
8329 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8331 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8332 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8333 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8334 vop[i] = tmp;
8337 /* In the comments below we assume 8-bit inputs for simplicity,
8338 but the approach works for any full integer type. */
8340 /* Create a vector of -128. */
8341 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8342 tree min_narrow = build_vector_from_val (narrow_vectype,
8343 min_narrow_elttype);
8345 /* Create a vector of 64. */
8346 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8347 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8348 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8350 /* Emit: SUB_RES = VOP[0] - 128. */
8351 tree sub_res = make_ssa_name (narrow_vectype);
8352 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8353 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8355 /* Emit:
8357 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8358 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8359 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8361 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8362 Doing the two 64 * y steps first allows more time to compute x. */
8363 tree stage1 = make_ssa_name (wide_vectype);
8364 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8365 vop[1], half_narrow, vop[2]);
8366 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8368 tree stage2 = make_ssa_name (wide_vectype);
8369 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8370 vop[1], half_narrow, stage1);
8371 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8373 tree stage3 = make_ssa_name (wide_vectype);
8374 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8375 sub_res, vop[1], stage2);
8376 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8378 /* Convert STAGE3 to the reduction type. */
8379 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8382 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8383 value. */
8385 bool
8386 vect_transform_reduction (loop_vec_info loop_vinfo,
8387 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8388 gimple **vec_stmt, slp_tree slp_node)
8390 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8391 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8392 int i;
8393 int ncopies;
8394 int vec_num;
8396 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8397 gcc_assert (reduc_info->is_reduc_info);
8399 if (nested_in_vect_loop_p (loop, stmt_info))
8401 loop = loop->inner;
8402 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8405 gimple_match_op op;
8406 if (!gimple_extract_op (stmt_info->stmt, &op))
8407 gcc_unreachable ();
8409 /* All uses but the last are expected to be defined in the loop.
8410 The last use is the reduction variable. In case of nested cycle this
8411 assumption is not true: we use reduc_index to record the index of the
8412 reduction variable. */
8413 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8414 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8415 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8416 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8418 if (slp_node)
8420 ncopies = 1;
8421 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8423 else
8425 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8426 vec_num = 1;
8429 code_helper code = canonicalize_code (op.code, op.type);
8430 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8432 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8433 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8434 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8436 /* Transform. */
8437 tree new_temp = NULL_TREE;
8438 auto_vec<tree> vec_oprnds0;
8439 auto_vec<tree> vec_oprnds1;
8440 auto_vec<tree> vec_oprnds2;
8441 tree def0;
8443 if (dump_enabled_p ())
8444 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8446 /* FORNOW: Multiple types are not supported for condition. */
8447 if (code == COND_EXPR)
8448 gcc_assert (ncopies == 1);
8450 /* A binary COND_OP reduction must have the same definition and else
8451 value. */
8452 bool cond_fn_p = code.is_internal_fn ()
8453 && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8454 if (cond_fn_p)
8456 gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8457 || code == IFN_COND_MUL || code == IFN_COND_AND
8458 || code == IFN_COND_IOR || code == IFN_COND_XOR);
8459 gcc_assert (op.num_ops == 4
8460 && (op.ops[reduc_index]
8461 == op.ops[internal_fn_else_index ((internal_fn) code)]));
8464 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8466 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8467 if (reduction_type == FOLD_LEFT_REDUCTION)
8469 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8470 gcc_assert (code.is_tree_code () || cond_fn_p);
8471 return vectorize_fold_left_reduction
8472 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8473 code, reduc_fn, op.ops, op.num_ops, vectype_in,
8474 reduc_index, masks, lens);
8477 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8478 gcc_assert (single_defuse_cycle
8479 || code == DOT_PROD_EXPR
8480 || code == WIDEN_SUM_EXPR
8481 || code == SAD_EXPR);
8483 /* Create the destination vector */
8484 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8485 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8487 /* Get NCOPIES vector definitions for all operands except the reduction
8488 definition. */
8489 if (!cond_fn_p)
8491 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8492 single_defuse_cycle && reduc_index == 0
8493 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8494 single_defuse_cycle && reduc_index == 1
8495 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8496 op.num_ops == 3
8497 && !(single_defuse_cycle && reduc_index == 2)
8498 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8500 else
8502 /* For a conditional operation pass the truth type as mask
8503 vectype. */
8504 gcc_assert (single_defuse_cycle
8505 && (reduc_index == 1 || reduc_index == 2));
8506 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8507 op.ops[0], &vec_oprnds0, truth_type_for (vectype_in),
8508 reduc_index == 1 ? NULL_TREE : op.ops[1],
8509 &vec_oprnds1, NULL_TREE,
8510 reduc_index == 2 ? NULL_TREE : op.ops[2],
8511 &vec_oprnds2, NULL_TREE);
8514 /* For single def-use cycles get one copy of the vectorized reduction
8515 definition. */
8516 if (single_defuse_cycle)
8518 gcc_assert (!slp_node);
8519 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8520 op.ops[reduc_index],
8521 reduc_index == 0 ? &vec_oprnds0
8522 : (reduc_index == 1 ? &vec_oprnds1
8523 : &vec_oprnds2));
8526 bool emulated_mixed_dot_prod
8527 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8528 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8530 gimple *new_stmt;
8531 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8532 if (masked_loop_p && !mask_by_cond_expr)
8534 /* No conditional ifns have been defined for dot-product yet. */
8535 gcc_assert (code != DOT_PROD_EXPR);
8537 /* Make sure that the reduction accumulator is vop[0]. */
8538 if (reduc_index == 1)
8540 gcc_assert (commutative_binary_op_p (code, op.type));
8541 std::swap (vop[0], vop[1]);
8543 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8544 vec_num * ncopies, vectype_in, i);
8545 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8546 vop[0], vop[1], vop[0]);
8547 new_temp = make_ssa_name (vec_dest, call);
8548 gimple_call_set_lhs (call, new_temp);
8549 gimple_call_set_nothrow (call, true);
8550 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8551 new_stmt = call;
8553 else
8555 if (op.num_ops >= 3)
8556 vop[2] = vec_oprnds2[i];
8558 if (masked_loop_p && mask_by_cond_expr)
8560 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8561 vec_num * ncopies, vectype_in, i);
8562 build_vect_cond_expr (code, vop, mask, gsi);
8565 if (emulated_mixed_dot_prod)
8566 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8567 vec_dest, vop);
8569 else if (code.is_internal_fn () && !cond_fn_p)
8570 new_stmt = gimple_build_call_internal (internal_fn (code),
8571 op.num_ops,
8572 vop[0], vop[1], vop[2]);
8573 else if (code.is_internal_fn () && cond_fn_p)
8574 new_stmt = gimple_build_call_internal (internal_fn (code),
8575 op.num_ops,
8576 vop[0], vop[1], vop[2],
8577 vop[1]);
8578 else
8579 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8580 vop[0], vop[1], vop[2]);
8581 new_temp = make_ssa_name (vec_dest, new_stmt);
8582 gimple_set_lhs (new_stmt, new_temp);
8583 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8586 if (slp_node)
8587 slp_node->push_vec_def (new_stmt);
8588 else if (single_defuse_cycle
8589 && i < ncopies - 1)
8591 if (reduc_index == 0)
8592 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8593 else if (reduc_index == 1)
8594 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8595 else if (reduc_index == 2)
8596 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8598 else
8599 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8602 if (!slp_node)
8603 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8605 return true;
8608 /* Transform phase of a cycle PHI. */
8610 bool
8611 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8612 stmt_vec_info stmt_info, gimple **vec_stmt,
8613 slp_tree slp_node, slp_instance slp_node_instance)
8615 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8616 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8617 int i;
8618 int ncopies;
8619 int j;
8620 bool nested_cycle = false;
8621 int vec_num;
8623 if (nested_in_vect_loop_p (loop, stmt_info))
8625 loop = loop->inner;
8626 nested_cycle = true;
8629 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8630 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8631 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8632 gcc_assert (reduc_info->is_reduc_info);
8634 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8635 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8636 /* Leave the scalar phi in place. */
8637 return true;
8639 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8640 /* For a nested cycle we do not fill the above. */
8641 if (!vectype_in)
8642 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8643 gcc_assert (vectype_in);
8645 if (slp_node)
8647 /* The size vect_schedule_slp_instance computes is off for us. */
8648 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8649 * SLP_TREE_LANES (slp_node), vectype_in);
8650 ncopies = 1;
8652 else
8654 vec_num = 1;
8655 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8658 /* Check whether we should use a single PHI node and accumulate
8659 vectors to one before the backedge. */
8660 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8661 ncopies = 1;
8663 /* Create the destination vector */
8664 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8665 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8666 vectype_out);
8668 /* Get the loop-entry arguments. */
8669 tree vec_initial_def = NULL_TREE;
8670 auto_vec<tree> vec_initial_defs;
8671 if (slp_node)
8673 vec_initial_defs.reserve (vec_num);
8674 if (nested_cycle)
8676 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8677 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8678 &vec_initial_defs);
8680 else
8682 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8683 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8684 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8686 unsigned int num_phis = stmts.length ();
8687 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8688 num_phis = 1;
8689 initial_values.reserve (num_phis);
8690 for (unsigned int i = 0; i < num_phis; ++i)
8692 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8693 initial_values.quick_push (vect_phi_initial_value (this_phi));
8695 if (vec_num == 1)
8696 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8697 if (!initial_values.is_empty ())
8699 tree initial_value
8700 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8701 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8702 tree neutral_op
8703 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8704 code, initial_value);
8705 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8706 &vec_initial_defs, vec_num,
8707 stmts.length (), neutral_op);
8711 else
8713 /* Get at the scalar def before the loop, that defines the initial
8714 value of the reduction variable. */
8715 tree initial_def = vect_phi_initial_value (phi);
8716 reduc_info->reduc_initial_values.safe_push (initial_def);
8717 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8718 and we can't use zero for induc_val, use initial_def. Similarly
8719 for REDUC_MIN and initial_def larger than the base. */
8720 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8722 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8723 if (TREE_CODE (initial_def) == INTEGER_CST
8724 && !integer_zerop (induc_val)
8725 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8726 && tree_int_cst_lt (initial_def, induc_val))
8727 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8728 && tree_int_cst_lt (induc_val, initial_def))))
8730 induc_val = initial_def;
8731 /* Communicate we used the initial_def to epilouge
8732 generation. */
8733 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8735 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8737 else if (nested_cycle)
8739 /* Do not use an adjustment def as that case is not supported
8740 correctly if ncopies is not one. */
8741 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8742 ncopies, initial_def,
8743 &vec_initial_defs);
8745 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8746 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8747 /* Fill the initial vector with the initial scalar value. */
8748 vec_initial_def
8749 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8750 initial_def, initial_def);
8751 else
8753 if (ncopies == 1)
8754 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8755 if (!reduc_info->reduc_initial_values.is_empty ())
8757 initial_def = reduc_info->reduc_initial_values[0];
8758 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8759 tree neutral_op
8760 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8761 code, initial_def);
8762 gcc_assert (neutral_op);
8763 /* Try to simplify the vector initialization by applying an
8764 adjustment after the reduction has been performed. */
8765 if (!reduc_info->reused_accumulator
8766 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8767 && !operand_equal_p (neutral_op, initial_def))
8769 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8770 = initial_def;
8771 initial_def = neutral_op;
8773 vec_initial_def
8774 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8775 initial_def, neutral_op);
8780 if (vec_initial_def)
8782 vec_initial_defs.create (ncopies);
8783 for (i = 0; i < ncopies; ++i)
8784 vec_initial_defs.quick_push (vec_initial_def);
8787 if (auto *accumulator = reduc_info->reused_accumulator)
8789 tree def = accumulator->reduc_input;
8790 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8792 unsigned int nreduc;
8793 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8794 (TREE_TYPE (def)),
8795 TYPE_VECTOR_SUBPARTS (vectype_out),
8796 &nreduc);
8797 gcc_assert (res);
8798 gimple_seq stmts = NULL;
8799 /* Reduce the single vector to a smaller one. */
8800 if (nreduc != 1)
8802 /* Perform the reduction in the appropriate type. */
8803 tree rvectype = vectype_out;
8804 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8805 TREE_TYPE (TREE_TYPE (def))))
8806 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8807 TYPE_VECTOR_SUBPARTS
8808 (vectype_out));
8809 def = vect_create_partial_epilog (def, rvectype,
8810 STMT_VINFO_REDUC_CODE
8811 (reduc_info),
8812 &stmts);
8814 /* The epilogue loop might use a different vector mode, like
8815 VNx2DI vs. V2DI. */
8816 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8818 tree reduc_type = build_vector_type_for_mode
8819 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8820 def = gimple_convert (&stmts, reduc_type, def);
8822 /* Adjust the input so we pick up the partially reduced value
8823 for the skip edge in vect_create_epilog_for_reduction. */
8824 accumulator->reduc_input = def;
8825 /* And the reduction could be carried out using a different sign. */
8826 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8827 def = gimple_convert (&stmts, vectype_out, def);
8828 if (loop_vinfo->main_loop_edge)
8830 /* While we'd like to insert on the edge this will split
8831 blocks and disturb bookkeeping, we also will eventually
8832 need this on the skip edge. Rely on sinking to
8833 fixup optimal placement and insert in the pred. */
8834 gimple_stmt_iterator gsi
8835 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8836 /* Insert before a cond that eventually skips the
8837 epilogue. */
8838 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8839 gsi_prev (&gsi);
8840 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8842 else
8843 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8844 stmts);
8846 if (loop_vinfo->main_loop_edge)
8847 vec_initial_defs[0]
8848 = vect_get_main_loop_result (loop_vinfo, def,
8849 vec_initial_defs[0]);
8850 else
8851 vec_initial_defs.safe_push (def);
8854 /* Generate the reduction PHIs upfront. */
8855 for (i = 0; i < vec_num; i++)
8857 tree vec_init_def = vec_initial_defs[i];
8858 for (j = 0; j < ncopies; j++)
8860 /* Create the reduction-phi that defines the reduction
8861 operand. */
8862 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8864 /* Set the loop-entry arg of the reduction-phi. */
8865 if (j != 0 && nested_cycle)
8866 vec_init_def = vec_initial_defs[j];
8867 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8868 UNKNOWN_LOCATION);
8870 /* The loop-latch arg is set in epilogue processing. */
8872 if (slp_node)
8873 slp_node->push_vec_def (new_phi);
8874 else
8876 if (j == 0)
8877 *vec_stmt = new_phi;
8878 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8883 return true;
8886 /* Vectorizes LC PHIs. */
8888 bool
8889 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8890 stmt_vec_info stmt_info, gimple **vec_stmt,
8891 slp_tree slp_node)
8893 if (!loop_vinfo
8894 || !is_a <gphi *> (stmt_info->stmt)
8895 || gimple_phi_num_args (stmt_info->stmt) != 1)
8896 return false;
8898 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8899 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8900 return false;
8902 if (!vec_stmt) /* transformation not required. */
8904 /* Deal with copies from externs or constants that disguise as
8905 loop-closed PHI nodes (PR97886). */
8906 if (slp_node
8907 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8908 SLP_TREE_VECTYPE (slp_node)))
8910 if (dump_enabled_p ())
8911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8912 "incompatible vector types for invariants\n");
8913 return false;
8915 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8916 return true;
8919 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8920 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8921 basic_block bb = gimple_bb (stmt_info->stmt);
8922 edge e = single_pred_edge (bb);
8923 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8924 auto_vec<tree> vec_oprnds;
8925 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8926 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8927 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8928 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8930 /* Create the vectorized LC PHI node. */
8931 gphi *new_phi = create_phi_node (vec_dest, bb);
8932 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8933 if (slp_node)
8934 slp_node->push_vec_def (new_phi);
8935 else
8936 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8938 if (!slp_node)
8939 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8941 return true;
8944 /* Vectorizes PHIs. */
8946 bool
8947 vectorizable_phi (vec_info *,
8948 stmt_vec_info stmt_info, gimple **vec_stmt,
8949 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8951 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8952 return false;
8954 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8955 return false;
8957 tree vectype = SLP_TREE_VECTYPE (slp_node);
8959 if (!vec_stmt) /* transformation not required. */
8961 slp_tree child;
8962 unsigned i;
8963 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8964 if (!child)
8966 if (dump_enabled_p ())
8967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8968 "PHI node with unvectorized backedge def\n");
8969 return false;
8971 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8973 if (dump_enabled_p ())
8974 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8975 "incompatible vector types for invariants\n");
8976 return false;
8978 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8979 && !useless_type_conversion_p (vectype,
8980 SLP_TREE_VECTYPE (child)))
8982 /* With bools we can have mask and non-mask precision vectors
8983 or different non-mask precisions. while pattern recog is
8984 supposed to guarantee consistency here bugs in it can cause
8985 mismatches (PR103489 and PR103800 for example).
8986 Deal with them here instead of ICEing later. */
8987 if (dump_enabled_p ())
8988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8989 "incompatible vector type setup from "
8990 "bool pattern detection\n");
8991 return false;
8994 /* For single-argument PHIs assume coalescing which means zero cost
8995 for the scalar and the vector PHIs. This avoids artificially
8996 favoring the vector path (but may pessimize it in some cases). */
8997 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8998 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8999 vector_stmt, stmt_info, vectype, 0, vect_body);
9000 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9001 return true;
9004 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9005 basic_block bb = gimple_bb (stmt_info->stmt);
9006 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9007 auto_vec<gphi *> new_phis;
9008 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9010 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9012 /* Skip not yet vectorized defs. */
9013 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9014 && SLP_TREE_VEC_DEFS (child).is_empty ())
9015 continue;
9017 auto_vec<tree> vec_oprnds;
9018 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9019 if (!new_phis.exists ())
9021 new_phis.create (vec_oprnds.length ());
9022 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9024 /* Create the vectorized LC PHI node. */
9025 new_phis.quick_push (create_phi_node (vec_dest, bb));
9026 slp_node->push_vec_def (new_phis[j]);
9029 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9030 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9031 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9033 /* We should have at least one already vectorized child. */
9034 gcc_assert (new_phis.exists ());
9036 return true;
9039 /* Vectorizes first order recurrences. An overview of the transformation
9040 is described below. Suppose we have the following loop.
9042 int t = 0;
9043 for (int i = 0; i < n; ++i)
9045 b[i] = a[i] - t;
9046 t = a[i];
9049 There is a first-order recurrence on 'a'. For this loop, the scalar IR
9050 looks (simplified) like:
9052 scalar.preheader:
9053 init = 0;
9055 scalar.body:
9056 i = PHI <0(scalar.preheader), i+1(scalar.body)>
9057 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9058 _1 = a[i]
9059 b[i] = _1 - _2
9060 if (i < n) goto scalar.body
9062 In this example, _2 is a recurrence because it's value depends on the
9063 previous iteration. We vectorize this as (VF = 4)
9065 vector.preheader:
9066 vect_init = vect_cst(..., ..., ..., 0)
9068 vector.body
9069 i = PHI <0(vector.preheader), i+4(vector.body)>
9070 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9071 vect_2 = a[i, i+1, i+2, i+3];
9072 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9073 b[i, i+1, i+2, i+3] = vect_2 - vect_3
9074 if (..) goto vector.body
9076 In this function, vectorizable_recurr, we code generate both the
9077 vector PHI node and the permute since those together compute the
9078 vectorized value of the scalar PHI. We do not yet have the
9079 backedge value to fill in there nor into the vec_perm. Those
9080 are filled in maybe_set_vectorized_backedge_value and
9081 vect_schedule_scc.
9083 TODO: Since the scalar loop does not have a use of the recurrence
9084 outside of the loop the natural way to implement peeling via
9085 vectorizing the live value doesn't work. For now peeling of loops
9086 with a recurrence is not implemented. For SLP the supported cases
9087 are restricted to those requiring a single vector recurrence PHI. */
9089 bool
9090 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9091 gimple **vec_stmt, slp_tree slp_node,
9092 stmt_vector_for_cost *cost_vec)
9094 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9095 return false;
9097 gphi *phi = as_a<gphi *> (stmt_info->stmt);
9099 /* So far we only support first-order recurrence auto-vectorization. */
9100 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9101 return false;
9103 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9104 unsigned ncopies;
9105 if (slp_node)
9106 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9107 else
9108 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9109 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9110 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9111 /* We need to be able to make progress with a single vector. */
9112 if (maybe_gt (dist * 2, nunits))
9114 if (dump_enabled_p ())
9115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9116 "first order recurrence exceeds half of "
9117 "a vector\n");
9118 return false;
9121 /* First-order recurrence autovectorization needs to handle permutation
9122 with indices = [nunits-1, nunits, nunits+1, ...]. */
9123 vec_perm_builder sel (nunits, 1, 3);
9124 for (int i = 0; i < 3; ++i)
9125 sel.quick_push (nunits - dist + i);
9126 vec_perm_indices indices (sel, 2, nunits);
9128 if (!vec_stmt) /* transformation not required. */
9130 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9131 indices))
9132 return false;
9134 if (slp_node)
9136 /* We eventually need to set a vector type on invariant
9137 arguments. */
9138 unsigned j;
9139 slp_tree child;
9140 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9141 if (!vect_maybe_update_slp_op_vectype
9142 (child, SLP_TREE_VECTYPE (slp_node)))
9144 if (dump_enabled_p ())
9145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9146 "incompatible vector types for "
9147 "invariants\n");
9148 return false;
9151 /* The recurrence costs the initialization vector and one permute
9152 for each copy. */
9153 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9154 stmt_info, 0, vect_prologue);
9155 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9156 stmt_info, 0, vect_body);
9157 if (dump_enabled_p ())
9158 dump_printf_loc (MSG_NOTE, vect_location,
9159 "vectorizable_recurr: inside_cost = %d, "
9160 "prologue_cost = %d .\n", inside_cost,
9161 prologue_cost);
9163 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9164 return true;
9167 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9168 basic_block bb = gimple_bb (phi);
9169 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9170 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9172 gimple_seq stmts = NULL;
9173 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9174 gsi_insert_seq_on_edge_immediate (pe, stmts);
9176 tree vec_init = build_vector_from_val (vectype, preheader);
9177 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9179 /* Create the vectorized first-order PHI node. */
9180 tree vec_dest = vect_get_new_vect_var (vectype,
9181 vect_simple_var, "vec_recur_");
9182 gphi *new_phi = create_phi_node (vec_dest, bb);
9183 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9185 /* Insert shuffles the first-order recurrence autovectorization.
9186 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9187 tree perm = vect_gen_perm_mask_checked (vectype, indices);
9189 /* Insert the required permute after the latch definition. The
9190 second and later operands are tentative and will be updated when we have
9191 vectorized the latch definition. */
9192 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9193 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9194 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9195 gsi_next (&gsi2);
9197 for (unsigned i = 0; i < ncopies; ++i)
9199 vec_dest = make_ssa_name (vectype);
9200 gassign *vperm
9201 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9202 i == 0 ? gimple_phi_result (new_phi) : NULL,
9203 NULL, perm);
9204 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9206 if (slp_node)
9207 slp_node->push_vec_def (vperm);
9208 else
9209 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9212 if (!slp_node)
9213 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9214 return true;
9217 /* Return true if VECTYPE represents a vector that requires lowering
9218 by the vector lowering pass. */
9220 bool
9221 vect_emulated_vector_p (tree vectype)
9223 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9224 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9225 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9228 /* Return true if we can emulate CODE on an integer mode representation
9229 of a vector. */
9231 bool
9232 vect_can_vectorize_without_simd_p (tree_code code)
9234 switch (code)
9236 case PLUS_EXPR:
9237 case MINUS_EXPR:
9238 case NEGATE_EXPR:
9239 case BIT_AND_EXPR:
9240 case BIT_IOR_EXPR:
9241 case BIT_XOR_EXPR:
9242 case BIT_NOT_EXPR:
9243 return true;
9245 default:
9246 return false;
9250 /* Likewise, but taking a code_helper. */
9252 bool
9253 vect_can_vectorize_without_simd_p (code_helper code)
9255 return (code.is_tree_code ()
9256 && vect_can_vectorize_without_simd_p (tree_code (code)));
9259 /* Create vector init for vectorized iv. */
9260 static tree
9261 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9262 tree step_expr, poly_uint64 nunits,
9263 tree vectype,
9264 enum vect_induction_op_type induction_type)
9266 unsigned HOST_WIDE_INT const_nunits;
9267 tree vec_shift, vec_init, new_name;
9268 unsigned i;
9269 tree itype = TREE_TYPE (vectype);
9271 /* iv_loop is the loop to be vectorized. Create:
9272 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9273 new_name = gimple_convert (stmts, itype, init_expr);
9274 switch (induction_type)
9276 case vect_step_op_shr:
9277 case vect_step_op_shl:
9278 /* Build the Initial value from shift_expr. */
9279 vec_init = gimple_build_vector_from_val (stmts,
9280 vectype,
9281 new_name);
9282 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9283 build_zero_cst (itype), step_expr);
9284 vec_init = gimple_build (stmts,
9285 (induction_type == vect_step_op_shr
9286 ? RSHIFT_EXPR : LSHIFT_EXPR),
9287 vectype, vec_init, vec_shift);
9288 break;
9290 case vect_step_op_neg:
9292 vec_init = gimple_build_vector_from_val (stmts,
9293 vectype,
9294 new_name);
9295 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9296 vectype, vec_init);
9297 /* The encoding has 2 interleaved stepped patterns. */
9298 vec_perm_builder sel (nunits, 2, 3);
9299 sel.quick_grow (6);
9300 for (i = 0; i < 3; i++)
9302 sel[2 * i] = i;
9303 sel[2 * i + 1] = i + nunits;
9305 vec_perm_indices indices (sel, 2, nunits);
9306 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9307 fail when vec_init is const vector. In that situation vec_perm is not
9308 really needed. */
9309 tree perm_mask_even
9310 = vect_gen_perm_mask_any (vectype, indices);
9311 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9312 vectype,
9313 vec_init, vec_neg,
9314 perm_mask_even);
9316 break;
9318 case vect_step_op_mul:
9320 /* Use unsigned mult to avoid UD integer overflow. */
9321 gcc_assert (nunits.is_constant (&const_nunits));
9322 tree utype = unsigned_type_for (itype);
9323 tree uvectype = build_vector_type (utype,
9324 TYPE_VECTOR_SUBPARTS (vectype));
9325 new_name = gimple_convert (stmts, utype, new_name);
9326 vec_init = gimple_build_vector_from_val (stmts,
9327 uvectype,
9328 new_name);
9329 tree_vector_builder elts (uvectype, const_nunits, 1);
9330 tree elt_step = build_one_cst (utype);
9332 elts.quick_push (elt_step);
9333 for (i = 1; i < const_nunits; i++)
9335 /* Create: new_name_i = new_name + step_expr. */
9336 elt_step = gimple_build (stmts, MULT_EXPR,
9337 utype, elt_step, step_expr);
9338 elts.quick_push (elt_step);
9340 /* Create a vector from [new_name_0, new_name_1, ...,
9341 new_name_nunits-1]. */
9342 tree vec_mul = gimple_build_vector (stmts, &elts);
9343 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9344 vec_init, vec_mul);
9345 vec_init = gimple_convert (stmts, vectype, vec_init);
9347 break;
9349 default:
9350 gcc_unreachable ();
9353 return vec_init;
9356 /* Peel init_expr by skip_niter for induction_type. */
9357 tree
9358 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9359 tree skip_niters, tree step_expr,
9360 enum vect_induction_op_type induction_type)
9362 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9363 tree type = TREE_TYPE (init_expr);
9364 unsigned prec = TYPE_PRECISION (type);
9365 switch (induction_type)
9367 case vect_step_op_neg:
9368 if (TREE_INT_CST_LOW (skip_niters) % 2)
9369 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9370 /* else no change. */
9371 break;
9373 case vect_step_op_shr:
9374 case vect_step_op_shl:
9375 skip_niters = gimple_convert (stmts, type, skip_niters);
9376 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9377 /* When shift mount >= precision, need to avoid UD.
9378 In the original loop, there's no UD, and according to semantic,
9379 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9380 if (!tree_fits_uhwi_p (step_expr)
9381 || tree_to_uhwi (step_expr) >= prec)
9383 if (induction_type == vect_step_op_shl
9384 || TYPE_UNSIGNED (type))
9385 init_expr = build_zero_cst (type);
9386 else
9387 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9388 init_expr,
9389 wide_int_to_tree (type, prec - 1));
9391 else
9392 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9393 ? RSHIFT_EXPR : LSHIFT_EXPR),
9394 type, init_expr, step_expr);
9395 break;
9397 case vect_step_op_mul:
9399 tree utype = unsigned_type_for (type);
9400 init_expr = gimple_convert (stmts, utype, init_expr);
9401 wide_int skipn = wi::to_wide (skip_niters);
9402 wide_int begin = wi::to_wide (step_expr);
9403 auto_mpz base, exp, mod, res;
9404 wi::to_mpz (begin, base, TYPE_SIGN (type));
9405 wi::to_mpz (skipn, exp, UNSIGNED);
9406 mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9407 mpz_powm (res, base, exp, mod);
9408 begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9409 tree mult_expr = wide_int_to_tree (utype, begin);
9410 init_expr = gimple_build (stmts, MULT_EXPR, utype,
9411 init_expr, mult_expr);
9412 init_expr = gimple_convert (stmts, type, init_expr);
9414 break;
9416 default:
9417 gcc_unreachable ();
9420 return init_expr;
9423 /* Create vector step for vectorized iv. */
9424 static tree
9425 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9426 poly_uint64 vf,
9427 enum vect_induction_op_type induction_type)
9429 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9430 tree new_name = NULL;
9431 /* Step should be pow (step, vf) for mult induction. */
9432 if (induction_type == vect_step_op_mul)
9434 gcc_assert (vf.is_constant ());
9435 wide_int begin = wi::to_wide (step_expr);
9437 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9438 begin = wi::mul (begin, wi::to_wide (step_expr));
9440 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9442 else if (induction_type == vect_step_op_neg)
9443 /* Do nothing. */
9445 else
9446 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9447 expr, step_expr);
9448 return new_name;
9451 static tree
9452 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9453 stmt_vec_info stmt_info,
9454 tree new_name, tree vectype,
9455 enum vect_induction_op_type induction_type)
9457 /* No step is needed for neg induction. */
9458 if (induction_type == vect_step_op_neg)
9459 return NULL;
9461 tree t = unshare_expr (new_name);
9462 gcc_assert (CONSTANT_CLASS_P (new_name)
9463 || TREE_CODE (new_name) == SSA_NAME);
9464 tree new_vec = build_vector_from_val (vectype, t);
9465 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9466 new_vec, vectype, NULL);
9467 return vec_step;
9470 /* Update vectorized iv with vect_step, induc_def is init. */
9471 static tree
9472 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9473 tree induc_def, tree vec_step,
9474 enum vect_induction_op_type induction_type)
9476 tree vec_def = induc_def;
9477 switch (induction_type)
9479 case vect_step_op_mul:
9481 /* Use unsigned mult to avoid UD integer overflow. */
9482 tree uvectype
9483 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9484 TYPE_VECTOR_SUBPARTS (vectype));
9485 vec_def = gimple_convert (stmts, uvectype, vec_def);
9486 vec_step = gimple_convert (stmts, uvectype, vec_step);
9487 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9488 vec_def, vec_step);
9489 vec_def = gimple_convert (stmts, vectype, vec_def);
9491 break;
9493 case vect_step_op_shr:
9494 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9495 vec_def, vec_step);
9496 break;
9498 case vect_step_op_shl:
9499 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9500 vec_def, vec_step);
9501 break;
9502 case vect_step_op_neg:
9503 vec_def = induc_def;
9504 /* Do nothing. */
9505 break;
9506 default:
9507 gcc_unreachable ();
9510 return vec_def;
9514 /* Function vectorizable_induction
9516 Check if STMT_INFO performs an nonlinear induction computation that can be
9517 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9518 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9519 basic block.
9520 Return true if STMT_INFO is vectorizable in this way. */
9522 static bool
9523 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9524 stmt_vec_info stmt_info,
9525 gimple **vec_stmt, slp_tree slp_node,
9526 stmt_vector_for_cost *cost_vec)
9528 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9529 unsigned ncopies;
9530 bool nested_in_vect_loop = false;
9531 class loop *iv_loop;
9532 tree vec_def;
9533 edge pe = loop_preheader_edge (loop);
9534 basic_block new_bb;
9535 tree vec_init, vec_step;
9536 tree new_name;
9537 gimple *new_stmt;
9538 gphi *induction_phi;
9539 tree induc_def, vec_dest;
9540 tree init_expr, step_expr;
9541 tree niters_skip;
9542 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9543 unsigned i;
9544 gimple_stmt_iterator si;
9546 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9548 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9549 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9550 enum vect_induction_op_type induction_type
9551 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9553 gcc_assert (induction_type > vect_step_op_add);
9555 if (slp_node)
9556 ncopies = 1;
9557 else
9558 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9559 gcc_assert (ncopies >= 1);
9561 /* FORNOW. Only handle nonlinear induction in the same loop. */
9562 if (nested_in_vect_loop_p (loop, stmt_info))
9564 if (dump_enabled_p ())
9565 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9566 "nonlinear induction in nested loop.\n");
9567 return false;
9570 iv_loop = loop;
9571 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9573 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9574 update for each iv and a permutation to generate wanted vector iv. */
9575 if (slp_node)
9577 if (dump_enabled_p ())
9578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9579 "SLP induction not supported for nonlinear"
9580 " induction.\n");
9581 return false;
9584 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9586 if (dump_enabled_p ())
9587 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9588 "floating point nonlinear induction vectorization"
9589 " not supported.\n");
9590 return false;
9593 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9594 init_expr = vect_phi_initial_value (phi);
9595 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9596 && TREE_CODE (step_expr) == INTEGER_CST);
9597 /* step_expr should be aligned with init_expr,
9598 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9599 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9601 if (TREE_CODE (init_expr) == INTEGER_CST)
9602 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9603 else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9605 /* INIT_EXPR could be a bit_field, bail out for such case. */
9606 if (dump_enabled_p ())
9607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9608 "nonlinear induction vectorization failed:"
9609 " component type of vectype is not a nop conversion"
9610 " from type of init_expr.\n");
9611 return false;
9614 switch (induction_type)
9616 case vect_step_op_neg:
9617 if (TREE_CODE (init_expr) != INTEGER_CST
9618 && TREE_CODE (init_expr) != REAL_CST)
9620 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9621 if (!directly_supported_p (NEGATE_EXPR, vectype))
9622 return false;
9624 /* The encoding has 2 interleaved stepped patterns. */
9625 vec_perm_builder sel (nunits, 2, 3);
9626 machine_mode mode = TYPE_MODE (vectype);
9627 sel.quick_grow (6);
9628 for (i = 0; i < 3; i++)
9630 sel[i * 2] = i;
9631 sel[i * 2 + 1] = i + nunits;
9633 vec_perm_indices indices (sel, 2, nunits);
9634 if (!can_vec_perm_const_p (mode, mode, indices))
9635 return false;
9637 break;
9639 case vect_step_op_mul:
9641 /* Check for backend support of MULT_EXPR. */
9642 if (!directly_supported_p (MULT_EXPR, vectype))
9643 return false;
9645 /* ?? How to construct vector step for variable number vector.
9646 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9647 if (!vf.is_constant ())
9648 return false;
9650 break;
9652 case vect_step_op_shr:
9653 /* Check for backend support of RSHIFT_EXPR. */
9654 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9655 return false;
9657 /* Don't shift more than type precision to avoid UD. */
9658 if (!tree_fits_uhwi_p (step_expr)
9659 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9660 TYPE_PRECISION (TREE_TYPE (init_expr))))
9661 return false;
9662 break;
9664 case vect_step_op_shl:
9665 /* Check for backend support of RSHIFT_EXPR. */
9666 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9667 return false;
9669 /* Don't shift more than type precision to avoid UD. */
9670 if (!tree_fits_uhwi_p (step_expr)
9671 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9672 TYPE_PRECISION (TREE_TYPE (init_expr))))
9673 return false;
9675 break;
9677 default:
9678 gcc_unreachable ();
9681 if (!vec_stmt) /* transformation not required. */
9683 unsigned inside_cost = 0, prologue_cost = 0;
9684 /* loop cost for vec_loop. Neg induction doesn't have any
9685 inside_cost. */
9686 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9687 stmt_info, 0, vect_body);
9689 /* loop cost for vec_loop. Neg induction doesn't have any
9690 inside_cost. */
9691 if (induction_type == vect_step_op_neg)
9692 inside_cost = 0;
9694 /* prologue cost for vec_init and vec_step. */
9695 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9696 stmt_info, 0, vect_prologue);
9698 if (dump_enabled_p ())
9699 dump_printf_loc (MSG_NOTE, vect_location,
9700 "vect_model_induction_cost: inside_cost = %d, "
9701 "prologue_cost = %d. \n", inside_cost,
9702 prologue_cost);
9704 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9705 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9706 return true;
9709 /* Transform. */
9711 /* Compute a vector variable, initialized with the first VF values of
9712 the induction variable. E.g., for an iv with IV_PHI='X' and
9713 evolution S, for a vector of 4 units, we want to compute:
9714 [X, X + S, X + 2*S, X + 3*S]. */
9716 if (dump_enabled_p ())
9717 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9719 pe = loop_preheader_edge (iv_loop);
9720 /* Find the first insertion point in the BB. */
9721 basic_block bb = gimple_bb (phi);
9722 si = gsi_after_labels (bb);
9724 gimple_seq stmts = NULL;
9726 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9727 /* If we are using the loop mask to "peel" for alignment then we need
9728 to adjust the start value here. */
9729 if (niters_skip != NULL_TREE)
9730 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9731 step_expr, induction_type);
9733 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9734 step_expr, nunits, vectype,
9735 induction_type);
9736 if (stmts)
9738 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9739 gcc_assert (!new_bb);
9742 stmts = NULL;
9743 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9744 vf, induction_type);
9745 if (stmts)
9747 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9748 gcc_assert (!new_bb);
9751 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9752 new_name, vectype,
9753 induction_type);
9754 /* Create the following def-use cycle:
9755 loop prolog:
9756 vec_init = ...
9757 vec_step = ...
9758 loop:
9759 vec_iv = PHI <vec_init, vec_loop>
9761 STMT
9763 vec_loop = vec_iv + vec_step; */
9765 /* Create the induction-phi that defines the induction-operand. */
9766 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9767 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9768 induc_def = PHI_RESULT (induction_phi);
9770 /* Create the iv update inside the loop. */
9771 stmts = NULL;
9772 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9773 induc_def, vec_step,
9774 induction_type);
9776 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9777 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9779 /* Set the arguments of the phi node: */
9780 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9781 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9782 UNKNOWN_LOCATION);
9784 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9785 *vec_stmt = induction_phi;
9787 /* In case that vectorization factor (VF) is bigger than the number
9788 of elements that we can fit in a vectype (nunits), we have to generate
9789 more than one vector stmt - i.e - we need to "unroll" the
9790 vector stmt by a factor VF/nunits. For more details see documentation
9791 in vectorizable_operation. */
9793 if (ncopies > 1)
9795 stmts = NULL;
9796 /* FORNOW. This restriction should be relaxed. */
9797 gcc_assert (!nested_in_vect_loop);
9799 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9800 nunits, induction_type);
9802 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9803 new_name, vectype,
9804 induction_type);
9805 vec_def = induc_def;
9806 for (i = 1; i < ncopies; i++)
9808 /* vec_i = vec_prev + vec_step. */
9809 stmts = NULL;
9810 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9811 vec_def, vec_step,
9812 induction_type);
9813 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9814 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9815 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9819 if (dump_enabled_p ())
9820 dump_printf_loc (MSG_NOTE, vect_location,
9821 "transform induction: created def-use cycle: %G%G",
9822 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9824 return true;
9827 /* Function vectorizable_induction
9829 Check if STMT_INFO performs an induction computation that can be vectorized.
9830 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9831 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9832 Return true if STMT_INFO is vectorizable in this way. */
9834 bool
9835 vectorizable_induction (loop_vec_info loop_vinfo,
9836 stmt_vec_info stmt_info,
9837 gimple **vec_stmt, slp_tree slp_node,
9838 stmt_vector_for_cost *cost_vec)
9840 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9841 unsigned ncopies;
9842 bool nested_in_vect_loop = false;
9843 class loop *iv_loop;
9844 tree vec_def;
9845 edge pe = loop_preheader_edge (loop);
9846 basic_block new_bb;
9847 tree new_vec, vec_init, vec_step, t;
9848 tree new_name;
9849 gimple *new_stmt;
9850 gphi *induction_phi;
9851 tree induc_def, vec_dest;
9852 tree init_expr, step_expr;
9853 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9854 unsigned i;
9855 tree expr;
9856 gimple_stmt_iterator si;
9857 enum vect_induction_op_type induction_type
9858 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9860 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9861 if (!phi)
9862 return false;
9864 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9865 return false;
9867 /* Make sure it was recognized as induction computation. */
9868 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9869 return false;
9871 /* Handle nonlinear induction in a separate place. */
9872 if (induction_type != vect_step_op_add)
9873 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9874 vec_stmt, slp_node, cost_vec);
9876 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9877 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9879 if (slp_node)
9880 ncopies = 1;
9881 else
9882 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9883 gcc_assert (ncopies >= 1);
9885 /* FORNOW. These restrictions should be relaxed. */
9886 if (nested_in_vect_loop_p (loop, stmt_info))
9888 imm_use_iterator imm_iter;
9889 use_operand_p use_p;
9890 gimple *exit_phi;
9891 edge latch_e;
9892 tree loop_arg;
9894 if (ncopies > 1)
9896 if (dump_enabled_p ())
9897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9898 "multiple types in nested loop.\n");
9899 return false;
9902 exit_phi = NULL;
9903 latch_e = loop_latch_edge (loop->inner);
9904 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9905 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9907 gimple *use_stmt = USE_STMT (use_p);
9908 if (is_gimple_debug (use_stmt))
9909 continue;
9911 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9913 exit_phi = use_stmt;
9914 break;
9917 if (exit_phi)
9919 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9920 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9921 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9923 if (dump_enabled_p ())
9924 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9925 "inner-loop induction only used outside "
9926 "of the outer vectorized loop.\n");
9927 return false;
9931 nested_in_vect_loop = true;
9932 iv_loop = loop->inner;
9934 else
9935 iv_loop = loop;
9936 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9938 if (slp_node && !nunits.is_constant ())
9940 /* The current SLP code creates the step value element-by-element. */
9941 if (dump_enabled_p ())
9942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9943 "SLP induction not supported for variable-length"
9944 " vectors.\n");
9945 return false;
9948 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9950 if (dump_enabled_p ())
9951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9952 "floating point induction vectorization disabled\n");
9953 return false;
9956 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9957 gcc_assert (step_expr != NULL_TREE);
9958 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9960 /* Check for backend support of PLUS/MINUS_EXPR. */
9961 if (!directly_supported_p (PLUS_EXPR, step_vectype)
9962 || !directly_supported_p (MINUS_EXPR, step_vectype))
9963 return false;
9965 if (!vec_stmt) /* transformation not required. */
9967 unsigned inside_cost = 0, prologue_cost = 0;
9968 if (slp_node)
9970 /* We eventually need to set a vector type on invariant
9971 arguments. */
9972 unsigned j;
9973 slp_tree child;
9974 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9975 if (!vect_maybe_update_slp_op_vectype
9976 (child, SLP_TREE_VECTYPE (slp_node)))
9978 if (dump_enabled_p ())
9979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9980 "incompatible vector types for "
9981 "invariants\n");
9982 return false;
9984 /* loop cost for vec_loop. */
9985 inside_cost
9986 = record_stmt_cost (cost_vec,
9987 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9988 vector_stmt, stmt_info, 0, vect_body);
9989 /* prologue cost for vec_init (if not nested) and step. */
9990 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9991 scalar_to_vec,
9992 stmt_info, 0, vect_prologue);
9994 else /* if (!slp_node) */
9996 /* loop cost for vec_loop. */
9997 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9998 stmt_info, 0, vect_body);
9999 /* prologue cost for vec_init and vec_step. */
10000 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10001 stmt_info, 0, vect_prologue);
10003 if (dump_enabled_p ())
10004 dump_printf_loc (MSG_NOTE, vect_location,
10005 "vect_model_induction_cost: inside_cost = %d, "
10006 "prologue_cost = %d .\n", inside_cost,
10007 prologue_cost);
10009 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10010 DUMP_VECT_SCOPE ("vectorizable_induction");
10011 return true;
10014 /* Transform. */
10016 /* Compute a vector variable, initialized with the first VF values of
10017 the induction variable. E.g., for an iv with IV_PHI='X' and
10018 evolution S, for a vector of 4 units, we want to compute:
10019 [X, X + S, X + 2*S, X + 3*S]. */
10021 if (dump_enabled_p ())
10022 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10024 pe = loop_preheader_edge (iv_loop);
10025 /* Find the first insertion point in the BB. */
10026 basic_block bb = gimple_bb (phi);
10027 si = gsi_after_labels (bb);
10029 /* For SLP induction we have to generate several IVs as for example
10030 with group size 3 we need
10031 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10032 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
10033 if (slp_node)
10035 /* Enforced above. */
10036 unsigned int const_nunits = nunits.to_constant ();
10038 /* The initial values are vectorized, but any lanes > group_size
10039 need adjustment. */
10040 slp_tree init_node
10041 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10043 /* Gather steps. Since we do not vectorize inductions as
10044 cycles we have to reconstruct the step from SCEV data. */
10045 unsigned group_size = SLP_TREE_LANES (slp_node);
10046 tree *steps = XALLOCAVEC (tree, group_size);
10047 tree *inits = XALLOCAVEC (tree, group_size);
10048 stmt_vec_info phi_info;
10049 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10051 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10052 if (!init_node)
10053 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10054 pe->dest_idx);
10057 /* Now generate the IVs. */
10058 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10059 gcc_assert ((const_nunits * nvects) % group_size == 0);
10060 unsigned nivs;
10061 if (nested_in_vect_loop)
10062 nivs = nvects;
10063 else
10065 /* Compute the number of distinct IVs we need. First reduce
10066 group_size if it is a multiple of const_nunits so we get
10067 one IV for a group_size of 4 but const_nunits 2. */
10068 unsigned group_sizep = group_size;
10069 if (group_sizep % const_nunits == 0)
10070 group_sizep = group_sizep / const_nunits;
10071 nivs = least_common_multiple (group_sizep,
10072 const_nunits) / const_nunits;
10074 tree stept = TREE_TYPE (step_vectype);
10075 tree lupdate_mul = NULL_TREE;
10076 if (!nested_in_vect_loop)
10078 /* The number of iterations covered in one vector iteration. */
10079 unsigned lup_mul = (nvects * const_nunits) / group_size;
10080 lupdate_mul
10081 = build_vector_from_val (step_vectype,
10082 SCALAR_FLOAT_TYPE_P (stept)
10083 ? build_real_from_wide (stept, lup_mul,
10084 UNSIGNED)
10085 : build_int_cstu (stept, lup_mul));
10087 tree peel_mul = NULL_TREE;
10088 gimple_seq init_stmts = NULL;
10089 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10091 if (SCALAR_FLOAT_TYPE_P (stept))
10092 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10093 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10094 else
10095 peel_mul = gimple_convert (&init_stmts, stept,
10096 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10097 peel_mul = gimple_build_vector_from_val (&init_stmts,
10098 step_vectype, peel_mul);
10100 unsigned ivn;
10101 auto_vec<tree> vec_steps;
10102 for (ivn = 0; ivn < nivs; ++ivn)
10104 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10105 tree_vector_builder init_elts (vectype, const_nunits, 1);
10106 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10107 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10109 /* The scalar steps of the IVs. */
10110 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10111 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10112 step_elts.quick_push (elt);
10113 if (!init_node)
10115 /* The scalar inits of the IVs if not vectorized. */
10116 elt = inits[(ivn*const_nunits + eltn) % group_size];
10117 if (!useless_type_conversion_p (TREE_TYPE (vectype),
10118 TREE_TYPE (elt)))
10119 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10120 TREE_TYPE (vectype), elt);
10121 init_elts.quick_push (elt);
10123 /* The number of steps to add to the initial values. */
10124 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10125 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10126 ? build_real_from_wide (stept,
10127 mul_elt, UNSIGNED)
10128 : build_int_cstu (stept, mul_elt));
10130 vec_step = gimple_build_vector (&init_stmts, &step_elts);
10131 vec_steps.safe_push (vec_step);
10132 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10133 if (peel_mul)
10134 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10135 step_mul, peel_mul);
10136 if (!init_node)
10137 vec_init = gimple_build_vector (&init_stmts, &init_elts);
10139 /* Create the induction-phi that defines the induction-operand. */
10140 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10141 "vec_iv_");
10142 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10143 induc_def = PHI_RESULT (induction_phi);
10145 /* Create the iv update inside the loop */
10146 tree up = vec_step;
10147 if (lupdate_mul)
10148 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10149 vec_step, lupdate_mul);
10150 gimple_seq stmts = NULL;
10151 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10152 vec_def = gimple_build (&stmts,
10153 PLUS_EXPR, step_vectype, vec_def, up);
10154 vec_def = gimple_convert (&stmts, vectype, vec_def);
10155 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10156 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10157 UNKNOWN_LOCATION);
10159 if (init_node)
10160 vec_init = vect_get_slp_vect_def (init_node, ivn);
10161 if (!nested_in_vect_loop
10162 && !integer_zerop (step_mul))
10164 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10165 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10166 vec_step, step_mul);
10167 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10168 vec_def, up);
10169 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10172 /* Set the arguments of the phi node: */
10173 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10175 slp_node->push_vec_def (induction_phi);
10177 if (!nested_in_vect_loop)
10179 /* Fill up to the number of vectors we need for the whole group. */
10180 nivs = least_common_multiple (group_size,
10181 const_nunits) / const_nunits;
10182 vec_steps.reserve (nivs-ivn);
10183 for (; ivn < nivs; ++ivn)
10185 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10186 vec_steps.quick_push (vec_steps[0]);
10190 /* Re-use IVs when we can. We are generating further vector
10191 stmts by adding VF' * stride to the IVs generated above. */
10192 if (ivn < nvects)
10194 unsigned vfp
10195 = least_common_multiple (group_size, const_nunits) / group_size;
10196 tree lupdate_mul
10197 = build_vector_from_val (step_vectype,
10198 SCALAR_FLOAT_TYPE_P (stept)
10199 ? build_real_from_wide (stept,
10200 vfp, UNSIGNED)
10201 : build_int_cstu (stept, vfp));
10202 for (; ivn < nvects; ++ivn)
10204 gimple *iv
10205 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10206 tree def = gimple_get_lhs (iv);
10207 if (ivn < 2*nivs)
10208 vec_steps[ivn - nivs]
10209 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10210 vec_steps[ivn - nivs], lupdate_mul);
10211 gimple_seq stmts = NULL;
10212 def = gimple_convert (&stmts, step_vectype, def);
10213 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10214 def, vec_steps[ivn % nivs]);
10215 def = gimple_convert (&stmts, vectype, def);
10216 if (gimple_code (iv) == GIMPLE_PHI)
10217 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10218 else
10220 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10221 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10223 slp_node->push_vec_def (def);
10227 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10228 gcc_assert (!new_bb);
10230 return true;
10233 init_expr = vect_phi_initial_value (phi);
10235 gimple_seq stmts = NULL;
10236 if (!nested_in_vect_loop)
10238 /* Convert the initial value to the IV update type. */
10239 tree new_type = TREE_TYPE (step_expr);
10240 init_expr = gimple_convert (&stmts, new_type, init_expr);
10242 /* If we are using the loop mask to "peel" for alignment then we need
10243 to adjust the start value here. */
10244 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10245 if (skip_niters != NULL_TREE)
10247 if (FLOAT_TYPE_P (vectype))
10248 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10249 skip_niters);
10250 else
10251 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10252 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10253 skip_niters, step_expr);
10254 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10255 init_expr, skip_step);
10259 if (stmts)
10261 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10262 gcc_assert (!new_bb);
10265 /* Create the vector that holds the initial_value of the induction. */
10266 if (nested_in_vect_loop)
10268 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10269 been created during vectorization of previous stmts. We obtain it
10270 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10271 auto_vec<tree> vec_inits;
10272 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10273 init_expr, &vec_inits);
10274 vec_init = vec_inits[0];
10275 /* If the initial value is not of proper type, convert it. */
10276 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10278 new_stmt
10279 = gimple_build_assign (vect_get_new_ssa_name (vectype,
10280 vect_simple_var,
10281 "vec_iv_"),
10282 VIEW_CONVERT_EXPR,
10283 build1 (VIEW_CONVERT_EXPR, vectype,
10284 vec_init));
10285 vec_init = gimple_assign_lhs (new_stmt);
10286 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10287 new_stmt);
10288 gcc_assert (!new_bb);
10291 else
10293 /* iv_loop is the loop to be vectorized. Create:
10294 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10295 stmts = NULL;
10296 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10298 unsigned HOST_WIDE_INT const_nunits;
10299 if (nunits.is_constant (&const_nunits))
10301 tree_vector_builder elts (step_vectype, const_nunits, 1);
10302 elts.quick_push (new_name);
10303 for (i = 1; i < const_nunits; i++)
10305 /* Create: new_name_i = new_name + step_expr */
10306 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10307 new_name, step_expr);
10308 elts.quick_push (new_name);
10310 /* Create a vector from [new_name_0, new_name_1, ...,
10311 new_name_nunits-1] */
10312 vec_init = gimple_build_vector (&stmts, &elts);
10314 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10315 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10316 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10317 new_name, step_expr);
10318 else
10320 /* Build:
10321 [base, base, base, ...]
10322 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10323 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10324 gcc_assert (flag_associative_math);
10325 tree index = build_index_vector (step_vectype, 0, 1);
10326 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10327 new_name);
10328 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10329 step_expr);
10330 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10331 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10332 vec_init, step_vec);
10333 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10334 vec_init, base_vec);
10336 vec_init = gimple_convert (&stmts, vectype, vec_init);
10338 if (stmts)
10340 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10341 gcc_assert (!new_bb);
10346 /* Create the vector that holds the step of the induction. */
10347 gimple_stmt_iterator *step_iv_si = NULL;
10348 if (nested_in_vect_loop)
10349 /* iv_loop is nested in the loop to be vectorized. Generate:
10350 vec_step = [S, S, S, S] */
10351 new_name = step_expr;
10352 else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10354 /* When we're using loop_len produced by SELEC_VL, the non-final
10355 iterations are not always processing VF elements. So vectorize
10356 induction variable instead of
10358 _21 = vect_vec_iv_.6_22 + { VF, ... };
10360 We should generate:
10362 _35 = .SELECT_VL (ivtmp_33, VF);
10363 vect_cst__22 = [vec_duplicate_expr] _35;
10364 _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10365 gcc_assert (!slp_node);
10366 gimple_seq seq = NULL;
10367 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10368 tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10369 expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10370 unshare_expr (len)),
10371 &seq, true, NULL_TREE);
10372 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10373 step_expr);
10374 gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10375 step_iv_si = &si;
10377 else
10379 /* iv_loop is the loop to be vectorized. Generate:
10380 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10381 gimple_seq seq = NULL;
10382 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10384 expr = build_int_cst (integer_type_node, vf);
10385 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10387 else
10388 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10389 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10390 expr, step_expr);
10391 if (seq)
10393 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10394 gcc_assert (!new_bb);
10398 t = unshare_expr (new_name);
10399 gcc_assert (CONSTANT_CLASS_P (new_name)
10400 || TREE_CODE (new_name) == SSA_NAME);
10401 new_vec = build_vector_from_val (step_vectype, t);
10402 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10403 new_vec, step_vectype, step_iv_si);
10406 /* Create the following def-use cycle:
10407 loop prolog:
10408 vec_init = ...
10409 vec_step = ...
10410 loop:
10411 vec_iv = PHI <vec_init, vec_loop>
10413 STMT
10415 vec_loop = vec_iv + vec_step; */
10417 /* Create the induction-phi that defines the induction-operand. */
10418 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10419 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10420 induc_def = PHI_RESULT (induction_phi);
10422 /* Create the iv update inside the loop */
10423 stmts = NULL;
10424 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10425 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10426 vec_def = gimple_convert (&stmts, vectype, vec_def);
10427 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10428 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10430 /* Set the arguments of the phi node: */
10431 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10432 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10433 UNKNOWN_LOCATION);
10435 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10436 *vec_stmt = induction_phi;
10438 /* In case that vectorization factor (VF) is bigger than the number
10439 of elements that we can fit in a vectype (nunits), we have to generate
10440 more than one vector stmt - i.e - we need to "unroll" the
10441 vector stmt by a factor VF/nunits. For more details see documentation
10442 in vectorizable_operation. */
10444 if (ncopies > 1)
10446 gimple_seq seq = NULL;
10447 /* FORNOW. This restriction should be relaxed. */
10448 gcc_assert (!nested_in_vect_loop);
10449 /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1. */
10450 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10452 /* Create the vector that holds the step of the induction. */
10453 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10455 expr = build_int_cst (integer_type_node, nunits);
10456 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10458 else
10459 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10460 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10461 expr, step_expr);
10462 if (seq)
10464 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10465 gcc_assert (!new_bb);
10468 t = unshare_expr (new_name);
10469 gcc_assert (CONSTANT_CLASS_P (new_name)
10470 || TREE_CODE (new_name) == SSA_NAME);
10471 new_vec = build_vector_from_val (step_vectype, t);
10472 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10473 new_vec, step_vectype, NULL);
10475 vec_def = induc_def;
10476 for (i = 1; i < ncopies + 1; i++)
10478 /* vec_i = vec_prev + vec_step */
10479 gimple_seq stmts = NULL;
10480 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10481 vec_def = gimple_build (&stmts,
10482 PLUS_EXPR, step_vectype, vec_def, vec_step);
10483 vec_def = gimple_convert (&stmts, vectype, vec_def);
10485 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10486 if (i < ncopies)
10488 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10489 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10491 else
10493 /* vec_1 = vec_iv + (VF/n * S)
10494 vec_2 = vec_1 + (VF/n * S)
10496 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10498 vec_n is used as vec_loop to save the large step register and
10499 related operations. */
10500 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10501 UNKNOWN_LOCATION);
10506 if (dump_enabled_p ())
10507 dump_printf_loc (MSG_NOTE, vect_location,
10508 "transform induction: created def-use cycle: %G%G",
10509 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10511 return true;
10514 /* Function vectorizable_live_operation.
10516 STMT_INFO computes a value that is used outside the loop. Check if
10517 it can be supported. */
10519 bool
10520 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10521 slp_tree slp_node, slp_instance slp_node_instance,
10522 int slp_index, bool vec_stmt_p,
10523 stmt_vector_for_cost *cost_vec)
10525 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10526 imm_use_iterator imm_iter;
10527 tree lhs, lhs_type, bitsize;
10528 tree vectype = (slp_node
10529 ? SLP_TREE_VECTYPE (slp_node)
10530 : STMT_VINFO_VECTYPE (stmt_info));
10531 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10532 int ncopies;
10533 gimple *use_stmt;
10534 auto_vec<tree> vec_oprnds;
10535 int vec_entry = 0;
10536 poly_uint64 vec_index = 0;
10538 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10540 /* If a stmt of a reduction is live, vectorize it via
10541 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10542 validity so just trigger the transform here. */
10543 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10545 if (!vec_stmt_p)
10546 return true;
10547 if (slp_node)
10549 /* For reduction chains the meta-info is attached to
10550 the group leader. */
10551 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10552 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10553 /* For SLP reductions we vectorize the epilogue for
10554 all involved stmts together. */
10555 else if (slp_index != 0)
10556 return true;
10558 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10559 gcc_assert (reduc_info->is_reduc_info);
10560 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10561 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10562 return true;
10563 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10564 slp_node_instance);
10565 return true;
10568 /* If STMT is not relevant and it is a simple assignment and its inputs are
10569 invariant then it can remain in place, unvectorized. The original last
10570 scalar value that it computes will be used. */
10571 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10573 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10574 if (dump_enabled_p ())
10575 dump_printf_loc (MSG_NOTE, vect_location,
10576 "statement is simple and uses invariant. Leaving in "
10577 "place.\n");
10578 return true;
10581 if (slp_node)
10582 ncopies = 1;
10583 else
10584 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10586 if (slp_node)
10588 gcc_assert (slp_index >= 0);
10590 /* Get the last occurrence of the scalar index from the concatenation of
10591 all the slp vectors. Calculate which slp vector it is and the index
10592 within. */
10593 int num_scalar = SLP_TREE_LANES (slp_node);
10594 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10595 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10597 /* Calculate which vector contains the result, and which lane of
10598 that vector we need. */
10599 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10601 if (dump_enabled_p ())
10602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10603 "Cannot determine which vector holds the"
10604 " final result.\n");
10605 return false;
10609 if (!vec_stmt_p)
10611 /* No transformation required. */
10612 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10614 if (slp_node)
10616 if (dump_enabled_p ())
10617 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10618 "can't operate on partial vectors "
10619 "because an SLP statement is live after "
10620 "the loop.\n");
10621 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10623 else if (ncopies > 1)
10625 if (dump_enabled_p ())
10626 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10627 "can't operate on partial vectors "
10628 "because ncopies is greater than 1.\n");
10629 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10631 else
10633 gcc_assert (ncopies == 1 && !slp_node);
10634 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10635 OPTIMIZE_FOR_SPEED))
10636 vect_record_loop_mask (loop_vinfo,
10637 &LOOP_VINFO_MASKS (loop_vinfo),
10638 1, vectype, NULL);
10639 else if (can_vec_extract_var_idx_p (
10640 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10641 vect_record_loop_len (loop_vinfo,
10642 &LOOP_VINFO_LENS (loop_vinfo),
10643 1, vectype, 1);
10644 else
10646 if (dump_enabled_p ())
10647 dump_printf_loc (
10648 MSG_MISSED_OPTIMIZATION, vect_location,
10649 "can't operate on partial vectors "
10650 "because the target doesn't support extract "
10651 "last reduction.\n");
10652 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10656 /* ??? Enable for loop costing as well. */
10657 if (!loop_vinfo)
10658 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10659 0, vect_epilogue);
10660 return true;
10663 /* Use the lhs of the original scalar statement. */
10664 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10665 if (dump_enabled_p ())
10666 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10667 "stmt %G", stmt);
10669 lhs = gimple_get_lhs (stmt);
10670 lhs_type = TREE_TYPE (lhs);
10672 bitsize = vector_element_bits_tree (vectype);
10674 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10675 tree vec_lhs, bitstart;
10676 gimple *vec_stmt;
10677 if (slp_node)
10679 gcc_assert (!loop_vinfo
10680 || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10681 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10683 /* Get the correct slp vectorized stmt. */
10684 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10685 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10687 /* Get entry to use. */
10688 bitstart = bitsize_int (vec_index);
10689 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10691 else
10693 /* For multiple copies, get the last copy. */
10694 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10695 vec_lhs = gimple_get_lhs (vec_stmt);
10697 /* Get the last lane in the vector. */
10698 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10701 if (loop_vinfo)
10703 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10704 requirement, insert one phi node for it. It looks like:
10705 loop;
10707 # lhs' = PHI <lhs>
10709 loop;
10711 # vec_lhs' = PHI <vec_lhs>
10712 new_tree = lane_extract <vec_lhs', ...>;
10713 lhs' = new_tree; */
10715 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10716 basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
10717 gcc_assert (single_pred_p (exit_bb));
10719 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10720 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10721 SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
10723 gimple_seq stmts = NULL;
10724 tree new_tree;
10725 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10727 /* Emit:
10729 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10731 where VEC_LHS is the vectorized live-out result and MASK is
10732 the loop mask for the final iteration. */
10733 gcc_assert (ncopies == 1 && !slp_node);
10734 gimple_seq tem = NULL;
10735 gimple_stmt_iterator gsi = gsi_last (tem);
10736 tree len
10737 = vect_get_loop_len (loop_vinfo, &gsi,
10738 &LOOP_VINFO_LENS (loop_vinfo),
10739 1, vectype, 0, 0);
10741 /* BIAS - 1. */
10742 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10743 tree bias_minus_one
10744 = int_const_binop (MINUS_EXPR,
10745 build_int_cst (TREE_TYPE (len), biasval),
10746 build_one_cst (TREE_TYPE (len)));
10748 /* LAST_INDEX = LEN + (BIAS - 1). */
10749 tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10750 len, bias_minus_one);
10752 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10753 tree scalar_res
10754 = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10755 vec_lhs_phi, last_index);
10757 /* Convert the extracted vector element to the scalar type. */
10758 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10760 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10762 /* Emit:
10764 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10766 where VEC_LHS is the vectorized live-out result and MASK is
10767 the loop mask for the final iteration. */
10768 gcc_assert (ncopies == 1 && !slp_node);
10769 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10770 gimple_seq tem = NULL;
10771 gimple_stmt_iterator gsi = gsi_last (tem);
10772 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10773 &LOOP_VINFO_MASKS (loop_vinfo),
10774 1, vectype, 0);
10775 gimple_seq_add_seq (&stmts, tem);
10776 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10777 mask, vec_lhs_phi);
10779 /* Convert the extracted vector element to the scalar type. */
10780 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10782 else
10784 tree bftype = TREE_TYPE (vectype);
10785 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10786 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10787 new_tree = build3 (BIT_FIELD_REF, bftype,
10788 vec_lhs_phi, bitsize, bitstart);
10789 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10790 &stmts, true, NULL_TREE);
10793 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10794 if (stmts)
10795 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10797 /* Remove existing phis that copy from lhs and create copies
10798 from new_tree. */
10799 gimple_stmt_iterator gsi;
10800 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
10802 gimple *phi = gsi_stmt (gsi);
10803 if ((gimple_phi_arg_def (phi, 0) == lhs))
10805 remove_phi_node (&gsi, false);
10806 tree lhs_phi = gimple_phi_result (phi);
10807 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10808 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10810 else
10811 gsi_next (&gsi);
10814 /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10815 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10816 gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10818 else
10820 /* For basic-block vectorization simply insert the lane-extraction. */
10821 tree bftype = TREE_TYPE (vectype);
10822 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10823 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10824 tree new_tree = build3 (BIT_FIELD_REF, bftype,
10825 vec_lhs, bitsize, bitstart);
10826 gimple_seq stmts = NULL;
10827 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10828 &stmts, true, NULL_TREE);
10829 if (TREE_CODE (new_tree) == SSA_NAME
10830 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10831 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10832 if (is_a <gphi *> (vec_stmt))
10834 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10835 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10837 else
10839 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10840 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10843 /* Replace use of lhs with newly computed result. If the use stmt is a
10844 single arg PHI, just replace all uses of PHI result. It's necessary
10845 because lcssa PHI defining lhs may be before newly inserted stmt. */
10846 use_operand_p use_p;
10847 stmt_vec_info use_stmt_info;
10848 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10849 if (!is_gimple_debug (use_stmt)
10850 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10851 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10853 /* ??? This can happen when the live lane ends up being
10854 rooted in a vector construction code-generated by an
10855 external SLP node (and code-generation for that already
10856 happened). See gcc.dg/vect/bb-slp-47.c.
10857 Doing this is what would happen if that vector CTOR
10858 were not code-generated yet so it is not too bad.
10859 ??? In fact we'd likely want to avoid this situation
10860 in the first place. */
10861 if (TREE_CODE (new_tree) == SSA_NAME
10862 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10863 && gimple_code (use_stmt) != GIMPLE_PHI
10864 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10865 use_stmt))
10867 if (dump_enabled_p ())
10868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10869 "Using original scalar computation for "
10870 "live lane because use preceeds vector "
10871 "def\n");
10872 continue;
10874 /* ??? It can also happen that we end up pulling a def into
10875 a loop where replacing out-of-loop uses would require
10876 a new LC SSA PHI node. Retain the original scalar in
10877 those cases as well. PR98064. */
10878 if (TREE_CODE (new_tree) == SSA_NAME
10879 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10880 && (gimple_bb (use_stmt)->loop_father
10881 != gimple_bb (vec_stmt)->loop_father)
10882 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10883 gimple_bb (use_stmt)->loop_father))
10885 if (dump_enabled_p ())
10886 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10887 "Using original scalar computation for "
10888 "live lane because there is an out-of-loop "
10889 "definition for it\n");
10890 continue;
10892 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10893 SET_USE (use_p, new_tree);
10894 update_stmt (use_stmt);
10898 return true;
10901 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
10903 static void
10904 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10906 ssa_op_iter op_iter;
10907 imm_use_iterator imm_iter;
10908 def_operand_p def_p;
10909 gimple *ustmt;
10911 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10913 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10915 basic_block bb;
10917 if (!is_gimple_debug (ustmt))
10918 continue;
10920 bb = gimple_bb (ustmt);
10922 if (!flow_bb_inside_loop_p (loop, bb))
10924 if (gimple_debug_bind_p (ustmt))
10926 if (dump_enabled_p ())
10927 dump_printf_loc (MSG_NOTE, vect_location,
10928 "killing debug use\n");
10930 gimple_debug_bind_reset_value (ustmt);
10931 update_stmt (ustmt);
10933 else
10934 gcc_unreachable ();
10940 /* Given loop represented by LOOP_VINFO, return true if computation of
10941 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10942 otherwise. */
10944 static bool
10945 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10947 /* Constant case. */
10948 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10950 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10951 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10953 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10954 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10955 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10956 return true;
10959 widest_int max;
10960 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10961 /* Check the upper bound of loop niters. */
10962 if (get_max_loop_iterations (loop, &max))
10964 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10965 signop sgn = TYPE_SIGN (type);
10966 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10967 if (max < type_max)
10968 return true;
10970 return false;
10973 /* Return a mask type with half the number of elements as OLD_TYPE,
10974 given that it should have mode NEW_MODE. */
10976 tree
10977 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10979 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10980 return build_truth_vector_type_for_mode (nunits, new_mode);
10983 /* Return a mask type with twice as many elements as OLD_TYPE,
10984 given that it should have mode NEW_MODE. */
10986 tree
10987 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10989 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10990 return build_truth_vector_type_for_mode (nunits, new_mode);
10993 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10994 contain a sequence of NVECTORS masks that each control a vector of type
10995 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10996 these vector masks with the vector version of SCALAR_MASK. */
10998 void
10999 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11000 unsigned int nvectors, tree vectype, tree scalar_mask)
11002 gcc_assert (nvectors != 0);
11004 if (scalar_mask)
11006 scalar_cond_masked_key cond (scalar_mask, nvectors);
11007 loop_vinfo->scalar_cond_masked_set.add (cond);
11010 masks->mask_set.add (std::make_pair (vectype, nvectors));
11013 /* Given a complete set of masks MASKS, extract mask number INDEX
11014 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11015 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
11017 See the comment above vec_loop_masks for more details about the mask
11018 arrangement. */
11020 tree
11021 vect_get_loop_mask (loop_vec_info loop_vinfo,
11022 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11023 unsigned int nvectors, tree vectype, unsigned int index)
11025 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11026 == vect_partial_vectors_while_ult)
11028 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11029 tree mask_type = rgm->type;
11031 /* Populate the rgroup's mask array, if this is the first time we've
11032 used it. */
11033 if (rgm->controls.is_empty ())
11035 rgm->controls.safe_grow_cleared (nvectors, true);
11036 for (unsigned int i = 0; i < nvectors; ++i)
11038 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11039 /* Provide a dummy definition until the real one is available. */
11040 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11041 rgm->controls[i] = mask;
11045 tree mask = rgm->controls[index];
11046 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11047 TYPE_VECTOR_SUBPARTS (vectype)))
11049 /* A loop mask for data type X can be reused for data type Y
11050 if X has N times more elements than Y and if Y's elements
11051 are N times bigger than X's. In this case each sequence
11052 of N elements in the loop mask will be all-zero or all-one.
11053 We can then view-convert the mask so that each sequence of
11054 N elements is replaced by a single element. */
11055 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11056 TYPE_VECTOR_SUBPARTS (vectype)));
11057 gimple_seq seq = NULL;
11058 mask_type = truth_type_for (vectype);
11059 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11060 if (seq)
11061 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11063 return mask;
11065 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11066 == vect_partial_vectors_avx512)
11068 /* The number of scalars per iteration and the number of vectors are
11069 both compile-time constants. */
11070 unsigned int nscalars_per_iter
11071 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11072 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11074 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11076 /* The stored nV is dependent on the mask type produced. */
11077 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11078 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11079 == rgm->factor);
11080 nvectors = rgm->factor;
11082 /* Populate the rgroup's mask array, if this is the first time we've
11083 used it. */
11084 if (rgm->controls.is_empty ())
11086 rgm->controls.safe_grow_cleared (nvectors, true);
11087 for (unsigned int i = 0; i < nvectors; ++i)
11089 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11090 /* Provide a dummy definition until the real one is available. */
11091 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11092 rgm->controls[i] = mask;
11095 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11096 TYPE_VECTOR_SUBPARTS (vectype)))
11097 return rgm->controls[index];
11099 /* Split the vector if needed. Since we are dealing with integer mode
11100 masks with AVX512 we can operate on the integer representation
11101 performing the whole vector shifting. */
11102 unsigned HOST_WIDE_INT factor;
11103 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11104 TYPE_VECTOR_SUBPARTS (vectype), &factor);
11105 gcc_assert (ok);
11106 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11107 tree mask_type = truth_type_for (vectype);
11108 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11109 unsigned vi = index / factor;
11110 unsigned vpart = index % factor;
11111 tree vec = rgm->controls[vi];
11112 gimple_seq seq = NULL;
11113 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11114 lang_hooks.types.type_for_mode
11115 (TYPE_MODE (rgm->type), 1), vec);
11116 /* For integer mode masks simply shift the right bits into position. */
11117 if (vpart != 0)
11118 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11119 build_int_cst (integer_type_node,
11120 (TYPE_VECTOR_SUBPARTS (vectype)
11121 * vpart)));
11122 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11123 (TYPE_MODE (mask_type), 1), vec);
11124 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11125 if (seq)
11126 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11127 return vec;
11129 else
11130 gcc_unreachable ();
11133 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11134 lengths for controlling an operation on VECTYPE. The operation splits
11135 each element of VECTYPE into FACTOR separate subelements, measuring the
11136 length as a number of these subelements. */
11138 void
11139 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11140 unsigned int nvectors, tree vectype, unsigned int factor)
11142 gcc_assert (nvectors != 0);
11143 if (lens->length () < nvectors)
11144 lens->safe_grow_cleared (nvectors, true);
11145 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11147 /* The number of scalars per iteration, scalar occupied bytes and
11148 the number of vectors are both compile-time constants. */
11149 unsigned int nscalars_per_iter
11150 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11151 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11153 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11155 /* For now, we only support cases in which all loads and stores fall back
11156 to VnQI or none do. */
11157 gcc_assert (!rgl->max_nscalars_per_iter
11158 || (rgl->factor == 1 && factor == 1)
11159 || (rgl->max_nscalars_per_iter * rgl->factor
11160 == nscalars_per_iter * factor));
11161 rgl->max_nscalars_per_iter = nscalars_per_iter;
11162 rgl->type = vectype;
11163 rgl->factor = factor;
11167 /* Given a complete set of lengths LENS, extract length number INDEX
11168 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11169 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11170 multipled by the number of elements that should be processed.
11171 Insert any set-up statements before GSI. */
11173 tree
11174 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11175 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11176 unsigned int index, unsigned int factor)
11178 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11179 bool use_bias_adjusted_len =
11180 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11182 /* Populate the rgroup's len array, if this is the first time we've
11183 used it. */
11184 if (rgl->controls.is_empty ())
11186 rgl->controls.safe_grow_cleared (nvectors, true);
11187 for (unsigned int i = 0; i < nvectors; ++i)
11189 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11190 gcc_assert (len_type != NULL_TREE);
11192 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11194 /* Provide a dummy definition until the real one is available. */
11195 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11196 rgl->controls[i] = len;
11198 if (use_bias_adjusted_len)
11200 gcc_assert (i == 0);
11201 tree adjusted_len =
11202 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11203 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11204 rgl->bias_adjusted_ctrl = adjusted_len;
11209 if (use_bias_adjusted_len)
11210 return rgl->bias_adjusted_ctrl;
11212 tree loop_len = rgl->controls[index];
11213 if (rgl->factor == 1 && factor == 1)
11215 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11216 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11217 if (maybe_ne (nunits1, nunits2))
11219 /* A loop len for data type X can be reused for data type Y
11220 if X has N times more elements than Y and if Y's elements
11221 are N times bigger than X's. */
11222 gcc_assert (multiple_p (nunits1, nunits2));
11223 factor = exact_div (nunits1, nunits2).to_constant ();
11224 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11225 gimple_seq seq = NULL;
11226 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11227 build_int_cst (iv_type, factor));
11228 if (seq)
11229 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11232 return loop_len;
11235 /* Scale profiling counters by estimation for LOOP which is vectorized
11236 by factor VF.
11237 If FLAT is true, the loop we started with had unrealistically flat
11238 profile. */
11240 static void
11241 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11243 /* For flat profiles do not scale down proportionally by VF and only
11244 cap by known iteration count bounds. */
11245 if (flat)
11247 if (dump_file && (dump_flags & TDF_DETAILS))
11248 fprintf (dump_file,
11249 "Vectorized loop profile seems flat; not scaling iteration "
11250 "count down by the vectorization factor %i\n", vf);
11251 scale_loop_profile (loop, profile_probability::always (),
11252 get_likely_max_loop_iterations_int (loop));
11253 return;
11255 /* Loop body executes VF fewer times and exit increases VF times. */
11256 profile_count entry_count = loop_preheader_edge (loop)->count ();
11258 /* If we have unreliable loop profile avoid dropping entry
11259 count bellow header count. This can happen since loops
11260 has unrealistically low trip counts. */
11261 while (vf > 1
11262 && loop->header->count > entry_count
11263 && loop->header->count < entry_count * vf)
11265 if (dump_file && (dump_flags & TDF_DETAILS))
11266 fprintf (dump_file,
11267 "Vectorization factor %i seems too large for profile "
11268 "prevoiusly believed to be consistent; reducing.\n", vf);
11269 vf /= 2;
11272 if (entry_count.nonzero_p ())
11273 set_edge_probability_and_rescale_others
11274 (exit_e,
11275 entry_count.probability_in (loop->header->count / vf));
11276 /* Avoid producing very large exit probability when we do not have
11277 sensible profile. */
11278 else if (exit_e->probability < profile_probability::always () / (vf * 2))
11279 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11280 loop->latch->count = single_pred_edge (loop->latch)->count ();
11282 scale_loop_profile (loop, profile_probability::always () / vf,
11283 get_likely_max_loop_iterations_int (loop));
11286 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11287 latch edge values originally defined by it. */
11289 static void
11290 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11291 stmt_vec_info def_stmt_info)
11293 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11294 if (!def || TREE_CODE (def) != SSA_NAME)
11295 return;
11296 stmt_vec_info phi_info;
11297 imm_use_iterator iter;
11298 use_operand_p use_p;
11299 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11301 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11302 if (!phi)
11303 continue;
11304 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11305 && (phi_info = loop_vinfo->lookup_stmt (phi))
11306 && STMT_VINFO_RELEVANT_P (phi_info)))
11307 continue;
11308 loop_p loop = gimple_bb (phi)->loop_father;
11309 edge e = loop_latch_edge (loop);
11310 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11311 continue;
11313 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11314 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11315 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11317 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11318 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11319 gcc_assert (phi_defs.length () == latch_defs.length ());
11320 for (unsigned i = 0; i < phi_defs.length (); ++i)
11321 add_phi_arg (as_a <gphi *> (phi_defs[i]),
11322 gimple_get_lhs (latch_defs[i]), e,
11323 gimple_phi_arg_location (phi, e->dest_idx));
11325 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11327 /* For first order recurrences we have to update both uses of
11328 the latch definition, the one in the PHI node and the one
11329 in the generated VEC_PERM_EXPR. */
11330 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11331 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11332 gcc_assert (phi_defs.length () == latch_defs.length ());
11333 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11334 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11335 for (unsigned i = 0; i < phi_defs.length (); ++i)
11337 gassign *perm = as_a <gassign *> (phi_defs[i]);
11338 if (i > 0)
11339 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11340 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11341 update_stmt (perm);
11343 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11344 gimple_phi_arg_location (phi, e->dest_idx));
11349 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11350 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11351 stmt_vec_info. */
11353 static bool
11354 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11355 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11357 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11358 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11360 if (dump_enabled_p ())
11361 dump_printf_loc (MSG_NOTE, vect_location,
11362 "------>vectorizing statement: %G", stmt_info->stmt);
11364 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11365 vect_loop_kill_debug_uses (loop, stmt_info);
11367 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11368 && !STMT_VINFO_LIVE_P (stmt_info))
11370 if (is_gimple_call (stmt_info->stmt)
11371 && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11373 gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11374 *seen_store = stmt_info;
11375 return false;
11377 return false;
11380 if (STMT_VINFO_VECTYPE (stmt_info))
11382 poly_uint64 nunits
11383 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11384 if (!STMT_SLP_TYPE (stmt_info)
11385 && maybe_ne (nunits, vf)
11386 && dump_enabled_p ())
11387 /* For SLP VF is set according to unrolling factor, and not
11388 to vector size, hence for SLP this print is not valid. */
11389 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11392 /* Pure SLP statements have already been vectorized. We still need
11393 to apply loop vectorization to hybrid SLP statements. */
11394 if (PURE_SLP_STMT (stmt_info))
11395 return false;
11397 if (dump_enabled_p ())
11398 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11400 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11401 *seen_store = stmt_info;
11403 return true;
11406 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11407 in the hash_map with its corresponding values. */
11409 static tree
11410 find_in_mapping (tree t, void *context)
11412 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11414 tree *value = mapping->get (t);
11415 return value ? *value : t;
11418 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11419 original loop that has now been vectorized.
11421 The inits of the data_references need to be advanced with the number of
11422 iterations of the main loop. This has been computed in vect_do_peeling and
11423 is stored in parameter ADVANCE. We first restore the data_references
11424 initial offset with the values recored in ORIG_DRS_INIT.
11426 Since the loop_vec_info of this EPILOGUE was constructed for the original
11427 loop, its stmt_vec_infos all point to the original statements. These need
11428 to be updated to point to their corresponding copies as well as the SSA_NAMES
11429 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11431 The data_reference's connections also need to be updated. Their
11432 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11433 stmt_vec_infos, their statements need to point to their corresponding copy,
11434 if they are gather loads or scatter stores then their reference needs to be
11435 updated to point to its corresponding copy and finally we set
11436 'base_misaligned' to false as we have already peeled for alignment in the
11437 prologue of the main loop. */
11439 static void
11440 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11442 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11443 auto_vec<gimple *> stmt_worklist;
11444 hash_map<tree,tree> mapping;
11445 gimple *orig_stmt, *new_stmt;
11446 gimple_stmt_iterator epilogue_gsi;
11447 gphi_iterator epilogue_phi_gsi;
11448 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11449 basic_block *epilogue_bbs = get_loop_body (epilogue);
11450 unsigned i;
11452 free (LOOP_VINFO_BBS (epilogue_vinfo));
11453 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11455 /* Advance data_reference's with the number of iterations of the previous
11456 loop and its prologue. */
11457 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11460 /* The EPILOGUE loop is a copy of the original loop so they share the same
11461 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11462 point to the copied statements. We also create a mapping of all LHS' in
11463 the original loop and all the LHS' in the EPILOGUE and create worklists to
11464 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11465 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11467 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11468 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11470 new_stmt = epilogue_phi_gsi.phi ();
11472 gcc_assert (gimple_uid (new_stmt) > 0);
11473 stmt_vinfo
11474 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11476 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11477 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11479 mapping.put (gimple_phi_result (orig_stmt),
11480 gimple_phi_result (new_stmt));
11481 /* PHI nodes can not have patterns or related statements. */
11482 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11483 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11486 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11487 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11489 new_stmt = gsi_stmt (epilogue_gsi);
11490 if (is_gimple_debug (new_stmt))
11491 continue;
11493 gcc_assert (gimple_uid (new_stmt) > 0);
11494 stmt_vinfo
11495 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11497 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11498 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11500 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11501 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11503 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11505 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11506 for (gimple_stmt_iterator gsi = gsi_start (seq);
11507 !gsi_end_p (gsi); gsi_next (&gsi))
11508 stmt_worklist.safe_push (gsi_stmt (gsi));
11511 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11512 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11514 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11515 stmt_worklist.safe_push (stmt);
11516 /* Set BB such that the assert in
11517 'get_initial_def_for_reduction' is able to determine that
11518 the BB of the related stmt is inside this loop. */
11519 gimple_set_bb (stmt,
11520 gimple_bb (new_stmt));
11521 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11522 gcc_assert (related_vinfo == NULL
11523 || related_vinfo == stmt_vinfo);
11528 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11529 using the original main loop and thus need to be updated to refer to the
11530 cloned variables used in the epilogue. */
11531 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11533 gimple *stmt = stmt_worklist[i];
11534 tree *new_op;
11536 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11538 tree op = gimple_op (stmt, j);
11539 if ((new_op = mapping.get(op)))
11540 gimple_set_op (stmt, j, *new_op);
11541 else
11543 /* PR92429: The last argument of simplify_replace_tree disables
11544 folding when replacing arguments. This is required as
11545 otherwise you might end up with different statements than the
11546 ones analyzed in vect_loop_analyze, leading to different
11547 vectorization. */
11548 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11549 &find_in_mapping, &mapping, false);
11550 gimple_set_op (stmt, j, op);
11555 struct data_reference *dr;
11556 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11557 FOR_EACH_VEC_ELT (datarefs, i, dr)
11559 orig_stmt = DR_STMT (dr);
11560 gcc_assert (gimple_uid (orig_stmt) > 0);
11561 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11562 /* Data references for gather loads and scatter stores do not use the
11563 updated offset we set using ADVANCE. Instead we have to make sure the
11564 reference in the data references point to the corresponding copy of
11565 the original in the epilogue. Make sure to update both
11566 gather/scatters recognized by dataref analysis and also other
11567 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11568 auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11569 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11570 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11572 DR_REF (dr)
11573 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11574 &find_in_mapping, &mapping);
11575 DR_BASE_ADDRESS (dr)
11576 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11577 &find_in_mapping, &mapping);
11579 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11580 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11581 /* The vector size of the epilogue is smaller than that of the main loop
11582 so the alignment is either the same or lower. This means the dr will
11583 thus by definition be aligned. */
11584 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11587 epilogue_vinfo->shared->datarefs_copy.release ();
11588 epilogue_vinfo->shared->save_datarefs ();
11591 /* Function vect_transform_loop.
11593 The analysis phase has determined that the loop is vectorizable.
11594 Vectorize the loop - created vectorized stmts to replace the scalar
11595 stmts in the loop, and update the loop exit condition.
11596 Returns scalar epilogue loop if any. */
11598 class loop *
11599 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11601 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11602 class loop *epilogue = NULL;
11603 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11604 int nbbs = loop->num_nodes;
11605 int i;
11606 tree niters_vector = NULL_TREE;
11607 tree step_vector = NULL_TREE;
11608 tree niters_vector_mult_vf = NULL_TREE;
11609 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11610 unsigned int lowest_vf = constant_lower_bound (vf);
11611 gimple *stmt;
11612 bool check_profitability = false;
11613 unsigned int th;
11614 bool flat = maybe_flat_loop_profile (loop);
11616 DUMP_VECT_SCOPE ("vec_transform_loop");
11618 loop_vinfo->shared->check_datarefs ();
11620 /* Use the more conservative vectorization threshold. If the number
11621 of iterations is constant assume the cost check has been performed
11622 by our caller. If the threshold makes all loops profitable that
11623 run at least the (estimated) vectorization factor number of times
11624 checking is pointless, too. */
11625 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11626 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11628 if (dump_enabled_p ())
11629 dump_printf_loc (MSG_NOTE, vect_location,
11630 "Profitability threshold is %d loop iterations.\n",
11631 th);
11632 check_profitability = true;
11635 /* Make sure there exists a single-predecessor exit bb. Do this before
11636 versioning. */
11637 edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11638 if (! single_pred_p (e->dest))
11640 split_loop_exit_edge (e, true);
11641 if (dump_enabled_p ())
11642 dump_printf (MSG_NOTE, "split exit edge\n");
11645 /* Version the loop first, if required, so the profitability check
11646 comes first. */
11648 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11650 class loop *sloop
11651 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11652 sloop->force_vectorize = false;
11653 check_profitability = false;
11656 /* Make sure there exists a single-predecessor exit bb also on the
11657 scalar loop copy. Do this after versioning but before peeling
11658 so CFG structure is fine for both scalar and if-converted loop
11659 to make slpeel_duplicate_current_defs_from_edges face matched
11660 loop closed PHI nodes on the exit. */
11661 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11663 e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11664 if (! single_pred_p (e->dest))
11666 split_loop_exit_edge (e, true);
11667 if (dump_enabled_p ())
11668 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11672 tree niters = vect_build_loop_niters (loop_vinfo);
11673 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11674 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11675 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11676 tree advance;
11677 drs_init_vec orig_drs_init;
11679 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11680 &step_vector, &niters_vector_mult_vf, th,
11681 check_profitability, niters_no_overflow,
11682 &advance);
11683 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11684 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11686 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11687 block after loop exit. We need to scale all that. */
11688 basic_block preheader
11689 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11690 preheader->count
11691 = preheader->count.apply_probability
11692 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11693 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11694 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11695 single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11696 = preheader->count;
11699 if (niters_vector == NULL_TREE)
11701 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11702 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11703 && known_eq (lowest_vf, vf))
11705 niters_vector
11706 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11707 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11708 step_vector = build_one_cst (TREE_TYPE (niters));
11710 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11711 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11712 &step_vector, niters_no_overflow);
11713 else
11714 /* vect_do_peeling subtracted the number of peeled prologue
11715 iterations from LOOP_VINFO_NITERS. */
11716 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11717 &niters_vector, &step_vector,
11718 niters_no_overflow);
11721 /* 1) Make sure the loop header has exactly two entries
11722 2) Make sure we have a preheader basic block. */
11724 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11726 split_edge (loop_preheader_edge (loop));
11728 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11729 /* This will deal with any possible peeling. */
11730 vect_prepare_for_masked_peels (loop_vinfo);
11732 /* Schedule the SLP instances first, then handle loop vectorization
11733 below. */
11734 if (!loop_vinfo->slp_instances.is_empty ())
11736 DUMP_VECT_SCOPE ("scheduling SLP instances");
11737 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11740 /* FORNOW: the vectorizer supports only loops which body consist
11741 of one basic block (header + empty latch). When the vectorizer will
11742 support more involved loop forms, the order by which the BBs are
11743 traversed need to be reconsidered. */
11745 for (i = 0; i < nbbs; i++)
11747 basic_block bb = bbs[i];
11748 stmt_vec_info stmt_info;
11750 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11751 gsi_next (&si))
11753 gphi *phi = si.phi ();
11754 if (dump_enabled_p ())
11755 dump_printf_loc (MSG_NOTE, vect_location,
11756 "------>vectorizing phi: %G", (gimple *) phi);
11757 stmt_info = loop_vinfo->lookup_stmt (phi);
11758 if (!stmt_info)
11759 continue;
11761 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11762 vect_loop_kill_debug_uses (loop, stmt_info);
11764 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11765 && !STMT_VINFO_LIVE_P (stmt_info))
11766 continue;
11768 if (STMT_VINFO_VECTYPE (stmt_info)
11769 && (maybe_ne
11770 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11771 && dump_enabled_p ())
11772 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11774 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11775 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11776 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11777 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11778 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11779 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11780 && ! PURE_SLP_STMT (stmt_info))
11782 if (dump_enabled_p ())
11783 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11784 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11788 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11789 gsi_next (&si))
11791 gphi *phi = si.phi ();
11792 stmt_info = loop_vinfo->lookup_stmt (phi);
11793 if (!stmt_info)
11794 continue;
11796 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11797 && !STMT_VINFO_LIVE_P (stmt_info))
11798 continue;
11800 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11801 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11802 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11803 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11804 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11805 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11806 && ! PURE_SLP_STMT (stmt_info))
11807 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11810 for (gimple_stmt_iterator si = gsi_start_bb (bb);
11811 !gsi_end_p (si);)
11813 stmt = gsi_stmt (si);
11814 /* During vectorization remove existing clobber stmts. */
11815 if (gimple_clobber_p (stmt))
11817 unlink_stmt_vdef (stmt);
11818 gsi_remove (&si, true);
11819 release_defs (stmt);
11821 else
11823 /* Ignore vector stmts created in the outer loop. */
11824 stmt_info = loop_vinfo->lookup_stmt (stmt);
11826 /* vector stmts created in the outer-loop during vectorization of
11827 stmts in an inner-loop may not have a stmt_info, and do not
11828 need to be vectorized. */
11829 stmt_vec_info seen_store = NULL;
11830 if (stmt_info)
11832 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11834 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11835 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11836 !gsi_end_p (subsi); gsi_next (&subsi))
11838 stmt_vec_info pat_stmt_info
11839 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11840 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11841 &si, &seen_store);
11843 stmt_vec_info pat_stmt_info
11844 = STMT_VINFO_RELATED_STMT (stmt_info);
11845 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11846 &si, &seen_store))
11847 maybe_set_vectorized_backedge_value (loop_vinfo,
11848 pat_stmt_info);
11850 else
11852 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11853 &seen_store))
11854 maybe_set_vectorized_backedge_value (loop_vinfo,
11855 stmt_info);
11858 gsi_next (&si);
11859 if (seen_store)
11861 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11862 /* Interleaving. If IS_STORE is TRUE, the
11863 vectorization of the interleaving chain was
11864 completed - free all the stores in the chain. */
11865 vect_remove_stores (loop_vinfo,
11866 DR_GROUP_FIRST_ELEMENT (seen_store));
11867 else
11868 /* Free the attached stmt_vec_info and remove the stmt. */
11869 loop_vinfo->remove_stmt (stmt_info);
11874 /* Stub out scalar statements that must not survive vectorization.
11875 Doing this here helps with grouped statements, or statements that
11876 are involved in patterns. */
11877 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11878 !gsi_end_p (gsi); gsi_next (&gsi))
11880 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11881 if (!call || !gimple_call_internal_p (call))
11882 continue;
11883 internal_fn ifn = gimple_call_internal_fn (call);
11884 if (ifn == IFN_MASK_LOAD)
11886 tree lhs = gimple_get_lhs (call);
11887 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11889 tree zero = build_zero_cst (TREE_TYPE (lhs));
11890 gimple *new_stmt = gimple_build_assign (lhs, zero);
11891 gsi_replace (&gsi, new_stmt, true);
11894 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11896 tree lhs = gimple_get_lhs (call);
11897 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11899 tree else_arg
11900 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11901 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11902 gsi_replace (&gsi, new_stmt, true);
11906 } /* BBs in loop */
11908 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11909 a zero NITERS becomes a nonzero NITERS_VECTOR. */
11910 if (integer_onep (step_vector))
11911 niters_no_overflow = true;
11912 vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11913 niters_vector, step_vector, niters_vector_mult_vf,
11914 !niters_no_overflow);
11916 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11918 /* True if the final iteration might not handle a full vector's
11919 worth of scalar iterations. */
11920 bool final_iter_may_be_partial
11921 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11922 /* The minimum number of iterations performed by the epilogue. This
11923 is 1 when peeling for gaps because we always need a final scalar
11924 iteration. */
11925 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11926 /* +1 to convert latch counts to loop iteration counts,
11927 -min_epilogue_iters to remove iterations that cannot be performed
11928 by the vector code. */
11929 int bias_for_lowest = 1 - min_epilogue_iters;
11930 int bias_for_assumed = bias_for_lowest;
11931 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11932 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11934 /* When the amount of peeling is known at compile time, the first
11935 iteration will have exactly alignment_npeels active elements.
11936 In the worst case it will have at least one. */
11937 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11938 bias_for_lowest += lowest_vf - min_first_active;
11939 bias_for_assumed += assumed_vf - min_first_active;
11941 /* In these calculations the "- 1" converts loop iteration counts
11942 back to latch counts. */
11943 if (loop->any_upper_bound)
11945 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11946 loop->nb_iterations_upper_bound
11947 = (final_iter_may_be_partial
11948 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11949 lowest_vf) - 1
11950 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11951 lowest_vf) - 1);
11952 if (main_vinfo
11953 /* Both peeling for alignment and peeling for gaps can end up
11954 with the scalar epilogue running for more than VF-1 iterations. */
11955 && !main_vinfo->peeling_for_alignment
11956 && !main_vinfo->peeling_for_gaps)
11958 unsigned int bound;
11959 poly_uint64 main_iters
11960 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11961 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11962 main_iters
11963 = upper_bound (main_iters,
11964 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11965 if (can_div_away_from_zero_p (main_iters,
11966 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11967 &bound))
11968 loop->nb_iterations_upper_bound
11969 = wi::umin ((bound_wide_int) (bound - 1),
11970 loop->nb_iterations_upper_bound);
11973 if (loop->any_likely_upper_bound)
11974 loop->nb_iterations_likely_upper_bound
11975 = (final_iter_may_be_partial
11976 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11977 + bias_for_lowest, lowest_vf) - 1
11978 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11979 + bias_for_lowest, lowest_vf) - 1);
11980 if (loop->any_estimate)
11981 loop->nb_iterations_estimate
11982 = (final_iter_may_be_partial
11983 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11984 assumed_vf) - 1
11985 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11986 assumed_vf) - 1);
11987 scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11988 assumed_vf, flat);
11990 if (dump_enabled_p ())
11992 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11994 dump_printf_loc (MSG_NOTE, vect_location,
11995 "LOOP VECTORIZED\n");
11996 if (loop->inner)
11997 dump_printf_loc (MSG_NOTE, vect_location,
11998 "OUTER LOOP VECTORIZED\n");
11999 dump_printf (MSG_NOTE, "\n");
12001 else
12002 dump_printf_loc (MSG_NOTE, vect_location,
12003 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12004 GET_MODE_NAME (loop_vinfo->vector_mode));
12007 /* Loops vectorized with a variable factor won't benefit from
12008 unrolling/peeling. */
12009 if (!vf.is_constant ())
12011 loop->unroll = 1;
12012 if (dump_enabled_p ())
12013 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12014 " variable-length vectorization factor\n");
12016 /* Free SLP instances here because otherwise stmt reference counting
12017 won't work. */
12018 slp_instance instance;
12019 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12020 vect_free_slp_instance (instance);
12021 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12022 /* Clear-up safelen field since its value is invalid after vectorization
12023 since vectorized loop can have loop-carried dependencies. */
12024 loop->safelen = 0;
12026 if (epilogue)
12028 update_epilogue_loop_vinfo (epilogue, advance);
12030 epilogue->simduid = loop->simduid;
12031 epilogue->force_vectorize = loop->force_vectorize;
12032 epilogue->dont_vectorize = false;
12035 return epilogue;
12038 /* The code below is trying to perform simple optimization - revert
12039 if-conversion for masked stores, i.e. if the mask of a store is zero
12040 do not perform it and all stored value producers also if possible.
12041 For example,
12042 for (i=0; i<n; i++)
12043 if (c[i])
12045 p1[i] += 1;
12046 p2[i] = p3[i] +2;
12048 this transformation will produce the following semi-hammock:
12050 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12052 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12053 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12054 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12055 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12056 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12057 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12061 void
12062 optimize_mask_stores (class loop *loop)
12064 basic_block *bbs = get_loop_body (loop);
12065 unsigned nbbs = loop->num_nodes;
12066 unsigned i;
12067 basic_block bb;
12068 class loop *bb_loop;
12069 gimple_stmt_iterator gsi;
12070 gimple *stmt;
12071 auto_vec<gimple *> worklist;
12072 auto_purge_vect_location sentinel;
12074 vect_location = find_loop_location (loop);
12075 /* Pick up all masked stores in loop if any. */
12076 for (i = 0; i < nbbs; i++)
12078 bb = bbs[i];
12079 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12080 gsi_next (&gsi))
12082 stmt = gsi_stmt (gsi);
12083 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12084 worklist.safe_push (stmt);
12088 free (bbs);
12089 if (worklist.is_empty ())
12090 return;
12092 /* Loop has masked stores. */
12093 while (!worklist.is_empty ())
12095 gimple *last, *last_store;
12096 edge e, efalse;
12097 tree mask;
12098 basic_block store_bb, join_bb;
12099 gimple_stmt_iterator gsi_to;
12100 tree vdef, new_vdef;
12101 gphi *phi;
12102 tree vectype;
12103 tree zero;
12105 last = worklist.pop ();
12106 mask = gimple_call_arg (last, 2);
12107 bb = gimple_bb (last);
12108 /* Create then_bb and if-then structure in CFG, then_bb belongs to
12109 the same loop as if_bb. It could be different to LOOP when two
12110 level loop-nest is vectorized and mask_store belongs to the inner
12111 one. */
12112 e = split_block (bb, last);
12113 bb_loop = bb->loop_father;
12114 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12115 join_bb = e->dest;
12116 store_bb = create_empty_bb (bb);
12117 add_bb_to_loop (store_bb, bb_loop);
12118 e->flags = EDGE_TRUE_VALUE;
12119 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12120 /* Put STORE_BB to likely part. */
12121 efalse->probability = profile_probability::likely ();
12122 e->probability = efalse->probability.invert ();
12123 store_bb->count = efalse->count ();
12124 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12125 if (dom_info_available_p (CDI_DOMINATORS))
12126 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12127 if (dump_enabled_p ())
12128 dump_printf_loc (MSG_NOTE, vect_location,
12129 "Create new block %d to sink mask stores.",
12130 store_bb->index);
12131 /* Create vector comparison with boolean result. */
12132 vectype = TREE_TYPE (mask);
12133 zero = build_zero_cst (vectype);
12134 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12135 gsi = gsi_last_bb (bb);
12136 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12137 /* Create new PHI node for vdef of the last masked store:
12138 .MEM_2 = VDEF <.MEM_1>
12139 will be converted to
12140 .MEM.3 = VDEF <.MEM_1>
12141 and new PHI node will be created in join bb
12142 .MEM_2 = PHI <.MEM_1, .MEM_3>
12144 vdef = gimple_vdef (last);
12145 new_vdef = make_ssa_name (gimple_vop (cfun), last);
12146 gimple_set_vdef (last, new_vdef);
12147 phi = create_phi_node (vdef, join_bb);
12148 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12150 /* Put all masked stores with the same mask to STORE_BB if possible. */
12151 while (true)
12153 gimple_stmt_iterator gsi_from;
12154 gimple *stmt1 = NULL;
12156 /* Move masked store to STORE_BB. */
12157 last_store = last;
12158 gsi = gsi_for_stmt (last);
12159 gsi_from = gsi;
12160 /* Shift GSI to the previous stmt for further traversal. */
12161 gsi_prev (&gsi);
12162 gsi_to = gsi_start_bb (store_bb);
12163 gsi_move_before (&gsi_from, &gsi_to);
12164 /* Setup GSI_TO to the non-empty block start. */
12165 gsi_to = gsi_start_bb (store_bb);
12166 if (dump_enabled_p ())
12167 dump_printf_loc (MSG_NOTE, vect_location,
12168 "Move stmt to created bb\n%G", last);
12169 /* Move all stored value producers if possible. */
12170 while (!gsi_end_p (gsi))
12172 tree lhs;
12173 imm_use_iterator imm_iter;
12174 use_operand_p use_p;
12175 bool res;
12177 /* Skip debug statements. */
12178 if (is_gimple_debug (gsi_stmt (gsi)))
12180 gsi_prev (&gsi);
12181 continue;
12183 stmt1 = gsi_stmt (gsi);
12184 /* Do not consider statements writing to memory or having
12185 volatile operand. */
12186 if (gimple_vdef (stmt1)
12187 || gimple_has_volatile_ops (stmt1))
12188 break;
12189 gsi_from = gsi;
12190 gsi_prev (&gsi);
12191 lhs = gimple_get_lhs (stmt1);
12192 if (!lhs)
12193 break;
12195 /* LHS of vectorized stmt must be SSA_NAME. */
12196 if (TREE_CODE (lhs) != SSA_NAME)
12197 break;
12199 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12201 /* Remove dead scalar statement. */
12202 if (has_zero_uses (lhs))
12204 gsi_remove (&gsi_from, true);
12205 continue;
12209 /* Check that LHS does not have uses outside of STORE_BB. */
12210 res = true;
12211 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12213 gimple *use_stmt;
12214 use_stmt = USE_STMT (use_p);
12215 if (is_gimple_debug (use_stmt))
12216 continue;
12217 if (gimple_bb (use_stmt) != store_bb)
12219 res = false;
12220 break;
12223 if (!res)
12224 break;
12226 if (gimple_vuse (stmt1)
12227 && gimple_vuse (stmt1) != gimple_vuse (last_store))
12228 break;
12230 /* Can move STMT1 to STORE_BB. */
12231 if (dump_enabled_p ())
12232 dump_printf_loc (MSG_NOTE, vect_location,
12233 "Move stmt to created bb\n%G", stmt1);
12234 gsi_move_before (&gsi_from, &gsi_to);
12235 /* Shift GSI_TO for further insertion. */
12236 gsi_prev (&gsi_to);
12238 /* Put other masked stores with the same mask to STORE_BB. */
12239 if (worklist.is_empty ()
12240 || gimple_call_arg (worklist.last (), 2) != mask
12241 || worklist.last () != stmt1)
12242 break;
12243 last = worklist.pop ();
12245 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12249 /* Decide whether it is possible to use a zero-based induction variable
12250 when vectorizing LOOP_VINFO with partial vectors. If it is, return
12251 the value that the induction variable must be able to hold in order
12252 to ensure that the rgroups eventually have no active vector elements.
12253 Return -1 otherwise. */
12255 widest_int
12256 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12258 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12259 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12260 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12262 /* Calculate the value that the induction variable must be able
12263 to hit in order to ensure that we end the loop with an all-false mask.
12264 This involves adding the maximum number of inactive trailing scalar
12265 iterations. */
12266 widest_int iv_limit = -1;
12267 if (max_loop_iterations (loop, &iv_limit))
12269 if (niters_skip)
12271 /* Add the maximum number of skipped iterations to the
12272 maximum iteration count. */
12273 if (TREE_CODE (niters_skip) == INTEGER_CST)
12274 iv_limit += wi::to_widest (niters_skip);
12275 else
12276 iv_limit += max_vf - 1;
12278 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12279 /* Make a conservatively-correct assumption. */
12280 iv_limit += max_vf - 1;
12282 /* IV_LIMIT is the maximum number of latch iterations, which is also
12283 the maximum in-range IV value. Round this value down to the previous
12284 vector alignment boundary and then add an extra full iteration. */
12285 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12286 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12288 return iv_limit;
12291 /* For the given rgroup_controls RGC, check whether an induction variable
12292 would ever hit a value that produces a set of all-false masks or zero
12293 lengths before wrapping around. Return true if it's possible to wrap
12294 around before hitting the desirable value, otherwise return false. */
12296 bool
12297 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12299 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12301 if (iv_limit == -1)
12302 return true;
12304 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12305 unsigned int compare_precision = TYPE_PRECISION (compare_type);
12306 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12308 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12309 return true;
12311 return false;