c++: retval dtor on rethrow [PR112301]
[official-gcc.git] / gcc / tree-vect-loop.cc
blob3b28c826b3b9b54cd0a4d45c7ecf4ed30701c937
1 /* Loop Vectorization
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "memmodel.h"
36 #include "optabs.h"
37 #include "diagnostic-core.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "cfganal.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "gimple-fold.h"
52 #include "cgraph.h"
53 #include "tree-cfg.h"
54 #include "tree-if-conv.h"
55 #include "internal-fn.h"
56 #include "tree-vector-builder.h"
57 #include "vec-perm-indices.h"
58 #include "tree-eh.h"
59 #include "case-cfn-macros.h"
60 #include "langhooks.h"
62 /* Loop Vectorization Pass.
64 This pass tries to vectorize loops.
66 For example, the vectorizer transforms the following simple loop:
68 short a[N]; short b[N]; short c[N]; int i;
70 for (i=0; i<N; i++){
71 a[i] = b[i] + c[i];
74 as if it was manually vectorized by rewriting the source code into:
76 typedef int __attribute__((mode(V8HI))) v8hi;
77 short a[N]; short b[N]; short c[N]; int i;
78 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 v8hi va, vb, vc;
81 for (i=0; i<N/8; i++){
82 vb = pb[i];
83 vc = pc[i];
84 va = vb + vc;
85 pa[i] = va;
88 The main entry to this pass is vectorize_loops(), in which
89 the vectorizer applies a set of analyses on a given set of loops,
90 followed by the actual vectorization transformation for the loops that
91 had successfully passed the analysis phase.
92 Throughout this pass we make a distinction between two types of
93 data: scalars (which are represented by SSA_NAMES), and memory references
94 ("data-refs"). These two types of data require different handling both
95 during analysis and transformation. The types of data-refs that the
96 vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 accesses are required to have a simple (consecutive) access pattern.
100 Analysis phase:
101 ===============
102 The driver for the analysis phase is vect_analyze_loop().
103 It applies a set of analyses, some of which rely on the scalar evolution
104 analyzer (scev) developed by Sebastian Pop.
106 During the analysis phase the vectorizer records some information
107 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 loop, as well as general information about the loop as a whole, which is
109 recorded in a "loop_vec_info" struct attached to each loop.
111 Transformation phase:
112 =====================
113 The loop transformation phase scans all the stmts in the loop, and
114 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 the loop that needs to be vectorized. It inserts the vector code sequence
116 just before the scalar stmt S, and records a pointer to the vector code
117 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 attached to S). This pointer will be used for the vectorization of following
119 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 otherwise, we rely on dead code elimination for removing it.
122 For example, say stmt S1 was vectorized into stmt VS1:
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 S2: a = b;
128 To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 resulting sequence would be:
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 VS2: va = vb;
136 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 Operands that are not SSA_NAMEs, are data-refs that appear in
139 load/store operations (like 'x[i]' in S1), and are handled differently.
141 Target modeling:
142 =================
143 Currently the only target specific information that is used is the
144 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 Targets that can support different sizes of vectors, for now will need
146 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 flexibility will be added in the future.
149 Since we only vectorize operations which vector form can be
150 expressed using existing tree codes, to verify that an operation is
151 supported, the vectorizer checks the relevant optab at the relevant
152 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 the value found is CODE_FOR_nothing, then there's no target support, and
154 we can't vectorize the stmt.
156 For additional information on this project see:
157 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 unsigned *);
162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 bool *, bool *, bool);
165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 may already be set for general statements (not just data refs). */
169 static opt_result
170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 bool vectype_maybe_set_p,
172 poly_uint64 *vf)
174 gimple *stmt = stmt_info->stmt;
176 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 && !STMT_VINFO_LIVE_P (stmt_info))
178 || gimple_clobber_p (stmt))
180 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 return opt_result::success ();
185 tree stmt_vectype, nunits_vectype;
186 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 &stmt_vectype,
188 &nunits_vectype);
189 if (!res)
190 return res;
192 if (stmt_vectype)
194 if (STMT_VINFO_VECTYPE (stmt_info))
195 /* The only case when a vectype had been already set is for stmts
196 that contain a data ref, or for "pattern-stmts" (stmts generated
197 by the vectorizer to represent/replace a certain idiom). */
198 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 || vectype_maybe_set_p)
200 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 else
202 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
205 if (nunits_vectype)
206 vect_update_max_nunits (vf, nunits_vectype);
208 return opt_result::success ();
211 /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 types of STMT_INFO and all attached pattern statements and update
213 the vectorization factor VF accordingly. Return true on success
214 or false if something prevented vectorization. */
216 static opt_result
217 vect_determine_vf_for_stmt (vec_info *vinfo,
218 stmt_vec_info stmt_info, poly_uint64 *vf)
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 if (!res)
225 return res;
227 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 && STMT_VINFO_RELATED_STMT (stmt_info))
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
233 /* If a pattern statement has def stmts, analyze them too. */
234 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 !gsi_end_p (si); gsi_next (&si))
237 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "==> examining pattern def stmt: %G",
241 def_stmt_info->stmt);
242 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 if (!res)
244 return res;
247 if (dump_enabled_p ())
248 dump_printf_loc (MSG_NOTE, vect_location,
249 "==> examining pattern statement: %G",
250 stmt_info->stmt);
251 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 if (!res)
253 return res;
256 return opt_result::success ();
259 /* Function vect_determine_vectorization_factor
261 Determine the vectorization factor (VF). VF is the number of data elements
262 that are operated upon in parallel in a single iteration of the vectorized
263 loop. For example, when vectorizing a loop that operates on 4byte elements,
264 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 elements can fit in a single vector register.
267 We currently support vectorization of loops in which all types operated upon
268 are of the same size. Therefore this function currently sets VF according to
269 the size of the types operated upon, and fails if there are multiple sizes
270 in the loop.
272 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 original loop:
274 for (i=0; i<N; i++){
275 a[i] = b[i] + c[i];
278 vectorized loop:
279 for (i=0; i<N; i+=VF){
280 a[i:VF] = b[i:VF] + c[i:VF];
284 static opt_result
285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
287 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 unsigned nbbs = loop->num_nodes;
290 poly_uint64 vectorization_factor = 1;
291 tree scalar_type = NULL_TREE;
292 gphi *phi;
293 tree vectype;
294 stmt_vec_info stmt_info;
295 unsigned i;
297 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
299 for (i = 0; i < nbbs; i++)
301 basic_block bb = bbs[i];
303 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 gsi_next (&si))
306 phi = si.phi ();
307 stmt_info = loop_vinfo->lookup_stmt (phi);
308 if (dump_enabled_p ())
309 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 (gimple *) phi);
312 gcc_assert (stmt_info);
314 if (STMT_VINFO_RELEVANT_P (stmt_info)
315 || STMT_VINFO_LIVE_P (stmt_info))
317 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 scalar_type = TREE_TYPE (PHI_RESULT (phi));
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE, vect_location,
322 "get vectype for scalar type: %T\n",
323 scalar_type);
325 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 if (!vectype)
327 return opt_result::failure_at (phi,
328 "not vectorized: unsupported "
329 "data-type %T\n",
330 scalar_type);
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 dump_printf (MSG_NOTE, "\n");
344 vect_update_max_nunits (&vectorization_factor, vectype);
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 gsi_next (&si))
351 if (is_gimple_debug (gsi_stmt (si)))
352 continue;
353 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 opt_result res
355 = vect_determine_vf_for_stmt (loop_vinfo,
356 stmt_info, &vectorization_factor);
357 if (!res)
358 return res;
362 /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 dump_dec (MSG_NOTE, vectorization_factor);
367 dump_printf (MSG_NOTE, "\n");
370 if (known_le (vectorization_factor, 1U))
371 return opt_result::failure_at (vect_location,
372 "not vectorized: unsupported data-type\n");
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 return opt_result::success ();
378 /* Function vect_is_simple_iv_evolution.
380 FORNOW: A simple evolution of an induction variables in the loop is
381 considered a polynomial evolution. */
383 static bool
384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 tree * step)
387 tree init_expr;
388 tree step_expr;
389 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 basic_block bb;
392 /* When there is no evolution in this loop, the evolution function
393 is not "simple". */
394 if (evolution_part == NULL_TREE)
395 return false;
397 /* When the evolution is a polynomial of degree >= 2
398 the evolution function is not "simple". */
399 if (tree_is_chrec (evolution_part))
400 return false;
402 step_expr = evolution_part;
403 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 step_expr, init_expr);
409 *init = init_expr;
410 *step = step_expr;
412 if (TREE_CODE (step_expr) != INTEGER_CST
413 && (TREE_CODE (step_expr) != SSA_NAME
414 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 || !flag_associative_math)))
419 && (TREE_CODE (step_expr) != REAL_CST
420 || !flag_associative_math))
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "step unknown.\n");
425 return false;
428 return true;
431 /* Function vect_is_nonlinear_iv_evolution
433 Only support nonlinear induction for integer type
434 1. neg
435 2. mul by constant
436 3. lshift/rshift by constant.
438 For neg induction, return a fake step as integer -1. */
439 static bool
440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 gphi* loop_phi_node, tree *init, tree *step)
443 tree init_expr, ev_expr, result, op1, op2;
444 gimple* def;
446 if (gimple_phi_num_args (loop_phi_node) != 2)
447 return false;
449 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
452 /* Support nonlinear induction only for integer type. */
453 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 return false;
456 *init = init_expr;
457 result = PHI_RESULT (loop_phi_node);
459 if (TREE_CODE (ev_expr) != SSA_NAME
460 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 || !is_gimple_assign (def))
462 return false;
464 enum tree_code t_code = gimple_assign_rhs_code (def);
465 switch (t_code)
467 case NEGATE_EXPR:
468 if (gimple_assign_rhs1 (def) != result)
469 return false;
470 *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 break;
474 case RSHIFT_EXPR:
475 case LSHIFT_EXPR:
476 case MULT_EXPR:
477 op1 = gimple_assign_rhs1 (def);
478 op2 = gimple_assign_rhs2 (def);
479 if (TREE_CODE (op2) != INTEGER_CST
480 || op1 != result)
481 return false;
482 *step = op2;
483 if (t_code == LSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 else if (t_code == RSHIFT_EXPR)
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 else
489 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 break;
492 default:
493 return false;
496 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
499 return true;
502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 what we are assuming is a double reduction. For example, given
504 a structure like this:
506 outer1:
507 x_1 = PHI <x_4(outer2), ...>;
510 inner:
511 x_2 = PHI <x_1(outer1), ...>;
513 x_3 = ...;
516 outer2:
517 x_4 = PHI <x_3(inner)>;
520 outer loop analysis would treat x_1 as a double reduction phi and
521 this function would then return true for x_2. */
523 static bool
524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
526 use_operand_p use_p;
527 ssa_op_iter op_iter;
528 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 return true;
532 return false;
535 /* Returns true if Phi is a first-order recurrence. A first-order
536 recurrence is a non-reduction recurrence relation in which the value of
537 the recurrence in the current loop iteration equals a value defined in
538 the previous iteration. */
540 static bool
541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 gphi *phi)
544 /* A nested cycle isn't vectorizable as first order recurrence. */
545 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 return false;
548 /* Ensure the loop latch definition is from within the loop. */
549 edge latch = loop_latch_edge (loop);
550 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 if (TREE_CODE (ldef) != SSA_NAME
552 || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 return false;
557 tree def = gimple_phi_result (phi);
559 /* Ensure every use_stmt of the phi node is dominated by the latch
560 definition. */
561 imm_use_iterator imm_iter;
562 use_operand_p use_p;
563 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 if (!is_gimple_debug (USE_STMT (use_p))
565 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 USE_STMT (use_p))))
568 return false;
570 /* First-order recurrence autovectorization needs shuffle vector. */
571 tree scalar_type = TREE_TYPE (def);
572 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 if (!vectype)
574 return false;
576 return true;
579 /* Function vect_analyze_scalar_cycles_1.
581 Examine the cross iteration def-use cycles of scalar variables
582 in LOOP. LOOP_VINFO represents the loop that is now being
583 considered for vectorization (can be LOOP, or an outer-loop
584 enclosing LOOP). SLP indicates there will be some subsequent
585 slp analyses or not. */
587 static void
588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 bool slp)
591 basic_block bb = loop->header;
592 tree init, step;
593 auto_vec<stmt_vec_info, 64> worklist;
594 gphi_iterator gsi;
595 bool double_reduc, reduc_chain;
597 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
599 /* First - identify all inductions. Reduction detection assumes that all the
600 inductions have been identified, therefore, this order must not be
601 changed. */
602 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
604 gphi *phi = gsi.phi ();
605 tree access_fn = NULL;
606 tree def = PHI_RESULT (phi);
607 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 (gimple *) phi);
613 /* Skip virtual phi's. The data dependences that are associated with
614 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 if (virtual_operand_p (def))
616 continue;
618 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
620 /* Analyze the evolution function. */
621 access_fn = analyze_scalar_evolution (loop, def);
622 if (access_fn)
624 STRIP_NOPS (access_fn);
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "Access function of PHI: %T\n", access_fn);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 = initial_condition_in_loop_num (access_fn, loop->num);
630 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 = evolution_part_in_loop_num (access_fn, loop->num);
634 if ((!access_fn
635 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 &init, &step)
638 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 && TREE_CODE (step) != INTEGER_CST))
640 /* Only handle nonlinear iv for same loop. */
641 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 phi, &init, &step)))
645 worklist.safe_push (stmt_vinfo);
646 continue;
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 != NULL_TREE);
651 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
659 /* Second - identify all reductions and nested cycles. */
660 while (worklist.length () > 0)
662 stmt_vec_info stmt_vinfo = worklist.pop ();
663 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
664 tree def = PHI_RESULT (phi);
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 (gimple *) phi);
670 gcc_assert (!virtual_operand_p (def)
671 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
673 stmt_vec_info reduc_stmt_info
674 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 &reduc_chain, slp);
676 if (reduc_stmt_info)
678 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 if (double_reduc)
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location,
684 "Detected double reduction.\n");
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
689 else
691 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "Detected vectorizable nested cycle.\n");
697 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
699 else
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_NOTE, vect_location,
703 "Detected reduction.\n");
705 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
706 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
707 /* Store the reduction cycles for possible vectorization in
708 loop-aware SLP if it was not detected as reduction
709 chain. */
710 if (! reduc_chain)
711 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
712 (reduc_stmt_info);
716 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
718 else
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown def-use cycle pattern.\n");
726 /* Function vect_analyze_scalar_cycles.
728 Examine the cross iteration def-use cycles of scalar variables, by
729 analyzing the loop-header PHIs of scalar variables. Classify each
730 cycle as one of the following: invariant, induction, reduction, unknown.
731 We do that for the loop represented by LOOP_VINFO, and also to its
732 inner-loop, if exists.
733 Examples for scalar cycles:
735 Example1: reduction:
737 loop1:
738 for (i=0; i<N; i++)
739 sum += a[i];
741 Example2: induction:
743 loop2:
744 for (i=0; i<N; i++)
745 a[i] = i; */
747 static void
748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
752 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
754 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 Reductions in such inner-loop therefore have different properties than
756 the reductions in the nest that gets vectorized:
757 1. When vectorized, they are executed in the same order as in the original
758 scalar loop, so we can't change the order of computation when
759 vectorizing them.
760 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 current checks are too strict. */
763 if (loop->inner)
764 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
767 /* Transfer group and reduction information from STMT_INFO to its
768 pattern stmt. */
770 static void
771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
773 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
774 stmt_vec_info stmtp;
775 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
777 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
780 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
781 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
782 == STMT_VINFO_DEF_TYPE (stmt_info));
783 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
784 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
785 if (stmt_info)
786 REDUC_GROUP_NEXT_ELEMENT (stmtp)
787 = STMT_VINFO_RELATED_STMT (stmt_info);
789 while (stmt_info);
792 /* Fixup scalar cycles that now have their stmts detected as patterns. */
794 static void
795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
797 stmt_vec_info first;
798 unsigned i;
800 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
802 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
803 while (next)
805 if ((STMT_VINFO_IN_PATTERN_P (next)
806 != STMT_VINFO_IN_PATTERN_P (first))
807 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
808 break;
809 next = REDUC_GROUP_NEXT_ELEMENT (next);
811 /* If all reduction chain members are well-formed patterns adjust
812 the group to group the pattern stmts instead. */
813 if (! next
814 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
816 if (STMT_VINFO_IN_PATTERN_P (first))
818 vect_fixup_reduc_chain (first);
819 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
820 = STMT_VINFO_RELATED_STMT (first);
823 /* If not all stmt in the chain are patterns or if we failed
824 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 it as regular reduction instead. */
826 else
828 stmt_vec_info vinfo = first;
829 stmt_vec_info last = NULL;
830 while (vinfo)
832 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
833 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
834 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
835 last = vinfo;
836 vinfo = next;
838 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
839 = vect_internal_def;
840 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
841 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
842 --i;
847 /* Function vect_get_loop_niters.
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 niter information holds in ASSUMPTIONS.
854 Return the loop exit conditions. */
857 static vec<gcond *>
858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
859 tree *number_of_iterations, tree *number_of_iterationsm1)
861 auto_vec<edge> exits = get_loop_exit_edges (loop);
862 vec<gcond *> conds;
863 conds.create (exits.length ());
864 class tree_niter_desc niter_desc;
865 tree niter_assumptions, niter, may_be_zero;
867 *assumptions = boolean_true_node;
868 *number_of_iterationsm1 = chrec_dont_know;
869 *number_of_iterations = chrec_dont_know;
871 DUMP_VECT_SCOPE ("get_loop_niters");
873 if (exits.is_empty ())
874 return conds;
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
878 exits.length ());
880 edge exit;
881 unsigned int i;
882 FOR_EACH_VEC_ELT (exits, i, exit)
884 gcond *cond = get_loop_exit_condition (exit);
885 if (cond)
886 conds.safe_push (cond);
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
891 if (exit != main_exit)
892 continue;
894 may_be_zero = NULL_TREE;
895 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
896 || chrec_contains_undetermined (niter_desc.niter))
897 continue;
899 niter_assumptions = niter_desc.assumptions;
900 may_be_zero = niter_desc.may_be_zero;
901 niter = niter_desc.niter;
903 if (may_be_zero && integer_zerop (may_be_zero))
904 may_be_zero = NULL_TREE;
906 if (may_be_zero)
908 if (COMPARISON_CLASS_P (may_be_zero))
910 /* Try to combine may_be_zero with assumptions, this can simplify
911 computation of niter expression. */
912 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
913 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
914 niter_assumptions,
915 fold_build1 (TRUTH_NOT_EXPR,
916 boolean_type_node,
917 may_be_zero));
918 else
919 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
920 build_int_cst (TREE_TYPE (niter), 0),
921 rewrite_to_non_trapping_overflow (niter));
923 may_be_zero = NULL_TREE;
925 else if (integer_nonzerop (may_be_zero))
927 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
928 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
929 continue;
931 else
932 continue;
935 /* Loop assumptions are based off the normal exit. */
936 *assumptions = niter_assumptions;
937 *number_of_iterationsm1 = niter;
939 /* We want the number of loop header executions which is the number
940 of latch executions plus one.
941 ??? For UINT_MAX latch executions this number overflows to zero
942 for loops like do { n++; } while (n != 0); */
943 if (niter && !chrec_contains_undetermined (niter))
944 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
945 unshare_expr (niter),
946 build_int_cst (TREE_TYPE (niter), 1));
947 *number_of_iterations = niter;
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
953 return conds;
956 /* Determine the main loop exit for the vectorizer. */
958 edge
959 vec_init_loop_exit_info (class loop *loop)
961 /* Before we begin we must first determine which exit is the main one and
962 which are auxilary exits. */
963 auto_vec<edge> exits = get_loop_exit_edges (loop);
964 if (exits.length () == 1)
965 return exits[0];
967 /* If we have multiple exits we only support counting IV at the moment. Analyze
968 all exits and return one */
969 class tree_niter_desc niter_desc;
970 edge candidate = NULL;
971 for (edge exit : exits)
973 if (!get_loop_exit_condition (exit))
974 continue;
976 if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
977 && !chrec_contains_undetermined (niter_desc.niter))
979 if (!niter_desc.may_be_zero || !candidate)
980 candidate = exit;
984 return candidate;
987 /* Function bb_in_loop_p
989 Used as predicate for dfs order traversal of the loop bbs. */
991 static bool
992 bb_in_loop_p (const_basic_block bb, const void *data)
994 const class loop *const loop = (const class loop *)data;
995 if (flow_bb_inside_loop_p (loop, bb))
996 return true;
997 return false;
1001 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002 stmt_vec_info structs for all the stmts in LOOP_IN. */
1004 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005 : vec_info (vec_info::loop, shared),
1006 loop (loop_in),
1007 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008 num_itersm1 (NULL_TREE),
1009 num_iters (NULL_TREE),
1010 num_iters_unchanged (NULL_TREE),
1011 num_iters_assumptions (NULL_TREE),
1012 vector_costs (nullptr),
1013 scalar_costs (nullptr),
1014 th (0),
1015 versioning_threshold (0),
1016 vectorization_factor (0),
1017 main_loop_edge (nullptr),
1018 skip_main_loop_edge (nullptr),
1019 skip_this_loop_edge (nullptr),
1020 reusable_accumulators (),
1021 suggested_unroll_factor (1),
1022 max_vectorization_factor (0),
1023 mask_skip_niters (NULL_TREE),
1024 rgroup_compare_type (NULL_TREE),
1025 simd_if_cond (NULL_TREE),
1026 partial_vector_style (vect_partial_vectors_none),
1027 unaligned_dr (NULL),
1028 peeling_for_alignment (0),
1029 ptr_mask (0),
1030 ivexpr_map (NULL),
1031 scan_map (NULL),
1032 slp_unrolling_factor (1),
1033 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034 vectorizable (false),
1035 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036 using_partial_vectors_p (false),
1037 using_decrementing_iv_p (false),
1038 using_select_vl_p (false),
1039 epil_using_partial_vectors_p (false),
1040 partial_load_store_bias (0),
1041 peeling_for_gaps (false),
1042 peeling_for_niter (false),
1043 no_data_dependencies (false),
1044 has_mask_store (false),
1045 scalar_loop_scaling (profile_probability::uninitialized ()),
1046 scalar_loop (NULL),
1047 orig_loop_info (NULL),
1048 vec_loop_iv_exit (NULL),
1049 vec_epilogue_loop_iv_exit (NULL),
1050 scalar_loop_iv_exit (NULL)
1052 /* CHECKME: We want to visit all BBs before their successors (except for
1053 latch blocks, for which this assertion wouldn't hold). In the simple
1054 case of the loop forms we allow, a dfs order of the BBs would the same
1055 as reversed postorder traversal, so we are safe. */
1057 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1058 bbs, loop->num_nodes, loop);
1059 gcc_assert (nbbs == loop->num_nodes);
1061 for (unsigned int i = 0; i < nbbs; i++)
1063 basic_block bb = bbs[i];
1064 gimple_stmt_iterator si;
1066 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1068 gimple *phi = gsi_stmt (si);
1069 gimple_set_uid (phi, 0);
1070 add_stmt (phi);
1073 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1075 gimple *stmt = gsi_stmt (si);
1076 gimple_set_uid (stmt, 0);
1077 if (is_gimple_debug (stmt))
1078 continue;
1079 add_stmt (stmt);
1080 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1081 third argument is the #pragma omp simd if (x) condition, when 0,
1082 loop shouldn't be vectorized, when non-zero constant, it should
1083 be vectorized normally, otherwise versioned with vectorized loop
1084 done if the condition is non-zero at runtime. */
1085 if (loop_in->simduid
1086 && is_gimple_call (stmt)
1087 && gimple_call_internal_p (stmt)
1088 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1089 && gimple_call_num_args (stmt) >= 3
1090 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1091 && (loop_in->simduid
1092 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1094 tree arg = gimple_call_arg (stmt, 2);
1095 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1096 simd_if_cond = arg;
1097 else
1098 gcc_assert (integer_nonzerop (arg));
1103 epilogue_vinfos.create (6);
1106 /* Free all levels of rgroup CONTROLS. */
1108 void
1109 release_vec_loop_controls (vec<rgroup_controls> *controls)
1111 rgroup_controls *rgc;
1112 unsigned int i;
1113 FOR_EACH_VEC_ELT (*controls, i, rgc)
1114 rgc->controls.release ();
1115 controls->release ();
1118 /* Free all memory used by the _loop_vec_info, as well as all the
1119 stmt_vec_info structs of all the stmts in the loop. */
1121 _loop_vec_info::~_loop_vec_info ()
1123 free (bbs);
1125 release_vec_loop_controls (&masks.rgc_vec);
1126 release_vec_loop_controls (&lens);
1127 delete ivexpr_map;
1128 delete scan_map;
1129 epilogue_vinfos.release ();
1130 delete scalar_costs;
1131 delete vector_costs;
1133 /* When we release an epiloge vinfo that we do not intend to use
1134 avoid clearing AUX of the main loop which should continue to
1135 point to the main loop vinfo since otherwise we'll leak that. */
1136 if (loop->aux == this)
1137 loop->aux = NULL;
1140 /* Return an invariant or register for EXPR and emit necessary
1141 computations in the LOOP_VINFO loop preheader. */
1143 tree
1144 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1146 if (is_gimple_reg (expr)
1147 || is_gimple_min_invariant (expr))
1148 return expr;
1150 if (! loop_vinfo->ivexpr_map)
1151 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1152 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1153 if (! cached)
1155 gimple_seq stmts = NULL;
1156 cached = force_gimple_operand (unshare_expr (expr),
1157 &stmts, true, NULL_TREE);
1158 if (stmts)
1160 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1161 gsi_insert_seq_on_edge_immediate (e, stmts);
1164 return cached;
1167 /* Return true if we can use CMP_TYPE as the comparison type to produce
1168 all masks required to mask LOOP_VINFO. */
1170 static bool
1171 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1173 rgroup_controls *rgm;
1174 unsigned int i;
1175 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1176 if (rgm->type != NULL_TREE
1177 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1178 cmp_type, rgm->type,
1179 OPTIMIZE_FOR_SPEED))
1180 return false;
1181 return true;
1184 /* Calculate the maximum number of scalars per iteration for every
1185 rgroup in LOOP_VINFO. */
1187 static unsigned int
1188 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1190 unsigned int res = 1;
1191 unsigned int i;
1192 rgroup_controls *rgm;
1193 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1194 res = MAX (res, rgm->max_nscalars_per_iter);
1195 return res;
1198 /* Calculate the minimum precision necessary to represent:
1200 MAX_NITERS * FACTOR
1202 as an unsigned integer, where MAX_NITERS is the maximum number of
1203 loop header iterations for the original scalar form of LOOP_VINFO. */
1205 static unsigned
1206 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1208 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1210 /* Get the maximum number of iterations that is representable
1211 in the counter type. */
1212 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1213 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1215 /* Get a more refined estimate for the number of iterations. */
1216 widest_int max_back_edges;
1217 if (max_loop_iterations (loop, &max_back_edges))
1218 max_ni = wi::smin (max_ni, max_back_edges + 1);
1220 /* Work out how many bits we need to represent the limit. */
1221 return wi::min_precision (max_ni * factor, UNSIGNED);
1224 /* True if the loop needs peeling or partial vectors when vectorized. */
1226 static bool
1227 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1229 unsigned HOST_WIDE_INT const_vf;
1230 HOST_WIDE_INT max_niter
1231 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1233 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1234 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1235 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1236 (loop_vinfo));
1238 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1239 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1241 /* Work out the (constant) number of iterations that need to be
1242 peeled for reasons other than niters. */
1243 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1244 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1245 peel_niter += 1;
1246 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1247 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1248 return true;
1250 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1251 /* ??? When peeling for gaps but not alignment, we could
1252 try to check whether the (variable) niters is known to be
1253 VF * N + 1. That's something of a niche case though. */
1254 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1255 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1256 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1257 < (unsigned) exact_log2 (const_vf))
1258 /* In case of versioning, check if the maximum number of
1259 iterations is greater than th. If they are identical,
1260 the epilogue is unnecessary. */
1261 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1262 || ((unsigned HOST_WIDE_INT) max_niter
1263 > (th / const_vf) * const_vf))))
1264 return true;
1266 return false;
1269 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1270 whether we can actually generate the masks required. Return true if so,
1271 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1273 static bool
1274 vect_verify_full_masking (loop_vec_info loop_vinfo)
1276 unsigned int min_ni_width;
1278 /* Use a normal loop if there are no statements that need masking.
1279 This only happens in rare degenerate cases: it means that the loop
1280 has no loads, no stores, and no live-out values. */
1281 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1282 return false;
1284 /* Produce the rgroup controls. */
1285 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1287 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1288 tree vectype = mask.first;
1289 unsigned nvectors = mask.second;
1291 if (masks->rgc_vec.length () < nvectors)
1292 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1293 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1294 /* The number of scalars per iteration and the number of vectors are
1295 both compile-time constants. */
1296 unsigned int nscalars_per_iter
1297 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1298 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1300 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1302 rgm->max_nscalars_per_iter = nscalars_per_iter;
1303 rgm->type = truth_type_for (vectype);
1304 rgm->factor = 1;
1308 unsigned int max_nscalars_per_iter
1309 = vect_get_max_nscalars_per_iter (loop_vinfo);
1311 /* Work out how many bits we need to represent the limit. */
1312 min_ni_width
1313 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1315 /* Find a scalar mode for which WHILE_ULT is supported. */
1316 opt_scalar_int_mode cmp_mode_iter;
1317 tree cmp_type = NULL_TREE;
1318 tree iv_type = NULL_TREE;
1319 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1320 unsigned int iv_precision = UINT_MAX;
1322 if (iv_limit != -1)
1323 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1324 UNSIGNED);
1326 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1328 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1329 if (cmp_bits >= min_ni_width
1330 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1332 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1333 if (this_type
1334 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1336 /* Although we could stop as soon as we find a valid mode,
1337 there are at least two reasons why that's not always the
1338 best choice:
1340 - An IV that's Pmode or wider is more likely to be reusable
1341 in address calculations than an IV that's narrower than
1342 Pmode.
1344 - Doing the comparison in IV_PRECISION or wider allows
1345 a natural 0-based IV, whereas using a narrower comparison
1346 type requires mitigations against wrap-around.
1348 Conversely, if the IV limit is variable, doing the comparison
1349 in a wider type than the original type can introduce
1350 unnecessary extensions, so picking the widest valid mode
1351 is not always a good choice either.
1353 Here we prefer the first IV type that's Pmode or wider,
1354 and the first comparison type that's IV_PRECISION or wider.
1355 (The comparison type must be no wider than the IV type,
1356 to avoid extensions in the vector loop.)
1358 ??? We might want to try continuing beyond Pmode for ILP32
1359 targets if CMP_BITS < IV_PRECISION. */
1360 iv_type = this_type;
1361 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1362 cmp_type = this_type;
1363 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1364 break;
1369 if (!cmp_type)
1371 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1372 return false;
1375 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1376 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1377 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1378 return true;
1381 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1382 whether we can actually generate AVX512 style masks. Return true if so,
1383 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1385 static bool
1386 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1388 /* Produce differently organized rgc_vec and differently check
1389 we can produce masks. */
1391 /* Use a normal loop if there are no statements that need masking.
1392 This only happens in rare degenerate cases: it means that the loop
1393 has no loads, no stores, and no live-out values. */
1394 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1395 return false;
1397 /* For the decrementing IV we need to represent all values in
1398 [0, niter + niter_skip] where niter_skip is the elements we
1399 skip in the first iteration for prologue peeling. */
1400 tree iv_type = NULL_TREE;
1401 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1402 unsigned int iv_precision = UINT_MAX;
1403 if (iv_limit != -1)
1404 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1406 /* First compute the type for the IV we use to track the remaining
1407 scalar iterations. */
1408 opt_scalar_int_mode cmp_mode_iter;
1409 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1411 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1412 if (cmp_bits >= iv_precision
1413 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1415 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1416 if (iv_type)
1417 break;
1420 if (!iv_type)
1421 return false;
1423 /* Produce the rgroup controls. */
1424 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1426 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1427 tree vectype = mask.first;
1428 unsigned nvectors = mask.second;
1430 /* The number of scalars per iteration and the number of vectors are
1431 both compile-time constants. */
1432 unsigned int nscalars_per_iter
1433 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1434 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1436 /* We index the rgroup_controls vector with nscalars_per_iter
1437 which we keep constant and instead have a varying nvectors,
1438 remembering the vector mask with the fewest nV. */
1439 if (masks->rgc_vec.length () < nscalars_per_iter)
1440 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1441 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1443 if (!rgm->type || rgm->factor > nvectors)
1445 rgm->type = truth_type_for (vectype);
1446 rgm->compare_type = NULL_TREE;
1447 rgm->max_nscalars_per_iter = nscalars_per_iter;
1448 rgm->factor = nvectors;
1449 rgm->bias_adjusted_ctrl = NULL_TREE;
1453 /* There is no fixed compare type we are going to use but we have to
1454 be able to get at one for each mask group. */
1455 unsigned int min_ni_width
1456 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1458 bool ok = true;
1459 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1461 tree mask_type = rgc.type;
1462 if (!mask_type)
1463 continue;
1465 if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1467 ok = false;
1468 break;
1471 /* If iv_type is usable as compare type use that - we can elide the
1472 saturation in that case. */
1473 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1475 tree cmp_vectype
1476 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1477 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1478 rgc.compare_type = cmp_vectype;
1480 if (!rgc.compare_type)
1481 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1483 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1484 if (cmp_bits >= min_ni_width
1485 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1487 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1488 if (!cmp_type)
1489 continue;
1491 /* Check whether we can produce the mask with cmp_type. */
1492 tree cmp_vectype
1493 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1494 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1496 rgc.compare_type = cmp_vectype;
1497 break;
1501 if (!rgc.compare_type)
1503 ok = false;
1504 break;
1507 if (!ok)
1509 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1510 return false;
1513 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1514 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1515 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1516 return true;
1519 /* Check whether we can use vector access with length based on precison
1520 comparison. So far, to keep it simple, we only allow the case that the
1521 precision of the target supported length is larger than the precision
1522 required by loop niters. */
1524 static bool
1525 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1527 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1528 return false;
1530 machine_mode len_load_mode, len_store_mode;
1531 if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1532 .exists (&len_load_mode))
1533 return false;
1534 if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1535 .exists (&len_store_mode))
1536 return false;
1538 signed char partial_load_bias = internal_len_load_store_bias
1539 (IFN_LEN_LOAD, len_load_mode);
1541 signed char partial_store_bias = internal_len_load_store_bias
1542 (IFN_LEN_STORE, len_store_mode);
1544 gcc_assert (partial_load_bias == partial_store_bias);
1546 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1547 return false;
1549 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1550 len_loads with a length of zero. In order to avoid that we prohibit
1551 more than one loop length here. */
1552 if (partial_load_bias == -1
1553 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1554 return false;
1556 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1558 unsigned int max_nitems_per_iter = 1;
1559 unsigned int i;
1560 rgroup_controls *rgl;
1561 /* Find the maximum number of items per iteration for every rgroup. */
1562 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1564 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1565 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1568 /* Work out how many bits we need to represent the length limit. */
1569 unsigned int min_ni_prec
1570 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1572 /* Now use the maximum of below precisions for one suitable IV type:
1573 - the IV's natural precision
1574 - the precision needed to hold: the maximum number of scalar
1575 iterations multiplied by the scale factor (min_ni_prec above)
1576 - the Pmode precision
1578 If min_ni_prec is less than the precision of the current niters,
1579 we perfer to still use the niters type. Prefer to use Pmode and
1580 wider IV to avoid narrow conversions. */
1582 unsigned int ni_prec
1583 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1584 min_ni_prec = MAX (min_ni_prec, ni_prec);
1585 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1587 tree iv_type = NULL_TREE;
1588 opt_scalar_int_mode tmode_iter;
1589 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1591 scalar_mode tmode = tmode_iter.require ();
1592 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1594 /* ??? Do we really want to construct one IV whose precision exceeds
1595 BITS_PER_WORD? */
1596 if (tbits > BITS_PER_WORD)
1597 break;
1599 /* Find the first available standard integral type. */
1600 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1602 iv_type = build_nonstandard_integer_type (tbits, true);
1603 break;
1607 if (!iv_type)
1609 if (dump_enabled_p ())
1610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1611 "can't vectorize with length-based partial vectors"
1612 " because there is no suitable iv type.\n");
1613 return false;
1616 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1617 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1618 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1620 return true;
1623 /* Calculate the cost of one scalar iteration of the loop. */
1624 static void
1625 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1627 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1628 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1629 int nbbs = loop->num_nodes, factor;
1630 int innerloop_iters, i;
1632 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1634 /* Gather costs for statements in the scalar loop. */
1636 /* FORNOW. */
1637 innerloop_iters = 1;
1638 if (loop->inner)
1639 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1641 for (i = 0; i < nbbs; i++)
1643 gimple_stmt_iterator si;
1644 basic_block bb = bbs[i];
1646 if (bb->loop_father == loop->inner)
1647 factor = innerloop_iters;
1648 else
1649 factor = 1;
1651 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1653 gimple *stmt = gsi_stmt (si);
1654 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1656 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1657 continue;
1659 /* Skip stmts that are not vectorized inside the loop. */
1660 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1661 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1662 && (!STMT_VINFO_LIVE_P (vstmt_info)
1663 || !VECTORIZABLE_CYCLE_DEF
1664 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1665 continue;
1667 vect_cost_for_stmt kind;
1668 if (STMT_VINFO_DATA_REF (stmt_info))
1670 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1671 kind = scalar_load;
1672 else
1673 kind = scalar_store;
1675 else if (vect_nop_conversion_p (stmt_info))
1676 continue;
1677 else
1678 kind = scalar_stmt;
1680 /* We are using vect_prologue here to avoid scaling twice
1681 by the inner loop factor. */
1682 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1683 factor, kind, stmt_info, 0, vect_prologue);
1687 /* Now accumulate cost. */
1688 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1689 add_stmt_costs (loop_vinfo->scalar_costs,
1690 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1691 loop_vinfo->scalar_costs->finish_cost (nullptr);
1695 /* Function vect_analyze_loop_form.
1697 Verify that certain CFG restrictions hold, including:
1698 - the loop has a pre-header
1699 - the loop has a single entry and exit
1700 - the loop exit condition is simple enough
1701 - the number of iterations can be analyzed, i.e, a countable loop. The
1702 niter could be analyzed under some assumptions. */
1704 opt_result
1705 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1707 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1709 edge exit_e = vec_init_loop_exit_info (loop);
1710 if (!exit_e)
1711 return opt_result::failure_at (vect_location,
1712 "not vectorized:"
1713 " could not determine main exit from"
1714 " loop with multiple exits.\n");
1715 info->loop_exit = exit_e;
1716 if (dump_enabled_p ())
1717 dump_printf_loc (MSG_NOTE, vect_location,
1718 "using as main loop exit: %d -> %d [AUX: %p]\n",
1719 exit_e->src->index, exit_e->dest->index, exit_e->aux);
1721 /* Different restrictions apply when we are considering an inner-most loop,
1722 vs. an outer (nested) loop.
1723 (FORNOW. May want to relax some of these restrictions in the future). */
1725 info->inner_loop_cond = NULL;
1726 if (!loop->inner)
1728 /* Inner-most loop. We currently require that the number of BBs is
1729 exactly 2 (the header and latch). Vectorizable inner-most loops
1730 look like this:
1732 (pre-header)
1734 header <--------+
1735 | | |
1736 | +--> latch --+
1738 (exit-bb) */
1740 if (loop->num_nodes != 2)
1741 return opt_result::failure_at (vect_location,
1742 "not vectorized:"
1743 " control flow in loop.\n");
1745 if (empty_block_p (loop->header))
1746 return opt_result::failure_at (vect_location,
1747 "not vectorized: empty loop.\n");
1749 else
1751 class loop *innerloop = loop->inner;
1752 edge entryedge;
1754 /* Nested loop. We currently require that the loop is doubly-nested,
1755 contains a single inner loop, and the number of BBs is exactly 5.
1756 Vectorizable outer-loops look like this:
1758 (pre-header)
1760 header <---+
1762 inner-loop |
1764 tail ------+
1766 (exit-bb)
1768 The inner-loop has the properties expected of inner-most loops
1769 as described above. */
1771 if ((loop->inner)->inner || (loop->inner)->next)
1772 return opt_result::failure_at (vect_location,
1773 "not vectorized:"
1774 " multiple nested loops.\n");
1776 if (loop->num_nodes != 5)
1777 return opt_result::failure_at (vect_location,
1778 "not vectorized:"
1779 " control flow in loop.\n");
1781 entryedge = loop_preheader_edge (innerloop);
1782 if (entryedge->src != loop->header
1783 || !single_exit (innerloop)
1784 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1785 return opt_result::failure_at (vect_location,
1786 "not vectorized:"
1787 " unsupported outerloop form.\n");
1789 /* Analyze the inner-loop. */
1790 vect_loop_form_info inner;
1791 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1792 if (!res)
1794 if (dump_enabled_p ())
1795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1796 "not vectorized: Bad inner loop.\n");
1797 return res;
1800 /* Don't support analyzing niter under assumptions for inner
1801 loop. */
1802 if (!integer_onep (inner.assumptions))
1803 return opt_result::failure_at (vect_location,
1804 "not vectorized: Bad inner loop.\n");
1806 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1807 return opt_result::failure_at (vect_location,
1808 "not vectorized: inner-loop count not"
1809 " invariant.\n");
1811 if (dump_enabled_p ())
1812 dump_printf_loc (MSG_NOTE, vect_location,
1813 "Considering outer-loop vectorization.\n");
1814 info->inner_loop_cond = inner.conds[0];
1817 if (!single_exit (loop))
1818 return opt_result::failure_at (vect_location,
1819 "not vectorized: multiple exits.\n");
1820 if (EDGE_COUNT (loop->header->preds) != 2)
1821 return opt_result::failure_at (vect_location,
1822 "not vectorized:"
1823 " too many incoming edges.\n");
1825 /* We assume that the loop exit condition is at the end of the loop. i.e,
1826 that the loop is represented as a do-while (with a proper if-guard
1827 before the loop if needed), where the loop header contains all the
1828 executable statements, and the latch is empty. */
1829 if (!empty_block_p (loop->latch)
1830 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1831 return opt_result::failure_at (vect_location,
1832 "not vectorized: latch block not empty.\n");
1834 /* Make sure the exit is not abnormal. */
1835 if (exit_e->flags & EDGE_ABNORMAL)
1836 return opt_result::failure_at (vect_location,
1837 "not vectorized:"
1838 " abnormal loop exit edge.\n");
1840 info->conds
1841 = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1842 &info->number_of_iterations,
1843 &info->number_of_iterationsm1);
1845 if (info->conds.is_empty ())
1846 return opt_result::failure_at
1847 (vect_location,
1848 "not vectorized: complicated exit condition.\n");
1850 /* Determine what the primary and alternate exit conds are. */
1851 for (unsigned i = 0; i < info->conds.length (); i++)
1853 gcond *cond = info->conds[i];
1854 if (exit_e->src == gimple_bb (cond))
1855 std::swap (info->conds[0], info->conds[i]);
1858 if (integer_zerop (info->assumptions)
1859 || !info->number_of_iterations
1860 || chrec_contains_undetermined (info->number_of_iterations))
1861 return opt_result::failure_at
1862 (info->conds[0],
1863 "not vectorized: number of iterations cannot be computed.\n");
1865 if (integer_zerop (info->number_of_iterations))
1866 return opt_result::failure_at
1867 (info->conds[0],
1868 "not vectorized: number of iterations = 0.\n");
1870 if (!(tree_fits_shwi_p (info->number_of_iterations)
1871 && tree_to_shwi (info->number_of_iterations) > 0))
1873 if (dump_enabled_p ())
1875 dump_printf_loc (MSG_NOTE, vect_location,
1876 "Symbolic number of iterations is ");
1877 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1878 dump_printf (MSG_NOTE, "\n");
1882 return opt_result::success ();
1885 /* Create a loop_vec_info for LOOP with SHARED and the
1886 vect_analyze_loop_form result. */
1888 loop_vec_info
1889 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1890 const vect_loop_form_info *info,
1891 loop_vec_info main_loop_info)
1893 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1894 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1895 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1896 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1897 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1898 /* Also record the assumptions for versioning. */
1899 if (!integer_onep (info->assumptions) && !main_loop_info)
1900 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1902 for (gcond *cond : info->conds)
1904 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1905 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1908 for (unsigned i = 1; i < info->conds.length (); i ++)
1909 LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1910 LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1912 LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1914 if (info->inner_loop_cond)
1916 stmt_vec_info inner_loop_cond_info
1917 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1918 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1919 /* If we have an estimate on the number of iterations of the inner
1920 loop use that to limit the scale for costing, otherwise use
1921 --param vect-inner-loop-cost-factor literally. */
1922 widest_int nit;
1923 if (estimated_stmt_executions (loop->inner, &nit))
1924 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1925 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1928 return loop_vinfo;
1933 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1934 statements update the vectorization factor. */
1936 static void
1937 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1939 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1940 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1941 int nbbs = loop->num_nodes;
1942 poly_uint64 vectorization_factor;
1943 int i;
1945 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1947 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1948 gcc_assert (known_ne (vectorization_factor, 0U));
1950 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1951 vectorization factor of the loop is the unrolling factor required by
1952 the SLP instances. If that unrolling factor is 1, we say, that we
1953 perform pure SLP on loop - cross iteration parallelism is not
1954 exploited. */
1955 bool only_slp_in_loop = true;
1956 for (i = 0; i < nbbs; i++)
1958 basic_block bb = bbs[i];
1959 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1960 gsi_next (&si))
1962 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1963 if (!stmt_info)
1964 continue;
1965 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1966 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1967 && !PURE_SLP_STMT (stmt_info))
1968 /* STMT needs both SLP and loop-based vectorization. */
1969 only_slp_in_loop = false;
1971 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1972 gsi_next (&si))
1974 if (is_gimple_debug (gsi_stmt (si)))
1975 continue;
1976 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1977 stmt_info = vect_stmt_to_vectorize (stmt_info);
1978 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1979 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1980 && !PURE_SLP_STMT (stmt_info))
1981 /* STMT needs both SLP and loop-based vectorization. */
1982 only_slp_in_loop = false;
1986 if (only_slp_in_loop)
1988 if (dump_enabled_p ())
1989 dump_printf_loc (MSG_NOTE, vect_location,
1990 "Loop contains only SLP stmts\n");
1991 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1993 else
1995 if (dump_enabled_p ())
1996 dump_printf_loc (MSG_NOTE, vect_location,
1997 "Loop contains SLP and non-SLP stmts\n");
1998 /* Both the vectorization factor and unroll factor have the form
1999 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2000 so they must have a common multiple. */
2001 vectorization_factor
2002 = force_common_multiple (vectorization_factor,
2003 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2006 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2007 if (dump_enabled_p ())
2009 dump_printf_loc (MSG_NOTE, vect_location,
2010 "Updating vectorization factor to ");
2011 dump_dec (MSG_NOTE, vectorization_factor);
2012 dump_printf (MSG_NOTE, ".\n");
2016 /* Return true if STMT_INFO describes a double reduction phi and if
2017 the other phi in the reduction is also relevant for vectorization.
2018 This rejects cases such as:
2020 outer1:
2021 x_1 = PHI <x_3(outer2), ...>;
2024 inner:
2025 x_2 = ...;
2028 outer2:
2029 x_3 = PHI <x_2(inner)>;
2031 if nothing in x_2 or elsewhere makes x_1 relevant. */
2033 static bool
2034 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2036 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2037 return false;
2039 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2042 /* Function vect_analyze_loop_operations.
2044 Scan the loop stmts and make sure they are all vectorizable. */
2046 static opt_result
2047 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2049 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2050 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2051 int nbbs = loop->num_nodes;
2052 int i;
2053 stmt_vec_info stmt_info;
2054 bool need_to_vectorize = false;
2055 bool ok;
2057 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2059 auto_vec<stmt_info_for_cost> cost_vec;
2061 for (i = 0; i < nbbs; i++)
2063 basic_block bb = bbs[i];
2065 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2066 gsi_next (&si))
2068 gphi *phi = si.phi ();
2069 ok = true;
2071 stmt_info = loop_vinfo->lookup_stmt (phi);
2072 if (dump_enabled_p ())
2073 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2074 (gimple *) phi);
2075 if (virtual_operand_p (gimple_phi_result (phi)))
2076 continue;
2078 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2079 (i.e., a phi in the tail of the outer-loop). */
2080 if (! is_loop_header_bb_p (bb))
2082 /* FORNOW: we currently don't support the case that these phis
2083 are not used in the outerloop (unless it is double reduction,
2084 i.e., this phi is vect_reduction_def), cause this case
2085 requires to actually do something here. */
2086 if (STMT_VINFO_LIVE_P (stmt_info)
2087 && !vect_active_double_reduction_p (stmt_info))
2088 return opt_result::failure_at (phi,
2089 "Unsupported loop-closed phi"
2090 " in outer-loop.\n");
2092 /* If PHI is used in the outer loop, we check that its operand
2093 is defined in the inner loop. */
2094 if (STMT_VINFO_RELEVANT_P (stmt_info))
2096 tree phi_op;
2098 if (gimple_phi_num_args (phi) != 1)
2099 return opt_result::failure_at (phi, "unsupported phi");
2101 phi_op = PHI_ARG_DEF (phi, 0);
2102 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2103 if (!op_def_info)
2104 return opt_result::failure_at (phi, "unsupported phi\n");
2106 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2107 && (STMT_VINFO_RELEVANT (op_def_info)
2108 != vect_used_in_outer_by_reduction))
2109 return opt_result::failure_at (phi, "unsupported phi\n");
2111 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2112 || (STMT_VINFO_DEF_TYPE (stmt_info)
2113 == vect_double_reduction_def))
2114 && !vectorizable_lc_phi (loop_vinfo,
2115 stmt_info, NULL, NULL))
2116 return opt_result::failure_at (phi, "unsupported phi\n");
2119 continue;
2122 gcc_assert (stmt_info);
2124 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2125 || STMT_VINFO_LIVE_P (stmt_info))
2126 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2127 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2128 /* A scalar-dependence cycle that we don't support. */
2129 return opt_result::failure_at (phi,
2130 "not vectorized:"
2131 " scalar dependence cycle.\n");
2133 if (STMT_VINFO_RELEVANT_P (stmt_info))
2135 need_to_vectorize = true;
2136 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2137 && ! PURE_SLP_STMT (stmt_info))
2138 ok = vectorizable_induction (loop_vinfo,
2139 stmt_info, NULL, NULL,
2140 &cost_vec);
2141 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2142 || (STMT_VINFO_DEF_TYPE (stmt_info)
2143 == vect_double_reduction_def)
2144 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2145 && ! PURE_SLP_STMT (stmt_info))
2146 ok = vectorizable_reduction (loop_vinfo,
2147 stmt_info, NULL, NULL, &cost_vec);
2148 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2149 == vect_first_order_recurrence)
2150 && ! PURE_SLP_STMT (stmt_info))
2151 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2152 &cost_vec);
2155 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2156 if (ok
2157 && STMT_VINFO_LIVE_P (stmt_info)
2158 && !PURE_SLP_STMT (stmt_info))
2159 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2160 -1, false, &cost_vec);
2162 if (!ok)
2163 return opt_result::failure_at (phi,
2164 "not vectorized: relevant phi not "
2165 "supported: %G",
2166 static_cast <gimple *> (phi));
2169 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2170 gsi_next (&si))
2172 gimple *stmt = gsi_stmt (si);
2173 if (!gimple_clobber_p (stmt)
2174 && !is_gimple_debug (stmt))
2176 opt_result res
2177 = vect_analyze_stmt (loop_vinfo,
2178 loop_vinfo->lookup_stmt (stmt),
2179 &need_to_vectorize,
2180 NULL, NULL, &cost_vec);
2181 if (!res)
2182 return res;
2185 } /* bbs */
2187 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2189 /* All operations in the loop are either irrelevant (deal with loop
2190 control, or dead), or only used outside the loop and can be moved
2191 out of the loop (e.g. invariants, inductions). The loop can be
2192 optimized away by scalar optimizations. We're better off not
2193 touching this loop. */
2194 if (!need_to_vectorize)
2196 if (dump_enabled_p ())
2197 dump_printf_loc (MSG_NOTE, vect_location,
2198 "All the computation can be taken out of the loop.\n");
2199 return opt_result::failure_at
2200 (vect_location,
2201 "not vectorized: redundant loop. no profit to vectorize.\n");
2204 return opt_result::success ();
2207 /* Return true if we know that the iteration count is smaller than the
2208 vectorization factor. Return false if it isn't, or if we can't be sure
2209 either way. */
2211 static bool
2212 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2214 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2216 HOST_WIDE_INT max_niter;
2217 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2218 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2219 else
2220 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2222 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2223 return true;
2225 return false;
2228 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2229 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2230 definitely no, or -1 if it's worth retrying. */
2232 static int
2233 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2234 unsigned *suggested_unroll_factor)
2236 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2237 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2239 /* Only loops that can handle partially-populated vectors can have iteration
2240 counts less than the vectorization factor. */
2241 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2242 && vect_known_niters_smaller_than_vf (loop_vinfo))
2244 if (dump_enabled_p ())
2245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246 "not vectorized: iteration count smaller than "
2247 "vectorization factor.\n");
2248 return 0;
2251 /* If we know the number of iterations we can do better, for the
2252 epilogue we can also decide whether the main loop leaves us
2253 with enough iterations, prefering a smaller vector epilog then
2254 also possibly used for the case we skip the vector loop. */
2255 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2257 widest_int scalar_niters
2258 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2259 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2261 loop_vec_info orig_loop_vinfo
2262 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2263 unsigned lowest_vf
2264 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2265 int prolog_peeling = 0;
2266 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2267 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2268 if (prolog_peeling >= 0
2269 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2270 lowest_vf))
2272 unsigned gap
2273 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2274 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2275 % lowest_vf + gap);
2278 /* Reject vectorizing for a single scalar iteration, even if
2279 we could in principle implement that using partial vectors. */
2280 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2281 if (scalar_niters <= peeling_gap + 1)
2283 if (dump_enabled_p ())
2284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285 "not vectorized: loop only has a single "
2286 "scalar iteration.\n");
2287 return 0;
2290 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2292 /* Check that the loop processes at least one full vector. */
2293 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2294 if (known_lt (scalar_niters, vf))
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298 "loop does not have enough iterations "
2299 "to support vectorization.\n");
2300 return 0;
2303 /* If we need to peel an extra epilogue iteration to handle data
2304 accesses with gaps, check that there are enough scalar iterations
2305 available.
2307 The check above is redundant with this one when peeling for gaps,
2308 but the distinction is useful for diagnostics. */
2309 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2310 && known_le (scalar_niters, vf))
2312 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2314 "loop does not have enough iterations "
2315 "to support peeling for gaps.\n");
2316 return 0;
2321 /* If using the "very cheap" model. reject cases in which we'd keep
2322 a copy of the scalar code (even if we might be able to vectorize it). */
2323 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2324 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2325 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2326 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2328 if (dump_enabled_p ())
2329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2330 "some scalar iterations would need to be peeled\n");
2331 return 0;
2334 int min_profitable_iters, min_profitable_estimate;
2335 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2336 &min_profitable_estimate,
2337 suggested_unroll_factor);
2339 if (min_profitable_iters < 0)
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343 "not vectorized: vectorization not profitable.\n");
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346 "not vectorized: vector version will never be "
2347 "profitable.\n");
2348 return -1;
2351 int min_scalar_loop_bound = (param_min_vect_loop_bound
2352 * assumed_vf);
2354 /* Use the cost model only if it is more conservative than user specified
2355 threshold. */
2356 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2357 min_profitable_iters);
2359 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2361 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2362 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2364 if (dump_enabled_p ())
2365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2366 "not vectorized: vectorization not profitable.\n");
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE, vect_location,
2369 "not vectorized: iteration count smaller than user "
2370 "specified loop bound parameter or minimum profitable "
2371 "iterations (whichever is more conservative).\n");
2372 return 0;
2375 /* The static profitablity threshold min_profitable_estimate includes
2376 the cost of having to check at runtime whether the scalar loop
2377 should be used instead. If it turns out that we don't need or want
2378 such a check, the threshold we should use for the static estimate
2379 is simply the point at which the vector loop becomes more profitable
2380 than the scalar loop. */
2381 if (min_profitable_estimate > min_profitable_iters
2382 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2383 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2384 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2385 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2387 if (dump_enabled_p ())
2388 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2389 " choice between the scalar and vector loops\n");
2390 min_profitable_estimate = min_profitable_iters;
2393 /* If the vector loop needs multiple iterations to be beneficial then
2394 things are probably too close to call, and the conservative thing
2395 would be to stick with the scalar code. */
2396 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2397 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2399 if (dump_enabled_p ())
2400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2401 "one iteration of the vector loop would be"
2402 " more expensive than the equivalent number of"
2403 " iterations of the scalar loop\n");
2404 return 0;
2407 HOST_WIDE_INT estimated_niter;
2409 /* If we are vectorizing an epilogue then we know the maximum number of
2410 scalar iterations it will cover is at least one lower than the
2411 vectorization factor of the main loop. */
2412 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2413 estimated_niter
2414 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2415 else
2417 estimated_niter = estimated_stmt_executions_int (loop);
2418 if (estimated_niter == -1)
2419 estimated_niter = likely_max_stmt_executions_int (loop);
2421 if (estimated_niter != -1
2422 && ((unsigned HOST_WIDE_INT) estimated_niter
2423 < MAX (th, (unsigned) min_profitable_estimate)))
2425 if (dump_enabled_p ())
2426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2427 "not vectorized: estimated iteration count too "
2428 "small.\n");
2429 if (dump_enabled_p ())
2430 dump_printf_loc (MSG_NOTE, vect_location,
2431 "not vectorized: estimated iteration count smaller "
2432 "than specified loop bound parameter or minimum "
2433 "profitable iterations (whichever is more "
2434 "conservative).\n");
2435 return -1;
2438 return 1;
2441 static opt_result
2442 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2443 vec<data_reference_p> *datarefs,
2444 unsigned int *n_stmts)
2446 *n_stmts = 0;
2447 for (unsigned i = 0; i < loop->num_nodes; i++)
2448 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2449 !gsi_end_p (gsi); gsi_next (&gsi))
2451 gimple *stmt = gsi_stmt (gsi);
2452 if (is_gimple_debug (stmt))
2453 continue;
2454 ++(*n_stmts);
2455 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2456 NULL, 0);
2457 if (!res)
2459 if (is_gimple_call (stmt) && loop->safelen)
2461 tree fndecl = gimple_call_fndecl (stmt), op;
2462 if (fndecl == NULL_TREE
2463 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2465 fndecl = gimple_call_arg (stmt, 0);
2466 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2467 fndecl = TREE_OPERAND (fndecl, 0);
2468 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2470 if (fndecl != NULL_TREE)
2472 cgraph_node *node = cgraph_node::get (fndecl);
2473 if (node != NULL && node->simd_clones != NULL)
2475 unsigned int j, n = gimple_call_num_args (stmt);
2476 for (j = 0; j < n; j++)
2478 op = gimple_call_arg (stmt, j);
2479 if (DECL_P (op)
2480 || (REFERENCE_CLASS_P (op)
2481 && get_base_address (op)))
2482 break;
2484 op = gimple_call_lhs (stmt);
2485 /* Ignore #pragma omp declare simd functions
2486 if they don't have data references in the
2487 call stmt itself. */
2488 if (j == n
2489 && !(op
2490 && (DECL_P (op)
2491 || (REFERENCE_CLASS_P (op)
2492 && get_base_address (op)))))
2493 continue;
2497 return res;
2499 /* If dependence analysis will give up due to the limit on the
2500 number of datarefs stop here and fail fatally. */
2501 if (datarefs->length ()
2502 > (unsigned)param_loop_max_datarefs_for_datadeps)
2503 return opt_result::failure_at (stmt, "exceeded param "
2504 "loop-max-datarefs-for-datadeps\n");
2506 return opt_result::success ();
2509 /* Look for SLP-only access groups and turn each individual access into its own
2510 group. */
2511 static void
2512 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2514 unsigned int i;
2515 struct data_reference *dr;
2517 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2519 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2520 FOR_EACH_VEC_ELT (datarefs, i, dr)
2522 gcc_assert (DR_REF (dr));
2523 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2525 /* Check if the load is a part of an interleaving chain. */
2526 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2528 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2529 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2530 unsigned int group_size = DR_GROUP_SIZE (first_element);
2532 /* Check if SLP-only groups. */
2533 if (!STMT_SLP_TYPE (stmt_info)
2534 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2536 /* Dissolve the group. */
2537 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2539 stmt_vec_info vinfo = first_element;
2540 while (vinfo)
2542 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2543 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2544 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2545 DR_GROUP_SIZE (vinfo) = 1;
2546 if (STMT_VINFO_STRIDED_P (first_element)
2547 /* We cannot handle stores with gaps. */
2548 || DR_IS_WRITE (dr_info->dr))
2550 STMT_VINFO_STRIDED_P (vinfo) = true;
2551 DR_GROUP_GAP (vinfo) = 0;
2553 else
2554 DR_GROUP_GAP (vinfo) = group_size - 1;
2555 /* Duplicate and adjust alignment info, it needs to
2556 be present on each group leader, see dr_misalignment. */
2557 if (vinfo != first_element)
2559 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2560 dr_info2->target_alignment = dr_info->target_alignment;
2561 int misalignment = dr_info->misalignment;
2562 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2564 HOST_WIDE_INT diff
2565 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2566 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2567 unsigned HOST_WIDE_INT align_c
2568 = dr_info->target_alignment.to_constant ();
2569 misalignment = (misalignment + diff) % align_c;
2571 dr_info2->misalignment = misalignment;
2573 vinfo = next;
2580 /* Determine if operating on full vectors for LOOP_VINFO might leave
2581 some scalar iterations still to do. If so, decide how we should
2582 handle those scalar iterations. The possibilities are:
2584 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2585 In this case:
2587 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2588 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2589 LOOP_VINFO_PEELING_FOR_NITER == false
2591 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2592 to handle the remaining scalar iterations. In this case:
2594 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2595 LOOP_VINFO_PEELING_FOR_NITER == true
2597 There are two choices:
2599 (2a) Consider vectorizing the epilogue loop at the same VF as the
2600 main loop, but using partial vectors instead of full vectors.
2601 In this case:
2603 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2605 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2606 In this case:
2608 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2611 opt_result
2612 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2614 /* Determine whether there would be any scalar iterations left over. */
2615 bool need_peeling_or_partial_vectors_p
2616 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2618 /* Decide whether to vectorize the loop with partial vectors. */
2619 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2620 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2621 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2622 && need_peeling_or_partial_vectors_p)
2624 /* For partial-vector-usage=1, try to push the handling of partial
2625 vectors to the epilogue, with the main loop continuing to operate
2626 on full vectors.
2628 If we are unrolling we also do not want to use partial vectors. This
2629 is to avoid the overhead of generating multiple masks and also to
2630 avoid having to execute entire iterations of FALSE masked instructions
2631 when dealing with one or less full iterations.
2633 ??? We could then end up failing to use partial vectors if we
2634 decide to peel iterations into a prologue, and if the main loop
2635 then ends up processing fewer than VF iterations. */
2636 if ((param_vect_partial_vector_usage == 1
2637 || loop_vinfo->suggested_unroll_factor > 1)
2638 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2639 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2640 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2641 else
2642 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2645 if (dump_enabled_p ())
2646 dump_printf_loc (MSG_NOTE, vect_location,
2647 "operating on %s vectors%s.\n",
2648 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2649 ? "partial" : "full",
2650 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2651 ? " for epilogue loop" : "");
2653 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2654 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2655 && need_peeling_or_partial_vectors_p);
2657 return opt_result::success ();
2660 /* Function vect_analyze_loop_2.
2662 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2663 analyses will record information in some members of LOOP_VINFO. FATAL
2664 indicates if some analysis meets fatal error. If one non-NULL pointer
2665 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2666 worked out suggested unroll factor, while one NULL pointer shows it's
2667 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2668 is to hold the slp decision when the suggested unroll factor is worked
2669 out. */
2670 static opt_result
2671 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2672 unsigned *suggested_unroll_factor,
2673 bool& slp_done_for_suggested_uf)
2675 opt_result ok = opt_result::success ();
2676 int res;
2677 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2678 poly_uint64 min_vf = 2;
2679 loop_vec_info orig_loop_vinfo = NULL;
2681 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2682 loop_vec_info of the first vectorized loop. */
2683 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2684 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2685 else
2686 orig_loop_vinfo = loop_vinfo;
2687 gcc_assert (orig_loop_vinfo);
2689 /* The first group of checks is independent of the vector size. */
2690 fatal = true;
2692 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2693 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2694 return opt_result::failure_at (vect_location,
2695 "not vectorized: simd if(0)\n");
2697 /* Find all data references in the loop (which correspond to vdefs/vuses)
2698 and analyze their evolution in the loop. */
2700 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2702 /* Gather the data references and count stmts in the loop. */
2703 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2705 opt_result res
2706 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2707 &LOOP_VINFO_DATAREFS (loop_vinfo),
2708 &LOOP_VINFO_N_STMTS (loop_vinfo));
2709 if (!res)
2711 if (dump_enabled_p ())
2712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2713 "not vectorized: loop contains function "
2714 "calls or data references that cannot "
2715 "be analyzed\n");
2716 return res;
2718 loop_vinfo->shared->save_datarefs ();
2720 else
2721 loop_vinfo->shared->check_datarefs ();
2723 /* Analyze the data references and also adjust the minimal
2724 vectorization factor according to the loads and stores. */
2726 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2727 if (!ok)
2729 if (dump_enabled_p ())
2730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2731 "bad data references.\n");
2732 return ok;
2735 /* Check if we are applying unroll factor now. */
2736 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2737 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2739 /* If the slp decision is false when suggested unroll factor is worked
2740 out, and we are applying suggested unroll factor, we can simply skip
2741 all slp related analyses this time. */
2742 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2744 /* Classify all cross-iteration scalar data-flow cycles.
2745 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2746 vect_analyze_scalar_cycles (loop_vinfo, slp);
2748 vect_pattern_recog (loop_vinfo);
2750 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2752 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2753 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2755 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2756 if (!ok)
2758 if (dump_enabled_p ())
2759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760 "bad data access.\n");
2761 return ok;
2764 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2766 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2767 if (!ok)
2769 if (dump_enabled_p ())
2770 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2771 "unexpected pattern.\n");
2772 return ok;
2775 /* While the rest of the analysis below depends on it in some way. */
2776 fatal = false;
2778 /* Analyze data dependences between the data-refs in the loop
2779 and adjust the maximum vectorization factor according to
2780 the dependences.
2781 FORNOW: fail at the first data dependence that we encounter. */
2783 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2784 if (!ok)
2786 if (dump_enabled_p ())
2787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2788 "bad data dependence.\n");
2789 return ok;
2791 if (max_vf != MAX_VECTORIZATION_FACTOR
2792 && maybe_lt (max_vf, min_vf))
2793 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2794 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2796 ok = vect_determine_vectorization_factor (loop_vinfo);
2797 if (!ok)
2799 if (dump_enabled_p ())
2800 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2801 "can't determine vectorization factor.\n");
2802 return ok;
2804 if (max_vf != MAX_VECTORIZATION_FACTOR
2805 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2806 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2808 /* Compute the scalar iteration cost. */
2809 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2811 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2813 if (slp)
2815 /* Check the SLP opportunities in the loop, analyze and build
2816 SLP trees. */
2817 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2818 if (!ok)
2819 return ok;
2821 /* If there are any SLP instances mark them as pure_slp. */
2822 slp = vect_make_slp_decision (loop_vinfo);
2823 if (slp)
2825 /* Find stmts that need to be both vectorized and SLPed. */
2826 vect_detect_hybrid_slp (loop_vinfo);
2828 /* Update the vectorization factor based on the SLP decision. */
2829 vect_update_vf_for_slp (loop_vinfo);
2831 /* Optimize the SLP graph with the vectorization factor fixed. */
2832 vect_optimize_slp (loop_vinfo);
2834 /* Gather the loads reachable from the SLP graph entries. */
2835 vect_gather_slp_loads (loop_vinfo);
2839 bool saved_can_use_partial_vectors_p
2840 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2842 /* We don't expect to have to roll back to anything other than an empty
2843 set of rgroups. */
2844 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2846 /* This is the point where we can re-start analysis with SLP forced off. */
2847 start_over:
2849 /* Apply the suggested unrolling factor, this was determined by the backend
2850 during finish_cost the first time we ran the analyzis for this
2851 vector mode. */
2852 if (applying_suggested_uf)
2853 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2855 /* Now the vectorization factor is final. */
2856 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2857 gcc_assert (known_ne (vectorization_factor, 0U));
2859 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2861 dump_printf_loc (MSG_NOTE, vect_location,
2862 "vectorization_factor = ");
2863 dump_dec (MSG_NOTE, vectorization_factor);
2864 dump_printf (MSG_NOTE, ", niters = %wd\n",
2865 LOOP_VINFO_INT_NITERS (loop_vinfo));
2868 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2870 /* Analyze the alignment of the data-refs in the loop.
2871 Fail if a data reference is found that cannot be vectorized. */
2873 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2874 if (!ok)
2876 if (dump_enabled_p ())
2877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2878 "bad data alignment.\n");
2879 return ok;
2882 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2883 It is important to call pruning after vect_analyze_data_ref_accesses,
2884 since we use grouping information gathered by interleaving analysis. */
2885 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2886 if (!ok)
2887 return ok;
2889 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2890 vectorization, since we do not want to add extra peeling or
2891 add versioning for alignment. */
2892 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2893 /* This pass will decide on using loop versioning and/or loop peeling in
2894 order to enhance the alignment of data references in the loop. */
2895 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2896 if (!ok)
2897 return ok;
2899 if (slp)
2901 /* Analyze operations in the SLP instances. Note this may
2902 remove unsupported SLP instances which makes the above
2903 SLP kind detection invalid. */
2904 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2905 vect_slp_analyze_operations (loop_vinfo);
2906 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2908 ok = opt_result::failure_at (vect_location,
2909 "unsupported SLP instances\n");
2910 goto again;
2913 /* Check whether any load in ALL SLP instances is possibly permuted. */
2914 slp_tree load_node, slp_root;
2915 unsigned i, x;
2916 slp_instance instance;
2917 bool can_use_lanes = true;
2918 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2920 slp_root = SLP_INSTANCE_TREE (instance);
2921 int group_size = SLP_TREE_LANES (slp_root);
2922 tree vectype = SLP_TREE_VECTYPE (slp_root);
2923 bool loads_permuted = false;
2924 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2926 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2927 continue;
2928 unsigned j;
2929 stmt_vec_info load_info;
2930 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2931 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2933 loads_permuted = true;
2934 break;
2938 /* If the loads and stores can be handled with load/store-lane
2939 instructions record it and move on to the next instance. */
2940 if (loads_permuted
2941 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2942 && vect_store_lanes_supported (vectype, group_size, false)
2943 != IFN_LAST)
2945 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2947 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2948 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2949 /* Use SLP for strided accesses (or if we can't
2950 load-lanes). */
2951 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2952 || vect_load_lanes_supported
2953 (STMT_VINFO_VECTYPE (stmt_vinfo),
2954 DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2955 break;
2958 can_use_lanes
2959 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2961 if (can_use_lanes && dump_enabled_p ())
2962 dump_printf_loc (MSG_NOTE, vect_location,
2963 "SLP instance %p can use load/store-lanes\n",
2964 (void *) instance);
2966 else
2968 can_use_lanes = false;
2969 break;
2973 /* If all SLP instances can use load/store-lanes abort SLP and try again
2974 with SLP disabled. */
2975 if (can_use_lanes)
2977 ok = opt_result::failure_at (vect_location,
2978 "Built SLP cancelled: can use "
2979 "load/store-lanes\n");
2980 if (dump_enabled_p ())
2981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2982 "Built SLP cancelled: all SLP instances support "
2983 "load/store-lanes\n");
2984 goto again;
2988 /* Dissolve SLP-only groups. */
2989 vect_dissolve_slp_only_groups (loop_vinfo);
2991 /* Scan all the remaining operations in the loop that are not subject
2992 to SLP and make sure they are vectorizable. */
2993 ok = vect_analyze_loop_operations (loop_vinfo);
2994 if (!ok)
2996 if (dump_enabled_p ())
2997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2998 "bad operation or unsupported loop bound.\n");
2999 return ok;
3002 /* For now, we don't expect to mix both masking and length approaches for one
3003 loop, disable it if both are recorded. */
3004 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3005 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3006 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3008 if (dump_enabled_p ())
3009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3010 "can't vectorize a loop with partial vectors"
3011 " because we don't expect to mix different"
3012 " approaches with partial vectors for the"
3013 " same loop.\n");
3014 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3017 /* If we still have the option of using partial vectors,
3018 check whether we can generate the necessary loop controls. */
3019 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3021 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3023 if (!vect_verify_full_masking (loop_vinfo)
3024 && !vect_verify_full_masking_avx512 (loop_vinfo))
3025 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3027 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3028 if (!vect_verify_loop_lens (loop_vinfo))
3029 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3032 /* If we're vectorizing a loop that uses length "controls" and
3033 can iterate more than once, we apply decrementing IV approach
3034 in loop control. */
3035 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3036 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3037 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3038 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3039 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3040 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3041 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3043 /* If a loop uses length controls and has a decrementing loop control IV,
3044 we will normally pass that IV through a MIN_EXPR to calcaluate the
3045 basis for the length controls. E.g. in a loop that processes one
3046 element per scalar iteration, the number of elements would be
3047 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3049 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3050 step, since only the final iteration of the vector loop can have
3051 inactive lanes.
3053 However, some targets have a dedicated instruction for calculating the
3054 preferred length, given the total number of elements that still need to
3055 be processed. This is encapsulated in the SELECT_VL internal function.
3057 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3058 to determine the basis for the length controls. However, unlike the
3059 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3060 lanes inactive in any iteration of the vector loop, not just the last
3061 iteration. This SELECT_VL approach therefore requires us to use pointer
3062 IVs with variable steps.
3064 Once we've decided how many elements should be processed by one
3065 iteration of the vector loop, we need to populate the rgroup controls.
3066 If a loop has multiple rgroups, we need to make sure that those rgroups
3067 "line up" (that is, they must be consistent about which elements are
3068 active and which aren't). This is done by vect_adjust_loop_lens_control.
3070 In principle, it would be possible to use vect_adjust_loop_lens_control
3071 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3072 However:
3074 (1) In practice, it only makes sense to use SELECT_VL when a vector
3075 operation will be controlled directly by the result. It is not
3076 worth using SELECT_VL if it would only be the input to other
3077 calculations.
3079 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3080 pointer IV will need N updates by a variable amount (N-1 updates
3081 within the iteration and 1 update to move to the next iteration).
3083 Because of this, we prefer to use the MIN_EXPR approach whenever there
3084 is more than one length control.
3086 In addition, SELECT_VL always operates to a granularity of 1 unit.
3087 If we wanted to use it to control an SLP operation on N consecutive
3088 elements, we would need to make the SELECT_VL inputs measure scalar
3089 iterations (rather than elements) and then multiply the SELECT_VL
3090 result by N. But using SELECT_VL this way is inefficient because
3091 of (1) above.
3093 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3094 satisfied:
3096 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3097 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3099 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3100 we will fail to gain benefits of following unroll optimizations. We prefer
3101 using the MIN_EXPR approach in this situation. */
3102 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3104 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3105 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3106 OPTIMIZE_FOR_SPEED)
3107 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3108 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3109 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3110 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3111 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3114 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3115 assuming that the loop will be used as a main loop. We will redo
3116 this analysis later if we instead decide to use the loop as an
3117 epilogue loop. */
3118 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3119 if (!ok)
3120 return ok;
3122 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3123 to be able to handle fewer than VF scalars, or needs to have a lower VF
3124 than the main loop. */
3125 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3126 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3128 poly_uint64 unscaled_vf
3129 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3130 orig_loop_vinfo->suggested_unroll_factor);
3131 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3132 return opt_result::failure_at (vect_location,
3133 "Vectorization factor too high for"
3134 " epilogue loop.\n");
3137 /* Check the costings of the loop make vectorizing worthwhile. */
3138 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3139 if (res < 0)
3141 ok = opt_result::failure_at (vect_location,
3142 "Loop costings may not be worthwhile.\n");
3143 goto again;
3145 if (!res)
3146 return opt_result::failure_at (vect_location,
3147 "Loop costings not worthwhile.\n");
3149 /* If an epilogue loop is required make sure we can create one. */
3150 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3151 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3153 if (dump_enabled_p ())
3154 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3155 if (!vect_can_advance_ivs_p (loop_vinfo)
3156 || !slpeel_can_duplicate_loop_p (loop,
3157 LOOP_VINFO_IV_EXIT (loop_vinfo),
3158 LOOP_VINFO_IV_EXIT (loop_vinfo)))
3160 ok = opt_result::failure_at (vect_location,
3161 "not vectorized: can't create required "
3162 "epilog loop\n");
3163 goto again;
3167 /* During peeling, we need to check if number of loop iterations is
3168 enough for both peeled prolog loop and vector loop. This check
3169 can be merged along with threshold check of loop versioning, so
3170 increase threshold for this case if necessary.
3172 If we are analyzing an epilogue we still want to check what its
3173 versioning threshold would be. If we decide to vectorize the epilogues we
3174 will want to use the lowest versioning threshold of all epilogues and main
3175 loop. This will enable us to enter a vectorized epilogue even when
3176 versioning the loop. We can't simply check whether the epilogue requires
3177 versioning though since we may have skipped some versioning checks when
3178 analyzing the epilogue. For instance, checks for alias versioning will be
3179 skipped when dealing with epilogues as we assume we already checked them
3180 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3181 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3183 poly_uint64 niters_th = 0;
3184 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3186 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3188 /* Niters for peeled prolog loop. */
3189 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3191 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3192 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3193 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3195 else
3196 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3199 /* Niters for at least one iteration of vectorized loop. */
3200 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3201 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3202 /* One additional iteration because of peeling for gap. */
3203 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3204 niters_th += 1;
3206 /* Use the same condition as vect_transform_loop to decide when to use
3207 the cost to determine a versioning threshold. */
3208 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3209 && ordered_p (th, niters_th))
3210 niters_th = ordered_max (poly_uint64 (th), niters_th);
3212 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3215 gcc_assert (known_eq (vectorization_factor,
3216 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3218 slp_done_for_suggested_uf = slp;
3220 /* Ok to vectorize! */
3221 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3222 return opt_result::success ();
3224 again:
3225 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3226 gcc_assert (!ok);
3228 /* Try again with SLP forced off but if we didn't do any SLP there is
3229 no point in re-trying. */
3230 if (!slp)
3231 return ok;
3233 /* If the slp decision is true when suggested unroll factor is worked
3234 out, and we are applying suggested unroll factor, we don't need to
3235 re-try any more. */
3236 if (applying_suggested_uf && slp_done_for_suggested_uf)
3237 return ok;
3239 /* If there are reduction chains re-trying will fail anyway. */
3240 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3241 return ok;
3243 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3244 via interleaving or lane instructions. */
3245 slp_instance instance;
3246 slp_tree node;
3247 unsigned i, j;
3248 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3250 stmt_vec_info vinfo;
3251 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3252 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3253 continue;
3254 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3255 unsigned int size = DR_GROUP_SIZE (vinfo);
3256 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3257 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3258 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3259 && ! vect_grouped_store_supported (vectype, size))
3260 return opt_result::failure_at (vinfo->stmt,
3261 "unsupported grouped store\n");
3262 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3264 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
3265 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3266 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3267 size = DR_GROUP_SIZE (vinfo);
3268 vectype = STMT_VINFO_VECTYPE (vinfo);
3269 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3270 && ! vect_grouped_load_supported (vectype, single_element_p,
3271 size))
3272 return opt_result::failure_at (vinfo->stmt,
3273 "unsupported grouped load\n");
3277 if (dump_enabled_p ())
3278 dump_printf_loc (MSG_NOTE, vect_location,
3279 "re-trying with SLP disabled\n");
3281 /* Roll back state appropriately. No SLP this time. */
3282 slp = false;
3283 /* Restore vectorization factor as it were without SLP. */
3284 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3285 /* Free the SLP instances. */
3286 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3287 vect_free_slp_instance (instance);
3288 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3289 /* Reset SLP type to loop_vect on all stmts. */
3290 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3292 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3293 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3294 !gsi_end_p (si); gsi_next (&si))
3296 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3297 STMT_SLP_TYPE (stmt_info) = loop_vect;
3298 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3299 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3301 /* vectorizable_reduction adjusts reduction stmt def-types,
3302 restore them to that of the PHI. */
3303 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3304 = STMT_VINFO_DEF_TYPE (stmt_info);
3305 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3306 (STMT_VINFO_REDUC_DEF (stmt_info)))
3307 = STMT_VINFO_DEF_TYPE (stmt_info);
3310 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3311 !gsi_end_p (si); gsi_next (&si))
3313 if (is_gimple_debug (gsi_stmt (si)))
3314 continue;
3315 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3316 STMT_SLP_TYPE (stmt_info) = loop_vect;
3317 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3319 stmt_vec_info pattern_stmt_info
3320 = STMT_VINFO_RELATED_STMT (stmt_info);
3321 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3322 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3324 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3325 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3326 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3327 !gsi_end_p (pi); gsi_next (&pi))
3328 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3329 = loop_vect;
3333 /* Free optimized alias test DDRS. */
3334 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3335 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3336 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3337 /* Reset target cost data. */
3338 delete loop_vinfo->vector_costs;
3339 loop_vinfo->vector_costs = nullptr;
3340 /* Reset accumulated rgroup information. */
3341 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3342 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3343 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3344 /* Reset assorted flags. */
3345 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3346 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3347 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3348 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3349 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3350 = saved_can_use_partial_vectors_p;
3352 goto start_over;
3355 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3356 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3357 OLD_LOOP_VINFO is better unless something specifically indicates
3358 otherwise.
3360 Note that this deliberately isn't a partial order. */
3362 static bool
3363 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3364 loop_vec_info old_loop_vinfo)
3366 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3367 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3369 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3370 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3372 /* Always prefer a VF of loop->simdlen over any other VF. */
3373 if (loop->simdlen)
3375 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3376 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3377 if (new_simdlen_p != old_simdlen_p)
3378 return new_simdlen_p;
3381 const auto *old_costs = old_loop_vinfo->vector_costs;
3382 const auto *new_costs = new_loop_vinfo->vector_costs;
3383 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3384 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3386 return new_costs->better_main_loop_than_p (old_costs);
3389 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3390 true if we should. */
3392 static bool
3393 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3394 loop_vec_info old_loop_vinfo)
3396 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3397 return false;
3399 if (dump_enabled_p ())
3400 dump_printf_loc (MSG_NOTE, vect_location,
3401 "***** Preferring vector mode %s to vector mode %s\n",
3402 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3403 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3404 return true;
3407 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3408 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3409 MODE_I to the next mode useful to analyze.
3410 Return the loop_vinfo on success and wrapped null on failure. */
3412 static opt_loop_vec_info
3413 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3414 const vect_loop_form_info *loop_form_info,
3415 loop_vec_info main_loop_vinfo,
3416 const vector_modes &vector_modes, unsigned &mode_i,
3417 machine_mode &autodetected_vector_mode,
3418 bool &fatal)
3420 loop_vec_info loop_vinfo
3421 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3423 machine_mode vector_mode = vector_modes[mode_i];
3424 loop_vinfo->vector_mode = vector_mode;
3425 unsigned int suggested_unroll_factor = 1;
3426 bool slp_done_for_suggested_uf = false;
3428 /* Run the main analysis. */
3429 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3430 &suggested_unroll_factor,
3431 slp_done_for_suggested_uf);
3432 if (dump_enabled_p ())
3433 dump_printf_loc (MSG_NOTE, vect_location,
3434 "***** Analysis %s with vector mode %s\n",
3435 res ? "succeeded" : " failed",
3436 GET_MODE_NAME (loop_vinfo->vector_mode));
3438 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3440 if (dump_enabled_p ())
3441 dump_printf_loc (MSG_NOTE, vect_location,
3442 "***** Re-trying analysis for unrolling"
3443 " with unroll factor %d and slp %s.\n",
3444 suggested_unroll_factor,
3445 slp_done_for_suggested_uf ? "on" : "off");
3446 loop_vec_info unroll_vinfo
3447 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3448 unroll_vinfo->vector_mode = vector_mode;
3449 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3450 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3451 slp_done_for_suggested_uf);
3452 if (new_res)
3454 delete loop_vinfo;
3455 loop_vinfo = unroll_vinfo;
3457 else
3458 delete unroll_vinfo;
3461 /* Remember the autodetected vector mode. */
3462 if (vector_mode == VOIDmode)
3463 autodetected_vector_mode = loop_vinfo->vector_mode;
3465 /* Advance mode_i, first skipping modes that would result in the
3466 same analysis result. */
3467 while (mode_i + 1 < vector_modes.length ()
3468 && vect_chooses_same_modes_p (loop_vinfo,
3469 vector_modes[mode_i + 1]))
3471 if (dump_enabled_p ())
3472 dump_printf_loc (MSG_NOTE, vect_location,
3473 "***** The result for vector mode %s would"
3474 " be the same\n",
3475 GET_MODE_NAME (vector_modes[mode_i + 1]));
3476 mode_i += 1;
3478 if (mode_i + 1 < vector_modes.length ()
3479 && VECTOR_MODE_P (autodetected_vector_mode)
3480 && (related_vector_mode (vector_modes[mode_i + 1],
3481 GET_MODE_INNER (autodetected_vector_mode))
3482 == autodetected_vector_mode)
3483 && (related_vector_mode (autodetected_vector_mode,
3484 GET_MODE_INNER (vector_modes[mode_i + 1]))
3485 == vector_modes[mode_i + 1]))
3487 if (dump_enabled_p ())
3488 dump_printf_loc (MSG_NOTE, vect_location,
3489 "***** Skipping vector mode %s, which would"
3490 " repeat the analysis for %s\n",
3491 GET_MODE_NAME (vector_modes[mode_i + 1]),
3492 GET_MODE_NAME (autodetected_vector_mode));
3493 mode_i += 1;
3495 mode_i++;
3497 if (!res)
3499 delete loop_vinfo;
3500 if (fatal)
3501 gcc_checking_assert (main_loop_vinfo == NULL);
3502 return opt_loop_vec_info::propagate_failure (res);
3505 return opt_loop_vec_info::success (loop_vinfo);
3508 /* Function vect_analyze_loop.
3510 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3511 for it. The different analyses will record information in the
3512 loop_vec_info struct. */
3513 opt_loop_vec_info
3514 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3516 DUMP_VECT_SCOPE ("analyze_loop_nest");
3518 if (loop_outer (loop)
3519 && loop_vec_info_for_loop (loop_outer (loop))
3520 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3521 return opt_loop_vec_info::failure_at (vect_location,
3522 "outer-loop already vectorized.\n");
3524 if (!find_loop_nest (loop, &shared->loop_nest))
3525 return opt_loop_vec_info::failure_at
3526 (vect_location,
3527 "not vectorized: loop nest containing two or more consecutive inner"
3528 " loops cannot be vectorized\n");
3530 /* Analyze the loop form. */
3531 vect_loop_form_info loop_form_info;
3532 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3533 if (!res)
3535 if (dump_enabled_p ())
3536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3537 "bad loop form.\n");
3538 return opt_loop_vec_info::propagate_failure (res);
3540 if (!integer_onep (loop_form_info.assumptions))
3542 /* We consider to vectorize this loop by versioning it under
3543 some assumptions. In order to do this, we need to clear
3544 existing information computed by scev and niter analyzer. */
3545 scev_reset_htab ();
3546 free_numbers_of_iterations_estimates (loop);
3547 /* Also set flag for this loop so that following scev and niter
3548 analysis are done under the assumptions. */
3549 loop_constraint_set (loop, LOOP_C_FINITE);
3552 auto_vector_modes vector_modes;
3553 /* Autodetect first vector size we try. */
3554 vector_modes.safe_push (VOIDmode);
3555 unsigned int autovec_flags
3556 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3557 loop->simdlen != 0);
3558 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3559 && !unlimited_cost_model (loop));
3560 machine_mode autodetected_vector_mode = VOIDmode;
3561 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3562 unsigned int mode_i = 0;
3563 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3565 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3566 a mode has not been analyzed. */
3567 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3568 for (unsigned i = 0; i < vector_modes.length (); ++i)
3569 cached_vf_per_mode.safe_push (0);
3571 /* First determine the main loop vectorization mode, either the first
3572 one that works, starting with auto-detecting the vector mode and then
3573 following the targets order of preference, or the one with the
3574 lowest cost if pick_lowest_cost_p. */
3575 while (1)
3577 bool fatal;
3578 unsigned int last_mode_i = mode_i;
3579 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3580 failed. */
3581 cached_vf_per_mode[last_mode_i] = -1;
3582 opt_loop_vec_info loop_vinfo
3583 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3584 NULL, vector_modes, mode_i,
3585 autodetected_vector_mode, fatal);
3586 if (fatal)
3587 break;
3589 if (loop_vinfo)
3591 /* Analyzis has been successful so update the VF value. The
3592 VF should always be a multiple of unroll_factor and we want to
3593 capture the original VF here. */
3594 cached_vf_per_mode[last_mode_i]
3595 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3596 loop_vinfo->suggested_unroll_factor);
3597 /* Once we hit the desired simdlen for the first time,
3598 discard any previous attempts. */
3599 if (simdlen
3600 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3602 delete first_loop_vinfo;
3603 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3604 simdlen = 0;
3606 else if (pick_lowest_cost_p
3607 && first_loop_vinfo
3608 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3610 /* Pick loop_vinfo over first_loop_vinfo. */
3611 delete first_loop_vinfo;
3612 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3614 if (first_loop_vinfo == NULL)
3615 first_loop_vinfo = loop_vinfo;
3616 else
3618 delete loop_vinfo;
3619 loop_vinfo = opt_loop_vec_info::success (NULL);
3622 /* Commit to first_loop_vinfo if we have no reason to try
3623 alternatives. */
3624 if (!simdlen && !pick_lowest_cost_p)
3625 break;
3627 if (mode_i == vector_modes.length ()
3628 || autodetected_vector_mode == VOIDmode)
3629 break;
3631 /* Try the next biggest vector size. */
3632 if (dump_enabled_p ())
3633 dump_printf_loc (MSG_NOTE, vect_location,
3634 "***** Re-trying analysis with vector mode %s\n",
3635 GET_MODE_NAME (vector_modes[mode_i]));
3637 if (!first_loop_vinfo)
3638 return opt_loop_vec_info::propagate_failure (res);
3640 if (dump_enabled_p ())
3641 dump_printf_loc (MSG_NOTE, vect_location,
3642 "***** Choosing vector mode %s\n",
3643 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3645 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3646 enabled, SIMDUID is not set, it is the innermost loop and we have
3647 either already found the loop's SIMDLEN or there was no SIMDLEN to
3648 begin with.
3649 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3650 bool vect_epilogues = (!simdlen
3651 && loop->inner == NULL
3652 && param_vect_epilogues_nomask
3653 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3654 && !loop->simduid);
3655 if (!vect_epilogues)
3656 return first_loop_vinfo;
3658 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3659 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3661 /* For epilogues start the analysis from the first mode. The motivation
3662 behind starting from the beginning comes from cases where the VECTOR_MODES
3663 array may contain length-agnostic and length-specific modes. Their
3664 ordering is not guaranteed, so we could end up picking a mode for the main
3665 loop that is after the epilogue's optimal mode. */
3666 vector_modes[0] = autodetected_vector_mode;
3667 mode_i = 0;
3669 bool supports_partial_vectors =
3670 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3671 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3673 while (1)
3675 /* If the target does not support partial vectors we can shorten the
3676 number of modes to analyze for the epilogue as we know we can't pick a
3677 mode that would lead to a VF at least as big as the
3678 FIRST_VINFO_VF. */
3679 if (!supports_partial_vectors
3680 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3682 mode_i++;
3683 if (mode_i == vector_modes.length ())
3684 break;
3685 continue;
3688 if (dump_enabled_p ())
3689 dump_printf_loc (MSG_NOTE, vect_location,
3690 "***** Re-trying epilogue analysis with vector "
3691 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3693 bool fatal;
3694 opt_loop_vec_info loop_vinfo
3695 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3696 first_loop_vinfo,
3697 vector_modes, mode_i,
3698 autodetected_vector_mode, fatal);
3699 if (fatal)
3700 break;
3702 if (loop_vinfo)
3704 if (pick_lowest_cost_p)
3706 /* Keep trying to roll back vectorization attempts while the
3707 loop_vec_infos they produced were worse than this one. */
3708 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3709 while (!vinfos.is_empty ()
3710 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3712 gcc_assert (vect_epilogues);
3713 delete vinfos.pop ();
3716 /* For now only allow one epilogue loop. */
3717 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3719 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3720 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3721 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3722 || maybe_ne (lowest_th, 0U));
3723 /* Keep track of the known smallest versioning
3724 threshold. */
3725 if (ordered_p (lowest_th, th))
3726 lowest_th = ordered_min (lowest_th, th);
3728 else
3730 delete loop_vinfo;
3731 loop_vinfo = opt_loop_vec_info::success (NULL);
3734 /* For now only allow one epilogue loop, but allow
3735 pick_lowest_cost_p to replace it, so commit to the
3736 first epilogue if we have no reason to try alternatives. */
3737 if (!pick_lowest_cost_p)
3738 break;
3741 if (mode_i == vector_modes.length ())
3742 break;
3746 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3748 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3749 if (dump_enabled_p ())
3750 dump_printf_loc (MSG_NOTE, vect_location,
3751 "***** Choosing epilogue vector mode %s\n",
3752 GET_MODE_NAME
3753 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3756 return first_loop_vinfo;
3759 /* Return true if there is an in-order reduction function for CODE, storing
3760 it in *REDUC_FN if so. */
3762 static bool
3763 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3765 /* We support MINUS_EXPR by negating the operand. This also preserves an
3766 initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3767 (-0.0) = -0.0. */
3768 if (code == PLUS_EXPR || code == MINUS_EXPR)
3770 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3771 return true;
3773 return false;
3776 /* Function reduction_fn_for_scalar_code
3778 Input:
3779 CODE - tree_code of a reduction operations.
3781 Output:
3782 REDUC_FN - the corresponding internal function to be used to reduce the
3783 vector of partial results into a single scalar result, or IFN_LAST
3784 if the operation is a supported reduction operation, but does not have
3785 such an internal function.
3787 Return FALSE if CODE currently cannot be vectorized as reduction. */
3789 bool
3790 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3792 if (code.is_tree_code ())
3793 switch (tree_code (code))
3795 case MAX_EXPR:
3796 *reduc_fn = IFN_REDUC_MAX;
3797 return true;
3799 case MIN_EXPR:
3800 *reduc_fn = IFN_REDUC_MIN;
3801 return true;
3803 case PLUS_EXPR:
3804 *reduc_fn = IFN_REDUC_PLUS;
3805 return true;
3807 case BIT_AND_EXPR:
3808 *reduc_fn = IFN_REDUC_AND;
3809 return true;
3811 case BIT_IOR_EXPR:
3812 *reduc_fn = IFN_REDUC_IOR;
3813 return true;
3815 case BIT_XOR_EXPR:
3816 *reduc_fn = IFN_REDUC_XOR;
3817 return true;
3819 case MULT_EXPR:
3820 case MINUS_EXPR:
3821 *reduc_fn = IFN_LAST;
3822 return true;
3824 default:
3825 return false;
3827 else
3828 switch (combined_fn (code))
3830 CASE_CFN_FMAX:
3831 *reduc_fn = IFN_REDUC_FMAX;
3832 return true;
3834 CASE_CFN_FMIN:
3835 *reduc_fn = IFN_REDUC_FMIN;
3836 return true;
3838 default:
3839 return false;
3843 /* If there is a neutral value X such that a reduction would not be affected
3844 by the introduction of additional X elements, return that X, otherwise
3845 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3846 of the scalar elements. If the reduction has just a single initial value
3847 then INITIAL_VALUE is that value, otherwise it is null.
3848 If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3849 In that case no signed zero is returned. */
3851 tree
3852 neutral_op_for_reduction (tree scalar_type, code_helper code,
3853 tree initial_value, bool as_initial)
3855 if (code.is_tree_code ())
3856 switch (tree_code (code))
3858 case DOT_PROD_EXPR:
3859 case SAD_EXPR:
3860 case MINUS_EXPR:
3861 case BIT_IOR_EXPR:
3862 case BIT_XOR_EXPR:
3863 return build_zero_cst (scalar_type);
3864 case WIDEN_SUM_EXPR:
3865 case PLUS_EXPR:
3866 if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3867 return build_real (scalar_type, dconstm0);
3868 else
3869 return build_zero_cst (scalar_type);
3871 case MULT_EXPR:
3872 return build_one_cst (scalar_type);
3874 case BIT_AND_EXPR:
3875 return build_all_ones_cst (scalar_type);
3877 case MAX_EXPR:
3878 case MIN_EXPR:
3879 return initial_value;
3881 default:
3882 return NULL_TREE;
3884 else
3885 switch (combined_fn (code))
3887 CASE_CFN_FMIN:
3888 CASE_CFN_FMAX:
3889 return initial_value;
3891 default:
3892 return NULL_TREE;
3896 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3897 STMT is printed with a message MSG. */
3899 static void
3900 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3902 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3905 /* Return true if we need an in-order reduction for operation CODE
3906 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3907 overflow must wrap. */
3909 bool
3910 needs_fold_left_reduction_p (tree type, code_helper code)
3912 /* CHECKME: check for !flag_finite_math_only too? */
3913 if (SCALAR_FLOAT_TYPE_P (type))
3915 if (code.is_tree_code ())
3916 switch (tree_code (code))
3918 case MIN_EXPR:
3919 case MAX_EXPR:
3920 return false;
3922 default:
3923 return !flag_associative_math;
3925 else
3926 switch (combined_fn (code))
3928 CASE_CFN_FMIN:
3929 CASE_CFN_FMAX:
3930 return false;
3932 default:
3933 return !flag_associative_math;
3937 if (INTEGRAL_TYPE_P (type))
3938 return (!code.is_tree_code ()
3939 || !operation_no_trapping_overflow (type, tree_code (code)));
3941 if (SAT_FIXED_POINT_TYPE_P (type))
3942 return true;
3944 return false;
3947 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3948 has a handled computation expression. Store the main reduction
3949 operation in *CODE. */
3951 static bool
3952 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3953 tree loop_arg, code_helper *code,
3954 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3956 auto_bitmap visited;
3957 tree lookfor = PHI_RESULT (phi);
3958 ssa_op_iter curri;
3959 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3960 while (USE_FROM_PTR (curr) != loop_arg)
3961 curr = op_iter_next_use (&curri);
3962 curri.i = curri.numops;
3965 path.safe_push (std::make_pair (curri, curr));
3966 tree use = USE_FROM_PTR (curr);
3967 if (use == lookfor)
3968 break;
3969 gimple *def = SSA_NAME_DEF_STMT (use);
3970 if (gimple_nop_p (def)
3971 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3973 pop:
3976 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3977 curri = x.first;
3978 curr = x.second;
3980 curr = op_iter_next_use (&curri);
3981 /* Skip already visited or non-SSA operands (from iterating
3982 over PHI args). */
3983 while (curr != NULL_USE_OPERAND_P
3984 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3985 || ! bitmap_set_bit (visited,
3986 SSA_NAME_VERSION
3987 (USE_FROM_PTR (curr)))));
3989 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3990 if (curr == NULL_USE_OPERAND_P)
3991 break;
3993 else
3995 if (gimple_code (def) == GIMPLE_PHI)
3996 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3997 else
3998 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3999 while (curr != NULL_USE_OPERAND_P
4000 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4001 || ! bitmap_set_bit (visited,
4002 SSA_NAME_VERSION
4003 (USE_FROM_PTR (curr)))))
4004 curr = op_iter_next_use (&curri);
4005 if (curr == NULL_USE_OPERAND_P)
4006 goto pop;
4009 while (1);
4010 if (dump_file && (dump_flags & TDF_DETAILS))
4012 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4013 unsigned i;
4014 std::pair<ssa_op_iter, use_operand_p> *x;
4015 FOR_EACH_VEC_ELT (path, i, x)
4016 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4017 dump_printf (MSG_NOTE, "\n");
4020 /* Check whether the reduction path detected is valid. */
4021 bool fail = path.length () == 0;
4022 bool neg = false;
4023 int sign = -1;
4024 *code = ERROR_MARK;
4025 for (unsigned i = 1; i < path.length (); ++i)
4027 gimple *use_stmt = USE_STMT (path[i].second);
4028 gimple_match_op op;
4029 if (!gimple_extract_op (use_stmt, &op))
4031 fail = true;
4032 break;
4034 unsigned int opi = op.num_ops;
4035 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4037 /* The following make sure we can compute the operand index
4038 easily plus it mostly disallows chaining via COND_EXPR condition
4039 operands. */
4040 for (opi = 0; opi < op.num_ops; ++opi)
4041 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4042 break;
4044 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4046 for (opi = 0; opi < op.num_ops; ++opi)
4047 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4048 break;
4050 if (opi == op.num_ops)
4052 fail = true;
4053 break;
4055 op.code = canonicalize_code (op.code, op.type);
4056 if (op.code == MINUS_EXPR)
4058 op.code = PLUS_EXPR;
4059 /* Track whether we negate the reduction value each iteration. */
4060 if (op.ops[1] == op.ops[opi])
4061 neg = ! neg;
4063 if (CONVERT_EXPR_CODE_P (op.code)
4064 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4066 else if (*code == ERROR_MARK)
4068 *code = op.code;
4069 sign = TYPE_SIGN (op.type);
4071 else if (op.code != *code)
4073 fail = true;
4074 break;
4076 else if ((op.code == MIN_EXPR
4077 || op.code == MAX_EXPR)
4078 && sign != TYPE_SIGN (op.type))
4080 fail = true;
4081 break;
4083 /* Check there's only a single stmt the op is used on. For the
4084 not value-changing tail and the last stmt allow out-of-loop uses.
4085 ??? We could relax this and handle arbitrary live stmts by
4086 forcing a scalar epilogue for example. */
4087 imm_use_iterator imm_iter;
4088 use_operand_p use_p;
4089 gimple *op_use_stmt;
4090 unsigned cnt = 0;
4091 bool cond_fn_p = op.code.is_internal_fn ()
4092 && (conditional_internal_fn_code (internal_fn (op.code))
4093 != ERROR_MARK);
4095 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4097 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4098 op1 twice (once as definition, once as else) in the same operation.
4099 Allow this. */
4100 if (cond_fn_p)
4102 gcall *call = dyn_cast<gcall *> (use_stmt);
4103 unsigned else_pos
4104 = internal_fn_else_index (internal_fn (op.code));
4106 for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4108 if (j == else_pos)
4109 continue;
4110 if (gimple_call_arg (call, j) == op.ops[opi])
4111 cnt++;
4114 else if (!is_gimple_debug (op_use_stmt)
4115 && (*code != ERROR_MARK
4116 || flow_bb_inside_loop_p (loop,
4117 gimple_bb (op_use_stmt))))
4118 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4119 cnt++;
4122 if (cnt != 1)
4124 fail = true;
4125 break;
4128 return ! fail && ! neg && *code != ERROR_MARK;
4131 bool
4132 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4133 tree loop_arg, enum tree_code code)
4135 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4136 code_helper code_;
4137 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4138 && code_ == code);
4143 /* Function vect_is_simple_reduction
4145 (1) Detect a cross-iteration def-use cycle that represents a simple
4146 reduction computation. We look for the following pattern:
4148 loop_header:
4149 a1 = phi < a0, a2 >
4150 a3 = ...
4151 a2 = operation (a3, a1)
4155 a3 = ...
4156 loop_header:
4157 a1 = phi < a0, a2 >
4158 a2 = operation (a3, a1)
4160 such that:
4161 1. operation is commutative and associative and it is safe to
4162 change the order of the computation
4163 2. no uses for a2 in the loop (a2 is used out of the loop)
4164 3. no uses of a1 in the loop besides the reduction operation
4165 4. no uses of a1 outside the loop.
4167 Conditions 1,4 are tested here.
4168 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4170 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4171 nested cycles.
4173 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4174 reductions:
4176 a1 = phi < a0, a2 >
4177 inner loop (def of a3)
4178 a2 = phi < a3 >
4180 (4) Detect condition expressions, ie:
4181 for (int i = 0; i < N; i++)
4182 if (a[i] < val)
4183 ret_val = a[i];
4187 static stmt_vec_info
4188 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4189 bool *double_reduc, bool *reduc_chain_p, bool slp)
4191 gphi *phi = as_a <gphi *> (phi_info->stmt);
4192 gimple *phi_use_stmt = NULL;
4193 imm_use_iterator imm_iter;
4194 use_operand_p use_p;
4196 *double_reduc = false;
4197 *reduc_chain_p = false;
4198 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4200 tree phi_name = PHI_RESULT (phi);
4201 /* ??? If there are no uses of the PHI result the inner loop reduction
4202 won't be detected as possibly double-reduction by vectorizable_reduction
4203 because that tries to walk the PHI arg from the preheader edge which
4204 can be constant. See PR60382. */
4205 if (has_zero_uses (phi_name))
4206 return NULL;
4207 class loop *loop = (gimple_bb (phi))->loop_father;
4208 unsigned nphi_def_loop_uses = 0;
4209 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4211 gimple *use_stmt = USE_STMT (use_p);
4212 if (is_gimple_debug (use_stmt))
4213 continue;
4215 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4217 if (dump_enabled_p ())
4218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4219 "intermediate value used outside loop.\n");
4221 return NULL;
4224 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4225 op1 twice (once as definition, once as else) in the same operation.
4226 Only count it as one. */
4227 if (use_stmt != phi_use_stmt)
4229 nphi_def_loop_uses++;
4230 phi_use_stmt = use_stmt;
4234 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4235 if (TREE_CODE (latch_def) != SSA_NAME)
4237 if (dump_enabled_p ())
4238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4239 "reduction: not ssa_name: %T\n", latch_def);
4240 return NULL;
4243 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4244 if (!def_stmt_info
4245 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4246 return NULL;
4248 bool nested_in_vect_loop
4249 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4250 unsigned nlatch_def_loop_uses = 0;
4251 auto_vec<gphi *, 3> lcphis;
4252 bool inner_loop_of_double_reduc = false;
4253 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4255 gimple *use_stmt = USE_STMT (use_p);
4256 if (is_gimple_debug (use_stmt))
4257 continue;
4258 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4259 nlatch_def_loop_uses++;
4260 else
4262 /* We can have more than one loop-closed PHI. */
4263 lcphis.safe_push (as_a <gphi *> (use_stmt));
4264 if (nested_in_vect_loop
4265 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4266 == vect_double_reduction_def))
4267 inner_loop_of_double_reduc = true;
4271 /* If we are vectorizing an inner reduction we are executing that
4272 in the original order only in case we are not dealing with a
4273 double reduction. */
4274 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4276 if (dump_enabled_p ())
4277 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4278 "detected nested cycle: ");
4279 return def_stmt_info;
4282 /* When the inner loop of a double reduction ends up with more than
4283 one loop-closed PHI we have failed to classify alternate such
4284 PHIs as double reduction, leading to wrong code. See PR103237. */
4285 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4287 if (dump_enabled_p ())
4288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4289 "unhandle double reduction\n");
4290 return NULL;
4293 /* If this isn't a nested cycle or if the nested cycle reduction value
4294 is used ouside of the inner loop we cannot handle uses of the reduction
4295 value. */
4296 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4298 if (dump_enabled_p ())
4299 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4300 "reduction used in loop.\n");
4301 return NULL;
4304 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4305 defined in the inner loop. */
4306 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4308 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4309 if (gimple_phi_num_args (def_stmt) != 1
4310 || TREE_CODE (op1) != SSA_NAME)
4312 if (dump_enabled_p ())
4313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4314 "unsupported phi node definition.\n");
4316 return NULL;
4319 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4320 and the latch definition op1. */
4321 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4322 if (gimple_bb (def1)
4323 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4324 && loop->inner
4325 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4326 && (is_gimple_assign (def1) || is_gimple_call (def1))
4327 && is_a <gphi *> (phi_use_stmt)
4328 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4329 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4330 loop_latch_edge (loop->inner))))
4332 if (dump_enabled_p ())
4333 report_vect_op (MSG_NOTE, def_stmt,
4334 "detected double reduction: ");
4336 *double_reduc = true;
4337 return def_stmt_info;
4340 return NULL;
4343 /* Look for the expression computing latch_def from then loop PHI result. */
4344 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4345 code_helper code;
4346 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4347 path))
4349 STMT_VINFO_REDUC_CODE (phi_info) = code;
4350 if (code == COND_EXPR && !nested_in_vect_loop)
4351 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4353 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4354 reduction chain for which the additional restriction is that
4355 all operations in the chain are the same. */
4356 auto_vec<stmt_vec_info, 8> reduc_chain;
4357 unsigned i;
4358 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4359 for (i = path.length () - 1; i >= 1; --i)
4361 gimple *stmt = USE_STMT (path[i].second);
4362 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4363 gimple_match_op op;
4364 if (!gimple_extract_op (stmt, &op))
4365 gcc_unreachable ();
4366 if (gassign *assign = dyn_cast<gassign *> (stmt))
4367 STMT_VINFO_REDUC_IDX (stmt_info)
4368 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4369 else
4371 gcall *call = as_a<gcall *> (stmt);
4372 STMT_VINFO_REDUC_IDX (stmt_info)
4373 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4375 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4376 && (i == 1 || i == path.length () - 1));
4377 if ((op.code != code && !leading_conversion)
4378 /* We can only handle the final value in epilogue
4379 generation for reduction chains. */
4380 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4381 is_slp_reduc = false;
4382 /* For reduction chains we support a trailing/leading
4383 conversions. We do not store those in the actual chain. */
4384 if (leading_conversion)
4385 continue;
4386 reduc_chain.safe_push (stmt_info);
4388 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4390 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4392 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4393 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4395 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4396 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4398 /* Save the chain for further analysis in SLP detection. */
4399 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4400 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4402 *reduc_chain_p = true;
4403 if (dump_enabled_p ())
4404 dump_printf_loc (MSG_NOTE, vect_location,
4405 "reduction: detected reduction chain\n");
4407 else if (dump_enabled_p ())
4408 dump_printf_loc (MSG_NOTE, vect_location,
4409 "reduction: detected reduction\n");
4411 return def_stmt_info;
4414 if (dump_enabled_p ())
4415 dump_printf_loc (MSG_NOTE, vect_location,
4416 "reduction: unknown pattern\n");
4418 return NULL;
4421 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4422 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4423 or -1 if not known. */
4425 static int
4426 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4428 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4429 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4431 if (dump_enabled_p ())
4432 dump_printf_loc (MSG_NOTE, vect_location,
4433 "cost model: epilogue peel iters set to vf/2 "
4434 "because loop iterations are unknown .\n");
4435 return assumed_vf / 2;
4437 else
4439 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4440 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4441 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4442 /* If we need to peel for gaps, but no peeling is required, we have to
4443 peel VF iterations. */
4444 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4445 peel_iters_epilogue = assumed_vf;
4446 return peel_iters_epilogue;
4450 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4452 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4453 int *peel_iters_epilogue,
4454 stmt_vector_for_cost *scalar_cost_vec,
4455 stmt_vector_for_cost *prologue_cost_vec,
4456 stmt_vector_for_cost *epilogue_cost_vec)
4458 int retval = 0;
4460 *peel_iters_epilogue
4461 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4463 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4465 /* If peeled iterations are known but number of scalar loop
4466 iterations are unknown, count a taken branch per peeled loop. */
4467 if (peel_iters_prologue > 0)
4468 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4469 vect_prologue);
4470 if (*peel_iters_epilogue > 0)
4471 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4472 vect_epilogue);
4475 stmt_info_for_cost *si;
4476 int j;
4477 if (peel_iters_prologue)
4478 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4479 retval += record_stmt_cost (prologue_cost_vec,
4480 si->count * peel_iters_prologue,
4481 si->kind, si->stmt_info, si->misalign,
4482 vect_prologue);
4483 if (*peel_iters_epilogue)
4484 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4485 retval += record_stmt_cost (epilogue_cost_vec,
4486 si->count * *peel_iters_epilogue,
4487 si->kind, si->stmt_info, si->misalign,
4488 vect_epilogue);
4490 return retval;
4493 /* Function vect_estimate_min_profitable_iters
4495 Return the number of iterations required for the vector version of the
4496 loop to be profitable relative to the cost of the scalar version of the
4497 loop.
4499 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4500 of iterations for vectorization. -1 value means loop vectorization
4501 is not profitable. This returned value may be used for dynamic
4502 profitability check.
4504 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4505 for static check against estimated number of iterations. */
4507 static void
4508 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4509 int *ret_min_profitable_niters,
4510 int *ret_min_profitable_estimate,
4511 unsigned *suggested_unroll_factor)
4513 int min_profitable_iters;
4514 int min_profitable_estimate;
4515 int peel_iters_prologue;
4516 int peel_iters_epilogue;
4517 unsigned vec_inside_cost = 0;
4518 int vec_outside_cost = 0;
4519 unsigned vec_prologue_cost = 0;
4520 unsigned vec_epilogue_cost = 0;
4521 int scalar_single_iter_cost = 0;
4522 int scalar_outside_cost = 0;
4523 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4524 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4525 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4527 /* Cost model disabled. */
4528 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4530 if (dump_enabled_p ())
4531 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4532 *ret_min_profitable_niters = 0;
4533 *ret_min_profitable_estimate = 0;
4534 return;
4537 /* Requires loop versioning tests to handle misalignment. */
4538 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4540 /* FIXME: Make cost depend on complexity of individual check. */
4541 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4542 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4543 if (dump_enabled_p ())
4544 dump_printf (MSG_NOTE,
4545 "cost model: Adding cost of checks for loop "
4546 "versioning to treat misalignment.\n");
4549 /* Requires loop versioning with alias checks. */
4550 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4552 /* FIXME: Make cost depend on complexity of individual check. */
4553 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4554 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4555 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4556 if (len)
4557 /* Count LEN - 1 ANDs and LEN comparisons. */
4558 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4559 scalar_stmt, vect_prologue);
4560 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4561 if (len)
4563 /* Count LEN - 1 ANDs and LEN comparisons. */
4564 unsigned int nstmts = len * 2 - 1;
4565 /* +1 for each bias that needs adding. */
4566 for (unsigned int i = 0; i < len; ++i)
4567 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4568 nstmts += 1;
4569 (void) add_stmt_cost (target_cost_data, nstmts,
4570 scalar_stmt, vect_prologue);
4572 if (dump_enabled_p ())
4573 dump_printf (MSG_NOTE,
4574 "cost model: Adding cost of checks for loop "
4575 "versioning aliasing.\n");
4578 /* Requires loop versioning with niter checks. */
4579 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4581 /* FIXME: Make cost depend on complexity of individual check. */
4582 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4583 NULL, NULL, NULL_TREE, 0, vect_prologue);
4584 if (dump_enabled_p ())
4585 dump_printf (MSG_NOTE,
4586 "cost model: Adding cost of checks for loop "
4587 "versioning niters.\n");
4590 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4591 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4592 vect_prologue);
4594 /* Count statements in scalar loop. Using this as scalar cost for a single
4595 iteration for now.
4597 TODO: Add outer loop support.
4599 TODO: Consider assigning different costs to different scalar
4600 statements. */
4602 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4604 /* Add additional cost for the peeled instructions in prologue and epilogue
4605 loop. (For fully-masked loops there will be no peeling.)
4607 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4608 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4610 TODO: Build an expression that represents peel_iters for prologue and
4611 epilogue to be used in a run-time test. */
4613 bool prologue_need_br_taken_cost = false;
4614 bool prologue_need_br_not_taken_cost = false;
4616 /* Calculate peel_iters_prologue. */
4617 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4618 peel_iters_prologue = 0;
4619 else if (npeel < 0)
4621 peel_iters_prologue = assumed_vf / 2;
4622 if (dump_enabled_p ())
4623 dump_printf (MSG_NOTE, "cost model: "
4624 "prologue peel iters set to vf/2.\n");
4626 /* If peeled iterations are unknown, count a taken branch and a not taken
4627 branch per peeled loop. Even if scalar loop iterations are known,
4628 vector iterations are not known since peeled prologue iterations are
4629 not known. Hence guards remain the same. */
4630 prologue_need_br_taken_cost = true;
4631 prologue_need_br_not_taken_cost = true;
4633 else
4635 peel_iters_prologue = npeel;
4636 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4637 /* If peeled iterations are known but number of scalar loop
4638 iterations are unknown, count a taken branch per peeled loop. */
4639 prologue_need_br_taken_cost = true;
4642 bool epilogue_need_br_taken_cost = false;
4643 bool epilogue_need_br_not_taken_cost = false;
4645 /* Calculate peel_iters_epilogue. */
4646 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4647 /* We need to peel exactly one iteration for gaps. */
4648 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4649 else if (npeel < 0)
4651 /* If peeling for alignment is unknown, loop bound of main loop
4652 becomes unknown. */
4653 peel_iters_epilogue = assumed_vf / 2;
4654 if (dump_enabled_p ())
4655 dump_printf (MSG_NOTE, "cost model: "
4656 "epilogue peel iters set to vf/2 because "
4657 "peeling for alignment is unknown.\n");
4659 /* See the same reason above in peel_iters_prologue calculation. */
4660 epilogue_need_br_taken_cost = true;
4661 epilogue_need_br_not_taken_cost = true;
4663 else
4665 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4666 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4667 /* If peeled iterations are known but number of scalar loop
4668 iterations are unknown, count a taken branch per peeled loop. */
4669 epilogue_need_br_taken_cost = true;
4672 stmt_info_for_cost *si;
4673 int j;
4674 /* Add costs associated with peel_iters_prologue. */
4675 if (peel_iters_prologue)
4676 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4678 (void) add_stmt_cost (target_cost_data,
4679 si->count * peel_iters_prologue, si->kind,
4680 si->stmt_info, si->node, si->vectype,
4681 si->misalign, vect_prologue);
4684 /* Add costs associated with peel_iters_epilogue. */
4685 if (peel_iters_epilogue)
4686 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4688 (void) add_stmt_cost (target_cost_data,
4689 si->count * peel_iters_epilogue, si->kind,
4690 si->stmt_info, si->node, si->vectype,
4691 si->misalign, vect_epilogue);
4694 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4696 if (prologue_need_br_taken_cost)
4697 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4698 vect_prologue);
4700 if (prologue_need_br_not_taken_cost)
4701 (void) add_stmt_cost (target_cost_data, 1,
4702 cond_branch_not_taken, vect_prologue);
4704 if (epilogue_need_br_taken_cost)
4705 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4706 vect_epilogue);
4708 if (epilogue_need_br_not_taken_cost)
4709 (void) add_stmt_cost (target_cost_data, 1,
4710 cond_branch_not_taken, vect_epilogue);
4712 /* Take care of special costs for rgroup controls of partial vectors. */
4713 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4714 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4715 == vect_partial_vectors_avx512))
4717 /* Calculate how many masks we need to generate. */
4718 unsigned int num_masks = 0;
4719 bool need_saturation = false;
4720 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4721 if (rgm.type)
4723 unsigned nvectors = rgm.factor;
4724 num_masks += nvectors;
4725 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4726 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4727 need_saturation = true;
4730 /* ??? The target isn't able to identify the costs below as
4731 producing masks so it cannot penaltize cases where we'd run
4732 out of mask registers for example. */
4734 /* ??? We are also failing to account for smaller vector masks
4735 we generate by splitting larger masks in vect_get_loop_mask. */
4737 /* In the worst case, we need to generate each mask in the prologue
4738 and in the loop body. We need one splat per group and one
4739 compare per mask.
4741 Sometimes the prologue mask will fold to a constant,
4742 so the actual prologue cost might be smaller. However, it's
4743 simpler and safer to use the worst-case cost; if this ends up
4744 being the tie-breaker between vectorizing or not, then it's
4745 probably better not to vectorize. */
4746 (void) add_stmt_cost (target_cost_data,
4747 num_masks
4748 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4749 vector_stmt, NULL, NULL, NULL_TREE, 0,
4750 vect_prologue);
4751 (void) add_stmt_cost (target_cost_data,
4752 num_masks
4753 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4754 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4756 /* When we need saturation we need it both in the prologue and
4757 the epilogue. */
4758 if (need_saturation)
4760 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4761 NULL, NULL, NULL_TREE, 0, vect_prologue);
4762 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4763 NULL, NULL, NULL_TREE, 0, vect_body);
4766 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4767 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4768 == vect_partial_vectors_while_ult))
4770 /* Calculate how many masks we need to generate. */
4771 unsigned int num_masks = 0;
4772 rgroup_controls *rgm;
4773 unsigned int num_vectors_m1;
4774 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4775 num_vectors_m1, rgm)
4776 if (rgm->type)
4777 num_masks += num_vectors_m1 + 1;
4778 gcc_assert (num_masks > 0);
4780 /* In the worst case, we need to generate each mask in the prologue
4781 and in the loop body. One of the loop body mask instructions
4782 replaces the comparison in the scalar loop, and since we don't
4783 count the scalar comparison against the scalar body, we shouldn't
4784 count that vector instruction against the vector body either.
4786 Sometimes we can use unpacks instead of generating prologue
4787 masks and sometimes the prologue mask will fold to a constant,
4788 so the actual prologue cost might be smaller. However, it's
4789 simpler and safer to use the worst-case cost; if this ends up
4790 being the tie-breaker between vectorizing or not, then it's
4791 probably better not to vectorize. */
4792 (void) add_stmt_cost (target_cost_data, num_masks,
4793 vector_stmt, NULL, NULL, NULL_TREE, 0,
4794 vect_prologue);
4795 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4796 vector_stmt, NULL, NULL, NULL_TREE, 0,
4797 vect_body);
4799 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4801 /* Referring to the functions vect_set_loop_condition_partial_vectors
4802 and vect_set_loop_controls_directly, we need to generate each
4803 length in the prologue and in the loop body if required. Although
4804 there are some possible optimizations, we consider the worst case
4805 here. */
4807 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4808 signed char partial_load_store_bias
4809 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4810 bool need_iterate_p
4811 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4812 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4814 /* Calculate how many statements to be added. */
4815 unsigned int prologue_stmts = 0;
4816 unsigned int body_stmts = 0;
4818 rgroup_controls *rgc;
4819 unsigned int num_vectors_m1;
4820 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4821 if (rgc->type)
4823 /* May need one SHIFT for nitems_total computation. */
4824 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4825 if (nitems != 1 && !niters_known_p)
4826 prologue_stmts += 1;
4828 /* May need one MAX and one MINUS for wrap around. */
4829 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4830 prologue_stmts += 2;
4832 /* Need one MAX and one MINUS for each batch limit excepting for
4833 the 1st one. */
4834 prologue_stmts += num_vectors_m1 * 2;
4836 unsigned int num_vectors = num_vectors_m1 + 1;
4838 /* Need to set up lengths in prologue, only one MIN required
4839 for each since start index is zero. */
4840 prologue_stmts += num_vectors;
4842 /* If we have a non-zero partial load bias, we need one PLUS
4843 to adjust the load length. */
4844 if (partial_load_store_bias != 0)
4845 body_stmts += 1;
4847 /* Each may need two MINs and one MINUS to update lengths in body
4848 for next iteration. */
4849 if (need_iterate_p)
4850 body_stmts += 3 * num_vectors;
4853 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4854 scalar_stmt, vect_prologue);
4855 (void) add_stmt_cost (target_cost_data, body_stmts,
4856 scalar_stmt, vect_body);
4859 /* FORNOW: The scalar outside cost is incremented in one of the
4860 following ways:
4862 1. The vectorizer checks for alignment and aliasing and generates
4863 a condition that allows dynamic vectorization. A cost model
4864 check is ANDED with the versioning condition. Hence scalar code
4865 path now has the added cost of the versioning check.
4867 if (cost > th & versioning_check)
4868 jmp to vector code
4870 Hence run-time scalar is incremented by not-taken branch cost.
4872 2. The vectorizer then checks if a prologue is required. If the
4873 cost model check was not done before during versioning, it has to
4874 be done before the prologue check.
4876 if (cost <= th)
4877 prologue = scalar_iters
4878 if (prologue == 0)
4879 jmp to vector code
4880 else
4881 execute prologue
4882 if (prologue == num_iters)
4883 go to exit
4885 Hence the run-time scalar cost is incremented by a taken branch,
4886 plus a not-taken branch, plus a taken branch cost.
4888 3. The vectorizer then checks if an epilogue is required. If the
4889 cost model check was not done before during prologue check, it
4890 has to be done with the epilogue check.
4892 if (prologue == 0)
4893 jmp to vector code
4894 else
4895 execute prologue
4896 if (prologue == num_iters)
4897 go to exit
4898 vector code:
4899 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4900 jmp to epilogue
4902 Hence the run-time scalar cost should be incremented by 2 taken
4903 branches.
4905 TODO: The back end may reorder the BBS's differently and reverse
4906 conditions/branch directions. Change the estimates below to
4907 something more reasonable. */
4909 /* If the number of iterations is known and we do not do versioning, we can
4910 decide whether to vectorize at compile time. Hence the scalar version
4911 do not carry cost model guard costs. */
4912 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4913 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4915 /* Cost model check occurs at versioning. */
4916 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4917 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4918 else
4920 /* Cost model check occurs at prologue generation. */
4921 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4922 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4923 + vect_get_stmt_cost (cond_branch_not_taken);
4924 /* Cost model check occurs at epilogue generation. */
4925 else
4926 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4930 /* Complete the target-specific cost calculations. */
4931 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4932 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4933 suggested_unroll_factor);
4935 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4936 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4937 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4938 *suggested_unroll_factor,
4939 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4941 if (dump_enabled_p ())
4942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4943 "can't unroll as unrolled vectorization factor larger"
4944 " than maximum vectorization factor: "
4945 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4946 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4947 *suggested_unroll_factor = 1;
4950 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4952 if (dump_enabled_p ())
4954 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4955 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4956 vec_inside_cost);
4957 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4958 vec_prologue_cost);
4959 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4960 vec_epilogue_cost);
4961 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4962 scalar_single_iter_cost);
4963 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4964 scalar_outside_cost);
4965 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4966 vec_outside_cost);
4967 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4968 peel_iters_prologue);
4969 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4970 peel_iters_epilogue);
4973 /* Calculate number of iterations required to make the vector version
4974 profitable, relative to the loop bodies only. The following condition
4975 must hold true:
4976 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4977 where
4978 SIC = scalar iteration cost, VIC = vector iteration cost,
4979 VOC = vector outside cost, VF = vectorization factor,
4980 NPEEL = prologue iterations + epilogue iterations,
4981 SOC = scalar outside cost for run time cost model check. */
4983 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4984 - vec_inside_cost);
4985 if (saving_per_viter <= 0)
4987 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4988 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4989 "vectorization did not happen for a simd loop");
4991 if (dump_enabled_p ())
4992 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4993 "cost model: the vector iteration cost = %d "
4994 "divided by the scalar iteration cost = %d "
4995 "is greater or equal to the vectorization factor = %d"
4996 ".\n",
4997 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4998 *ret_min_profitable_niters = -1;
4999 *ret_min_profitable_estimate = -1;
5000 return;
5003 /* ??? The "if" arm is written to handle all cases; see below for what
5004 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5005 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5007 /* Rewriting the condition above in terms of the number of
5008 vector iterations (vniters) rather than the number of
5009 scalar iterations (niters) gives:
5011 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5013 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5015 For integer N, X and Y when X > 0:
5017 N * X > Y <==> N >= (Y /[floor] X) + 1. */
5018 int outside_overhead = (vec_outside_cost
5019 - scalar_single_iter_cost * peel_iters_prologue
5020 - scalar_single_iter_cost * peel_iters_epilogue
5021 - scalar_outside_cost);
5022 /* We're only interested in cases that require at least one
5023 vector iteration. */
5024 int min_vec_niters = 1;
5025 if (outside_overhead > 0)
5026 min_vec_niters = outside_overhead / saving_per_viter + 1;
5028 if (dump_enabled_p ())
5029 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5030 min_vec_niters);
5032 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5034 /* Now that we know the minimum number of vector iterations,
5035 find the minimum niters for which the scalar cost is larger:
5037 SIC * niters > VIC * vniters + VOC - SOC
5039 We know that the minimum niters is no more than
5040 vniters * VF + NPEEL, but it might be (and often is) less
5041 than that if a partial vector iteration is cheaper than the
5042 equivalent scalar code. */
5043 int threshold = (vec_inside_cost * min_vec_niters
5044 + vec_outside_cost
5045 - scalar_outside_cost);
5046 if (threshold <= 0)
5047 min_profitable_iters = 1;
5048 else
5049 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5051 else
5052 /* Convert the number of vector iterations into a number of
5053 scalar iterations. */
5054 min_profitable_iters = (min_vec_niters * assumed_vf
5055 + peel_iters_prologue
5056 + peel_iters_epilogue);
5058 else
5060 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5061 * assumed_vf
5062 - vec_inside_cost * peel_iters_prologue
5063 - vec_inside_cost * peel_iters_epilogue);
5064 if (min_profitable_iters <= 0)
5065 min_profitable_iters = 0;
5066 else
5068 min_profitable_iters /= saving_per_viter;
5070 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5071 <= (((int) vec_inside_cost * min_profitable_iters)
5072 + (((int) vec_outside_cost - scalar_outside_cost)
5073 * assumed_vf)))
5074 min_profitable_iters++;
5078 if (dump_enabled_p ())
5079 dump_printf (MSG_NOTE,
5080 " Calculated minimum iters for profitability: %d\n",
5081 min_profitable_iters);
5083 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5084 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5085 /* We want the vectorized loop to execute at least once. */
5086 min_profitable_iters = assumed_vf + peel_iters_prologue;
5087 else if (min_profitable_iters < peel_iters_prologue)
5088 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5089 vectorized loop executes at least once. */
5090 min_profitable_iters = peel_iters_prologue;
5092 if (dump_enabled_p ())
5093 dump_printf_loc (MSG_NOTE, vect_location,
5094 " Runtime profitability threshold = %d\n",
5095 min_profitable_iters);
5097 *ret_min_profitable_niters = min_profitable_iters;
5099 /* Calculate number of iterations required to make the vector version
5100 profitable, relative to the loop bodies only.
5102 Non-vectorized variant is SIC * niters and it must win over vector
5103 variant on the expected loop trip count. The following condition must hold true:
5104 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5106 if (vec_outside_cost <= 0)
5107 min_profitable_estimate = 0;
5108 /* ??? This "else if" arm is written to handle all cases; see below for
5109 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5110 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5112 /* This is a repeat of the code above, but with + SOC rather
5113 than - SOC. */
5114 int outside_overhead = (vec_outside_cost
5115 - scalar_single_iter_cost * peel_iters_prologue
5116 - scalar_single_iter_cost * peel_iters_epilogue
5117 + scalar_outside_cost);
5118 int min_vec_niters = 1;
5119 if (outside_overhead > 0)
5120 min_vec_niters = outside_overhead / saving_per_viter + 1;
5122 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5124 int threshold = (vec_inside_cost * min_vec_niters
5125 + vec_outside_cost
5126 + scalar_outside_cost);
5127 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5129 else
5130 min_profitable_estimate = (min_vec_niters * assumed_vf
5131 + peel_iters_prologue
5132 + peel_iters_epilogue);
5134 else
5136 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5137 * assumed_vf
5138 - vec_inside_cost * peel_iters_prologue
5139 - vec_inside_cost * peel_iters_epilogue)
5140 / ((scalar_single_iter_cost * assumed_vf)
5141 - vec_inside_cost);
5143 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5144 if (dump_enabled_p ())
5145 dump_printf_loc (MSG_NOTE, vect_location,
5146 " Static estimate profitability threshold = %d\n",
5147 min_profitable_estimate);
5149 *ret_min_profitable_estimate = min_profitable_estimate;
5152 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5153 vector elements (not bits) for a vector with NELT elements. */
5154 static void
5155 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5156 vec_perm_builder *sel)
5158 /* The encoding is a single stepped pattern. Any wrap-around is handled
5159 by vec_perm_indices. */
5160 sel->new_vector (nelt, 1, 3);
5161 for (unsigned int i = 0; i < 3; i++)
5162 sel->quick_push (i + offset);
5165 /* Checks whether the target supports whole-vector shifts for vectors of mode
5166 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5167 it supports vec_perm_const with masks for all necessary shift amounts. */
5168 static bool
5169 have_whole_vector_shift (machine_mode mode)
5171 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5172 return true;
5174 /* Variable-length vectors should be handled via the optab. */
5175 unsigned int nelt;
5176 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5177 return false;
5179 vec_perm_builder sel;
5180 vec_perm_indices indices;
5181 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5183 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5184 indices.new_vector (sel, 2, nelt);
5185 if (!can_vec_perm_const_p (mode, mode, indices, false))
5186 return false;
5188 return true;
5191 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5192 multiplication operands have differing signs and (b) we intend
5193 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5194 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5196 static bool
5197 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5198 stmt_vec_info stmt_info)
5200 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5201 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5202 return false;
5204 tree rhs1 = gimple_assign_rhs1 (assign);
5205 tree rhs2 = gimple_assign_rhs2 (assign);
5206 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5207 return false;
5209 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5210 gcc_assert (reduc_info->is_reduc_info);
5211 return !directly_supported_p (DOT_PROD_EXPR,
5212 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5213 optab_vector_mixed_sign);
5216 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5217 functions. Design better to avoid maintenance issues. */
5219 /* Function vect_model_reduction_cost.
5221 Models cost for a reduction operation, including the vector ops
5222 generated within the strip-mine loop in some cases, the initial
5223 definition before the loop, and the epilogue code that must be generated. */
5225 static void
5226 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5227 stmt_vec_info stmt_info, internal_fn reduc_fn,
5228 vect_reduction_type reduction_type,
5229 int ncopies, stmt_vector_for_cost *cost_vec)
5231 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5232 tree vectype;
5233 machine_mode mode;
5234 class loop *loop = NULL;
5236 if (loop_vinfo)
5237 loop = LOOP_VINFO_LOOP (loop_vinfo);
5239 /* Condition reductions generate two reductions in the loop. */
5240 if (reduction_type == COND_REDUCTION)
5241 ncopies *= 2;
5243 vectype = STMT_VINFO_VECTYPE (stmt_info);
5244 mode = TYPE_MODE (vectype);
5245 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5247 gimple_match_op op;
5248 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5249 gcc_unreachable ();
5251 bool emulated_mixed_dot_prod
5252 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5253 if (reduction_type == EXTRACT_LAST_REDUCTION)
5254 /* No extra instructions are needed in the prologue. The loop body
5255 operations are costed in vectorizable_condition. */
5256 inside_cost = 0;
5257 else if (reduction_type == FOLD_LEFT_REDUCTION)
5259 /* No extra instructions needed in the prologue. */
5260 prologue_cost = 0;
5262 if (reduc_fn != IFN_LAST)
5263 /* Count one reduction-like operation per vector. */
5264 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5265 stmt_info, 0, vect_body);
5266 else
5268 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5269 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5270 inside_cost = record_stmt_cost (cost_vec, nelements,
5271 vec_to_scalar, stmt_info, 0,
5272 vect_body);
5273 inside_cost += record_stmt_cost (cost_vec, nelements,
5274 scalar_stmt, stmt_info, 0,
5275 vect_body);
5278 else
5280 /* Add in the cost of the initial definitions. */
5281 int prologue_stmts;
5282 if (reduction_type == COND_REDUCTION)
5283 /* For cond reductions we have four vectors: initial index, step,
5284 initial result of the data reduction, initial value of the index
5285 reduction. */
5286 prologue_stmts = 4;
5287 else if (emulated_mixed_dot_prod)
5288 /* We need the initial reduction value and two invariants:
5289 one that contains the minimum signed value and one that
5290 contains half of its negative. */
5291 prologue_stmts = 3;
5292 else
5293 prologue_stmts = 1;
5294 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5295 scalar_to_vec, stmt_info, 0,
5296 vect_prologue);
5299 /* Determine cost of epilogue code.
5301 We have a reduction operator that will reduce the vector in one statement.
5302 Also requires scalar extract. */
5304 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5306 if (reduc_fn != IFN_LAST)
5308 if (reduction_type == COND_REDUCTION)
5310 /* An EQ stmt and an COND_EXPR stmt. */
5311 epilogue_cost += record_stmt_cost (cost_vec, 2,
5312 vector_stmt, stmt_info, 0,
5313 vect_epilogue);
5314 /* Reduction of the max index and a reduction of the found
5315 values. */
5316 epilogue_cost += record_stmt_cost (cost_vec, 2,
5317 vec_to_scalar, stmt_info, 0,
5318 vect_epilogue);
5319 /* A broadcast of the max value. */
5320 epilogue_cost += record_stmt_cost (cost_vec, 1,
5321 scalar_to_vec, stmt_info, 0,
5322 vect_epilogue);
5324 else
5326 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5327 stmt_info, 0, vect_epilogue);
5328 epilogue_cost += record_stmt_cost (cost_vec, 1,
5329 vec_to_scalar, stmt_info, 0,
5330 vect_epilogue);
5333 else if (reduction_type == COND_REDUCTION)
5335 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5336 /* Extraction of scalar elements. */
5337 epilogue_cost += record_stmt_cost (cost_vec,
5338 2 * estimated_nunits,
5339 vec_to_scalar, stmt_info, 0,
5340 vect_epilogue);
5341 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5342 epilogue_cost += record_stmt_cost (cost_vec,
5343 2 * estimated_nunits - 3,
5344 scalar_stmt, stmt_info, 0,
5345 vect_epilogue);
5347 else if (reduction_type == EXTRACT_LAST_REDUCTION
5348 || reduction_type == FOLD_LEFT_REDUCTION)
5349 /* No extra instructions need in the epilogue. */
5351 else
5353 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5354 tree bitsize = TYPE_SIZE (op.type);
5355 int element_bitsize = tree_to_uhwi (bitsize);
5356 int nelements = vec_size_in_bits / element_bitsize;
5358 if (op.code == COND_EXPR)
5359 op.code = MAX_EXPR;
5361 /* We have a whole vector shift available. */
5362 if (VECTOR_MODE_P (mode)
5363 && directly_supported_p (op.code, vectype)
5364 && have_whole_vector_shift (mode))
5366 /* Final reduction via vector shifts and the reduction operator.
5367 Also requires scalar extract. */
5368 epilogue_cost += record_stmt_cost (cost_vec,
5369 exact_log2 (nelements) * 2,
5370 vector_stmt, stmt_info, 0,
5371 vect_epilogue);
5372 epilogue_cost += record_stmt_cost (cost_vec, 1,
5373 vec_to_scalar, stmt_info, 0,
5374 vect_epilogue);
5376 else
5377 /* Use extracts and reduction op for final reduction. For N
5378 elements, we have N extracts and N-1 reduction ops. */
5379 epilogue_cost += record_stmt_cost (cost_vec,
5380 nelements + nelements - 1,
5381 vector_stmt, stmt_info, 0,
5382 vect_epilogue);
5386 if (dump_enabled_p ())
5387 dump_printf (MSG_NOTE,
5388 "vect_model_reduction_cost: inside_cost = %d, "
5389 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5390 prologue_cost, epilogue_cost);
5393 /* SEQ is a sequence of instructions that initialize the reduction
5394 described by REDUC_INFO. Emit them in the appropriate place. */
5396 static void
5397 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5398 stmt_vec_info reduc_info, gimple *seq)
5400 if (reduc_info->reused_accumulator)
5402 /* When reusing an accumulator from the main loop, we only need
5403 initialization instructions if the main loop can be skipped.
5404 In that case, emit the initialization instructions at the end
5405 of the guard block that does the skip. */
5406 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5407 gcc_assert (skip_edge);
5408 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5409 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5411 else
5413 /* The normal case: emit the initialization instructions on the
5414 preheader edge. */
5415 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5416 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5420 /* Function get_initial_def_for_reduction
5422 Input:
5423 REDUC_INFO - the info_for_reduction
5424 INIT_VAL - the initial value of the reduction variable
5425 NEUTRAL_OP - a value that has no effect on the reduction, as per
5426 neutral_op_for_reduction
5428 Output:
5429 Return a vector variable, initialized according to the operation that
5430 STMT_VINFO performs. This vector will be used as the initial value
5431 of the vector of partial results.
5433 The value we need is a vector in which element 0 has value INIT_VAL
5434 and every other element has value NEUTRAL_OP. */
5436 static tree
5437 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5438 stmt_vec_info reduc_info,
5439 tree init_val, tree neutral_op)
5441 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5442 tree scalar_type = TREE_TYPE (init_val);
5443 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5444 tree init_def;
5445 gimple_seq stmts = NULL;
5447 gcc_assert (vectype);
5449 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5450 || SCALAR_FLOAT_TYPE_P (scalar_type));
5452 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5453 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5455 if (operand_equal_p (init_val, neutral_op))
5457 /* If both elements are equal then the vector described above is
5458 just a splat. */
5459 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5460 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5462 else
5464 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5465 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5466 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5468 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5469 element 0. */
5470 init_def = gimple_build_vector_from_val (&stmts, vectype,
5471 neutral_op);
5472 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5473 vectype, init_def, init_val);
5475 else
5477 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5478 tree_vector_builder elts (vectype, 1, 2);
5479 elts.quick_push (init_val);
5480 elts.quick_push (neutral_op);
5481 init_def = gimple_build_vector (&stmts, &elts);
5485 if (stmts)
5486 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5487 return init_def;
5490 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5491 which performs a reduction involving GROUP_SIZE scalar statements.
5492 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5493 is nonnull, introducing extra elements of that value will not change the
5494 result. */
5496 static void
5497 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5498 stmt_vec_info reduc_info,
5499 vec<tree> *vec_oprnds,
5500 unsigned int number_of_vectors,
5501 unsigned int group_size, tree neutral_op)
5503 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5504 unsigned HOST_WIDE_INT nunits;
5505 unsigned j, number_of_places_left_in_vector;
5506 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5507 unsigned int i;
5509 gcc_assert (group_size == initial_values.length () || neutral_op);
5511 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5512 created vectors. It is greater than 1 if unrolling is performed.
5514 For example, we have two scalar operands, s1 and s2 (e.g., group of
5515 strided accesses of size two), while NUNITS is four (i.e., four scalars
5516 of this type can be packed in a vector). The output vector will contain
5517 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5518 will be 2).
5520 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5521 vectors containing the operands.
5523 For example, NUNITS is four as before, and the group size is 8
5524 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5525 {s5, s6, s7, s8}. */
5527 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5528 nunits = group_size;
5530 number_of_places_left_in_vector = nunits;
5531 bool constant_p = true;
5532 tree_vector_builder elts (vector_type, nunits, 1);
5533 elts.quick_grow (nunits);
5534 gimple_seq ctor_seq = NULL;
5535 for (j = 0; j < nunits * number_of_vectors; ++j)
5537 tree op;
5538 i = j % group_size;
5540 /* Get the def before the loop. In reduction chain we have only
5541 one initial value. Else we have as many as PHIs in the group. */
5542 if (i >= initial_values.length () || (j > i && neutral_op))
5543 op = neutral_op;
5544 else
5545 op = initial_values[i];
5547 /* Create 'vect_ = {op0,op1,...,opn}'. */
5548 number_of_places_left_in_vector--;
5549 elts[nunits - number_of_places_left_in_vector - 1] = op;
5550 if (!CONSTANT_CLASS_P (op))
5551 constant_p = false;
5553 if (number_of_places_left_in_vector == 0)
5555 tree init;
5556 if (constant_p && !neutral_op
5557 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5558 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5559 /* Build the vector directly from ELTS. */
5560 init = gimple_build_vector (&ctor_seq, &elts);
5561 else if (neutral_op)
5563 /* Build a vector of the neutral value and shift the
5564 other elements into place. */
5565 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5566 neutral_op);
5567 int k = nunits;
5568 while (k > 0 && elts[k - 1] == neutral_op)
5569 k -= 1;
5570 while (k > 0)
5572 k -= 1;
5573 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5574 vector_type, init, elts[k]);
5577 else
5579 /* First time round, duplicate ELTS to fill the
5580 required number of vectors. */
5581 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5582 elts, number_of_vectors, *vec_oprnds);
5583 break;
5585 vec_oprnds->quick_push (init);
5587 number_of_places_left_in_vector = nunits;
5588 elts.new_vector (vector_type, nunits, 1);
5589 elts.quick_grow (nunits);
5590 constant_p = true;
5593 if (ctor_seq != NULL)
5594 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5597 /* For a statement STMT_INFO taking part in a reduction operation return
5598 the stmt_vec_info the meta information is stored on. */
5600 stmt_vec_info
5601 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5603 stmt_info = vect_orig_stmt (stmt_info);
5604 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5605 if (!is_a <gphi *> (stmt_info->stmt)
5606 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5607 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5608 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5609 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5611 if (gimple_phi_num_args (phi) == 1)
5612 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5614 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5616 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5617 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5618 stmt_info = info;
5620 return stmt_info;
5623 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5624 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5625 return false. */
5627 static bool
5628 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5629 stmt_vec_info reduc_info)
5631 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5632 if (!main_loop_vinfo)
5633 return false;
5635 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5636 return false;
5638 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5639 auto_vec<tree, 16> main_loop_results (num_phis);
5640 auto_vec<tree, 16> initial_values (num_phis);
5641 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5643 /* The epilogue loop can be entered either from the main loop or
5644 from an earlier guard block. */
5645 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5646 for (tree incoming_value : reduc_info->reduc_initial_values)
5648 /* Look for:
5650 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5651 INITIAL_VALUE(guard block)>. */
5652 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5654 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5655 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5657 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5658 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5660 main_loop_results.quick_push (from_main_loop);
5661 initial_values.quick_push (from_skip);
5664 else
5665 /* The main loop dominates the epilogue loop. */
5666 main_loop_results.splice (reduc_info->reduc_initial_values);
5668 /* See if the main loop has the kind of accumulator we need. */
5669 vect_reusable_accumulator *accumulator
5670 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5671 if (!accumulator
5672 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5673 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5674 accumulator->reduc_info->reduc_scalar_results.begin ()))
5675 return false;
5677 /* Handle the case where we can reduce wider vectors to narrower ones. */
5678 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5679 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5680 unsigned HOST_WIDE_INT m;
5681 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5682 TYPE_VECTOR_SUBPARTS (vectype), &m))
5683 return false;
5684 /* Check the intermediate vector types and operations are available. */
5685 tree prev_vectype = old_vectype;
5686 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5687 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5689 intermediate_nunits = exact_div (intermediate_nunits, 2);
5690 tree intermediate_vectype = get_related_vectype_for_scalar_type
5691 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5692 if (!intermediate_vectype
5693 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5694 intermediate_vectype)
5695 || !can_vec_extract (TYPE_MODE (prev_vectype),
5696 TYPE_MODE (intermediate_vectype)))
5697 return false;
5698 prev_vectype = intermediate_vectype;
5701 /* Non-SLP reductions might apply an adjustment after the reduction
5702 operation, in order to simplify the initialization of the accumulator.
5703 If the epilogue loop carries on from where the main loop left off,
5704 it should apply the same adjustment to the final reduction result.
5706 If the epilogue loop can also be entered directly (rather than via
5707 the main loop), we need to be able to handle that case in the same way,
5708 with the same adjustment. (In principle we could add a PHI node
5709 to select the correct adjustment, but in practice that shouldn't be
5710 necessary.) */
5711 tree main_adjustment
5712 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5713 if (loop_vinfo->main_loop_edge && main_adjustment)
5715 gcc_assert (num_phis == 1);
5716 tree initial_value = initial_values[0];
5717 /* Check that we can use INITIAL_VALUE as the adjustment and
5718 initialize the accumulator with a neutral value instead. */
5719 if (!operand_equal_p (initial_value, main_adjustment))
5720 return false;
5721 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5722 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5723 code, initial_value);
5725 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5726 reduc_info->reduc_initial_values.truncate (0);
5727 reduc_info->reduc_initial_values.splice (initial_values);
5728 reduc_info->reused_accumulator = accumulator;
5729 return true;
5732 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5733 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5735 static tree
5736 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5737 gimple_seq *seq)
5739 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5740 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5741 tree stype = TREE_TYPE (vectype);
5742 tree new_temp = vec_def;
5743 while (nunits > nunits1)
5745 nunits /= 2;
5746 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5747 stype, nunits);
5748 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5750 /* The target has to make sure we support lowpart/highpart
5751 extraction, either via direct vector extract or through
5752 an integer mode punning. */
5753 tree dst1, dst2;
5754 gimple *epilog_stmt;
5755 if (convert_optab_handler (vec_extract_optab,
5756 TYPE_MODE (TREE_TYPE (new_temp)),
5757 TYPE_MODE (vectype1))
5758 != CODE_FOR_nothing)
5760 /* Extract sub-vectors directly once vec_extract becomes
5761 a conversion optab. */
5762 dst1 = make_ssa_name (vectype1);
5763 epilog_stmt
5764 = gimple_build_assign (dst1, BIT_FIELD_REF,
5765 build3 (BIT_FIELD_REF, vectype1,
5766 new_temp, TYPE_SIZE (vectype1),
5767 bitsize_int (0)));
5768 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5769 dst2 = make_ssa_name (vectype1);
5770 epilog_stmt
5771 = gimple_build_assign (dst2, BIT_FIELD_REF,
5772 build3 (BIT_FIELD_REF, vectype1,
5773 new_temp, TYPE_SIZE (vectype1),
5774 bitsize_int (bitsize)));
5775 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5777 else
5779 /* Extract via punning to appropriately sized integer mode
5780 vector. */
5781 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5782 tree etype = build_vector_type (eltype, 2);
5783 gcc_assert (convert_optab_handler (vec_extract_optab,
5784 TYPE_MODE (etype),
5785 TYPE_MODE (eltype))
5786 != CODE_FOR_nothing);
5787 tree tem = make_ssa_name (etype);
5788 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5789 build1 (VIEW_CONVERT_EXPR,
5790 etype, new_temp));
5791 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5792 new_temp = tem;
5793 tem = make_ssa_name (eltype);
5794 epilog_stmt
5795 = gimple_build_assign (tem, BIT_FIELD_REF,
5796 build3 (BIT_FIELD_REF, eltype,
5797 new_temp, TYPE_SIZE (eltype),
5798 bitsize_int (0)));
5799 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5800 dst1 = make_ssa_name (vectype1);
5801 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5802 build1 (VIEW_CONVERT_EXPR,
5803 vectype1, tem));
5804 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5805 tem = make_ssa_name (eltype);
5806 epilog_stmt
5807 = gimple_build_assign (tem, BIT_FIELD_REF,
5808 build3 (BIT_FIELD_REF, eltype,
5809 new_temp, TYPE_SIZE (eltype),
5810 bitsize_int (bitsize)));
5811 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5812 dst2 = make_ssa_name (vectype1);
5813 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5814 build1 (VIEW_CONVERT_EXPR,
5815 vectype1, tem));
5816 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5819 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5822 return new_temp;
5825 /* Function vect_create_epilog_for_reduction
5827 Create code at the loop-epilog to finalize the result of a reduction
5828 computation.
5830 STMT_INFO is the scalar reduction stmt that is being vectorized.
5831 SLP_NODE is an SLP node containing a group of reduction statements. The
5832 first one in this group is STMT_INFO.
5833 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5834 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5835 (counting from 0)
5837 This function:
5838 1. Completes the reduction def-use cycles.
5839 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5840 by calling the function specified by REDUC_FN if available, or by
5841 other means (whole-vector shifts or a scalar loop).
5842 The function also creates a new phi node at the loop exit to preserve
5843 loop-closed form, as illustrated below.
5845 The flow at the entry to this function:
5847 loop:
5848 vec_def = phi <vec_init, null> # REDUCTION_PHI
5849 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5850 s_loop = scalar_stmt # (scalar) STMT_INFO
5851 loop_exit:
5852 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5853 use <s_out0>
5854 use <s_out0>
5856 The above is transformed by this function into:
5858 loop:
5859 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5860 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5861 s_loop = scalar_stmt # (scalar) STMT_INFO
5862 loop_exit:
5863 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5864 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5865 v_out2 = reduce <v_out1>
5866 s_out3 = extract_field <v_out2, 0>
5867 s_out4 = adjust_result <s_out3>
5868 use <s_out4>
5869 use <s_out4>
5872 static void
5873 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5874 stmt_vec_info stmt_info,
5875 slp_tree slp_node,
5876 slp_instance slp_node_instance)
5878 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5879 gcc_assert (reduc_info->is_reduc_info);
5880 /* For double reductions we need to get at the inner loop reduction
5881 stmt which has the meta info attached. Our stmt_info is that of the
5882 loop-closed PHI of the inner loop which we remember as
5883 def for the reduction PHI generation. */
5884 bool double_reduc = false;
5885 stmt_vec_info rdef_info = stmt_info;
5886 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5888 gcc_assert (!slp_node);
5889 double_reduc = true;
5890 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5891 (stmt_info->stmt, 0));
5892 stmt_info = vect_stmt_to_vectorize (stmt_info);
5894 gphi *reduc_def_stmt
5895 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5896 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5897 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5898 tree vectype;
5899 machine_mode mode;
5900 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5901 basic_block exit_bb;
5902 tree scalar_dest;
5903 tree scalar_type;
5904 gimple *new_phi = NULL, *phi = NULL;
5905 gimple_stmt_iterator exit_gsi;
5906 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5907 gimple *epilog_stmt = NULL;
5908 gimple *exit_phi;
5909 tree bitsize;
5910 tree def;
5911 tree orig_name, scalar_result;
5912 imm_use_iterator imm_iter, phi_imm_iter;
5913 use_operand_p use_p, phi_use_p;
5914 gimple *use_stmt;
5915 auto_vec<tree> reduc_inputs;
5916 int j, i;
5917 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5918 unsigned int group_size = 1, k;
5919 auto_vec<gimple *> phis;
5920 /* SLP reduction without reduction chain, e.g.,
5921 # a1 = phi <a2, a0>
5922 # b1 = phi <b2, b0>
5923 a2 = operation (a1)
5924 b2 = operation (b1) */
5925 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5926 bool direct_slp_reduc;
5927 tree induction_index = NULL_TREE;
5929 if (slp_node)
5930 group_size = SLP_TREE_LANES (slp_node);
5932 if (nested_in_vect_loop_p (loop, stmt_info))
5934 outer_loop = loop;
5935 loop = loop->inner;
5936 gcc_assert (!slp_node && double_reduc);
5939 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5940 gcc_assert (vectype);
5941 mode = TYPE_MODE (vectype);
5943 tree induc_val = NULL_TREE;
5944 tree adjustment_def = NULL;
5945 if (slp_node)
5947 else
5949 /* Optimize: for induction condition reduction, if we can't use zero
5950 for induc_val, use initial_def. */
5951 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5952 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5953 else if (double_reduc)
5955 else
5956 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5959 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5960 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5961 if (slp_reduc)
5962 /* All statements produce live-out values. */
5963 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5964 else if (slp_node)
5966 /* The last statement in the reduction chain produces the live-out
5967 value. Note SLP optimization can shuffle scalar stmts to
5968 optimize permutations so we have to search for the last stmt. */
5969 for (k = 0; k < group_size; ++k)
5970 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5972 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5973 break;
5977 unsigned vec_num;
5978 int ncopies;
5979 if (slp_node)
5981 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5982 ncopies = 1;
5984 else
5986 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5987 vec_num = 1;
5988 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5991 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5992 which is updated with the current index of the loop for every match of
5993 the original loop's cond_expr (VEC_STMT). This results in a vector
5994 containing the last time the condition passed for that vector lane.
5995 The first match will be a 1 to allow 0 to be used for non-matching
5996 indexes. If there are no matches at all then the vector will be all
5997 zeroes.
5999 PR92772: This algorithm is broken for architectures that support
6000 masked vectors, but do not provide fold_extract_last. */
6001 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6003 auto_vec<std::pair<tree, bool>, 2> ccompares;
6004 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6005 cond_info = vect_stmt_to_vectorize (cond_info);
6006 while (cond_info != reduc_info)
6008 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6010 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6011 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6012 ccompares.safe_push
6013 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6014 STMT_VINFO_REDUC_IDX (cond_info) == 2));
6016 cond_info
6017 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6018 1 + STMT_VINFO_REDUC_IDX
6019 (cond_info)));
6020 cond_info = vect_stmt_to_vectorize (cond_info);
6022 gcc_assert (ccompares.length () != 0);
6024 tree indx_before_incr, indx_after_incr;
6025 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6026 int scalar_precision
6027 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6028 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6029 tree cr_index_vector_type = get_related_vectype_for_scalar_type
6030 (TYPE_MODE (vectype), cr_index_scalar_type,
6031 TYPE_VECTOR_SUBPARTS (vectype));
6033 /* First we create a simple vector induction variable which starts
6034 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6035 vector size (STEP). */
6037 /* Create a {1,2,3,...} vector. */
6038 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6040 /* Create a vector of the step value. */
6041 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6042 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6044 /* Create an induction variable. */
6045 gimple_stmt_iterator incr_gsi;
6046 bool insert_after;
6047 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6048 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6049 insert_after, &indx_before_incr, &indx_after_incr);
6051 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6052 filled with zeros (VEC_ZERO). */
6054 /* Create a vector of 0s. */
6055 tree zero = build_zero_cst (cr_index_scalar_type);
6056 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6058 /* Create a vector phi node. */
6059 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6060 new_phi = create_phi_node (new_phi_tree, loop->header);
6061 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6062 loop_preheader_edge (loop), UNKNOWN_LOCATION);
6064 /* Now take the condition from the loops original cond_exprs
6065 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6066 every match uses values from the induction variable
6067 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6068 (NEW_PHI_TREE).
6069 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6070 the new cond_expr (INDEX_COND_EXPR). */
6071 gimple_seq stmts = NULL;
6072 for (int i = ccompares.length () - 1; i != -1; --i)
6074 tree ccompare = ccompares[i].first;
6075 if (ccompares[i].second)
6076 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6077 cr_index_vector_type,
6078 ccompare,
6079 indx_before_incr, new_phi_tree);
6080 else
6081 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6082 cr_index_vector_type,
6083 ccompare,
6084 new_phi_tree, indx_before_incr);
6086 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6088 /* Update the phi with the vec cond. */
6089 induction_index = new_phi_tree;
6090 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6091 loop_latch_edge (loop), UNKNOWN_LOCATION);
6094 /* 2. Create epilog code.
6095 The reduction epilog code operates across the elements of the vector
6096 of partial results computed by the vectorized loop.
6097 The reduction epilog code consists of:
6099 step 1: compute the scalar result in a vector (v_out2)
6100 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6101 step 3: adjust the scalar result (s_out3) if needed.
6103 Step 1 can be accomplished using one the following three schemes:
6104 (scheme 1) using reduc_fn, if available.
6105 (scheme 2) using whole-vector shifts, if available.
6106 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6107 combined.
6109 The overall epilog code looks like this:
6111 s_out0 = phi <s_loop> # original EXIT_PHI
6112 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6113 v_out2 = reduce <v_out1> # step 1
6114 s_out3 = extract_field <v_out2, 0> # step 2
6115 s_out4 = adjust_result <s_out3> # step 3
6117 (step 3 is optional, and steps 1 and 2 may be combined).
6118 Lastly, the uses of s_out0 are replaced by s_out4. */
6121 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6122 v_out1 = phi <VECT_DEF>
6123 Store them in NEW_PHIS. */
6124 if (double_reduc)
6125 loop = outer_loop;
6126 exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
6127 exit_gsi = gsi_after_labels (exit_bb);
6128 reduc_inputs.create (slp_node ? vec_num : ncopies);
6129 for (unsigned i = 0; i < vec_num; i++)
6131 gimple_seq stmts = NULL;
6132 if (slp_node)
6133 def = vect_get_slp_vect_def (slp_node, i);
6134 else
6135 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6136 for (j = 0; j < ncopies; j++)
6138 tree new_def = copy_ssa_name (def);
6139 phi = create_phi_node (new_def, exit_bb);
6140 if (j)
6141 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6142 SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
6143 new_def = gimple_convert (&stmts, vectype, new_def);
6144 reduc_inputs.quick_push (new_def);
6146 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6149 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6150 (i.e. when reduc_fn is not available) and in the final adjustment
6151 code (if needed). Also get the original scalar reduction variable as
6152 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6153 represents a reduction pattern), the tree-code and scalar-def are
6154 taken from the original stmt that the pattern-stmt (STMT) replaces.
6155 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6156 are taken from STMT. */
6158 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6159 if (orig_stmt_info != stmt_info)
6161 /* Reduction pattern */
6162 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6163 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6166 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6167 scalar_type = TREE_TYPE (scalar_dest);
6168 scalar_results.truncate (0);
6169 scalar_results.reserve_exact (group_size);
6170 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6171 bitsize = TYPE_SIZE (scalar_type);
6173 /* True if we should implement SLP_REDUC using native reduction operations
6174 instead of scalar operations. */
6175 direct_slp_reduc = (reduc_fn != IFN_LAST
6176 && slp_reduc
6177 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6179 /* In case of reduction chain, e.g.,
6180 # a1 = phi <a3, a0>
6181 a2 = operation (a1)
6182 a3 = operation (a2),
6184 we may end up with more than one vector result. Here we reduce them
6185 to one vector.
6187 The same is true for a SLP reduction, e.g.,
6188 # a1 = phi <a2, a0>
6189 # b1 = phi <b2, b0>
6190 a2 = operation (a1)
6191 b2 = operation (a2),
6193 where we can end up with more than one vector as well. We can
6194 easily accumulate vectors when the number of vector elements is
6195 a multiple of the SLP group size.
6197 The same is true if we couldn't use a single defuse cycle. */
6198 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6199 || direct_slp_reduc
6200 || (slp_reduc
6201 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6202 || ncopies > 1)
6204 gimple_seq stmts = NULL;
6205 tree single_input = reduc_inputs[0];
6206 for (k = 1; k < reduc_inputs.length (); k++)
6207 single_input = gimple_build (&stmts, code, vectype,
6208 single_input, reduc_inputs[k]);
6209 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6211 reduc_inputs.truncate (0);
6212 reduc_inputs.safe_push (single_input);
6215 tree orig_reduc_input = reduc_inputs[0];
6217 /* If this loop is an epilogue loop that can be skipped after the
6218 main loop, we can only share a reduction operation between the
6219 main loop and the epilogue if we put it at the target of the
6220 skip edge.
6222 We can still reuse accumulators if this check fails. Doing so has
6223 the minor(?) benefit of making the epilogue loop's scalar result
6224 independent of the main loop's scalar result. */
6225 bool unify_with_main_loop_p = false;
6226 if (reduc_info->reused_accumulator
6227 && loop_vinfo->skip_this_loop_edge
6228 && single_succ_p (exit_bb)
6229 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6231 unify_with_main_loop_p = true;
6233 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6234 reduc_inputs[0] = make_ssa_name (vectype);
6235 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6236 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6237 UNKNOWN_LOCATION);
6238 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6239 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6240 exit_gsi = gsi_after_labels (reduc_block);
6243 /* Shouldn't be used beyond this point. */
6244 exit_bb = nullptr;
6246 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6247 && reduc_fn != IFN_LAST)
6249 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6250 various data values where the condition matched and another vector
6251 (INDUCTION_INDEX) containing all the indexes of those matches. We
6252 need to extract the last matching index (which will be the index with
6253 highest value) and use this to index into the data vector.
6254 For the case where there were no matches, the data vector will contain
6255 all default values and the index vector will be all zeros. */
6257 /* Get various versions of the type of the vector of indexes. */
6258 tree index_vec_type = TREE_TYPE (induction_index);
6259 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6260 tree index_scalar_type = TREE_TYPE (index_vec_type);
6261 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6263 /* Get an unsigned integer version of the type of the data vector. */
6264 int scalar_precision
6265 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6266 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6267 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6268 vectype);
6270 /* First we need to create a vector (ZERO_VEC) of zeros and another
6271 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6272 can create using a MAX reduction and then expanding.
6273 In the case where the loop never made any matches, the max index will
6274 be zero. */
6276 /* Vector of {0, 0, 0,...}. */
6277 tree zero_vec = build_zero_cst (vectype);
6279 /* Find maximum value from the vector of found indexes. */
6280 tree max_index = make_ssa_name (index_scalar_type);
6281 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6282 1, induction_index);
6283 gimple_call_set_lhs (max_index_stmt, max_index);
6284 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6286 /* Vector of {max_index, max_index, max_index,...}. */
6287 tree max_index_vec = make_ssa_name (index_vec_type);
6288 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6289 max_index);
6290 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6291 max_index_vec_rhs);
6292 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6294 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6295 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6296 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6297 otherwise. Only one value should match, resulting in a vector
6298 (VEC_COND) with one data value and the rest zeros.
6299 In the case where the loop never made any matches, every index will
6300 match, resulting in a vector with all data values (which will all be
6301 the default value). */
6303 /* Compare the max index vector to the vector of found indexes to find
6304 the position of the max value. */
6305 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6306 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6307 induction_index,
6308 max_index_vec);
6309 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6311 /* Use the compare to choose either values from the data vector or
6312 zero. */
6313 tree vec_cond = make_ssa_name (vectype);
6314 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6315 vec_compare,
6316 reduc_inputs[0],
6317 zero_vec);
6318 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6320 /* Finally we need to extract the data value from the vector (VEC_COND)
6321 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6322 reduction, but because this doesn't exist, we can use a MAX reduction
6323 instead. The data value might be signed or a float so we need to cast
6324 it first.
6325 In the case where the loop never made any matches, the data values are
6326 all identical, and so will reduce down correctly. */
6328 /* Make the matched data values unsigned. */
6329 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6330 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6331 vec_cond);
6332 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6333 VIEW_CONVERT_EXPR,
6334 vec_cond_cast_rhs);
6335 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6337 /* Reduce down to a scalar value. */
6338 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6339 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6340 1, vec_cond_cast);
6341 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6342 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6344 /* Convert the reduced value back to the result type and set as the
6345 result. */
6346 gimple_seq stmts = NULL;
6347 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6348 data_reduc);
6349 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6350 scalar_results.safe_push (new_temp);
6352 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6353 && reduc_fn == IFN_LAST)
6355 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6356 idx = 0;
6357 idx_val = induction_index[0];
6358 val = data_reduc[0];
6359 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6360 if (induction_index[i] > idx_val)
6361 val = data_reduc[i], idx_val = induction_index[i];
6362 return val; */
6364 tree data_eltype = TREE_TYPE (vectype);
6365 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6366 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6367 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6368 /* Enforced by vectorizable_reduction, which ensures we have target
6369 support before allowing a conditional reduction on variable-length
6370 vectors. */
6371 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6372 tree idx_val = NULL_TREE, val = NULL_TREE;
6373 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6375 tree old_idx_val = idx_val;
6376 tree old_val = val;
6377 idx_val = make_ssa_name (idx_eltype);
6378 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6379 build3 (BIT_FIELD_REF, idx_eltype,
6380 induction_index,
6381 bitsize_int (el_size),
6382 bitsize_int (off)));
6383 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6384 val = make_ssa_name (data_eltype);
6385 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6386 build3 (BIT_FIELD_REF,
6387 data_eltype,
6388 reduc_inputs[0],
6389 bitsize_int (el_size),
6390 bitsize_int (off)));
6391 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6392 if (off != 0)
6394 tree new_idx_val = idx_val;
6395 if (off != v_size - el_size)
6397 new_idx_val = make_ssa_name (idx_eltype);
6398 epilog_stmt = gimple_build_assign (new_idx_val,
6399 MAX_EXPR, idx_val,
6400 old_idx_val);
6401 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6403 tree cond = make_ssa_name (boolean_type_node);
6404 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6405 idx_val, old_idx_val);
6406 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6407 tree new_val = make_ssa_name (data_eltype);
6408 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6409 cond, val, old_val);
6410 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6411 idx_val = new_idx_val;
6412 val = new_val;
6415 /* Convert the reduced value back to the result type and set as the
6416 result. */
6417 gimple_seq stmts = NULL;
6418 val = gimple_convert (&stmts, scalar_type, val);
6419 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6420 scalar_results.safe_push (val);
6423 /* 2.3 Create the reduction code, using one of the three schemes described
6424 above. In SLP we simply need to extract all the elements from the
6425 vector (without reducing them), so we use scalar shifts. */
6426 else if (reduc_fn != IFN_LAST && !slp_reduc)
6428 tree tmp;
6429 tree vec_elem_type;
6431 /* Case 1: Create:
6432 v_out2 = reduc_expr <v_out1> */
6434 if (dump_enabled_p ())
6435 dump_printf_loc (MSG_NOTE, vect_location,
6436 "Reduce using direct vector reduction.\n");
6438 gimple_seq stmts = NULL;
6439 vec_elem_type = TREE_TYPE (vectype);
6440 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6441 vec_elem_type, reduc_inputs[0]);
6442 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6443 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6445 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6446 && induc_val)
6448 /* Earlier we set the initial value to be a vector if induc_val
6449 values. Check the result and if it is induc_val then replace
6450 with the original initial value, unless induc_val is
6451 the same as initial_def already. */
6452 tree zcompare = make_ssa_name (boolean_type_node);
6453 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6454 new_temp, induc_val);
6455 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6456 tree initial_def = reduc_info->reduc_initial_values[0];
6457 tmp = make_ssa_name (new_scalar_dest);
6458 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6459 initial_def, new_temp);
6460 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6461 new_temp = tmp;
6464 scalar_results.safe_push (new_temp);
6466 else if (direct_slp_reduc)
6468 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6469 with the elements for other SLP statements replaced with the
6470 neutral value. We can then do a normal reduction on each vector. */
6472 /* Enforced by vectorizable_reduction. */
6473 gcc_assert (reduc_inputs.length () == 1);
6474 gcc_assert (pow2p_hwi (group_size));
6476 gimple_seq seq = NULL;
6478 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6479 and the same element size as VECTYPE. */
6480 tree index = build_index_vector (vectype, 0, 1);
6481 tree index_type = TREE_TYPE (index);
6482 tree index_elt_type = TREE_TYPE (index_type);
6483 tree mask_type = truth_type_for (index_type);
6485 /* Create a vector that, for each element, identifies which of
6486 the REDUC_GROUP_SIZE results should use it. */
6487 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6488 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6489 build_vector_from_val (index_type, index_mask));
6491 /* Get a neutral vector value. This is simply a splat of the neutral
6492 scalar value if we have one, otherwise the initial scalar value
6493 is itself a neutral value. */
6494 tree vector_identity = NULL_TREE;
6495 tree neutral_op = NULL_TREE;
6496 if (slp_node)
6498 tree initial_value = NULL_TREE;
6499 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6500 initial_value = reduc_info->reduc_initial_values[0];
6501 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6502 initial_value, false);
6504 if (neutral_op)
6505 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6506 neutral_op);
6507 for (unsigned int i = 0; i < group_size; ++i)
6509 /* If there's no univeral neutral value, we can use the
6510 initial scalar value from the original PHI. This is used
6511 for MIN and MAX reduction, for example. */
6512 if (!neutral_op)
6514 tree scalar_value = reduc_info->reduc_initial_values[i];
6515 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6516 scalar_value);
6517 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6518 scalar_value);
6521 /* Calculate the equivalent of:
6523 sel[j] = (index[j] == i);
6525 which selects the elements of REDUC_INPUTS[0] that should
6526 be included in the result. */
6527 tree compare_val = build_int_cst (index_elt_type, i);
6528 compare_val = build_vector_from_val (index_type, compare_val);
6529 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6530 index, compare_val);
6532 /* Calculate the equivalent of:
6534 vec = seq ? reduc_inputs[0] : vector_identity;
6536 VEC is now suitable for a full vector reduction. */
6537 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6538 sel, reduc_inputs[0], vector_identity);
6540 /* Do the reduction and convert it to the appropriate type. */
6541 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6542 TREE_TYPE (vectype), vec);
6543 scalar = gimple_convert (&seq, scalar_type, scalar);
6544 scalar_results.safe_push (scalar);
6546 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6548 else
6550 bool reduce_with_shift;
6551 tree vec_temp;
6553 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6555 /* See if the target wants to do the final (shift) reduction
6556 in a vector mode of smaller size and first reduce upper/lower
6557 halves against each other. */
6558 enum machine_mode mode1 = mode;
6559 tree stype = TREE_TYPE (vectype);
6560 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6561 unsigned nunits1 = nunits;
6562 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6563 && reduc_inputs.length () == 1)
6565 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6566 /* For SLP reductions we have to make sure lanes match up, but
6567 since we're doing individual element final reduction reducing
6568 vector width here is even more important.
6569 ??? We can also separate lanes with permutes, for the common
6570 case of power-of-two group-size odd/even extracts would work. */
6571 if (slp_reduc && nunits != nunits1)
6573 nunits1 = least_common_multiple (nunits1, group_size);
6574 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6577 if (!slp_reduc
6578 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6579 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6581 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6582 stype, nunits1);
6583 reduce_with_shift = have_whole_vector_shift (mode1);
6584 if (!VECTOR_MODE_P (mode1)
6585 || !directly_supported_p (code, vectype1))
6586 reduce_with_shift = false;
6588 /* First reduce the vector to the desired vector size we should
6589 do shift reduction on by combining upper and lower halves. */
6590 gimple_seq stmts = NULL;
6591 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6592 code, &stmts);
6593 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6594 reduc_inputs[0] = new_temp;
6596 if (reduce_with_shift && !slp_reduc)
6598 int element_bitsize = tree_to_uhwi (bitsize);
6599 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6600 for variable-length vectors and also requires direct target support
6601 for loop reductions. */
6602 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6603 int nelements = vec_size_in_bits / element_bitsize;
6604 vec_perm_builder sel;
6605 vec_perm_indices indices;
6607 int elt_offset;
6609 tree zero_vec = build_zero_cst (vectype1);
6610 /* Case 2: Create:
6611 for (offset = nelements/2; offset >= 1; offset/=2)
6613 Create: va' = vec_shift <va, offset>
6614 Create: va = vop <va, va'>
6615 } */
6617 tree rhs;
6619 if (dump_enabled_p ())
6620 dump_printf_loc (MSG_NOTE, vect_location,
6621 "Reduce using vector shifts\n");
6623 gimple_seq stmts = NULL;
6624 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6625 for (elt_offset = nelements / 2;
6626 elt_offset >= 1;
6627 elt_offset /= 2)
6629 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6630 indices.new_vector (sel, 2, nelements);
6631 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6632 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6633 new_temp, zero_vec, mask);
6634 new_temp = gimple_build (&stmts, code,
6635 vectype1, new_name, new_temp);
6637 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6639 /* 2.4 Extract the final scalar result. Create:
6640 s_out3 = extract_field <v_out2, bitpos> */
6642 if (dump_enabled_p ())
6643 dump_printf_loc (MSG_NOTE, vect_location,
6644 "extract scalar result\n");
6646 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6647 bitsize, bitsize_zero_node);
6648 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6649 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6650 gimple_assign_set_lhs (epilog_stmt, new_temp);
6651 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6652 scalar_results.safe_push (new_temp);
6654 else
6656 /* Case 3: Create:
6657 s = extract_field <v_out2, 0>
6658 for (offset = element_size;
6659 offset < vector_size;
6660 offset += element_size;)
6662 Create: s' = extract_field <v_out2, offset>
6663 Create: s = op <s, s'> // For non SLP cases
6664 } */
6666 if (dump_enabled_p ())
6667 dump_printf_loc (MSG_NOTE, vect_location,
6668 "Reduce using scalar code.\n");
6670 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6671 int element_bitsize = tree_to_uhwi (bitsize);
6672 tree compute_type = TREE_TYPE (vectype);
6673 gimple_seq stmts = NULL;
6674 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6676 int bit_offset;
6677 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6678 vec_temp, bitsize, bitsize_zero_node);
6680 /* In SLP we don't need to apply reduction operation, so we just
6681 collect s' values in SCALAR_RESULTS. */
6682 if (slp_reduc)
6683 scalar_results.safe_push (new_temp);
6685 for (bit_offset = element_bitsize;
6686 bit_offset < vec_size_in_bits;
6687 bit_offset += element_bitsize)
6689 tree bitpos = bitsize_int (bit_offset);
6690 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6691 compute_type, vec_temp,
6692 bitsize, bitpos);
6693 if (slp_reduc)
6695 /* In SLP we don't need to apply reduction operation, so
6696 we just collect s' values in SCALAR_RESULTS. */
6697 new_temp = new_name;
6698 scalar_results.safe_push (new_name);
6700 else
6701 new_temp = gimple_build (&stmts, code, compute_type,
6702 new_name, new_temp);
6706 /* The only case where we need to reduce scalar results in SLP, is
6707 unrolling. If the size of SCALAR_RESULTS is greater than
6708 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6709 REDUC_GROUP_SIZE. */
6710 if (slp_reduc)
6712 tree res, first_res, new_res;
6714 /* Reduce multiple scalar results in case of SLP unrolling. */
6715 for (j = group_size; scalar_results.iterate (j, &res);
6716 j++)
6718 first_res = scalar_results[j % group_size];
6719 new_res = gimple_build (&stmts, code, compute_type,
6720 first_res, res);
6721 scalar_results[j % group_size] = new_res;
6723 scalar_results.truncate (group_size);
6724 for (k = 0; k < group_size; k++)
6725 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6726 scalar_results[k]);
6728 else
6730 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6731 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6732 scalar_results.safe_push (new_temp);
6735 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6738 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6739 && induc_val)
6741 /* Earlier we set the initial value to be a vector if induc_val
6742 values. Check the result and if it is induc_val then replace
6743 with the original initial value, unless induc_val is
6744 the same as initial_def already. */
6745 tree zcompare = make_ssa_name (boolean_type_node);
6746 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6747 induc_val);
6748 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6749 tree initial_def = reduc_info->reduc_initial_values[0];
6750 tree tmp = make_ssa_name (new_scalar_dest);
6751 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6752 initial_def, new_temp);
6753 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6754 scalar_results[0] = tmp;
6758 /* 2.5 Adjust the final result by the initial value of the reduction
6759 variable. (When such adjustment is not needed, then
6760 'adjustment_def' is zero). For example, if code is PLUS we create:
6761 new_temp = loop_exit_def + adjustment_def */
6763 if (adjustment_def)
6765 gcc_assert (!slp_reduc);
6766 gimple_seq stmts = NULL;
6767 if (double_reduc)
6769 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6770 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6771 new_temp = gimple_build (&stmts, code, vectype,
6772 reduc_inputs[0], adjustment_def);
6774 else
6776 new_temp = scalar_results[0];
6777 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6778 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6779 adjustment_def);
6780 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6781 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6782 new_temp, adjustment_def);
6783 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6786 epilog_stmt = gimple_seq_last_stmt (stmts);
6787 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6788 scalar_results[0] = new_temp;
6791 /* Record this operation if it could be reused by the epilogue loop. */
6792 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6793 && reduc_inputs.length () == 1)
6794 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6795 { orig_reduc_input, reduc_info });
6797 if (double_reduc)
6798 loop = outer_loop;
6800 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6801 phis with new adjusted scalar results, i.e., replace use <s_out0>
6802 with use <s_out4>.
6804 Transform:
6805 loop_exit:
6806 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6807 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6808 v_out2 = reduce <v_out1>
6809 s_out3 = extract_field <v_out2, 0>
6810 s_out4 = adjust_result <s_out3>
6811 use <s_out0>
6812 use <s_out0>
6814 into:
6816 loop_exit:
6817 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6818 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6819 v_out2 = reduce <v_out1>
6820 s_out3 = extract_field <v_out2, 0>
6821 s_out4 = adjust_result <s_out3>
6822 use <s_out4>
6823 use <s_out4> */
6825 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6826 for (k = 0; k < live_out_stmts.size (); k++)
6828 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6829 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6831 phis.create (3);
6832 /* Find the loop-closed-use at the loop exit of the original scalar
6833 result. (The reduction result is expected to have two immediate uses,
6834 one at the latch block, and one at the loop exit). For double
6835 reductions we are looking for exit phis of the outer loop. */
6836 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6838 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6840 if (!is_gimple_debug (USE_STMT (use_p)))
6841 phis.safe_push (USE_STMT (use_p));
6843 else
6845 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6847 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6849 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6851 if (!flow_bb_inside_loop_p (loop,
6852 gimple_bb (USE_STMT (phi_use_p)))
6853 && !is_gimple_debug (USE_STMT (phi_use_p)))
6854 phis.safe_push (USE_STMT (phi_use_p));
6860 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6862 /* Replace the uses: */
6863 orig_name = PHI_RESULT (exit_phi);
6865 /* Look for a single use at the target of the skip edge. */
6866 if (unify_with_main_loop_p)
6868 use_operand_p use_p;
6869 gimple *user;
6870 if (!single_imm_use (orig_name, &use_p, &user))
6871 gcc_unreachable ();
6872 orig_name = gimple_get_lhs (user);
6875 scalar_result = scalar_results[k];
6876 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6878 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6879 SET_USE (use_p, scalar_result);
6880 update_stmt (use_stmt);
6884 phis.release ();
6888 /* Return a vector of type VECTYPE that is equal to the vector select
6889 operation "MASK ? VEC : IDENTITY". Insert the select statements
6890 before GSI. */
6892 static tree
6893 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6894 tree vec, tree identity)
6896 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6897 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6898 mask, vec, identity);
6899 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6900 return cond;
6903 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6904 order, starting with LHS. Insert the extraction statements before GSI and
6905 associate the new scalar SSA names with variable SCALAR_DEST.
6906 Return the SSA name for the result. */
6908 static tree
6909 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6910 tree_code code, tree lhs, tree vector_rhs)
6912 tree vectype = TREE_TYPE (vector_rhs);
6913 tree scalar_type = TREE_TYPE (vectype);
6914 tree bitsize = TYPE_SIZE (scalar_type);
6915 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6916 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6918 for (unsigned HOST_WIDE_INT bit_offset = 0;
6919 bit_offset < vec_size_in_bits;
6920 bit_offset += element_bitsize)
6922 tree bitpos = bitsize_int (bit_offset);
6923 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6924 bitsize, bitpos);
6926 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6927 rhs = make_ssa_name (scalar_dest, stmt);
6928 gimple_assign_set_lhs (stmt, rhs);
6929 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6931 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6932 tree new_name = make_ssa_name (scalar_dest, stmt);
6933 gimple_assign_set_lhs (stmt, new_name);
6934 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6935 lhs = new_name;
6937 return lhs;
6940 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6941 type of the vector input. */
6943 static internal_fn
6944 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6946 internal_fn mask_reduc_fn;
6947 internal_fn mask_len_reduc_fn;
6949 switch (reduc_fn)
6951 case IFN_FOLD_LEFT_PLUS:
6952 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6953 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6954 break;
6956 default:
6957 return IFN_LAST;
6960 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6961 OPTIMIZE_FOR_SPEED))
6962 return mask_reduc_fn;
6963 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6964 OPTIMIZE_FOR_SPEED))
6965 return mask_len_reduc_fn;
6966 return IFN_LAST;
6969 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6970 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6971 statement. CODE is the operation performed by STMT_INFO and OPS are
6972 its scalar operands. REDUC_INDEX is the index of the operand in
6973 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6974 implements in-order reduction, or IFN_LAST if we should open-code it.
6975 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6976 that should be used to control the operation in a fully-masked loop. */
6978 static bool
6979 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6980 stmt_vec_info stmt_info,
6981 gimple_stmt_iterator *gsi,
6982 gimple **vec_stmt, slp_tree slp_node,
6983 gimple *reduc_def_stmt,
6984 code_helper code, internal_fn reduc_fn,
6985 tree *ops, int num_ops, tree vectype_in,
6986 int reduc_index, vec_loop_masks *masks,
6987 vec_loop_lens *lens)
6989 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6990 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6991 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6993 int ncopies;
6994 if (slp_node)
6995 ncopies = 1;
6996 else
6997 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6999 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7000 gcc_assert (ncopies == 1);
7002 bool is_cond_op = false;
7003 if (!code.is_tree_code ())
7005 code = conditional_internal_fn_code (internal_fn (code));
7006 gcc_assert (code != ERROR_MARK);
7007 is_cond_op = true;
7010 gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7012 if (slp_node)
7014 if (is_cond_op)
7016 if (dump_enabled_p ())
7017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7018 "fold-left reduction on SLP not supported.\n");
7019 return false;
7022 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7023 TYPE_VECTOR_SUBPARTS (vectype_in)));
7026 /* The operands either come from a binary operation or an IFN_COND operation.
7027 The former is a gimple assign with binary rhs and the latter is a
7028 gimple call with four arguments. */
7029 gcc_assert (num_ops == 2 || num_ops == 4);
7030 tree op0, opmask;
7031 if (!is_cond_op)
7032 op0 = ops[1 - reduc_index];
7033 else
7035 op0 = ops[2];
7036 opmask = ops[0];
7037 gcc_assert (!slp_node);
7040 int group_size = 1;
7041 stmt_vec_info scalar_dest_def_info;
7042 auto_vec<tree> vec_oprnds0, vec_opmask;
7043 if (slp_node)
7045 auto_vec<vec<tree> > vec_defs (2);
7046 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7047 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7048 vec_defs[0].release ();
7049 vec_defs[1].release ();
7050 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7051 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7053 else
7055 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7056 op0, &vec_oprnds0);
7057 scalar_dest_def_info = stmt_info;
7059 /* For an IFN_COND_OP we also need the vector mask operand. */
7060 if (is_cond_op)
7061 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7062 opmask, &vec_opmask);
7065 gimple *sdef = scalar_dest_def_info->stmt;
7066 tree scalar_dest = gimple_get_lhs (sdef);
7067 tree scalar_type = TREE_TYPE (scalar_dest);
7068 tree reduc_var = gimple_phi_result (reduc_def_stmt);
7070 int vec_num = vec_oprnds0.length ();
7071 gcc_assert (vec_num == 1 || slp_node);
7072 tree vec_elem_type = TREE_TYPE (vectype_out);
7073 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7075 tree vector_identity = NULL_TREE;
7076 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7078 vector_identity = build_zero_cst (vectype_out);
7079 if (!HONOR_SIGNED_ZEROS (vectype_out))
7081 else
7083 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7084 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7085 vector_identity);
7089 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7090 int i;
7091 tree def0;
7092 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7094 gimple *new_stmt;
7095 tree mask = NULL_TREE;
7096 tree len = NULL_TREE;
7097 tree bias = NULL_TREE;
7098 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7099 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7100 else if (is_cond_op)
7101 mask = vec_opmask[0];
7102 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7104 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7105 i, 1);
7106 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7107 bias = build_int_cst (intQI_type_node, biasval);
7108 if (!is_cond_op)
7109 mask = build_minus_one_cst (truth_type_for (vectype_in));
7112 /* Handle MINUS by adding the negative. */
7113 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7115 tree negated = make_ssa_name (vectype_out);
7116 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7117 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7118 def0 = negated;
7121 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7122 && mask && mask_reduc_fn == IFN_LAST)
7123 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7124 vector_identity);
7126 /* On the first iteration the input is simply the scalar phi
7127 result, and for subsequent iterations it is the output of
7128 the preceding operation. */
7129 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7131 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7132 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7133 def0, mask, len, bias);
7134 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7135 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7136 def0, mask);
7137 else
7138 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7139 def0);
7140 /* For chained SLP reductions the output of the previous reduction
7141 operation serves as the input of the next. For the final statement
7142 the output cannot be a temporary - we reuse the original
7143 scalar destination of the last statement. */
7144 if (i != vec_num - 1)
7146 gimple_set_lhs (new_stmt, scalar_dest_var);
7147 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7148 gimple_set_lhs (new_stmt, reduc_var);
7151 else
7153 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7154 tree_code (code), reduc_var, def0);
7155 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7156 /* Remove the statement, so that we can use the same code paths
7157 as for statements that we've just created. */
7158 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7159 gsi_remove (&tmp_gsi, true);
7162 if (i == vec_num - 1)
7164 gimple_set_lhs (new_stmt, scalar_dest);
7165 vect_finish_replace_stmt (loop_vinfo,
7166 scalar_dest_def_info,
7167 new_stmt);
7169 else
7170 vect_finish_stmt_generation (loop_vinfo,
7171 scalar_dest_def_info,
7172 new_stmt, gsi);
7174 if (slp_node)
7175 slp_node->push_vec_def (new_stmt);
7176 else
7178 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7179 *vec_stmt = new_stmt;
7183 return true;
7186 /* Function is_nonwrapping_integer_induction.
7188 Check if STMT_VINO (which is part of loop LOOP) both increments and
7189 does not cause overflow. */
7191 static bool
7192 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7194 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7195 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7196 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7197 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7198 widest_int ni, max_loop_value, lhs_max;
7199 wi::overflow_type overflow = wi::OVF_NONE;
7201 /* Make sure the loop is integer based. */
7202 if (TREE_CODE (base) != INTEGER_CST
7203 || TREE_CODE (step) != INTEGER_CST)
7204 return false;
7206 /* Check that the max size of the loop will not wrap. */
7208 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7209 return true;
7211 if (! max_stmt_executions (loop, &ni))
7212 return false;
7214 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7215 &overflow);
7216 if (overflow)
7217 return false;
7219 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7220 TYPE_SIGN (lhs_type), &overflow);
7221 if (overflow)
7222 return false;
7224 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7225 <= TYPE_PRECISION (lhs_type));
7228 /* Check if masking can be supported by inserting a conditional expression.
7229 CODE is the code for the operation. COND_FN is the conditional internal
7230 function, if it exists. VECTYPE_IN is the type of the vector input. */
7231 static bool
7232 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7233 tree vectype_in)
7235 if (cond_fn != IFN_LAST
7236 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7237 OPTIMIZE_FOR_SPEED))
7238 return false;
7240 if (code.is_tree_code ())
7241 switch (tree_code (code))
7243 case DOT_PROD_EXPR:
7244 case SAD_EXPR:
7245 return true;
7247 default:
7248 break;
7250 return false;
7253 /* Insert a conditional expression to enable masked vectorization. CODE is the
7254 code for the operation. VOP is the array of operands. MASK is the loop
7255 mask. GSI is a statement iterator used to place the new conditional
7256 expression. */
7257 static void
7258 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7259 gimple_stmt_iterator *gsi)
7261 switch (tree_code (code))
7263 case DOT_PROD_EXPR:
7265 tree vectype = TREE_TYPE (vop[1]);
7266 tree zero = build_zero_cst (vectype);
7267 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7268 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7269 mask, vop[1], zero);
7270 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7271 vop[1] = masked_op1;
7272 break;
7275 case SAD_EXPR:
7277 tree vectype = TREE_TYPE (vop[1]);
7278 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7279 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7280 mask, vop[1], vop[0]);
7281 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7282 vop[1] = masked_op1;
7283 break;
7286 default:
7287 gcc_unreachable ();
7291 /* Function vectorizable_reduction.
7293 Check if STMT_INFO performs a reduction operation that can be vectorized.
7294 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7295 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7296 Return true if STMT_INFO is vectorizable in this way.
7298 This function also handles reduction idioms (patterns) that have been
7299 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7300 may be of this form:
7301 X = pattern_expr (arg0, arg1, ..., X)
7302 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7303 sequence that had been detected and replaced by the pattern-stmt
7304 (STMT_INFO).
7306 This function also handles reduction of condition expressions, for example:
7307 for (int i = 0; i < N; i++)
7308 if (a[i] < value)
7309 last = a[i];
7310 This is handled by vectorising the loop and creating an additional vector
7311 containing the loop indexes for which "a[i] < value" was true. In the
7312 function epilogue this is reduced to a single max value and then used to
7313 index into the vector of results.
7315 In some cases of reduction patterns, the type of the reduction variable X is
7316 different than the type of the other arguments of STMT_INFO.
7317 In such cases, the vectype that is used when transforming STMT_INFO into
7318 a vector stmt is different than the vectype that is used to determine the
7319 vectorization factor, because it consists of a different number of elements
7320 than the actual number of elements that are being operated upon in parallel.
7322 For example, consider an accumulation of shorts into an int accumulator.
7323 On some targets it's possible to vectorize this pattern operating on 8
7324 shorts at a time (hence, the vectype for purposes of determining the
7325 vectorization factor should be V8HI); on the other hand, the vectype that
7326 is used to create the vector form is actually V4SI (the type of the result).
7328 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7329 indicates what is the actual level of parallelism (V8HI in the example), so
7330 that the right vectorization factor would be derived. This vectype
7331 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7332 be used to create the vectorized stmt. The right vectype for the vectorized
7333 stmt is obtained from the type of the result X:
7334 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7336 This means that, contrary to "regular" reductions (or "regular" stmts in
7337 general), the following equation:
7338 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7339 does *NOT* necessarily hold for reduction patterns. */
7341 bool
7342 vectorizable_reduction (loop_vec_info loop_vinfo,
7343 stmt_vec_info stmt_info, slp_tree slp_node,
7344 slp_instance slp_node_instance,
7345 stmt_vector_for_cost *cost_vec)
7347 tree vectype_in = NULL_TREE;
7348 tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7349 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7350 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7351 stmt_vec_info cond_stmt_vinfo = NULL;
7352 int i;
7353 int ncopies;
7354 bool single_defuse_cycle = false;
7355 bool nested_cycle = false;
7356 bool double_reduc = false;
7357 int vec_num;
7358 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7359 tree cond_reduc_val = NULL_TREE;
7361 /* Make sure it was already recognized as a reduction computation. */
7362 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7363 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7364 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7365 return false;
7367 /* The stmt we store reduction analysis meta on. */
7368 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7369 reduc_info->is_reduc_info = true;
7371 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7373 if (is_a <gphi *> (stmt_info->stmt))
7375 if (slp_node)
7377 /* We eventually need to set a vector type on invariant
7378 arguments. */
7379 unsigned j;
7380 slp_tree child;
7381 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7382 if (!vect_maybe_update_slp_op_vectype
7383 (child, SLP_TREE_VECTYPE (slp_node)))
7385 if (dump_enabled_p ())
7386 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7387 "incompatible vector types for "
7388 "invariants\n");
7389 return false;
7392 /* Analysis for double-reduction is done on the outer
7393 loop PHI, nested cycles have no further restrictions. */
7394 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7396 else
7397 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7398 return true;
7401 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7402 stmt_vec_info phi_info = stmt_info;
7403 if (!is_a <gphi *> (stmt_info->stmt))
7405 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7406 return true;
7408 if (slp_node)
7410 slp_node_instance->reduc_phis = slp_node;
7411 /* ??? We're leaving slp_node to point to the PHIs, we only
7412 need it to get at the number of vector stmts which wasn't
7413 yet initialized for the instance root. */
7415 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7417 use_operand_p use_p;
7418 gimple *use_stmt;
7419 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7420 &use_p, &use_stmt);
7421 gcc_assert (res);
7422 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7425 /* PHIs should not participate in patterns. */
7426 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7427 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7429 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7430 and compute the reduction chain length. Discover the real
7431 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7432 tree reduc_def
7433 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7434 loop_latch_edge
7435 (gimple_bb (reduc_def_phi)->loop_father));
7436 unsigned reduc_chain_length = 0;
7437 bool only_slp_reduc_chain = true;
7438 stmt_info = NULL;
7439 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7440 while (reduc_def != PHI_RESULT (reduc_def_phi))
7442 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7443 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7444 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7446 if (dump_enabled_p ())
7447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7448 "reduction chain broken by patterns.\n");
7449 return false;
7451 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7452 only_slp_reduc_chain = false;
7453 /* For epilogue generation live members of the chain need
7454 to point back to the PHI via their original stmt for
7455 info_for_reduction to work. For SLP we need to look at
7456 all lanes here - even though we only will vectorize from
7457 the SLP node with live lane zero the other live lanes also
7458 need to be identified as part of a reduction to be able
7459 to skip code generation for them. */
7460 if (slp_for_stmt_info)
7462 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7463 if (STMT_VINFO_LIVE_P (s))
7464 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7466 else if (STMT_VINFO_LIVE_P (vdef))
7467 STMT_VINFO_REDUC_DEF (def) = phi_info;
7468 gimple_match_op op;
7469 if (!gimple_extract_op (vdef->stmt, &op))
7471 if (dump_enabled_p ())
7472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7473 "reduction chain includes unsupported"
7474 " statement type.\n");
7475 return false;
7477 if (CONVERT_EXPR_CODE_P (op.code))
7479 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7481 if (dump_enabled_p ())
7482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7483 "conversion in the reduction chain.\n");
7484 return false;
7487 else if (!stmt_info)
7488 /* First non-conversion stmt. */
7489 stmt_info = vdef;
7490 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7491 reduc_chain_length++;
7492 if (!stmt_info && slp_node)
7493 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7495 /* PHIs should not participate in patterns. */
7496 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7498 if (nested_in_vect_loop_p (loop, stmt_info))
7500 loop = loop->inner;
7501 nested_cycle = true;
7504 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7505 element. */
7506 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7508 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7509 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7511 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7512 gcc_assert (slp_node
7513 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7515 /* 1. Is vectorizable reduction? */
7516 /* Not supportable if the reduction variable is used in the loop, unless
7517 it's a reduction chain. */
7518 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7519 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7520 return false;
7522 /* Reductions that are not used even in an enclosing outer-loop,
7523 are expected to be "live" (used out of the loop). */
7524 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7525 && !STMT_VINFO_LIVE_P (stmt_info))
7526 return false;
7528 /* 2. Has this been recognized as a reduction pattern?
7530 Check if STMT represents a pattern that has been recognized
7531 in earlier analysis stages. For stmts that represent a pattern,
7532 the STMT_VINFO_RELATED_STMT field records the last stmt in
7533 the original sequence that constitutes the pattern. */
7535 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7536 if (orig_stmt_info)
7538 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7539 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7542 /* 3. Check the operands of the operation. The first operands are defined
7543 inside the loop body. The last operand is the reduction variable,
7544 which is defined by the loop-header-phi. */
7546 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7547 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7548 gimple_match_op op;
7549 if (!gimple_extract_op (stmt_info->stmt, &op))
7550 gcc_unreachable ();
7551 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7552 || op.code == WIDEN_SUM_EXPR
7553 || op.code == SAD_EXPR);
7555 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7556 && !SCALAR_FLOAT_TYPE_P (op.type))
7557 return false;
7559 /* Do not try to vectorize bit-precision reductions. */
7560 if (!type_has_mode_precision_p (op.type))
7561 return false;
7563 /* For lane-reducing ops we're reducing the number of reduction PHIs
7564 which means the only use of that may be in the lane-reducing operation. */
7565 if (lane_reduc_code_p
7566 && reduc_chain_length != 1
7567 && !only_slp_reduc_chain)
7569 if (dump_enabled_p ())
7570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7571 "lane-reducing reduction with extra stmts.\n");
7572 return false;
7575 /* All uses but the last are expected to be defined in the loop.
7576 The last use is the reduction variable. In case of nested cycle this
7577 assumption is not true: we use reduc_index to record the index of the
7578 reduction variable. */
7579 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7580 /* We need to skip an extra operand for COND_EXPRs with embedded
7581 comparison. */
7582 unsigned opno_adjust = 0;
7583 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7584 opno_adjust = 1;
7585 for (i = 0; i < (int) op.num_ops; i++)
7587 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7588 if (i == 0 && op.code == COND_EXPR)
7589 continue;
7591 stmt_vec_info def_stmt_info;
7592 enum vect_def_type dt;
7593 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7594 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7595 &vectype_op[i], &def_stmt_info))
7597 if (dump_enabled_p ())
7598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7599 "use not simple.\n");
7600 return false;
7602 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7603 continue;
7605 /* For an IFN_COND_OP we might hit the reduction definition operand
7606 twice (once as definition, once as else). */
7607 if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7608 continue;
7610 /* There should be only one cycle def in the stmt, the one
7611 leading to reduc_def. */
7612 if (VECTORIZABLE_CYCLE_DEF (dt))
7613 return false;
7615 if (!vectype_op[i])
7616 vectype_op[i]
7617 = get_vectype_for_scalar_type (loop_vinfo,
7618 TREE_TYPE (op.ops[i]), slp_op[i]);
7620 /* To properly compute ncopies we are interested in the widest
7621 non-reduction input type in case we're looking at a widening
7622 accumulation that we later handle in vect_transform_reduction. */
7623 if (lane_reduc_code_p
7624 && vectype_op[i]
7625 && (!vectype_in
7626 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7627 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7628 vectype_in = vectype_op[i];
7630 if (op.code == COND_EXPR)
7632 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7633 if (dt == vect_constant_def)
7635 cond_reduc_dt = dt;
7636 cond_reduc_val = op.ops[i];
7638 if (dt == vect_induction_def
7639 && def_stmt_info
7640 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7642 cond_reduc_dt = dt;
7643 cond_stmt_vinfo = def_stmt_info;
7647 if (!vectype_in)
7648 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7649 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7651 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7652 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7653 /* If we have a condition reduction, see if we can simplify it further. */
7654 if (v_reduc_type == COND_REDUCTION)
7656 if (slp_node)
7657 return false;
7659 /* When the condition uses the reduction value in the condition, fail. */
7660 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7662 if (dump_enabled_p ())
7663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7664 "condition depends on previous iteration\n");
7665 return false;
7668 if (reduc_chain_length == 1
7669 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7670 OPTIMIZE_FOR_SPEED)
7671 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7672 vectype_in,
7673 OPTIMIZE_FOR_SPEED)))
7675 if (dump_enabled_p ())
7676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7677 "optimizing condition reduction with"
7678 " FOLD_EXTRACT_LAST.\n");
7679 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7681 else if (cond_reduc_dt == vect_induction_def)
7683 tree base
7684 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7685 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7687 gcc_assert (TREE_CODE (base) == INTEGER_CST
7688 && TREE_CODE (step) == INTEGER_CST);
7689 cond_reduc_val = NULL_TREE;
7690 enum tree_code cond_reduc_op_code = ERROR_MARK;
7691 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7692 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7694 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7695 above base; punt if base is the minimum value of the type for
7696 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7697 else if (tree_int_cst_sgn (step) == -1)
7699 cond_reduc_op_code = MIN_EXPR;
7700 if (tree_int_cst_sgn (base) == -1)
7701 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7702 else if (tree_int_cst_lt (base,
7703 TYPE_MAX_VALUE (TREE_TYPE (base))))
7704 cond_reduc_val
7705 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7707 else
7709 cond_reduc_op_code = MAX_EXPR;
7710 if (tree_int_cst_sgn (base) == 1)
7711 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7712 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7713 base))
7714 cond_reduc_val
7715 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7717 if (cond_reduc_val)
7719 if (dump_enabled_p ())
7720 dump_printf_loc (MSG_NOTE, vect_location,
7721 "condition expression based on "
7722 "integer induction.\n");
7723 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7724 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7725 = cond_reduc_val;
7726 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7729 else if (cond_reduc_dt == vect_constant_def)
7731 enum vect_def_type cond_initial_dt;
7732 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7733 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7734 if (cond_initial_dt == vect_constant_def
7735 && types_compatible_p (TREE_TYPE (cond_initial_val),
7736 TREE_TYPE (cond_reduc_val)))
7738 tree e = fold_binary (LE_EXPR, boolean_type_node,
7739 cond_initial_val, cond_reduc_val);
7740 if (e && (integer_onep (e) || integer_zerop (e)))
7742 if (dump_enabled_p ())
7743 dump_printf_loc (MSG_NOTE, vect_location,
7744 "condition expression based on "
7745 "compile time constant.\n");
7746 /* Record reduction code at analysis stage. */
7747 STMT_VINFO_REDUC_CODE (reduc_info)
7748 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7749 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7755 if (STMT_VINFO_LIVE_P (phi_info))
7756 return false;
7758 if (slp_node)
7759 ncopies = 1;
7760 else
7761 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7763 gcc_assert (ncopies >= 1);
7765 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7767 if (nested_cycle)
7769 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7770 == vect_double_reduction_def);
7771 double_reduc = true;
7774 /* 4.2. Check support for the epilog operation.
7776 If STMT represents a reduction pattern, then the type of the
7777 reduction variable may be different than the type of the rest
7778 of the arguments. For example, consider the case of accumulation
7779 of shorts into an int accumulator; The original code:
7780 S1: int_a = (int) short_a;
7781 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7783 was replaced with:
7784 STMT: int_acc = widen_sum <short_a, int_acc>
7786 This means that:
7787 1. The tree-code that is used to create the vector operation in the
7788 epilog code (that reduces the partial results) is not the
7789 tree-code of STMT, but is rather the tree-code of the original
7790 stmt from the pattern that STMT is replacing. I.e, in the example
7791 above we want to use 'widen_sum' in the loop, but 'plus' in the
7792 epilog.
7793 2. The type (mode) we use to check available target support
7794 for the vector operation to be created in the *epilog*, is
7795 determined by the type of the reduction variable (in the example
7796 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7797 However the type (mode) we use to check available target support
7798 for the vector operation to be created *inside the loop*, is
7799 determined by the type of the other arguments to STMT (in the
7800 example we'd check this: optab_handler (widen_sum_optab,
7801 vect_short_mode)).
7803 This is contrary to "regular" reductions, in which the types of all
7804 the arguments are the same as the type of the reduction variable.
7805 For "regular" reductions we can therefore use the same vector type
7806 (and also the same tree-code) when generating the epilog code and
7807 when generating the code inside the loop. */
7809 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7811 /* If conversion might have created a conditional operation like
7812 IFN_COND_ADD already. Use the internal code for the following checks. */
7813 if (orig_code.is_internal_fn ())
7815 tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7816 orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7819 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7821 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7822 if (reduction_type == TREE_CODE_REDUCTION)
7824 /* Check whether it's ok to change the order of the computation.
7825 Generally, when vectorizing a reduction we change the order of the
7826 computation. This may change the behavior of the program in some
7827 cases, so we need to check that this is ok. One exception is when
7828 vectorizing an outer-loop: the inner-loop is executed sequentially,
7829 and therefore vectorizing reductions in the inner-loop during
7830 outer-loop vectorization is safe. Likewise when we are vectorizing
7831 a series of reductions using SLP and the VF is one the reductions
7832 are performed in scalar order. */
7833 if (slp_node
7834 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7835 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7837 else if (needs_fold_left_reduction_p (op.type, orig_code))
7839 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7840 is not directy used in stmt. */
7841 if (!only_slp_reduc_chain
7842 && reduc_chain_length != 1)
7844 if (dump_enabled_p ())
7845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7846 "in-order reduction chain without SLP.\n");
7847 return false;
7849 STMT_VINFO_REDUC_TYPE (reduc_info)
7850 = reduction_type = FOLD_LEFT_REDUCTION;
7852 else if (!commutative_binary_op_p (orig_code, op.type)
7853 || !associative_binary_op_p (orig_code, op.type))
7855 if (dump_enabled_p ())
7856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7857 "reduction: not commutative/associative\n");
7858 return false;
7862 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7863 && ncopies > 1)
7865 if (dump_enabled_p ())
7866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7867 "multiple types in double reduction or condition "
7868 "reduction or fold-left reduction.\n");
7869 return false;
7872 internal_fn reduc_fn = IFN_LAST;
7873 if (reduction_type == TREE_CODE_REDUCTION
7874 || reduction_type == FOLD_LEFT_REDUCTION
7875 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7876 || reduction_type == CONST_COND_REDUCTION)
7878 if (reduction_type == FOLD_LEFT_REDUCTION
7879 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7880 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7882 if (reduc_fn != IFN_LAST
7883 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7884 OPTIMIZE_FOR_SPEED))
7886 if (dump_enabled_p ())
7887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7888 "reduc op not supported by target.\n");
7890 reduc_fn = IFN_LAST;
7893 else
7895 if (!nested_cycle || double_reduc)
7897 if (dump_enabled_p ())
7898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7899 "no reduc code for scalar code.\n");
7901 return false;
7905 else if (reduction_type == COND_REDUCTION)
7907 int scalar_precision
7908 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7909 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7910 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7911 vectype_out);
7913 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7914 OPTIMIZE_FOR_SPEED))
7915 reduc_fn = IFN_REDUC_MAX;
7917 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7919 if (reduction_type != EXTRACT_LAST_REDUCTION
7920 && (!nested_cycle || double_reduc)
7921 && reduc_fn == IFN_LAST
7922 && !nunits_out.is_constant ())
7924 if (dump_enabled_p ())
7925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7926 "missing target support for reduction on"
7927 " variable-length vectors.\n");
7928 return false;
7931 /* For SLP reductions, see if there is a neutral value we can use. */
7932 tree neutral_op = NULL_TREE;
7933 if (slp_node)
7935 tree initial_value = NULL_TREE;
7936 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7937 initial_value = vect_phi_initial_value (reduc_def_phi);
7938 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7939 orig_code, initial_value);
7942 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7944 /* We can't support in-order reductions of code such as this:
7946 for (int i = 0; i < n1; ++i)
7947 for (int j = 0; j < n2; ++j)
7948 l += a[j];
7950 since GCC effectively transforms the loop when vectorizing:
7952 for (int i = 0; i < n1 / VF; ++i)
7953 for (int j = 0; j < n2; ++j)
7954 for (int k = 0; k < VF; ++k)
7955 l += a[j];
7957 which is a reassociation of the original operation. */
7958 if (dump_enabled_p ())
7959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7960 "in-order double reduction not supported.\n");
7962 return false;
7965 if (reduction_type == FOLD_LEFT_REDUCTION
7966 && slp_node
7967 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7969 /* We cannot use in-order reductions in this case because there is
7970 an implicit reassociation of the operations involved. */
7971 if (dump_enabled_p ())
7972 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7973 "in-order unchained SLP reductions not supported.\n");
7974 return false;
7977 /* For double reductions, and for SLP reductions with a neutral value,
7978 we construct a variable-length initial vector by loading a vector
7979 full of the neutral value and then shift-and-inserting the start
7980 values into the low-numbered elements. */
7981 if ((double_reduc || neutral_op)
7982 && !nunits_out.is_constant ()
7983 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7984 vectype_out, OPTIMIZE_FOR_SPEED))
7986 if (dump_enabled_p ())
7987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7988 "reduction on variable-length vectors requires"
7989 " target support for a vector-shift-and-insert"
7990 " operation.\n");
7991 return false;
7994 /* Check extra constraints for variable-length unchained SLP reductions. */
7995 if (slp_node
7996 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7997 && !nunits_out.is_constant ())
7999 /* We checked above that we could build the initial vector when
8000 there's a neutral element value. Check here for the case in
8001 which each SLP statement has its own initial value and in which
8002 that value needs to be repeated for every instance of the
8003 statement within the initial vector. */
8004 unsigned int group_size = SLP_TREE_LANES (slp_node);
8005 if (!neutral_op
8006 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8007 TREE_TYPE (vectype_out)))
8009 if (dump_enabled_p ())
8010 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8011 "unsupported form of SLP reduction for"
8012 " variable-length vectors: cannot build"
8013 " initial vector.\n");
8014 return false;
8016 /* The epilogue code relies on the number of elements being a multiple
8017 of the group size. The duplicate-and-interleave approach to setting
8018 up the initial vector does too. */
8019 if (!multiple_p (nunits_out, group_size))
8021 if (dump_enabled_p ())
8022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8023 "unsupported form of SLP reduction for"
8024 " variable-length vectors: the vector size"
8025 " is not a multiple of the number of results.\n");
8026 return false;
8030 if (reduction_type == COND_REDUCTION)
8032 widest_int ni;
8034 if (! max_loop_iterations (loop, &ni))
8036 if (dump_enabled_p ())
8037 dump_printf_loc (MSG_NOTE, vect_location,
8038 "loop count not known, cannot create cond "
8039 "reduction.\n");
8040 return false;
8042 /* Convert backedges to iterations. */
8043 ni += 1;
8045 /* The additional index will be the same type as the condition. Check
8046 that the loop can fit into this less one (because we'll use up the
8047 zero slot for when there are no matches). */
8048 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8049 if (wi::geu_p (ni, wi::to_widest (max_index)))
8051 if (dump_enabled_p ())
8052 dump_printf_loc (MSG_NOTE, vect_location,
8053 "loop size is greater than data size.\n");
8054 return false;
8058 /* In case the vectorization factor (VF) is bigger than the number
8059 of elements that we can fit in a vectype (nunits), we have to generate
8060 more than one vector stmt - i.e - we need to "unroll" the
8061 vector stmt by a factor VF/nunits. For more details see documentation
8062 in vectorizable_operation. */
8064 /* If the reduction is used in an outer loop we need to generate
8065 VF intermediate results, like so (e.g. for ncopies=2):
8066 r0 = phi (init, r0)
8067 r1 = phi (init, r1)
8068 r0 = x0 + r0;
8069 r1 = x1 + r1;
8070 (i.e. we generate VF results in 2 registers).
8071 In this case we have a separate def-use cycle for each copy, and therefore
8072 for each copy we get the vector def for the reduction variable from the
8073 respective phi node created for this copy.
8075 Otherwise (the reduction is unused in the loop nest), we can combine
8076 together intermediate results, like so (e.g. for ncopies=2):
8077 r = phi (init, r)
8078 r = x0 + r;
8079 r = x1 + r;
8080 (i.e. we generate VF/2 results in a single register).
8081 In this case for each copy we get the vector def for the reduction variable
8082 from the vectorized reduction operation generated in the previous iteration.
8084 This only works when we see both the reduction PHI and its only consumer
8085 in vectorizable_reduction and there are no intermediate stmts
8086 participating. When unrolling we want each unrolled iteration to have its
8087 own reduction accumulator since one of the main goals of unrolling a
8088 reduction is to reduce the aggregate loop-carried latency. */
8089 if (ncopies > 1
8090 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8091 && reduc_chain_length == 1
8092 && loop_vinfo->suggested_unroll_factor == 1)
8093 single_defuse_cycle = true;
8095 if (single_defuse_cycle || lane_reduc_code_p)
8097 gcc_assert (op.code != COND_EXPR);
8099 /* 4. Supportable by target? */
8100 bool ok = true;
8102 /* 4.1. check support for the operation in the loop
8104 This isn't necessary for the lane reduction codes, since they
8105 can only be produced by pattern matching, and it's up to the
8106 pattern matcher to test for support. The main reason for
8107 specifically skipping this step is to avoid rechecking whether
8108 mixed-sign dot-products can be implemented using signed
8109 dot-products. */
8110 machine_mode vec_mode = TYPE_MODE (vectype_in);
8111 if (!lane_reduc_code_p
8112 && !directly_supported_p (op.code, vectype_in, optab_vector))
8114 if (dump_enabled_p ())
8115 dump_printf (MSG_NOTE, "op not supported by target.\n");
8116 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8117 || !vect_can_vectorize_without_simd_p (op.code))
8118 ok = false;
8119 else
8120 if (dump_enabled_p ())
8121 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8124 if (vect_emulated_vector_p (vectype_in)
8125 && !vect_can_vectorize_without_simd_p (op.code))
8127 if (dump_enabled_p ())
8128 dump_printf (MSG_NOTE, "using word mode not possible.\n");
8129 return false;
8132 /* lane-reducing operations have to go through vect_transform_reduction.
8133 For the other cases try without the single cycle optimization. */
8134 if (!ok)
8136 if (lane_reduc_code_p)
8137 return false;
8138 else
8139 single_defuse_cycle = false;
8142 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8144 /* If the reduction stmt is one of the patterns that have lane
8145 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
8146 if ((ncopies > 1 && ! single_defuse_cycle)
8147 && lane_reduc_code_p)
8149 if (dump_enabled_p ())
8150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8151 "multi def-use cycle not possible for lane-reducing "
8152 "reduction operation\n");
8153 return false;
8156 if (slp_node
8157 && !(!single_defuse_cycle
8158 && !lane_reduc_code_p
8159 && reduction_type != FOLD_LEFT_REDUCTION))
8160 for (i = 0; i < (int) op.num_ops; i++)
8161 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8163 if (dump_enabled_p ())
8164 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8165 "incompatible vector types for invariants\n");
8166 return false;
8169 if (slp_node)
8170 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8171 else
8172 vec_num = 1;
8174 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8175 reduction_type, ncopies, cost_vec);
8176 /* Cost the reduction op inside the loop if transformed via
8177 vect_transform_reduction. Otherwise this is costed by the
8178 separate vectorizable_* routines. */
8179 if (single_defuse_cycle || lane_reduc_code_p)
8181 int factor = 1;
8182 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8183 /* Three dot-products and a subtraction. */
8184 factor = 4;
8185 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8186 stmt_info, 0, vect_body);
8189 if (dump_enabled_p ()
8190 && reduction_type == FOLD_LEFT_REDUCTION)
8191 dump_printf_loc (MSG_NOTE, vect_location,
8192 "using an in-order (fold-left) reduction.\n");
8193 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8194 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8195 reductions go through their own vectorizable_* routines. */
8196 if (!single_defuse_cycle
8197 && !lane_reduc_code_p
8198 && reduction_type != FOLD_LEFT_REDUCTION)
8200 stmt_vec_info tem
8201 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8202 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8204 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8205 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8207 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8208 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8210 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8212 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8213 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8214 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8216 if (reduction_type != FOLD_LEFT_REDUCTION
8217 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8218 && (cond_fn == IFN_LAST
8219 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8220 OPTIMIZE_FOR_SPEED)))
8222 if (dump_enabled_p ())
8223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8224 "can't operate on partial vectors because"
8225 " no conditional operation is available.\n");
8226 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8228 else if (reduction_type == FOLD_LEFT_REDUCTION
8229 && reduc_fn == IFN_LAST
8230 && !expand_vec_cond_expr_p (vectype_in,
8231 truth_type_for (vectype_in),
8232 SSA_NAME))
8234 if (dump_enabled_p ())
8235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8236 "can't operate on partial vectors because"
8237 " no conditional operation is available.\n");
8238 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8240 else if (reduction_type == FOLD_LEFT_REDUCTION
8241 && internal_fn_mask_index (reduc_fn) == -1
8242 && FLOAT_TYPE_P (vectype_in)
8243 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8245 if (dump_enabled_p ())
8246 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8247 "can't operate on partial vectors because"
8248 " signed zeros cannot be preserved.\n");
8249 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8251 else
8253 internal_fn mask_reduc_fn
8254 = get_masked_reduction_fn (reduc_fn, vectype_in);
8256 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8257 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8258 vectype_in, 1);
8259 else
8260 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8261 vectype_in, NULL);
8264 return true;
8267 /* STMT_INFO is a dot-product reduction whose multiplication operands
8268 have different signs. Emit a sequence to emulate the operation
8269 using a series of signed DOT_PROD_EXPRs and return the last
8270 statement generated. VEC_DEST is the result of the vector operation
8271 and VOP lists its inputs. */
8273 static gassign *
8274 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8275 gimple_stmt_iterator *gsi, tree vec_dest,
8276 tree vop[3])
8278 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8279 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8280 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8281 gimple *new_stmt;
8283 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8284 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8285 std::swap (vop[0], vop[1]);
8287 /* Convert all inputs to signed types. */
8288 for (int i = 0; i < 3; ++i)
8289 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8291 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8292 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8293 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8294 vop[i] = tmp;
8297 /* In the comments below we assume 8-bit inputs for simplicity,
8298 but the approach works for any full integer type. */
8300 /* Create a vector of -128. */
8301 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8302 tree min_narrow = build_vector_from_val (narrow_vectype,
8303 min_narrow_elttype);
8305 /* Create a vector of 64. */
8306 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8307 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8308 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8310 /* Emit: SUB_RES = VOP[0] - 128. */
8311 tree sub_res = make_ssa_name (narrow_vectype);
8312 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8313 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8315 /* Emit:
8317 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8318 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8319 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8321 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8322 Doing the two 64 * y steps first allows more time to compute x. */
8323 tree stage1 = make_ssa_name (wide_vectype);
8324 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8325 vop[1], half_narrow, vop[2]);
8326 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8328 tree stage2 = make_ssa_name (wide_vectype);
8329 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8330 vop[1], half_narrow, stage1);
8331 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8333 tree stage3 = make_ssa_name (wide_vectype);
8334 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8335 sub_res, vop[1], stage2);
8336 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8338 /* Convert STAGE3 to the reduction type. */
8339 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8342 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8343 value. */
8345 bool
8346 vect_transform_reduction (loop_vec_info loop_vinfo,
8347 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8348 gimple **vec_stmt, slp_tree slp_node)
8350 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8351 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8352 int i;
8353 int ncopies;
8354 int vec_num;
8356 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8357 gcc_assert (reduc_info->is_reduc_info);
8359 if (nested_in_vect_loop_p (loop, stmt_info))
8361 loop = loop->inner;
8362 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8365 gimple_match_op op;
8366 if (!gimple_extract_op (stmt_info->stmt, &op))
8367 gcc_unreachable ();
8369 /* All uses but the last are expected to be defined in the loop.
8370 The last use is the reduction variable. In case of nested cycle this
8371 assumption is not true: we use reduc_index to record the index of the
8372 reduction variable. */
8373 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8374 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8375 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8376 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8378 if (slp_node)
8380 ncopies = 1;
8381 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8383 else
8385 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8386 vec_num = 1;
8389 code_helper code = canonicalize_code (op.code, op.type);
8390 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8392 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8393 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8394 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8396 /* Transform. */
8397 tree new_temp = NULL_TREE;
8398 auto_vec<tree> vec_oprnds0;
8399 auto_vec<tree> vec_oprnds1;
8400 auto_vec<tree> vec_oprnds2;
8401 tree def0;
8403 if (dump_enabled_p ())
8404 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8406 /* FORNOW: Multiple types are not supported for condition. */
8407 if (code == COND_EXPR)
8408 gcc_assert (ncopies == 1);
8410 /* A binary COND_OP reduction must have the same definition and else
8411 value. */
8412 bool cond_fn_p = code.is_internal_fn ()
8413 && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8414 if (cond_fn_p)
8416 gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8417 || code == IFN_COND_MUL || code == IFN_COND_AND
8418 || code == IFN_COND_IOR || code == IFN_COND_XOR);
8419 gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
8422 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8424 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8425 if (reduction_type == FOLD_LEFT_REDUCTION)
8427 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8428 gcc_assert (code.is_tree_code () || cond_fn_p);
8429 return vectorize_fold_left_reduction
8430 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8431 code, reduc_fn, op.ops, op.num_ops, vectype_in,
8432 reduc_index, masks, lens);
8435 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8436 gcc_assert (single_defuse_cycle
8437 || code == DOT_PROD_EXPR
8438 || code == WIDEN_SUM_EXPR
8439 || code == SAD_EXPR);
8441 /* Create the destination vector */
8442 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8443 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8445 /* Get NCOPIES vector definitions for all operands except the reduction
8446 definition. */
8447 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8448 single_defuse_cycle && reduc_index == 0
8449 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8450 single_defuse_cycle && reduc_index == 1
8451 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8452 op.num_ops == 4
8453 || (op.num_ops == 3
8454 && !(single_defuse_cycle && reduc_index == 2))
8455 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8457 /* For single def-use cycles get one copy of the vectorized reduction
8458 definition. */
8459 if (single_defuse_cycle)
8461 gcc_assert (!slp_node);
8462 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8463 op.ops[reduc_index],
8464 reduc_index == 0 ? &vec_oprnds0
8465 : (reduc_index == 1 ? &vec_oprnds1
8466 : &vec_oprnds2));
8469 bool emulated_mixed_dot_prod
8470 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8471 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8473 gimple *new_stmt;
8474 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8475 if (masked_loop_p && !mask_by_cond_expr)
8477 /* No conditional ifns have been defined for dot-product yet. */
8478 gcc_assert (code != DOT_PROD_EXPR);
8480 /* Make sure that the reduction accumulator is vop[0]. */
8481 if (reduc_index == 1)
8483 gcc_assert (commutative_binary_op_p (code, op.type));
8484 std::swap (vop[0], vop[1]);
8486 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8487 vec_num * ncopies, vectype_in, i);
8488 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8489 vop[0], vop[1], vop[0]);
8490 new_temp = make_ssa_name (vec_dest, call);
8491 gimple_call_set_lhs (call, new_temp);
8492 gimple_call_set_nothrow (call, true);
8493 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8494 new_stmt = call;
8496 else
8498 if (op.num_ops >= 3)
8499 vop[2] = vec_oprnds2[i];
8501 if (masked_loop_p && mask_by_cond_expr)
8503 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8504 vec_num * ncopies, vectype_in, i);
8505 build_vect_cond_expr (code, vop, mask, gsi);
8508 if (emulated_mixed_dot_prod)
8509 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8510 vec_dest, vop);
8512 else if (code.is_internal_fn () && !cond_fn_p)
8513 new_stmt = gimple_build_call_internal (internal_fn (code),
8514 op.num_ops,
8515 vop[0], vop[1], vop[2]);
8516 else if (code.is_internal_fn () && cond_fn_p)
8517 new_stmt = gimple_build_call_internal (internal_fn (code),
8518 op.num_ops,
8519 vop[0], vop[1], vop[2],
8520 vop[1]);
8521 else
8522 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8523 vop[0], vop[1], vop[2]);
8524 new_temp = make_ssa_name (vec_dest, new_stmt);
8525 gimple_set_lhs (new_stmt, new_temp);
8526 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8529 if (slp_node)
8530 slp_node->push_vec_def (new_stmt);
8531 else if (single_defuse_cycle
8532 && i < ncopies - 1)
8534 if (reduc_index == 0)
8535 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8536 else if (reduc_index == 1)
8537 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8538 else if (reduc_index == 2)
8539 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8541 else
8542 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8545 if (!slp_node)
8546 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8548 return true;
8551 /* Transform phase of a cycle PHI. */
8553 bool
8554 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8555 stmt_vec_info stmt_info, gimple **vec_stmt,
8556 slp_tree slp_node, slp_instance slp_node_instance)
8558 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8559 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8560 int i;
8561 int ncopies;
8562 int j;
8563 bool nested_cycle = false;
8564 int vec_num;
8566 if (nested_in_vect_loop_p (loop, stmt_info))
8568 loop = loop->inner;
8569 nested_cycle = true;
8572 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8573 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8574 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8575 gcc_assert (reduc_info->is_reduc_info);
8577 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8578 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8579 /* Leave the scalar phi in place. */
8580 return true;
8582 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8583 /* For a nested cycle we do not fill the above. */
8584 if (!vectype_in)
8585 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8586 gcc_assert (vectype_in);
8588 if (slp_node)
8590 /* The size vect_schedule_slp_instance computes is off for us. */
8591 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8592 * SLP_TREE_LANES (slp_node), vectype_in);
8593 ncopies = 1;
8595 else
8597 vec_num = 1;
8598 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8601 /* Check whether we should use a single PHI node and accumulate
8602 vectors to one before the backedge. */
8603 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8604 ncopies = 1;
8606 /* Create the destination vector */
8607 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8608 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8609 vectype_out);
8611 /* Get the loop-entry arguments. */
8612 tree vec_initial_def = NULL_TREE;
8613 auto_vec<tree> vec_initial_defs;
8614 if (slp_node)
8616 vec_initial_defs.reserve (vec_num);
8617 if (nested_cycle)
8619 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8620 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8621 &vec_initial_defs);
8623 else
8625 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8626 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8627 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8629 unsigned int num_phis = stmts.length ();
8630 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8631 num_phis = 1;
8632 initial_values.reserve (num_phis);
8633 for (unsigned int i = 0; i < num_phis; ++i)
8635 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8636 initial_values.quick_push (vect_phi_initial_value (this_phi));
8638 if (vec_num == 1)
8639 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8640 if (!initial_values.is_empty ())
8642 tree initial_value
8643 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8644 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8645 tree neutral_op
8646 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8647 code, initial_value);
8648 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8649 &vec_initial_defs, vec_num,
8650 stmts.length (), neutral_op);
8654 else
8656 /* Get at the scalar def before the loop, that defines the initial
8657 value of the reduction variable. */
8658 tree initial_def = vect_phi_initial_value (phi);
8659 reduc_info->reduc_initial_values.safe_push (initial_def);
8660 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8661 and we can't use zero for induc_val, use initial_def. Similarly
8662 for REDUC_MIN and initial_def larger than the base. */
8663 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8665 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8666 if (TREE_CODE (initial_def) == INTEGER_CST
8667 && !integer_zerop (induc_val)
8668 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8669 && tree_int_cst_lt (initial_def, induc_val))
8670 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8671 && tree_int_cst_lt (induc_val, initial_def))))
8673 induc_val = initial_def;
8674 /* Communicate we used the initial_def to epilouge
8675 generation. */
8676 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8678 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8680 else if (nested_cycle)
8682 /* Do not use an adjustment def as that case is not supported
8683 correctly if ncopies is not one. */
8684 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8685 ncopies, initial_def,
8686 &vec_initial_defs);
8688 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8689 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8690 /* Fill the initial vector with the initial scalar value. */
8691 vec_initial_def
8692 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8693 initial_def, initial_def);
8694 else
8696 if (ncopies == 1)
8697 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8698 if (!reduc_info->reduc_initial_values.is_empty ())
8700 initial_def = reduc_info->reduc_initial_values[0];
8701 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8702 tree neutral_op
8703 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8704 code, initial_def);
8705 gcc_assert (neutral_op);
8706 /* Try to simplify the vector initialization by applying an
8707 adjustment after the reduction has been performed. */
8708 if (!reduc_info->reused_accumulator
8709 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8710 && !operand_equal_p (neutral_op, initial_def))
8712 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8713 = initial_def;
8714 initial_def = neutral_op;
8716 vec_initial_def
8717 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8718 initial_def, neutral_op);
8723 if (vec_initial_def)
8725 vec_initial_defs.create (ncopies);
8726 for (i = 0; i < ncopies; ++i)
8727 vec_initial_defs.quick_push (vec_initial_def);
8730 if (auto *accumulator = reduc_info->reused_accumulator)
8732 tree def = accumulator->reduc_input;
8733 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8735 unsigned int nreduc;
8736 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8737 (TREE_TYPE (def)),
8738 TYPE_VECTOR_SUBPARTS (vectype_out),
8739 &nreduc);
8740 gcc_assert (res);
8741 gimple_seq stmts = NULL;
8742 /* Reduce the single vector to a smaller one. */
8743 if (nreduc != 1)
8745 /* Perform the reduction in the appropriate type. */
8746 tree rvectype = vectype_out;
8747 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8748 TREE_TYPE (TREE_TYPE (def))))
8749 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8750 TYPE_VECTOR_SUBPARTS
8751 (vectype_out));
8752 def = vect_create_partial_epilog (def, rvectype,
8753 STMT_VINFO_REDUC_CODE
8754 (reduc_info),
8755 &stmts);
8757 /* The epilogue loop might use a different vector mode, like
8758 VNx2DI vs. V2DI. */
8759 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8761 tree reduc_type = build_vector_type_for_mode
8762 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8763 def = gimple_convert (&stmts, reduc_type, def);
8765 /* Adjust the input so we pick up the partially reduced value
8766 for the skip edge in vect_create_epilog_for_reduction. */
8767 accumulator->reduc_input = def;
8768 /* And the reduction could be carried out using a different sign. */
8769 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8770 def = gimple_convert (&stmts, vectype_out, def);
8771 if (loop_vinfo->main_loop_edge)
8773 /* While we'd like to insert on the edge this will split
8774 blocks and disturb bookkeeping, we also will eventually
8775 need this on the skip edge. Rely on sinking to
8776 fixup optimal placement and insert in the pred. */
8777 gimple_stmt_iterator gsi
8778 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8779 /* Insert before a cond that eventually skips the
8780 epilogue. */
8781 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8782 gsi_prev (&gsi);
8783 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8785 else
8786 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8787 stmts);
8789 if (loop_vinfo->main_loop_edge)
8790 vec_initial_defs[0]
8791 = vect_get_main_loop_result (loop_vinfo, def,
8792 vec_initial_defs[0]);
8793 else
8794 vec_initial_defs.safe_push (def);
8797 /* Generate the reduction PHIs upfront. */
8798 for (i = 0; i < vec_num; i++)
8800 tree vec_init_def = vec_initial_defs[i];
8801 for (j = 0; j < ncopies; j++)
8803 /* Create the reduction-phi that defines the reduction
8804 operand. */
8805 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8807 /* Set the loop-entry arg of the reduction-phi. */
8808 if (j != 0 && nested_cycle)
8809 vec_init_def = vec_initial_defs[j];
8810 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8811 UNKNOWN_LOCATION);
8813 /* The loop-latch arg is set in epilogue processing. */
8815 if (slp_node)
8816 slp_node->push_vec_def (new_phi);
8817 else
8819 if (j == 0)
8820 *vec_stmt = new_phi;
8821 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8826 return true;
8829 /* Vectorizes LC PHIs. */
8831 bool
8832 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8833 stmt_vec_info stmt_info, gimple **vec_stmt,
8834 slp_tree slp_node)
8836 if (!loop_vinfo
8837 || !is_a <gphi *> (stmt_info->stmt)
8838 || gimple_phi_num_args (stmt_info->stmt) != 1)
8839 return false;
8841 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8842 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8843 return false;
8845 if (!vec_stmt) /* transformation not required. */
8847 /* Deal with copies from externs or constants that disguise as
8848 loop-closed PHI nodes (PR97886). */
8849 if (slp_node
8850 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8851 SLP_TREE_VECTYPE (slp_node)))
8853 if (dump_enabled_p ())
8854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8855 "incompatible vector types for invariants\n");
8856 return false;
8858 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8859 return true;
8862 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8863 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8864 basic_block bb = gimple_bb (stmt_info->stmt);
8865 edge e = single_pred_edge (bb);
8866 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8867 auto_vec<tree> vec_oprnds;
8868 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8869 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8870 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8871 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8873 /* Create the vectorized LC PHI node. */
8874 gphi *new_phi = create_phi_node (vec_dest, bb);
8875 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8876 if (slp_node)
8877 slp_node->push_vec_def (new_phi);
8878 else
8879 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8881 if (!slp_node)
8882 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8884 return true;
8887 /* Vectorizes PHIs. */
8889 bool
8890 vectorizable_phi (vec_info *,
8891 stmt_vec_info stmt_info, gimple **vec_stmt,
8892 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8894 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8895 return false;
8897 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8898 return false;
8900 tree vectype = SLP_TREE_VECTYPE (slp_node);
8902 if (!vec_stmt) /* transformation not required. */
8904 slp_tree child;
8905 unsigned i;
8906 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8907 if (!child)
8909 if (dump_enabled_p ())
8910 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8911 "PHI node with unvectorized backedge def\n");
8912 return false;
8914 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8916 if (dump_enabled_p ())
8917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8918 "incompatible vector types for invariants\n");
8919 return false;
8921 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8922 && !useless_type_conversion_p (vectype,
8923 SLP_TREE_VECTYPE (child)))
8925 /* With bools we can have mask and non-mask precision vectors
8926 or different non-mask precisions. while pattern recog is
8927 supposed to guarantee consistency here bugs in it can cause
8928 mismatches (PR103489 and PR103800 for example).
8929 Deal with them here instead of ICEing later. */
8930 if (dump_enabled_p ())
8931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8932 "incompatible vector type setup from "
8933 "bool pattern detection\n");
8934 return false;
8937 /* For single-argument PHIs assume coalescing which means zero cost
8938 for the scalar and the vector PHIs. This avoids artificially
8939 favoring the vector path (but may pessimize it in some cases). */
8940 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8941 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8942 vector_stmt, stmt_info, vectype, 0, vect_body);
8943 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8944 return true;
8947 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8948 basic_block bb = gimple_bb (stmt_info->stmt);
8949 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8950 auto_vec<gphi *> new_phis;
8951 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8953 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8955 /* Skip not yet vectorized defs. */
8956 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8957 && SLP_TREE_VEC_DEFS (child).is_empty ())
8958 continue;
8960 auto_vec<tree> vec_oprnds;
8961 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8962 if (!new_phis.exists ())
8964 new_phis.create (vec_oprnds.length ());
8965 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8967 /* Create the vectorized LC PHI node. */
8968 new_phis.quick_push (create_phi_node (vec_dest, bb));
8969 slp_node->push_vec_def (new_phis[j]);
8972 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8973 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8974 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8976 /* We should have at least one already vectorized child. */
8977 gcc_assert (new_phis.exists ());
8979 return true;
8982 /* Vectorizes first order recurrences. An overview of the transformation
8983 is described below. Suppose we have the following loop.
8985 int t = 0;
8986 for (int i = 0; i < n; ++i)
8988 b[i] = a[i] - t;
8989 t = a[i];
8992 There is a first-order recurrence on 'a'. For this loop, the scalar IR
8993 looks (simplified) like:
8995 scalar.preheader:
8996 init = 0;
8998 scalar.body:
8999 i = PHI <0(scalar.preheader), i+1(scalar.body)>
9000 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9001 _1 = a[i]
9002 b[i] = _1 - _2
9003 if (i < n) goto scalar.body
9005 In this example, _2 is a recurrence because it's value depends on the
9006 previous iteration. We vectorize this as (VF = 4)
9008 vector.preheader:
9009 vect_init = vect_cst(..., ..., ..., 0)
9011 vector.body
9012 i = PHI <0(vector.preheader), i+4(vector.body)>
9013 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9014 vect_2 = a[i, i+1, i+2, i+3];
9015 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9016 b[i, i+1, i+2, i+3] = vect_2 - vect_3
9017 if (..) goto vector.body
9019 In this function, vectorizable_recurr, we code generate both the
9020 vector PHI node and the permute since those together compute the
9021 vectorized value of the scalar PHI. We do not yet have the
9022 backedge value to fill in there nor into the vec_perm. Those
9023 are filled in maybe_set_vectorized_backedge_value and
9024 vect_schedule_scc.
9026 TODO: Since the scalar loop does not have a use of the recurrence
9027 outside of the loop the natural way to implement peeling via
9028 vectorizing the live value doesn't work. For now peeling of loops
9029 with a recurrence is not implemented. For SLP the supported cases
9030 are restricted to those requiring a single vector recurrence PHI. */
9032 bool
9033 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9034 gimple **vec_stmt, slp_tree slp_node,
9035 stmt_vector_for_cost *cost_vec)
9037 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9038 return false;
9040 gphi *phi = as_a<gphi *> (stmt_info->stmt);
9042 /* So far we only support first-order recurrence auto-vectorization. */
9043 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9044 return false;
9046 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9047 unsigned ncopies;
9048 if (slp_node)
9049 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9050 else
9051 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9052 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9053 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9054 /* We need to be able to make progress with a single vector. */
9055 if (maybe_gt (dist * 2, nunits))
9057 if (dump_enabled_p ())
9058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9059 "first order recurrence exceeds half of "
9060 "a vector\n");
9061 return false;
9064 /* First-order recurrence autovectorization needs to handle permutation
9065 with indices = [nunits-1, nunits, nunits+1, ...]. */
9066 vec_perm_builder sel (nunits, 1, 3);
9067 for (int i = 0; i < 3; ++i)
9068 sel.quick_push (nunits - dist + i);
9069 vec_perm_indices indices (sel, 2, nunits);
9071 if (!vec_stmt) /* transformation not required. */
9073 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9074 indices))
9075 return false;
9077 if (slp_node)
9079 /* We eventually need to set a vector type on invariant
9080 arguments. */
9081 unsigned j;
9082 slp_tree child;
9083 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9084 if (!vect_maybe_update_slp_op_vectype
9085 (child, SLP_TREE_VECTYPE (slp_node)))
9087 if (dump_enabled_p ())
9088 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9089 "incompatible vector types for "
9090 "invariants\n");
9091 return false;
9094 /* The recurrence costs the initialization vector and one permute
9095 for each copy. */
9096 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9097 stmt_info, 0, vect_prologue);
9098 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9099 stmt_info, 0, vect_body);
9100 if (dump_enabled_p ())
9101 dump_printf_loc (MSG_NOTE, vect_location,
9102 "vectorizable_recurr: inside_cost = %d, "
9103 "prologue_cost = %d .\n", inside_cost,
9104 prologue_cost);
9106 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9107 return true;
9110 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9111 basic_block bb = gimple_bb (phi);
9112 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9113 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9115 gimple_seq stmts = NULL;
9116 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9117 gsi_insert_seq_on_edge_immediate (pe, stmts);
9119 tree vec_init = build_vector_from_val (vectype, preheader);
9120 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9122 /* Create the vectorized first-order PHI node. */
9123 tree vec_dest = vect_get_new_vect_var (vectype,
9124 vect_simple_var, "vec_recur_");
9125 gphi *new_phi = create_phi_node (vec_dest, bb);
9126 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9128 /* Insert shuffles the first-order recurrence autovectorization.
9129 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9130 tree perm = vect_gen_perm_mask_checked (vectype, indices);
9132 /* Insert the required permute after the latch definition. The
9133 second and later operands are tentative and will be updated when we have
9134 vectorized the latch definition. */
9135 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9136 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9137 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9138 gsi_next (&gsi2);
9140 for (unsigned i = 0; i < ncopies; ++i)
9142 vec_dest = make_ssa_name (vectype);
9143 gassign *vperm
9144 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9145 i == 0 ? gimple_phi_result (new_phi) : NULL,
9146 NULL, perm);
9147 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9149 if (slp_node)
9150 slp_node->push_vec_def (vperm);
9151 else
9152 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9155 if (!slp_node)
9156 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9157 return true;
9160 /* Return true if VECTYPE represents a vector that requires lowering
9161 by the vector lowering pass. */
9163 bool
9164 vect_emulated_vector_p (tree vectype)
9166 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9167 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9168 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9171 /* Return true if we can emulate CODE on an integer mode representation
9172 of a vector. */
9174 bool
9175 vect_can_vectorize_without_simd_p (tree_code code)
9177 switch (code)
9179 case PLUS_EXPR:
9180 case MINUS_EXPR:
9181 case NEGATE_EXPR:
9182 case BIT_AND_EXPR:
9183 case BIT_IOR_EXPR:
9184 case BIT_XOR_EXPR:
9185 case BIT_NOT_EXPR:
9186 return true;
9188 default:
9189 return false;
9193 /* Likewise, but taking a code_helper. */
9195 bool
9196 vect_can_vectorize_without_simd_p (code_helper code)
9198 return (code.is_tree_code ()
9199 && vect_can_vectorize_without_simd_p (tree_code (code)));
9202 /* Create vector init for vectorized iv. */
9203 static tree
9204 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9205 tree step_expr, poly_uint64 nunits,
9206 tree vectype,
9207 enum vect_induction_op_type induction_type)
9209 unsigned HOST_WIDE_INT const_nunits;
9210 tree vec_shift, vec_init, new_name;
9211 unsigned i;
9212 tree itype = TREE_TYPE (vectype);
9214 /* iv_loop is the loop to be vectorized. Create:
9215 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9216 new_name = gimple_convert (stmts, itype, init_expr);
9217 switch (induction_type)
9219 case vect_step_op_shr:
9220 case vect_step_op_shl:
9221 /* Build the Initial value from shift_expr. */
9222 vec_init = gimple_build_vector_from_val (stmts,
9223 vectype,
9224 new_name);
9225 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9226 build_zero_cst (itype), step_expr);
9227 vec_init = gimple_build (stmts,
9228 (induction_type == vect_step_op_shr
9229 ? RSHIFT_EXPR : LSHIFT_EXPR),
9230 vectype, vec_init, vec_shift);
9231 break;
9233 case vect_step_op_neg:
9235 vec_init = gimple_build_vector_from_val (stmts,
9236 vectype,
9237 new_name);
9238 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9239 vectype, vec_init);
9240 /* The encoding has 2 interleaved stepped patterns. */
9241 vec_perm_builder sel (nunits, 2, 3);
9242 sel.quick_grow (6);
9243 for (i = 0; i < 3; i++)
9245 sel[2 * i] = i;
9246 sel[2 * i + 1] = i + nunits;
9248 vec_perm_indices indices (sel, 2, nunits);
9249 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9250 fail when vec_init is const vector. In that situation vec_perm is not
9251 really needed. */
9252 tree perm_mask_even
9253 = vect_gen_perm_mask_any (vectype, indices);
9254 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9255 vectype,
9256 vec_init, vec_neg,
9257 perm_mask_even);
9259 break;
9261 case vect_step_op_mul:
9263 /* Use unsigned mult to avoid UD integer overflow. */
9264 gcc_assert (nunits.is_constant (&const_nunits));
9265 tree utype = unsigned_type_for (itype);
9266 tree uvectype = build_vector_type (utype,
9267 TYPE_VECTOR_SUBPARTS (vectype));
9268 new_name = gimple_convert (stmts, utype, new_name);
9269 vec_init = gimple_build_vector_from_val (stmts,
9270 uvectype,
9271 new_name);
9272 tree_vector_builder elts (uvectype, const_nunits, 1);
9273 tree elt_step = build_one_cst (utype);
9275 elts.quick_push (elt_step);
9276 for (i = 1; i < const_nunits; i++)
9278 /* Create: new_name_i = new_name + step_expr. */
9279 elt_step = gimple_build (stmts, MULT_EXPR,
9280 utype, elt_step, step_expr);
9281 elts.quick_push (elt_step);
9283 /* Create a vector from [new_name_0, new_name_1, ...,
9284 new_name_nunits-1]. */
9285 tree vec_mul = gimple_build_vector (stmts, &elts);
9286 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9287 vec_init, vec_mul);
9288 vec_init = gimple_convert (stmts, vectype, vec_init);
9290 break;
9292 default:
9293 gcc_unreachable ();
9296 return vec_init;
9299 /* Peel init_expr by skip_niter for induction_type. */
9300 tree
9301 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9302 tree skip_niters, tree step_expr,
9303 enum vect_induction_op_type induction_type)
9305 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9306 tree type = TREE_TYPE (init_expr);
9307 unsigned prec = TYPE_PRECISION (type);
9308 switch (induction_type)
9310 case vect_step_op_neg:
9311 if (TREE_INT_CST_LOW (skip_niters) % 2)
9312 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9313 /* else no change. */
9314 break;
9316 case vect_step_op_shr:
9317 case vect_step_op_shl:
9318 skip_niters = gimple_convert (stmts, type, skip_niters);
9319 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9320 /* When shift mount >= precision, need to avoid UD.
9321 In the original loop, there's no UD, and according to semantic,
9322 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9323 if (!tree_fits_uhwi_p (step_expr)
9324 || tree_to_uhwi (step_expr) >= prec)
9326 if (induction_type == vect_step_op_shl
9327 || TYPE_UNSIGNED (type))
9328 init_expr = build_zero_cst (type);
9329 else
9330 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9331 init_expr,
9332 wide_int_to_tree (type, prec - 1));
9334 else
9335 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9336 ? RSHIFT_EXPR : LSHIFT_EXPR),
9337 type, init_expr, step_expr);
9338 break;
9340 case vect_step_op_mul:
9342 tree utype = unsigned_type_for (type);
9343 init_expr = gimple_convert (stmts, utype, init_expr);
9344 wide_int skipn = wi::to_wide (skip_niters);
9345 wide_int begin = wi::to_wide (step_expr);
9346 auto_mpz base, exp, mod, res;
9347 wi::to_mpz (begin, base, TYPE_SIGN (type));
9348 wi::to_mpz (skipn, exp, UNSIGNED);
9349 mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9350 mpz_powm (res, base, exp, mod);
9351 begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9352 tree mult_expr = wide_int_to_tree (utype, begin);
9353 init_expr = gimple_build (stmts, MULT_EXPR, utype,
9354 init_expr, mult_expr);
9355 init_expr = gimple_convert (stmts, type, init_expr);
9357 break;
9359 default:
9360 gcc_unreachable ();
9363 return init_expr;
9366 /* Create vector step for vectorized iv. */
9367 static tree
9368 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9369 poly_uint64 vf,
9370 enum vect_induction_op_type induction_type)
9372 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9373 tree new_name = NULL;
9374 /* Step should be pow (step, vf) for mult induction. */
9375 if (induction_type == vect_step_op_mul)
9377 gcc_assert (vf.is_constant ());
9378 wide_int begin = wi::to_wide (step_expr);
9380 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9381 begin = wi::mul (begin, wi::to_wide (step_expr));
9383 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9385 else if (induction_type == vect_step_op_neg)
9386 /* Do nothing. */
9388 else
9389 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9390 expr, step_expr);
9391 return new_name;
9394 static tree
9395 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9396 stmt_vec_info stmt_info,
9397 tree new_name, tree vectype,
9398 enum vect_induction_op_type induction_type)
9400 /* No step is needed for neg induction. */
9401 if (induction_type == vect_step_op_neg)
9402 return NULL;
9404 tree t = unshare_expr (new_name);
9405 gcc_assert (CONSTANT_CLASS_P (new_name)
9406 || TREE_CODE (new_name) == SSA_NAME);
9407 tree new_vec = build_vector_from_val (vectype, t);
9408 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9409 new_vec, vectype, NULL);
9410 return vec_step;
9413 /* Update vectorized iv with vect_step, induc_def is init. */
9414 static tree
9415 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9416 tree induc_def, tree vec_step,
9417 enum vect_induction_op_type induction_type)
9419 tree vec_def = induc_def;
9420 switch (induction_type)
9422 case vect_step_op_mul:
9424 /* Use unsigned mult to avoid UD integer overflow. */
9425 tree uvectype
9426 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9427 TYPE_VECTOR_SUBPARTS (vectype));
9428 vec_def = gimple_convert (stmts, uvectype, vec_def);
9429 vec_step = gimple_convert (stmts, uvectype, vec_step);
9430 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9431 vec_def, vec_step);
9432 vec_def = gimple_convert (stmts, vectype, vec_def);
9434 break;
9436 case vect_step_op_shr:
9437 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9438 vec_def, vec_step);
9439 break;
9441 case vect_step_op_shl:
9442 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9443 vec_def, vec_step);
9444 break;
9445 case vect_step_op_neg:
9446 vec_def = induc_def;
9447 /* Do nothing. */
9448 break;
9449 default:
9450 gcc_unreachable ();
9453 return vec_def;
9457 /* Function vectorizable_induction
9459 Check if STMT_INFO performs an nonlinear induction computation that can be
9460 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9461 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9462 basic block.
9463 Return true if STMT_INFO is vectorizable in this way. */
9465 static bool
9466 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9467 stmt_vec_info stmt_info,
9468 gimple **vec_stmt, slp_tree slp_node,
9469 stmt_vector_for_cost *cost_vec)
9471 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9472 unsigned ncopies;
9473 bool nested_in_vect_loop = false;
9474 class loop *iv_loop;
9475 tree vec_def;
9476 edge pe = loop_preheader_edge (loop);
9477 basic_block new_bb;
9478 tree vec_init, vec_step;
9479 tree new_name;
9480 gimple *new_stmt;
9481 gphi *induction_phi;
9482 tree induc_def, vec_dest;
9483 tree init_expr, step_expr;
9484 tree niters_skip;
9485 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9486 unsigned i;
9487 gimple_stmt_iterator si;
9489 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9491 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9492 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9493 enum vect_induction_op_type induction_type
9494 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9496 gcc_assert (induction_type > vect_step_op_add);
9498 if (slp_node)
9499 ncopies = 1;
9500 else
9501 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9502 gcc_assert (ncopies >= 1);
9504 /* FORNOW. Only handle nonlinear induction in the same loop. */
9505 if (nested_in_vect_loop_p (loop, stmt_info))
9507 if (dump_enabled_p ())
9508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9509 "nonlinear induction in nested loop.\n");
9510 return false;
9513 iv_loop = loop;
9514 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9516 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9517 update for each iv and a permutation to generate wanted vector iv. */
9518 if (slp_node)
9520 if (dump_enabled_p ())
9521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9522 "SLP induction not supported for nonlinear"
9523 " induction.\n");
9524 return false;
9527 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9529 if (dump_enabled_p ())
9530 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9531 "floating point nonlinear induction vectorization"
9532 " not supported.\n");
9533 return false;
9536 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9537 init_expr = vect_phi_initial_value (phi);
9538 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9539 && TREE_CODE (step_expr) == INTEGER_CST);
9540 /* step_expr should be aligned with init_expr,
9541 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9542 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9544 if (TREE_CODE (init_expr) == INTEGER_CST)
9545 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9546 else
9547 gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9548 TREE_TYPE (init_expr)));
9550 switch (induction_type)
9552 case vect_step_op_neg:
9553 if (TREE_CODE (init_expr) != INTEGER_CST
9554 && TREE_CODE (init_expr) != REAL_CST)
9556 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9557 if (!directly_supported_p (NEGATE_EXPR, vectype))
9558 return false;
9560 /* The encoding has 2 interleaved stepped patterns. */
9561 vec_perm_builder sel (nunits, 2, 3);
9562 machine_mode mode = TYPE_MODE (vectype);
9563 sel.quick_grow (6);
9564 for (i = 0; i < 3; i++)
9566 sel[i * 2] = i;
9567 sel[i * 2 + 1] = i + nunits;
9569 vec_perm_indices indices (sel, 2, nunits);
9570 if (!can_vec_perm_const_p (mode, mode, indices))
9571 return false;
9573 break;
9575 case vect_step_op_mul:
9577 /* Check for backend support of MULT_EXPR. */
9578 if (!directly_supported_p (MULT_EXPR, vectype))
9579 return false;
9581 /* ?? How to construct vector step for variable number vector.
9582 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9583 if (!vf.is_constant ())
9584 return false;
9586 break;
9588 case vect_step_op_shr:
9589 /* Check for backend support of RSHIFT_EXPR. */
9590 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9591 return false;
9593 /* Don't shift more than type precision to avoid UD. */
9594 if (!tree_fits_uhwi_p (step_expr)
9595 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9596 TYPE_PRECISION (TREE_TYPE (init_expr))))
9597 return false;
9598 break;
9600 case vect_step_op_shl:
9601 /* Check for backend support of RSHIFT_EXPR. */
9602 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9603 return false;
9605 /* Don't shift more than type precision to avoid UD. */
9606 if (!tree_fits_uhwi_p (step_expr)
9607 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9608 TYPE_PRECISION (TREE_TYPE (init_expr))))
9609 return false;
9611 break;
9613 default:
9614 gcc_unreachable ();
9617 if (!vec_stmt) /* transformation not required. */
9619 unsigned inside_cost = 0, prologue_cost = 0;
9620 /* loop cost for vec_loop. Neg induction doesn't have any
9621 inside_cost. */
9622 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9623 stmt_info, 0, vect_body);
9625 /* loop cost for vec_loop. Neg induction doesn't have any
9626 inside_cost. */
9627 if (induction_type == vect_step_op_neg)
9628 inside_cost = 0;
9630 /* prologue cost for vec_init and vec_step. */
9631 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9632 stmt_info, 0, vect_prologue);
9634 if (dump_enabled_p ())
9635 dump_printf_loc (MSG_NOTE, vect_location,
9636 "vect_model_induction_cost: inside_cost = %d, "
9637 "prologue_cost = %d. \n", inside_cost,
9638 prologue_cost);
9640 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9641 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9642 return true;
9645 /* Transform. */
9647 /* Compute a vector variable, initialized with the first VF values of
9648 the induction variable. E.g., for an iv with IV_PHI='X' and
9649 evolution S, for a vector of 4 units, we want to compute:
9650 [X, X + S, X + 2*S, X + 3*S]. */
9652 if (dump_enabled_p ())
9653 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9655 pe = loop_preheader_edge (iv_loop);
9656 /* Find the first insertion point in the BB. */
9657 basic_block bb = gimple_bb (phi);
9658 si = gsi_after_labels (bb);
9660 gimple_seq stmts = NULL;
9662 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9663 /* If we are using the loop mask to "peel" for alignment then we need
9664 to adjust the start value here. */
9665 if (niters_skip != NULL_TREE)
9666 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9667 step_expr, induction_type);
9669 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9670 step_expr, nunits, vectype,
9671 induction_type);
9672 if (stmts)
9674 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9675 gcc_assert (!new_bb);
9678 stmts = NULL;
9679 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9680 vf, induction_type);
9681 if (stmts)
9683 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9684 gcc_assert (!new_bb);
9687 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9688 new_name, vectype,
9689 induction_type);
9690 /* Create the following def-use cycle:
9691 loop prolog:
9692 vec_init = ...
9693 vec_step = ...
9694 loop:
9695 vec_iv = PHI <vec_init, vec_loop>
9697 STMT
9699 vec_loop = vec_iv + vec_step; */
9701 /* Create the induction-phi that defines the induction-operand. */
9702 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9703 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9704 induc_def = PHI_RESULT (induction_phi);
9706 /* Create the iv update inside the loop. */
9707 stmts = NULL;
9708 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9709 induc_def, vec_step,
9710 induction_type);
9712 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9713 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9715 /* Set the arguments of the phi node: */
9716 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9717 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9718 UNKNOWN_LOCATION);
9720 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9721 *vec_stmt = induction_phi;
9723 /* In case that vectorization factor (VF) is bigger than the number
9724 of elements that we can fit in a vectype (nunits), we have to generate
9725 more than one vector stmt - i.e - we need to "unroll" the
9726 vector stmt by a factor VF/nunits. For more details see documentation
9727 in vectorizable_operation. */
9729 if (ncopies > 1)
9731 stmts = NULL;
9732 /* FORNOW. This restriction should be relaxed. */
9733 gcc_assert (!nested_in_vect_loop);
9735 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9736 nunits, induction_type);
9738 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9739 new_name, vectype,
9740 induction_type);
9741 vec_def = induc_def;
9742 for (i = 1; i < ncopies; i++)
9744 /* vec_i = vec_prev + vec_step. */
9745 stmts = NULL;
9746 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9747 vec_def, vec_step,
9748 induction_type);
9749 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9750 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9751 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9755 if (dump_enabled_p ())
9756 dump_printf_loc (MSG_NOTE, vect_location,
9757 "transform induction: created def-use cycle: %G%G",
9758 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9760 return true;
9763 /* Function vectorizable_induction
9765 Check if STMT_INFO performs an induction computation that can be vectorized.
9766 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9767 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9768 Return true if STMT_INFO is vectorizable in this way. */
9770 bool
9771 vectorizable_induction (loop_vec_info loop_vinfo,
9772 stmt_vec_info stmt_info,
9773 gimple **vec_stmt, slp_tree slp_node,
9774 stmt_vector_for_cost *cost_vec)
9776 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9777 unsigned ncopies;
9778 bool nested_in_vect_loop = false;
9779 class loop *iv_loop;
9780 tree vec_def;
9781 edge pe = loop_preheader_edge (loop);
9782 basic_block new_bb;
9783 tree new_vec, vec_init, vec_step, t;
9784 tree new_name;
9785 gimple *new_stmt;
9786 gphi *induction_phi;
9787 tree induc_def, vec_dest;
9788 tree init_expr, step_expr;
9789 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9790 unsigned i;
9791 tree expr;
9792 gimple_stmt_iterator si;
9793 enum vect_induction_op_type induction_type
9794 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9796 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9797 if (!phi)
9798 return false;
9800 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9801 return false;
9803 /* Make sure it was recognized as induction computation. */
9804 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9805 return false;
9807 /* Handle nonlinear induction in a separate place. */
9808 if (induction_type != vect_step_op_add)
9809 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9810 vec_stmt, slp_node, cost_vec);
9812 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9813 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9815 if (slp_node)
9816 ncopies = 1;
9817 else
9818 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9819 gcc_assert (ncopies >= 1);
9821 /* FORNOW. These restrictions should be relaxed. */
9822 if (nested_in_vect_loop_p (loop, stmt_info))
9824 imm_use_iterator imm_iter;
9825 use_operand_p use_p;
9826 gimple *exit_phi;
9827 edge latch_e;
9828 tree loop_arg;
9830 if (ncopies > 1)
9832 if (dump_enabled_p ())
9833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9834 "multiple types in nested loop.\n");
9835 return false;
9838 exit_phi = NULL;
9839 latch_e = loop_latch_edge (loop->inner);
9840 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9841 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9843 gimple *use_stmt = USE_STMT (use_p);
9844 if (is_gimple_debug (use_stmt))
9845 continue;
9847 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9849 exit_phi = use_stmt;
9850 break;
9853 if (exit_phi)
9855 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9856 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9857 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9859 if (dump_enabled_p ())
9860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9861 "inner-loop induction only used outside "
9862 "of the outer vectorized loop.\n");
9863 return false;
9867 nested_in_vect_loop = true;
9868 iv_loop = loop->inner;
9870 else
9871 iv_loop = loop;
9872 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9874 if (slp_node && !nunits.is_constant ())
9876 /* The current SLP code creates the step value element-by-element. */
9877 if (dump_enabled_p ())
9878 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9879 "SLP induction not supported for variable-length"
9880 " vectors.\n");
9881 return false;
9884 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9886 if (dump_enabled_p ())
9887 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9888 "floating point induction vectorization disabled\n");
9889 return false;
9892 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9893 gcc_assert (step_expr != NULL_TREE);
9894 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9896 /* Check for backend support of PLUS/MINUS_EXPR. */
9897 if (!directly_supported_p (PLUS_EXPR, step_vectype)
9898 || !directly_supported_p (MINUS_EXPR, step_vectype))
9899 return false;
9901 if (!vec_stmt) /* transformation not required. */
9903 unsigned inside_cost = 0, prologue_cost = 0;
9904 if (slp_node)
9906 /* We eventually need to set a vector type on invariant
9907 arguments. */
9908 unsigned j;
9909 slp_tree child;
9910 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9911 if (!vect_maybe_update_slp_op_vectype
9912 (child, SLP_TREE_VECTYPE (slp_node)))
9914 if (dump_enabled_p ())
9915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9916 "incompatible vector types for "
9917 "invariants\n");
9918 return false;
9920 /* loop cost for vec_loop. */
9921 inside_cost
9922 = record_stmt_cost (cost_vec,
9923 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9924 vector_stmt, stmt_info, 0, vect_body);
9925 /* prologue cost for vec_init (if not nested) and step. */
9926 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9927 scalar_to_vec,
9928 stmt_info, 0, vect_prologue);
9930 else /* if (!slp_node) */
9932 /* loop cost for vec_loop. */
9933 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9934 stmt_info, 0, vect_body);
9935 /* prologue cost for vec_init and vec_step. */
9936 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9937 stmt_info, 0, vect_prologue);
9939 if (dump_enabled_p ())
9940 dump_printf_loc (MSG_NOTE, vect_location,
9941 "vect_model_induction_cost: inside_cost = %d, "
9942 "prologue_cost = %d .\n", inside_cost,
9943 prologue_cost);
9945 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9946 DUMP_VECT_SCOPE ("vectorizable_induction");
9947 return true;
9950 /* Transform. */
9952 /* Compute a vector variable, initialized with the first VF values of
9953 the induction variable. E.g., for an iv with IV_PHI='X' and
9954 evolution S, for a vector of 4 units, we want to compute:
9955 [X, X + S, X + 2*S, X + 3*S]. */
9957 if (dump_enabled_p ())
9958 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9960 pe = loop_preheader_edge (iv_loop);
9961 /* Find the first insertion point in the BB. */
9962 basic_block bb = gimple_bb (phi);
9963 si = gsi_after_labels (bb);
9965 /* For SLP induction we have to generate several IVs as for example
9966 with group size 3 we need
9967 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9968 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9969 if (slp_node)
9971 /* Enforced above. */
9972 unsigned int const_nunits = nunits.to_constant ();
9974 /* The initial values are vectorized, but any lanes > group_size
9975 need adjustment. */
9976 slp_tree init_node
9977 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9979 /* Gather steps. Since we do not vectorize inductions as
9980 cycles we have to reconstruct the step from SCEV data. */
9981 unsigned group_size = SLP_TREE_LANES (slp_node);
9982 tree *steps = XALLOCAVEC (tree, group_size);
9983 tree *inits = XALLOCAVEC (tree, group_size);
9984 stmt_vec_info phi_info;
9985 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9987 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9988 if (!init_node)
9989 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9990 pe->dest_idx);
9993 /* Now generate the IVs. */
9994 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9995 gcc_assert ((const_nunits * nvects) % group_size == 0);
9996 unsigned nivs;
9997 if (nested_in_vect_loop)
9998 nivs = nvects;
9999 else
10001 /* Compute the number of distinct IVs we need. First reduce
10002 group_size if it is a multiple of const_nunits so we get
10003 one IV for a group_size of 4 but const_nunits 2. */
10004 unsigned group_sizep = group_size;
10005 if (group_sizep % const_nunits == 0)
10006 group_sizep = group_sizep / const_nunits;
10007 nivs = least_common_multiple (group_sizep,
10008 const_nunits) / const_nunits;
10010 tree stept = TREE_TYPE (step_vectype);
10011 tree lupdate_mul = NULL_TREE;
10012 if (!nested_in_vect_loop)
10014 /* The number of iterations covered in one vector iteration. */
10015 unsigned lup_mul = (nvects * const_nunits) / group_size;
10016 lupdate_mul
10017 = build_vector_from_val (step_vectype,
10018 SCALAR_FLOAT_TYPE_P (stept)
10019 ? build_real_from_wide (stept, lup_mul,
10020 UNSIGNED)
10021 : build_int_cstu (stept, lup_mul));
10023 tree peel_mul = NULL_TREE;
10024 gimple_seq init_stmts = NULL;
10025 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10027 if (SCALAR_FLOAT_TYPE_P (stept))
10028 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10029 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10030 else
10031 peel_mul = gimple_convert (&init_stmts, stept,
10032 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10033 peel_mul = gimple_build_vector_from_val (&init_stmts,
10034 step_vectype, peel_mul);
10036 unsigned ivn;
10037 auto_vec<tree> vec_steps;
10038 for (ivn = 0; ivn < nivs; ++ivn)
10040 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10041 tree_vector_builder init_elts (vectype, const_nunits, 1);
10042 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10043 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10045 /* The scalar steps of the IVs. */
10046 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10047 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10048 step_elts.quick_push (elt);
10049 if (!init_node)
10051 /* The scalar inits of the IVs if not vectorized. */
10052 elt = inits[(ivn*const_nunits + eltn) % group_size];
10053 if (!useless_type_conversion_p (TREE_TYPE (vectype),
10054 TREE_TYPE (elt)))
10055 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10056 TREE_TYPE (vectype), elt);
10057 init_elts.quick_push (elt);
10059 /* The number of steps to add to the initial values. */
10060 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10061 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10062 ? build_real_from_wide (stept,
10063 mul_elt, UNSIGNED)
10064 : build_int_cstu (stept, mul_elt));
10066 vec_step = gimple_build_vector (&init_stmts, &step_elts);
10067 vec_steps.safe_push (vec_step);
10068 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10069 if (peel_mul)
10070 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10071 step_mul, peel_mul);
10072 if (!init_node)
10073 vec_init = gimple_build_vector (&init_stmts, &init_elts);
10075 /* Create the induction-phi that defines the induction-operand. */
10076 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10077 "vec_iv_");
10078 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10079 induc_def = PHI_RESULT (induction_phi);
10081 /* Create the iv update inside the loop */
10082 tree up = vec_step;
10083 if (lupdate_mul)
10084 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10085 vec_step, lupdate_mul);
10086 gimple_seq stmts = NULL;
10087 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10088 vec_def = gimple_build (&stmts,
10089 PLUS_EXPR, step_vectype, vec_def, up);
10090 vec_def = gimple_convert (&stmts, vectype, vec_def);
10091 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10092 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10093 UNKNOWN_LOCATION);
10095 if (init_node)
10096 vec_init = vect_get_slp_vect_def (init_node, ivn);
10097 if (!nested_in_vect_loop
10098 && !integer_zerop (step_mul))
10100 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10101 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10102 vec_step, step_mul);
10103 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10104 vec_def, up);
10105 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10108 /* Set the arguments of the phi node: */
10109 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10111 slp_node->push_vec_def (induction_phi);
10113 if (!nested_in_vect_loop)
10115 /* Fill up to the number of vectors we need for the whole group. */
10116 nivs = least_common_multiple (group_size,
10117 const_nunits) / const_nunits;
10118 vec_steps.reserve (nivs-ivn);
10119 for (; ivn < nivs; ++ivn)
10121 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10122 vec_steps.quick_push (vec_steps[0]);
10126 /* Re-use IVs when we can. We are generating further vector
10127 stmts by adding VF' * stride to the IVs generated above. */
10128 if (ivn < nvects)
10130 unsigned vfp
10131 = least_common_multiple (group_size, const_nunits) / group_size;
10132 tree lupdate_mul
10133 = build_vector_from_val (step_vectype,
10134 SCALAR_FLOAT_TYPE_P (stept)
10135 ? build_real_from_wide (stept,
10136 vfp, UNSIGNED)
10137 : build_int_cstu (stept, vfp));
10138 for (; ivn < nvects; ++ivn)
10140 gimple *iv
10141 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10142 tree def = gimple_get_lhs (iv);
10143 if (ivn < 2*nivs)
10144 vec_steps[ivn - nivs]
10145 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10146 vec_steps[ivn - nivs], lupdate_mul);
10147 gimple_seq stmts = NULL;
10148 def = gimple_convert (&stmts, step_vectype, def);
10149 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10150 def, vec_steps[ivn % nivs]);
10151 def = gimple_convert (&stmts, vectype, def);
10152 if (gimple_code (iv) == GIMPLE_PHI)
10153 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10154 else
10156 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10157 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10159 slp_node->push_vec_def (def);
10163 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10164 gcc_assert (!new_bb);
10166 return true;
10169 init_expr = vect_phi_initial_value (phi);
10171 gimple_seq stmts = NULL;
10172 if (!nested_in_vect_loop)
10174 /* Convert the initial value to the IV update type. */
10175 tree new_type = TREE_TYPE (step_expr);
10176 init_expr = gimple_convert (&stmts, new_type, init_expr);
10178 /* If we are using the loop mask to "peel" for alignment then we need
10179 to adjust the start value here. */
10180 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10181 if (skip_niters != NULL_TREE)
10183 if (FLOAT_TYPE_P (vectype))
10184 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10185 skip_niters);
10186 else
10187 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10188 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10189 skip_niters, step_expr);
10190 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10191 init_expr, skip_step);
10195 if (stmts)
10197 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10198 gcc_assert (!new_bb);
10201 /* Create the vector that holds the initial_value of the induction. */
10202 if (nested_in_vect_loop)
10204 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10205 been created during vectorization of previous stmts. We obtain it
10206 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10207 auto_vec<tree> vec_inits;
10208 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10209 init_expr, &vec_inits);
10210 vec_init = vec_inits[0];
10211 /* If the initial value is not of proper type, convert it. */
10212 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10214 new_stmt
10215 = gimple_build_assign (vect_get_new_ssa_name (vectype,
10216 vect_simple_var,
10217 "vec_iv_"),
10218 VIEW_CONVERT_EXPR,
10219 build1 (VIEW_CONVERT_EXPR, vectype,
10220 vec_init));
10221 vec_init = gimple_assign_lhs (new_stmt);
10222 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10223 new_stmt);
10224 gcc_assert (!new_bb);
10227 else
10229 /* iv_loop is the loop to be vectorized. Create:
10230 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10231 stmts = NULL;
10232 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10234 unsigned HOST_WIDE_INT const_nunits;
10235 if (nunits.is_constant (&const_nunits))
10237 tree_vector_builder elts (step_vectype, const_nunits, 1);
10238 elts.quick_push (new_name);
10239 for (i = 1; i < const_nunits; i++)
10241 /* Create: new_name_i = new_name + step_expr */
10242 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10243 new_name, step_expr);
10244 elts.quick_push (new_name);
10246 /* Create a vector from [new_name_0, new_name_1, ...,
10247 new_name_nunits-1] */
10248 vec_init = gimple_build_vector (&stmts, &elts);
10250 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10251 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10252 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10253 new_name, step_expr);
10254 else
10256 /* Build:
10257 [base, base, base, ...]
10258 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10259 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10260 gcc_assert (flag_associative_math);
10261 tree index = build_index_vector (step_vectype, 0, 1);
10262 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10263 new_name);
10264 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10265 step_expr);
10266 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10267 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10268 vec_init, step_vec);
10269 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10270 vec_init, base_vec);
10272 vec_init = gimple_convert (&stmts, vectype, vec_init);
10274 if (stmts)
10276 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10277 gcc_assert (!new_bb);
10282 /* Create the vector that holds the step of the induction. */
10283 if (nested_in_vect_loop)
10284 /* iv_loop is nested in the loop to be vectorized. Generate:
10285 vec_step = [S, S, S, S] */
10286 new_name = step_expr;
10287 else
10289 /* iv_loop is the loop to be vectorized. Generate:
10290 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10291 gimple_seq seq = NULL;
10292 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10294 expr = build_int_cst (integer_type_node, vf);
10295 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10297 else
10298 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10299 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10300 expr, step_expr);
10301 if (seq)
10303 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10304 gcc_assert (!new_bb);
10308 t = unshare_expr (new_name);
10309 gcc_assert (CONSTANT_CLASS_P (new_name)
10310 || TREE_CODE (new_name) == SSA_NAME);
10311 new_vec = build_vector_from_val (step_vectype, t);
10312 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10313 new_vec, step_vectype, NULL);
10316 /* Create the following def-use cycle:
10317 loop prolog:
10318 vec_init = ...
10319 vec_step = ...
10320 loop:
10321 vec_iv = PHI <vec_init, vec_loop>
10323 STMT
10325 vec_loop = vec_iv + vec_step; */
10327 /* Create the induction-phi that defines the induction-operand. */
10328 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10329 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10330 induc_def = PHI_RESULT (induction_phi);
10332 /* Create the iv update inside the loop */
10333 stmts = NULL;
10334 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10335 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10336 vec_def = gimple_convert (&stmts, vectype, vec_def);
10337 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10338 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10340 /* Set the arguments of the phi node: */
10341 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10342 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10343 UNKNOWN_LOCATION);
10345 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10346 *vec_stmt = induction_phi;
10348 /* In case that vectorization factor (VF) is bigger than the number
10349 of elements that we can fit in a vectype (nunits), we have to generate
10350 more than one vector stmt - i.e - we need to "unroll" the
10351 vector stmt by a factor VF/nunits. For more details see documentation
10352 in vectorizable_operation. */
10354 if (ncopies > 1)
10356 gimple_seq seq = NULL;
10357 /* FORNOW. This restriction should be relaxed. */
10358 gcc_assert (!nested_in_vect_loop);
10360 /* Create the vector that holds the step of the induction. */
10361 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10363 expr = build_int_cst (integer_type_node, nunits);
10364 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10366 else
10367 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10368 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10369 expr, step_expr);
10370 if (seq)
10372 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10373 gcc_assert (!new_bb);
10376 t = unshare_expr (new_name);
10377 gcc_assert (CONSTANT_CLASS_P (new_name)
10378 || TREE_CODE (new_name) == SSA_NAME);
10379 new_vec = build_vector_from_val (step_vectype, t);
10380 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10381 new_vec, step_vectype, NULL);
10383 vec_def = induc_def;
10384 for (i = 1; i < ncopies + 1; i++)
10386 /* vec_i = vec_prev + vec_step */
10387 gimple_seq stmts = NULL;
10388 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10389 vec_def = gimple_build (&stmts,
10390 PLUS_EXPR, step_vectype, vec_def, vec_step);
10391 vec_def = gimple_convert (&stmts, vectype, vec_def);
10393 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10394 if (i < ncopies)
10396 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10397 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10399 else
10401 /* vec_1 = vec_iv + (VF/n * S)
10402 vec_2 = vec_1 + (VF/n * S)
10404 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10406 vec_n is used as vec_loop to save the large step register and
10407 related operations. */
10408 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10409 UNKNOWN_LOCATION);
10414 if (dump_enabled_p ())
10415 dump_printf_loc (MSG_NOTE, vect_location,
10416 "transform induction: created def-use cycle: %G%G",
10417 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10419 return true;
10422 /* Function vectorizable_live_operation.
10424 STMT_INFO computes a value that is used outside the loop. Check if
10425 it can be supported. */
10427 bool
10428 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10429 slp_tree slp_node, slp_instance slp_node_instance,
10430 int slp_index, bool vec_stmt_p,
10431 stmt_vector_for_cost *cost_vec)
10433 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10434 imm_use_iterator imm_iter;
10435 tree lhs, lhs_type, bitsize;
10436 tree vectype = (slp_node
10437 ? SLP_TREE_VECTYPE (slp_node)
10438 : STMT_VINFO_VECTYPE (stmt_info));
10439 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10440 int ncopies;
10441 gimple *use_stmt;
10442 auto_vec<tree> vec_oprnds;
10443 int vec_entry = 0;
10444 poly_uint64 vec_index = 0;
10446 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10448 /* If a stmt of a reduction is live, vectorize it via
10449 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10450 validity so just trigger the transform here. */
10451 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10453 if (!vec_stmt_p)
10454 return true;
10455 if (slp_node)
10457 /* For reduction chains the meta-info is attached to
10458 the group leader. */
10459 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10460 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10461 /* For SLP reductions we vectorize the epilogue for
10462 all involved stmts together. */
10463 else if (slp_index != 0)
10464 return true;
10466 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10467 gcc_assert (reduc_info->is_reduc_info);
10468 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10469 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10470 return true;
10471 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10472 slp_node_instance);
10473 return true;
10476 /* If STMT is not relevant and it is a simple assignment and its inputs are
10477 invariant then it can remain in place, unvectorized. The original last
10478 scalar value that it computes will be used. */
10479 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10481 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10482 if (dump_enabled_p ())
10483 dump_printf_loc (MSG_NOTE, vect_location,
10484 "statement is simple and uses invariant. Leaving in "
10485 "place.\n");
10486 return true;
10489 if (slp_node)
10490 ncopies = 1;
10491 else
10492 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10494 if (slp_node)
10496 gcc_assert (slp_index >= 0);
10498 /* Get the last occurrence of the scalar index from the concatenation of
10499 all the slp vectors. Calculate which slp vector it is and the index
10500 within. */
10501 int num_scalar = SLP_TREE_LANES (slp_node);
10502 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10503 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10505 /* Calculate which vector contains the result, and which lane of
10506 that vector we need. */
10507 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10509 if (dump_enabled_p ())
10510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10511 "Cannot determine which vector holds the"
10512 " final result.\n");
10513 return false;
10517 if (!vec_stmt_p)
10519 /* No transformation required. */
10520 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10522 if (slp_node)
10524 if (dump_enabled_p ())
10525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10526 "can't operate on partial vectors "
10527 "because an SLP statement is live after "
10528 "the loop.\n");
10529 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10531 else if (ncopies > 1)
10533 if (dump_enabled_p ())
10534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10535 "can't operate on partial vectors "
10536 "because ncopies is greater than 1.\n");
10537 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10539 else
10541 gcc_assert (ncopies == 1 && !slp_node);
10542 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10543 OPTIMIZE_FOR_SPEED))
10544 vect_record_loop_mask (loop_vinfo,
10545 &LOOP_VINFO_MASKS (loop_vinfo),
10546 1, vectype, NULL);
10547 else if (can_vec_extract_var_idx_p (
10548 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10549 vect_record_loop_len (loop_vinfo,
10550 &LOOP_VINFO_LENS (loop_vinfo),
10551 1, vectype, 1);
10552 else
10554 if (dump_enabled_p ())
10555 dump_printf_loc (
10556 MSG_MISSED_OPTIMIZATION, vect_location,
10557 "can't operate on partial vectors "
10558 "because the target doesn't support extract "
10559 "last reduction.\n");
10560 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10564 /* ??? Enable for loop costing as well. */
10565 if (!loop_vinfo)
10566 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10567 0, vect_epilogue);
10568 return true;
10571 /* Use the lhs of the original scalar statement. */
10572 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10573 if (dump_enabled_p ())
10574 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10575 "stmt %G", stmt);
10577 lhs = gimple_get_lhs (stmt);
10578 lhs_type = TREE_TYPE (lhs);
10580 bitsize = vector_element_bits_tree (vectype);
10582 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10583 tree vec_lhs, bitstart;
10584 gimple *vec_stmt;
10585 if (slp_node)
10587 gcc_assert (!loop_vinfo
10588 || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10589 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10591 /* Get the correct slp vectorized stmt. */
10592 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10593 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10595 /* Get entry to use. */
10596 bitstart = bitsize_int (vec_index);
10597 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10599 else
10601 /* For multiple copies, get the last copy. */
10602 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10603 vec_lhs = gimple_get_lhs (vec_stmt);
10605 /* Get the last lane in the vector. */
10606 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10609 if (loop_vinfo)
10611 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10612 requirement, insert one phi node for it. It looks like:
10613 loop;
10615 # lhs' = PHI <lhs>
10617 loop;
10619 # vec_lhs' = PHI <vec_lhs>
10620 new_tree = lane_extract <vec_lhs', ...>;
10621 lhs' = new_tree; */
10623 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10624 basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
10625 gcc_assert (single_pred_p (exit_bb));
10627 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10628 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10629 SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
10631 gimple_seq stmts = NULL;
10632 tree new_tree;
10633 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10635 /* Emit:
10637 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10639 where VEC_LHS is the vectorized live-out result and MASK is
10640 the loop mask for the final iteration. */
10641 gcc_assert (ncopies == 1 && !slp_node);
10642 gimple_seq tem = NULL;
10643 gimple_stmt_iterator gsi = gsi_last (tem);
10644 tree len
10645 = vect_get_loop_len (loop_vinfo, &gsi,
10646 &LOOP_VINFO_LENS (loop_vinfo),
10647 1, vectype, 0, 0);
10649 /* BIAS - 1. */
10650 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10651 tree bias_minus_one
10652 = int_const_binop (MINUS_EXPR,
10653 build_int_cst (TREE_TYPE (len), biasval),
10654 build_one_cst (TREE_TYPE (len)));
10656 /* LAST_INDEX = LEN + (BIAS - 1). */
10657 tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10658 len, bias_minus_one);
10660 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10661 tree scalar_res
10662 = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10663 vec_lhs_phi, last_index);
10665 /* Convert the extracted vector element to the scalar type. */
10666 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10668 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10670 /* Emit:
10672 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10674 where VEC_LHS is the vectorized live-out result and MASK is
10675 the loop mask for the final iteration. */
10676 gcc_assert (ncopies == 1 && !slp_node);
10677 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10678 gimple_seq tem = NULL;
10679 gimple_stmt_iterator gsi = gsi_last (tem);
10680 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10681 &LOOP_VINFO_MASKS (loop_vinfo),
10682 1, vectype, 0);
10683 gimple_seq_add_seq (&stmts, tem);
10684 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10685 mask, vec_lhs_phi);
10687 /* Convert the extracted vector element to the scalar type. */
10688 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10690 else
10692 tree bftype = TREE_TYPE (vectype);
10693 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10694 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10695 new_tree = build3 (BIT_FIELD_REF, bftype,
10696 vec_lhs_phi, bitsize, bitstart);
10697 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10698 &stmts, true, NULL_TREE);
10701 if (stmts)
10703 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10704 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10706 /* Remove existing phi from lhs and create one copy from new_tree. */
10707 tree lhs_phi = NULL_TREE;
10708 gimple_stmt_iterator gsi;
10709 for (gsi = gsi_start_phis (exit_bb);
10710 !gsi_end_p (gsi); gsi_next (&gsi))
10712 gimple *phi = gsi_stmt (gsi);
10713 if ((gimple_phi_arg_def (phi, 0) == lhs))
10715 remove_phi_node (&gsi, false);
10716 lhs_phi = gimple_phi_result (phi);
10717 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10718 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10719 break;
10724 /* Replace use of lhs with newly computed result. If the use stmt is a
10725 single arg PHI, just replace all uses of PHI result. It's necessary
10726 because lcssa PHI defining lhs may be before newly inserted stmt. */
10727 use_operand_p use_p;
10728 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10729 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10730 && !is_gimple_debug (use_stmt))
10732 if (gimple_code (use_stmt) == GIMPLE_PHI
10733 && gimple_phi_num_args (use_stmt) == 1)
10735 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10737 else
10739 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10740 SET_USE (use_p, new_tree);
10742 update_stmt (use_stmt);
10745 else
10747 /* For basic-block vectorization simply insert the lane-extraction. */
10748 tree bftype = TREE_TYPE (vectype);
10749 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10750 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10751 tree new_tree = build3 (BIT_FIELD_REF, bftype,
10752 vec_lhs, bitsize, bitstart);
10753 gimple_seq stmts = NULL;
10754 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10755 &stmts, true, NULL_TREE);
10756 if (TREE_CODE (new_tree) == SSA_NAME
10757 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10758 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10759 if (is_a <gphi *> (vec_stmt))
10761 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10762 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10764 else
10766 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10767 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10770 /* Replace use of lhs with newly computed result. If the use stmt is a
10771 single arg PHI, just replace all uses of PHI result. It's necessary
10772 because lcssa PHI defining lhs may be before newly inserted stmt. */
10773 use_operand_p use_p;
10774 stmt_vec_info use_stmt_info;
10775 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10776 if (!is_gimple_debug (use_stmt)
10777 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10778 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10780 /* ??? This can happen when the live lane ends up being
10781 used in a vector construction code-generated by an
10782 external SLP node (and code-generation for that already
10783 happened). See gcc.dg/vect/bb-slp-47.c.
10784 Doing this is what would happen if that vector CTOR
10785 were not code-generated yet so it is not too bad.
10786 ??? In fact we'd likely want to avoid this situation
10787 in the first place. */
10788 if (TREE_CODE (new_tree) == SSA_NAME
10789 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10790 && gimple_code (use_stmt) != GIMPLE_PHI
10791 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10792 use_stmt))
10794 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10795 gcc_checking_assert (code == SSA_NAME
10796 || code == CONSTRUCTOR
10797 || code == VIEW_CONVERT_EXPR
10798 || CONVERT_EXPR_CODE_P (code));
10799 if (dump_enabled_p ())
10800 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10801 "Using original scalar computation for "
10802 "live lane because use preceeds vector "
10803 "def\n");
10804 continue;
10806 /* ??? It can also happen that we end up pulling a def into
10807 a loop where replacing out-of-loop uses would require
10808 a new LC SSA PHI node. Retain the original scalar in
10809 those cases as well. PR98064. */
10810 if (TREE_CODE (new_tree) == SSA_NAME
10811 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10812 && (gimple_bb (use_stmt)->loop_father
10813 != gimple_bb (vec_stmt)->loop_father)
10814 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10815 gimple_bb (use_stmt)->loop_father))
10817 if (dump_enabled_p ())
10818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10819 "Using original scalar computation for "
10820 "live lane because there is an out-of-loop "
10821 "definition for it\n");
10822 continue;
10824 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10825 SET_USE (use_p, new_tree);
10826 update_stmt (use_stmt);
10830 return true;
10833 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
10835 static void
10836 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10838 ssa_op_iter op_iter;
10839 imm_use_iterator imm_iter;
10840 def_operand_p def_p;
10841 gimple *ustmt;
10843 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10845 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10847 basic_block bb;
10849 if (!is_gimple_debug (ustmt))
10850 continue;
10852 bb = gimple_bb (ustmt);
10854 if (!flow_bb_inside_loop_p (loop, bb))
10856 if (gimple_debug_bind_p (ustmt))
10858 if (dump_enabled_p ())
10859 dump_printf_loc (MSG_NOTE, vect_location,
10860 "killing debug use\n");
10862 gimple_debug_bind_reset_value (ustmt);
10863 update_stmt (ustmt);
10865 else
10866 gcc_unreachable ();
10872 /* Given loop represented by LOOP_VINFO, return true if computation of
10873 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10874 otherwise. */
10876 static bool
10877 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10879 /* Constant case. */
10880 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10882 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10883 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10885 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10886 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10887 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10888 return true;
10891 widest_int max;
10892 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10893 /* Check the upper bound of loop niters. */
10894 if (get_max_loop_iterations (loop, &max))
10896 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10897 signop sgn = TYPE_SIGN (type);
10898 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10899 if (max < type_max)
10900 return true;
10902 return false;
10905 /* Return a mask type with half the number of elements as OLD_TYPE,
10906 given that it should have mode NEW_MODE. */
10908 tree
10909 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10911 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10912 return build_truth_vector_type_for_mode (nunits, new_mode);
10915 /* Return a mask type with twice as many elements as OLD_TYPE,
10916 given that it should have mode NEW_MODE. */
10918 tree
10919 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10921 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10922 return build_truth_vector_type_for_mode (nunits, new_mode);
10925 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10926 contain a sequence of NVECTORS masks that each control a vector of type
10927 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10928 these vector masks with the vector version of SCALAR_MASK. */
10930 void
10931 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10932 unsigned int nvectors, tree vectype, tree scalar_mask)
10934 gcc_assert (nvectors != 0);
10936 if (scalar_mask)
10938 scalar_cond_masked_key cond (scalar_mask, nvectors);
10939 loop_vinfo->scalar_cond_masked_set.add (cond);
10942 masks->mask_set.add (std::make_pair (vectype, nvectors));
10945 /* Given a complete set of masks MASKS, extract mask number INDEX
10946 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10947 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10949 See the comment above vec_loop_masks for more details about the mask
10950 arrangement. */
10952 tree
10953 vect_get_loop_mask (loop_vec_info loop_vinfo,
10954 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10955 unsigned int nvectors, tree vectype, unsigned int index)
10957 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10958 == vect_partial_vectors_while_ult)
10960 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10961 tree mask_type = rgm->type;
10963 /* Populate the rgroup's mask array, if this is the first time we've
10964 used it. */
10965 if (rgm->controls.is_empty ())
10967 rgm->controls.safe_grow_cleared (nvectors, true);
10968 for (unsigned int i = 0; i < nvectors; ++i)
10970 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10971 /* Provide a dummy definition until the real one is available. */
10972 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10973 rgm->controls[i] = mask;
10977 tree mask = rgm->controls[index];
10978 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10979 TYPE_VECTOR_SUBPARTS (vectype)))
10981 /* A loop mask for data type X can be reused for data type Y
10982 if X has N times more elements than Y and if Y's elements
10983 are N times bigger than X's. In this case each sequence
10984 of N elements in the loop mask will be all-zero or all-one.
10985 We can then view-convert the mask so that each sequence of
10986 N elements is replaced by a single element. */
10987 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10988 TYPE_VECTOR_SUBPARTS (vectype)));
10989 gimple_seq seq = NULL;
10990 mask_type = truth_type_for (vectype);
10991 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10992 if (seq)
10993 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10995 return mask;
10997 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10998 == vect_partial_vectors_avx512)
11000 /* The number of scalars per iteration and the number of vectors are
11001 both compile-time constants. */
11002 unsigned int nscalars_per_iter
11003 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11004 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11006 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11008 /* The stored nV is dependent on the mask type produced. */
11009 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11010 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11011 == rgm->factor);
11012 nvectors = rgm->factor;
11014 /* Populate the rgroup's mask array, if this is the first time we've
11015 used it. */
11016 if (rgm->controls.is_empty ())
11018 rgm->controls.safe_grow_cleared (nvectors, true);
11019 for (unsigned int i = 0; i < nvectors; ++i)
11021 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11022 /* Provide a dummy definition until the real one is available. */
11023 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11024 rgm->controls[i] = mask;
11027 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11028 TYPE_VECTOR_SUBPARTS (vectype)))
11029 return rgm->controls[index];
11031 /* Split the vector if needed. Since we are dealing with integer mode
11032 masks with AVX512 we can operate on the integer representation
11033 performing the whole vector shifting. */
11034 unsigned HOST_WIDE_INT factor;
11035 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11036 TYPE_VECTOR_SUBPARTS (vectype), &factor);
11037 gcc_assert (ok);
11038 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11039 tree mask_type = truth_type_for (vectype);
11040 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11041 unsigned vi = index / factor;
11042 unsigned vpart = index % factor;
11043 tree vec = rgm->controls[vi];
11044 gimple_seq seq = NULL;
11045 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11046 lang_hooks.types.type_for_mode
11047 (TYPE_MODE (rgm->type), 1), vec);
11048 /* For integer mode masks simply shift the right bits into position. */
11049 if (vpart != 0)
11050 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11051 build_int_cst (integer_type_node,
11052 (TYPE_VECTOR_SUBPARTS (vectype)
11053 * vpart)));
11054 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11055 (TYPE_MODE (mask_type), 1), vec);
11056 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11057 if (seq)
11058 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11059 return vec;
11061 else
11062 gcc_unreachable ();
11065 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11066 lengths for controlling an operation on VECTYPE. The operation splits
11067 each element of VECTYPE into FACTOR separate subelements, measuring the
11068 length as a number of these subelements. */
11070 void
11071 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11072 unsigned int nvectors, tree vectype, unsigned int factor)
11074 gcc_assert (nvectors != 0);
11075 if (lens->length () < nvectors)
11076 lens->safe_grow_cleared (nvectors, true);
11077 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11079 /* The number of scalars per iteration, scalar occupied bytes and
11080 the number of vectors are both compile-time constants. */
11081 unsigned int nscalars_per_iter
11082 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11083 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11085 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11087 /* For now, we only support cases in which all loads and stores fall back
11088 to VnQI or none do. */
11089 gcc_assert (!rgl->max_nscalars_per_iter
11090 || (rgl->factor == 1 && factor == 1)
11091 || (rgl->max_nscalars_per_iter * rgl->factor
11092 == nscalars_per_iter * factor));
11093 rgl->max_nscalars_per_iter = nscalars_per_iter;
11094 rgl->type = vectype;
11095 rgl->factor = factor;
11099 /* Given a complete set of lengths LENS, extract length number INDEX
11100 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11101 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11102 multipled by the number of elements that should be processed.
11103 Insert any set-up statements before GSI. */
11105 tree
11106 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11107 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11108 unsigned int index, unsigned int factor)
11110 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11111 bool use_bias_adjusted_len =
11112 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11114 /* Populate the rgroup's len array, if this is the first time we've
11115 used it. */
11116 if (rgl->controls.is_empty ())
11118 rgl->controls.safe_grow_cleared (nvectors, true);
11119 for (unsigned int i = 0; i < nvectors; ++i)
11121 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11122 gcc_assert (len_type != NULL_TREE);
11124 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11126 /* Provide a dummy definition until the real one is available. */
11127 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11128 rgl->controls[i] = len;
11130 if (use_bias_adjusted_len)
11132 gcc_assert (i == 0);
11133 tree adjusted_len =
11134 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11135 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11136 rgl->bias_adjusted_ctrl = adjusted_len;
11141 if (use_bias_adjusted_len)
11142 return rgl->bias_adjusted_ctrl;
11144 tree loop_len = rgl->controls[index];
11145 if (rgl->factor == 1 && factor == 1)
11147 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11148 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11149 if (maybe_ne (nunits1, nunits2))
11151 /* A loop len for data type X can be reused for data type Y
11152 if X has N times more elements than Y and if Y's elements
11153 are N times bigger than X's. */
11154 gcc_assert (multiple_p (nunits1, nunits2));
11155 factor = exact_div (nunits1, nunits2).to_constant ();
11156 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11157 gimple_seq seq = NULL;
11158 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11159 build_int_cst (iv_type, factor));
11160 if (seq)
11161 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11164 return loop_len;
11167 /* Scale profiling counters by estimation for LOOP which is vectorized
11168 by factor VF.
11169 If FLAT is true, the loop we started with had unrealistically flat
11170 profile. */
11172 static void
11173 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11175 /* For flat profiles do not scale down proportionally by VF and only
11176 cap by known iteration count bounds. */
11177 if (flat)
11179 if (dump_file && (dump_flags & TDF_DETAILS))
11180 fprintf (dump_file,
11181 "Vectorized loop profile seems flat; not scaling iteration "
11182 "count down by the vectorization factor %i\n", vf);
11183 scale_loop_profile (loop, profile_probability::always (),
11184 get_likely_max_loop_iterations_int (loop));
11185 return;
11187 /* Loop body executes VF fewer times and exit increases VF times. */
11188 profile_count entry_count = loop_preheader_edge (loop)->count ();
11190 /* If we have unreliable loop profile avoid dropping entry
11191 count bellow header count. This can happen since loops
11192 has unrealistically low trip counts. */
11193 while (vf > 1
11194 && loop->header->count > entry_count
11195 && loop->header->count < entry_count * vf)
11197 if (dump_file && (dump_flags & TDF_DETAILS))
11198 fprintf (dump_file,
11199 "Vectorization factor %i seems too large for profile "
11200 "prevoiusly believed to be consistent; reducing.\n", vf);
11201 vf /= 2;
11204 if (entry_count.nonzero_p ())
11205 set_edge_probability_and_rescale_others
11206 (exit_e,
11207 entry_count.probability_in (loop->header->count / vf));
11208 /* Avoid producing very large exit probability when we do not have
11209 sensible profile. */
11210 else if (exit_e->probability < profile_probability::always () / (vf * 2))
11211 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11212 loop->latch->count = single_pred_edge (loop->latch)->count ();
11214 scale_loop_profile (loop, profile_probability::always () / vf,
11215 get_likely_max_loop_iterations_int (loop));
11218 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11219 latch edge values originally defined by it. */
11221 static void
11222 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11223 stmt_vec_info def_stmt_info)
11225 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11226 if (!def || TREE_CODE (def) != SSA_NAME)
11227 return;
11228 stmt_vec_info phi_info;
11229 imm_use_iterator iter;
11230 use_operand_p use_p;
11231 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11233 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11234 if (!phi)
11235 continue;
11236 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11237 && (phi_info = loop_vinfo->lookup_stmt (phi))
11238 && STMT_VINFO_RELEVANT_P (phi_info)))
11239 continue;
11240 loop_p loop = gimple_bb (phi)->loop_father;
11241 edge e = loop_latch_edge (loop);
11242 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11243 continue;
11245 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11246 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11247 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11249 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11250 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11251 gcc_assert (phi_defs.length () == latch_defs.length ());
11252 for (unsigned i = 0; i < phi_defs.length (); ++i)
11253 add_phi_arg (as_a <gphi *> (phi_defs[i]),
11254 gimple_get_lhs (latch_defs[i]), e,
11255 gimple_phi_arg_location (phi, e->dest_idx));
11257 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11259 /* For first order recurrences we have to update both uses of
11260 the latch definition, the one in the PHI node and the one
11261 in the generated VEC_PERM_EXPR. */
11262 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11263 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11264 gcc_assert (phi_defs.length () == latch_defs.length ());
11265 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11266 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11267 for (unsigned i = 0; i < phi_defs.length (); ++i)
11269 gassign *perm = as_a <gassign *> (phi_defs[i]);
11270 if (i > 0)
11271 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11272 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11273 update_stmt (perm);
11275 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11276 gimple_phi_arg_location (phi, e->dest_idx));
11281 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11282 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11283 stmt_vec_info. */
11285 static bool
11286 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11287 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11289 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11290 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11292 if (dump_enabled_p ())
11293 dump_printf_loc (MSG_NOTE, vect_location,
11294 "------>vectorizing statement: %G", stmt_info->stmt);
11296 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11297 vect_loop_kill_debug_uses (loop, stmt_info);
11299 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11300 && !STMT_VINFO_LIVE_P (stmt_info))
11301 return false;
11303 if (STMT_VINFO_VECTYPE (stmt_info))
11305 poly_uint64 nunits
11306 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11307 if (!STMT_SLP_TYPE (stmt_info)
11308 && maybe_ne (nunits, vf)
11309 && dump_enabled_p ())
11310 /* For SLP VF is set according to unrolling factor, and not
11311 to vector size, hence for SLP this print is not valid. */
11312 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11315 /* Pure SLP statements have already been vectorized. We still need
11316 to apply loop vectorization to hybrid SLP statements. */
11317 if (PURE_SLP_STMT (stmt_info))
11318 return false;
11320 if (dump_enabled_p ())
11321 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11323 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11324 *seen_store = stmt_info;
11326 return true;
11329 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11330 in the hash_map with its corresponding values. */
11332 static tree
11333 find_in_mapping (tree t, void *context)
11335 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11337 tree *value = mapping->get (t);
11338 return value ? *value : t;
11341 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11342 original loop that has now been vectorized.
11344 The inits of the data_references need to be advanced with the number of
11345 iterations of the main loop. This has been computed in vect_do_peeling and
11346 is stored in parameter ADVANCE. We first restore the data_references
11347 initial offset with the values recored in ORIG_DRS_INIT.
11349 Since the loop_vec_info of this EPILOGUE was constructed for the original
11350 loop, its stmt_vec_infos all point to the original statements. These need
11351 to be updated to point to their corresponding copies as well as the SSA_NAMES
11352 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11354 The data_reference's connections also need to be updated. Their
11355 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11356 stmt_vec_infos, their statements need to point to their corresponding copy,
11357 if they are gather loads or scatter stores then their reference needs to be
11358 updated to point to its corresponding copy and finally we set
11359 'base_misaligned' to false as we have already peeled for alignment in the
11360 prologue of the main loop. */
11362 static void
11363 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11365 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11366 auto_vec<gimple *> stmt_worklist;
11367 hash_map<tree,tree> mapping;
11368 gimple *orig_stmt, *new_stmt;
11369 gimple_stmt_iterator epilogue_gsi;
11370 gphi_iterator epilogue_phi_gsi;
11371 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11372 basic_block *epilogue_bbs = get_loop_body (epilogue);
11373 unsigned i;
11375 free (LOOP_VINFO_BBS (epilogue_vinfo));
11376 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11378 /* Advance data_reference's with the number of iterations of the previous
11379 loop and its prologue. */
11380 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11383 /* The EPILOGUE loop is a copy of the original loop so they share the same
11384 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11385 point to the copied statements. We also create a mapping of all LHS' in
11386 the original loop and all the LHS' in the EPILOGUE and create worklists to
11387 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11388 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11390 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11391 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11393 new_stmt = epilogue_phi_gsi.phi ();
11395 gcc_assert (gimple_uid (new_stmt) > 0);
11396 stmt_vinfo
11397 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11399 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11400 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11402 mapping.put (gimple_phi_result (orig_stmt),
11403 gimple_phi_result (new_stmt));
11404 /* PHI nodes can not have patterns or related statements. */
11405 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11406 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11409 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11410 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11412 new_stmt = gsi_stmt (epilogue_gsi);
11413 if (is_gimple_debug (new_stmt))
11414 continue;
11416 gcc_assert (gimple_uid (new_stmt) > 0);
11417 stmt_vinfo
11418 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11420 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11421 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11423 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11424 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11426 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11428 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11429 for (gimple_stmt_iterator gsi = gsi_start (seq);
11430 !gsi_end_p (gsi); gsi_next (&gsi))
11431 stmt_worklist.safe_push (gsi_stmt (gsi));
11434 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11435 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11437 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11438 stmt_worklist.safe_push (stmt);
11439 /* Set BB such that the assert in
11440 'get_initial_def_for_reduction' is able to determine that
11441 the BB of the related stmt is inside this loop. */
11442 gimple_set_bb (stmt,
11443 gimple_bb (new_stmt));
11444 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11445 gcc_assert (related_vinfo == NULL
11446 || related_vinfo == stmt_vinfo);
11451 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11452 using the original main loop and thus need to be updated to refer to the
11453 cloned variables used in the epilogue. */
11454 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11456 gimple *stmt = stmt_worklist[i];
11457 tree *new_op;
11459 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11461 tree op = gimple_op (stmt, j);
11462 if ((new_op = mapping.get(op)))
11463 gimple_set_op (stmt, j, *new_op);
11464 else
11466 /* PR92429: The last argument of simplify_replace_tree disables
11467 folding when replacing arguments. This is required as
11468 otherwise you might end up with different statements than the
11469 ones analyzed in vect_loop_analyze, leading to different
11470 vectorization. */
11471 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11472 &find_in_mapping, &mapping, false);
11473 gimple_set_op (stmt, j, op);
11478 struct data_reference *dr;
11479 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11480 FOR_EACH_VEC_ELT (datarefs, i, dr)
11482 orig_stmt = DR_STMT (dr);
11483 gcc_assert (gimple_uid (orig_stmt) > 0);
11484 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11485 /* Data references for gather loads and scatter stores do not use the
11486 updated offset we set using ADVANCE. Instead we have to make sure the
11487 reference in the data references point to the corresponding copy of
11488 the original in the epilogue. Make sure to update both
11489 gather/scatters recognized by dataref analysis and also other
11490 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11491 auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11492 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11493 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11495 DR_REF (dr)
11496 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11497 &find_in_mapping, &mapping);
11498 DR_BASE_ADDRESS (dr)
11499 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11500 &find_in_mapping, &mapping);
11502 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11503 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11504 /* The vector size of the epilogue is smaller than that of the main loop
11505 so the alignment is either the same or lower. This means the dr will
11506 thus by definition be aligned. */
11507 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11510 epilogue_vinfo->shared->datarefs_copy.release ();
11511 epilogue_vinfo->shared->save_datarefs ();
11514 /* Function vect_transform_loop.
11516 The analysis phase has determined that the loop is vectorizable.
11517 Vectorize the loop - created vectorized stmts to replace the scalar
11518 stmts in the loop, and update the loop exit condition.
11519 Returns scalar epilogue loop if any. */
11521 class loop *
11522 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11524 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11525 class loop *epilogue = NULL;
11526 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11527 int nbbs = loop->num_nodes;
11528 int i;
11529 tree niters_vector = NULL_TREE;
11530 tree step_vector = NULL_TREE;
11531 tree niters_vector_mult_vf = NULL_TREE;
11532 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11533 unsigned int lowest_vf = constant_lower_bound (vf);
11534 gimple *stmt;
11535 bool check_profitability = false;
11536 unsigned int th;
11537 bool flat = maybe_flat_loop_profile (loop);
11539 DUMP_VECT_SCOPE ("vec_transform_loop");
11541 loop_vinfo->shared->check_datarefs ();
11543 /* Use the more conservative vectorization threshold. If the number
11544 of iterations is constant assume the cost check has been performed
11545 by our caller. If the threshold makes all loops profitable that
11546 run at least the (estimated) vectorization factor number of times
11547 checking is pointless, too. */
11548 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11549 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11551 if (dump_enabled_p ())
11552 dump_printf_loc (MSG_NOTE, vect_location,
11553 "Profitability threshold is %d loop iterations.\n",
11554 th);
11555 check_profitability = true;
11558 /* Make sure there exists a single-predecessor exit bb. Do this before
11559 versioning. */
11560 edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11561 if (! single_pred_p (e->dest))
11563 split_loop_exit_edge (e, true);
11564 if (dump_enabled_p ())
11565 dump_printf (MSG_NOTE, "split exit edge\n");
11568 /* Version the loop first, if required, so the profitability check
11569 comes first. */
11571 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11573 class loop *sloop
11574 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11575 sloop->force_vectorize = false;
11576 check_profitability = false;
11579 /* Make sure there exists a single-predecessor exit bb also on the
11580 scalar loop copy. Do this after versioning but before peeling
11581 so CFG structure is fine for both scalar and if-converted loop
11582 to make slpeel_duplicate_current_defs_from_edges face matched
11583 loop closed PHI nodes on the exit. */
11584 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11586 e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11587 if (! single_pred_p (e->dest))
11589 split_loop_exit_edge (e, true);
11590 if (dump_enabled_p ())
11591 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11595 tree niters = vect_build_loop_niters (loop_vinfo);
11596 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11597 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11598 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11599 tree advance;
11600 drs_init_vec orig_drs_init;
11602 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11603 &step_vector, &niters_vector_mult_vf, th,
11604 check_profitability, niters_no_overflow,
11605 &advance);
11606 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11607 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11609 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11610 block after loop exit. We need to scale all that. */
11611 basic_block preheader
11612 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11613 preheader->count
11614 = preheader->count.apply_probability
11615 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11616 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11617 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11618 single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11619 = preheader->count;
11622 if (niters_vector == NULL_TREE)
11624 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11625 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11626 && known_eq (lowest_vf, vf))
11628 niters_vector
11629 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11630 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11631 step_vector = build_one_cst (TREE_TYPE (niters));
11633 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11634 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11635 &step_vector, niters_no_overflow);
11636 else
11637 /* vect_do_peeling subtracted the number of peeled prologue
11638 iterations from LOOP_VINFO_NITERS. */
11639 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11640 &niters_vector, &step_vector,
11641 niters_no_overflow);
11644 /* 1) Make sure the loop header has exactly two entries
11645 2) Make sure we have a preheader basic block. */
11647 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11649 split_edge (loop_preheader_edge (loop));
11651 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11652 /* This will deal with any possible peeling. */
11653 vect_prepare_for_masked_peels (loop_vinfo);
11655 /* Schedule the SLP instances first, then handle loop vectorization
11656 below. */
11657 if (!loop_vinfo->slp_instances.is_empty ())
11659 DUMP_VECT_SCOPE ("scheduling SLP instances");
11660 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11663 /* FORNOW: the vectorizer supports only loops which body consist
11664 of one basic block (header + empty latch). When the vectorizer will
11665 support more involved loop forms, the order by which the BBs are
11666 traversed need to be reconsidered. */
11668 for (i = 0; i < nbbs; i++)
11670 basic_block bb = bbs[i];
11671 stmt_vec_info stmt_info;
11673 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11674 gsi_next (&si))
11676 gphi *phi = si.phi ();
11677 if (dump_enabled_p ())
11678 dump_printf_loc (MSG_NOTE, vect_location,
11679 "------>vectorizing phi: %G", (gimple *) phi);
11680 stmt_info = loop_vinfo->lookup_stmt (phi);
11681 if (!stmt_info)
11682 continue;
11684 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11685 vect_loop_kill_debug_uses (loop, stmt_info);
11687 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11688 && !STMT_VINFO_LIVE_P (stmt_info))
11689 continue;
11691 if (STMT_VINFO_VECTYPE (stmt_info)
11692 && (maybe_ne
11693 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11694 && dump_enabled_p ())
11695 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11697 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11698 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11699 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11700 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11701 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11702 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11703 && ! PURE_SLP_STMT (stmt_info))
11705 if (dump_enabled_p ())
11706 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11707 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11711 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11712 gsi_next (&si))
11714 gphi *phi = si.phi ();
11715 stmt_info = loop_vinfo->lookup_stmt (phi);
11716 if (!stmt_info)
11717 continue;
11719 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11720 && !STMT_VINFO_LIVE_P (stmt_info))
11721 continue;
11723 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11724 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11725 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11726 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11727 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11728 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11729 && ! PURE_SLP_STMT (stmt_info))
11730 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11733 for (gimple_stmt_iterator si = gsi_start_bb (bb);
11734 !gsi_end_p (si);)
11736 stmt = gsi_stmt (si);
11737 /* During vectorization remove existing clobber stmts. */
11738 if (gimple_clobber_p (stmt))
11740 unlink_stmt_vdef (stmt);
11741 gsi_remove (&si, true);
11742 release_defs (stmt);
11744 else
11746 /* Ignore vector stmts created in the outer loop. */
11747 stmt_info = loop_vinfo->lookup_stmt (stmt);
11749 /* vector stmts created in the outer-loop during vectorization of
11750 stmts in an inner-loop may not have a stmt_info, and do not
11751 need to be vectorized. */
11752 stmt_vec_info seen_store = NULL;
11753 if (stmt_info)
11755 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11757 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11758 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11759 !gsi_end_p (subsi); gsi_next (&subsi))
11761 stmt_vec_info pat_stmt_info
11762 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11763 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11764 &si, &seen_store);
11766 stmt_vec_info pat_stmt_info
11767 = STMT_VINFO_RELATED_STMT (stmt_info);
11768 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11769 &si, &seen_store))
11770 maybe_set_vectorized_backedge_value (loop_vinfo,
11771 pat_stmt_info);
11773 else
11775 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11776 &seen_store))
11777 maybe_set_vectorized_backedge_value (loop_vinfo,
11778 stmt_info);
11781 gsi_next (&si);
11782 if (seen_store)
11784 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11785 /* Interleaving. If IS_STORE is TRUE, the
11786 vectorization of the interleaving chain was
11787 completed - free all the stores in the chain. */
11788 vect_remove_stores (loop_vinfo,
11789 DR_GROUP_FIRST_ELEMENT (seen_store));
11790 else
11791 /* Free the attached stmt_vec_info and remove the stmt. */
11792 loop_vinfo->remove_stmt (stmt_info);
11797 /* Stub out scalar statements that must not survive vectorization.
11798 Doing this here helps with grouped statements, or statements that
11799 are involved in patterns. */
11800 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11801 !gsi_end_p (gsi); gsi_next (&gsi))
11803 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11804 if (!call || !gimple_call_internal_p (call))
11805 continue;
11806 internal_fn ifn = gimple_call_internal_fn (call);
11807 if (ifn == IFN_MASK_LOAD)
11809 tree lhs = gimple_get_lhs (call);
11810 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11812 tree zero = build_zero_cst (TREE_TYPE (lhs));
11813 gimple *new_stmt = gimple_build_assign (lhs, zero);
11814 gsi_replace (&gsi, new_stmt, true);
11817 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11819 tree lhs = gimple_get_lhs (call);
11820 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11822 tree else_arg
11823 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11824 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11825 gsi_replace (&gsi, new_stmt, true);
11829 } /* BBs in loop */
11831 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11832 a zero NITERS becomes a nonzero NITERS_VECTOR. */
11833 if (integer_onep (step_vector))
11834 niters_no_overflow = true;
11835 vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11836 niters_vector, step_vector, niters_vector_mult_vf,
11837 !niters_no_overflow);
11839 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11841 /* True if the final iteration might not handle a full vector's
11842 worth of scalar iterations. */
11843 bool final_iter_may_be_partial
11844 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11845 /* The minimum number of iterations performed by the epilogue. This
11846 is 1 when peeling for gaps because we always need a final scalar
11847 iteration. */
11848 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11849 /* +1 to convert latch counts to loop iteration counts,
11850 -min_epilogue_iters to remove iterations that cannot be performed
11851 by the vector code. */
11852 int bias_for_lowest = 1 - min_epilogue_iters;
11853 int bias_for_assumed = bias_for_lowest;
11854 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11855 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11857 /* When the amount of peeling is known at compile time, the first
11858 iteration will have exactly alignment_npeels active elements.
11859 In the worst case it will have at least one. */
11860 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11861 bias_for_lowest += lowest_vf - min_first_active;
11862 bias_for_assumed += assumed_vf - min_first_active;
11864 /* In these calculations the "- 1" converts loop iteration counts
11865 back to latch counts. */
11866 if (loop->any_upper_bound)
11868 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11869 loop->nb_iterations_upper_bound
11870 = (final_iter_may_be_partial
11871 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11872 lowest_vf) - 1
11873 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11874 lowest_vf) - 1);
11875 if (main_vinfo
11876 /* Both peeling for alignment and peeling for gaps can end up
11877 with the scalar epilogue running for more than VF-1 iterations. */
11878 && !main_vinfo->peeling_for_alignment
11879 && !main_vinfo->peeling_for_gaps)
11881 unsigned int bound;
11882 poly_uint64 main_iters
11883 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11884 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11885 main_iters
11886 = upper_bound (main_iters,
11887 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11888 if (can_div_away_from_zero_p (main_iters,
11889 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11890 &bound))
11891 loop->nb_iterations_upper_bound
11892 = wi::umin ((bound_wide_int) (bound - 1),
11893 loop->nb_iterations_upper_bound);
11896 if (loop->any_likely_upper_bound)
11897 loop->nb_iterations_likely_upper_bound
11898 = (final_iter_may_be_partial
11899 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11900 + bias_for_lowest, lowest_vf) - 1
11901 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11902 + bias_for_lowest, lowest_vf) - 1);
11903 if (loop->any_estimate)
11904 loop->nb_iterations_estimate
11905 = (final_iter_may_be_partial
11906 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11907 assumed_vf) - 1
11908 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11909 assumed_vf) - 1);
11910 scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11911 assumed_vf, flat);
11913 if (dump_enabled_p ())
11915 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11917 dump_printf_loc (MSG_NOTE, vect_location,
11918 "LOOP VECTORIZED\n");
11919 if (loop->inner)
11920 dump_printf_loc (MSG_NOTE, vect_location,
11921 "OUTER LOOP VECTORIZED\n");
11922 dump_printf (MSG_NOTE, "\n");
11924 else
11925 dump_printf_loc (MSG_NOTE, vect_location,
11926 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11927 GET_MODE_NAME (loop_vinfo->vector_mode));
11930 /* Loops vectorized with a variable factor won't benefit from
11931 unrolling/peeling. */
11932 if (!vf.is_constant ())
11934 loop->unroll = 1;
11935 if (dump_enabled_p ())
11936 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11937 " variable-length vectorization factor\n");
11939 /* Free SLP instances here because otherwise stmt reference counting
11940 won't work. */
11941 slp_instance instance;
11942 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11943 vect_free_slp_instance (instance);
11944 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11945 /* Clear-up safelen field since its value is invalid after vectorization
11946 since vectorized loop can have loop-carried dependencies. */
11947 loop->safelen = 0;
11949 if (epilogue)
11951 update_epilogue_loop_vinfo (epilogue, advance);
11953 epilogue->simduid = loop->simduid;
11954 epilogue->force_vectorize = loop->force_vectorize;
11955 epilogue->dont_vectorize = false;
11958 return epilogue;
11961 /* The code below is trying to perform simple optimization - revert
11962 if-conversion for masked stores, i.e. if the mask of a store is zero
11963 do not perform it and all stored value producers also if possible.
11964 For example,
11965 for (i=0; i<n; i++)
11966 if (c[i])
11968 p1[i] += 1;
11969 p2[i] = p3[i] +2;
11971 this transformation will produce the following semi-hammock:
11973 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11975 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11976 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11977 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11978 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11979 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11980 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11984 void
11985 optimize_mask_stores (class loop *loop)
11987 basic_block *bbs = get_loop_body (loop);
11988 unsigned nbbs = loop->num_nodes;
11989 unsigned i;
11990 basic_block bb;
11991 class loop *bb_loop;
11992 gimple_stmt_iterator gsi;
11993 gimple *stmt;
11994 auto_vec<gimple *> worklist;
11995 auto_purge_vect_location sentinel;
11997 vect_location = find_loop_location (loop);
11998 /* Pick up all masked stores in loop if any. */
11999 for (i = 0; i < nbbs; i++)
12001 bb = bbs[i];
12002 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12003 gsi_next (&gsi))
12005 stmt = gsi_stmt (gsi);
12006 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12007 worklist.safe_push (stmt);
12011 free (bbs);
12012 if (worklist.is_empty ())
12013 return;
12015 /* Loop has masked stores. */
12016 while (!worklist.is_empty ())
12018 gimple *last, *last_store;
12019 edge e, efalse;
12020 tree mask;
12021 basic_block store_bb, join_bb;
12022 gimple_stmt_iterator gsi_to;
12023 tree vdef, new_vdef;
12024 gphi *phi;
12025 tree vectype;
12026 tree zero;
12028 last = worklist.pop ();
12029 mask = gimple_call_arg (last, 2);
12030 bb = gimple_bb (last);
12031 /* Create then_bb and if-then structure in CFG, then_bb belongs to
12032 the same loop as if_bb. It could be different to LOOP when two
12033 level loop-nest is vectorized and mask_store belongs to the inner
12034 one. */
12035 e = split_block (bb, last);
12036 bb_loop = bb->loop_father;
12037 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12038 join_bb = e->dest;
12039 store_bb = create_empty_bb (bb);
12040 add_bb_to_loop (store_bb, bb_loop);
12041 e->flags = EDGE_TRUE_VALUE;
12042 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12043 /* Put STORE_BB to likely part. */
12044 efalse->probability = profile_probability::likely ();
12045 e->probability = efalse->probability.invert ();
12046 store_bb->count = efalse->count ();
12047 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12048 if (dom_info_available_p (CDI_DOMINATORS))
12049 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12050 if (dump_enabled_p ())
12051 dump_printf_loc (MSG_NOTE, vect_location,
12052 "Create new block %d to sink mask stores.",
12053 store_bb->index);
12054 /* Create vector comparison with boolean result. */
12055 vectype = TREE_TYPE (mask);
12056 zero = build_zero_cst (vectype);
12057 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12058 gsi = gsi_last_bb (bb);
12059 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12060 /* Create new PHI node for vdef of the last masked store:
12061 .MEM_2 = VDEF <.MEM_1>
12062 will be converted to
12063 .MEM.3 = VDEF <.MEM_1>
12064 and new PHI node will be created in join bb
12065 .MEM_2 = PHI <.MEM_1, .MEM_3>
12067 vdef = gimple_vdef (last);
12068 new_vdef = make_ssa_name (gimple_vop (cfun), last);
12069 gimple_set_vdef (last, new_vdef);
12070 phi = create_phi_node (vdef, join_bb);
12071 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12073 /* Put all masked stores with the same mask to STORE_BB if possible. */
12074 while (true)
12076 gimple_stmt_iterator gsi_from;
12077 gimple *stmt1 = NULL;
12079 /* Move masked store to STORE_BB. */
12080 last_store = last;
12081 gsi = gsi_for_stmt (last);
12082 gsi_from = gsi;
12083 /* Shift GSI to the previous stmt for further traversal. */
12084 gsi_prev (&gsi);
12085 gsi_to = gsi_start_bb (store_bb);
12086 gsi_move_before (&gsi_from, &gsi_to);
12087 /* Setup GSI_TO to the non-empty block start. */
12088 gsi_to = gsi_start_bb (store_bb);
12089 if (dump_enabled_p ())
12090 dump_printf_loc (MSG_NOTE, vect_location,
12091 "Move stmt to created bb\n%G", last);
12092 /* Move all stored value producers if possible. */
12093 while (!gsi_end_p (gsi))
12095 tree lhs;
12096 imm_use_iterator imm_iter;
12097 use_operand_p use_p;
12098 bool res;
12100 /* Skip debug statements. */
12101 if (is_gimple_debug (gsi_stmt (gsi)))
12103 gsi_prev (&gsi);
12104 continue;
12106 stmt1 = gsi_stmt (gsi);
12107 /* Do not consider statements writing to memory or having
12108 volatile operand. */
12109 if (gimple_vdef (stmt1)
12110 || gimple_has_volatile_ops (stmt1))
12111 break;
12112 gsi_from = gsi;
12113 gsi_prev (&gsi);
12114 lhs = gimple_get_lhs (stmt1);
12115 if (!lhs)
12116 break;
12118 /* LHS of vectorized stmt must be SSA_NAME. */
12119 if (TREE_CODE (lhs) != SSA_NAME)
12120 break;
12122 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12124 /* Remove dead scalar statement. */
12125 if (has_zero_uses (lhs))
12127 gsi_remove (&gsi_from, true);
12128 continue;
12132 /* Check that LHS does not have uses outside of STORE_BB. */
12133 res = true;
12134 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12136 gimple *use_stmt;
12137 use_stmt = USE_STMT (use_p);
12138 if (is_gimple_debug (use_stmt))
12139 continue;
12140 if (gimple_bb (use_stmt) != store_bb)
12142 res = false;
12143 break;
12146 if (!res)
12147 break;
12149 if (gimple_vuse (stmt1)
12150 && gimple_vuse (stmt1) != gimple_vuse (last_store))
12151 break;
12153 /* Can move STMT1 to STORE_BB. */
12154 if (dump_enabled_p ())
12155 dump_printf_loc (MSG_NOTE, vect_location,
12156 "Move stmt to created bb\n%G", stmt1);
12157 gsi_move_before (&gsi_from, &gsi_to);
12158 /* Shift GSI_TO for further insertion. */
12159 gsi_prev (&gsi_to);
12161 /* Put other masked stores with the same mask to STORE_BB. */
12162 if (worklist.is_empty ()
12163 || gimple_call_arg (worklist.last (), 2) != mask
12164 || worklist.last () != stmt1)
12165 break;
12166 last = worklist.pop ();
12168 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12172 /* Decide whether it is possible to use a zero-based induction variable
12173 when vectorizing LOOP_VINFO with partial vectors. If it is, return
12174 the value that the induction variable must be able to hold in order
12175 to ensure that the rgroups eventually have no active vector elements.
12176 Return -1 otherwise. */
12178 widest_int
12179 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12181 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12182 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12183 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12185 /* Calculate the value that the induction variable must be able
12186 to hit in order to ensure that we end the loop with an all-false mask.
12187 This involves adding the maximum number of inactive trailing scalar
12188 iterations. */
12189 widest_int iv_limit = -1;
12190 if (max_loop_iterations (loop, &iv_limit))
12192 if (niters_skip)
12194 /* Add the maximum number of skipped iterations to the
12195 maximum iteration count. */
12196 if (TREE_CODE (niters_skip) == INTEGER_CST)
12197 iv_limit += wi::to_widest (niters_skip);
12198 else
12199 iv_limit += max_vf - 1;
12201 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12202 /* Make a conservatively-correct assumption. */
12203 iv_limit += max_vf - 1;
12205 /* IV_LIMIT is the maximum number of latch iterations, which is also
12206 the maximum in-range IV value. Round this value down to the previous
12207 vector alignment boundary and then add an extra full iteration. */
12208 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12209 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12211 return iv_limit;
12214 /* For the given rgroup_controls RGC, check whether an induction variable
12215 would ever hit a value that produces a set of all-false masks or zero
12216 lengths before wrapping around. Return true if it's possible to wrap
12217 around before hitting the desirable value, otherwise return false. */
12219 bool
12220 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12222 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12224 if (iv_limit == -1)
12225 return true;
12227 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12228 unsigned int compare_precision = TYPE_PRECISION (compare_type);
12229 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12231 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12232 return true;
12234 return false;