d: Merge upstream dmd, druntime 2bbf64907c, phobos b64bfbf91
[official-gcc.git] / gcc / tree-vect-loop.cc
blob6261cd1be1ddbf6ebbf95796313011e4934d7a87
1 /* Loop Vectorization
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "memmodel.h"
36 #include "optabs.h"
37 #include "diagnostic-core.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "cfganal.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "gimple-fold.h"
52 #include "cgraph.h"
53 #include "tree-cfg.h"
54 #include "tree-if-conv.h"
55 #include "internal-fn.h"
56 #include "tree-vector-builder.h"
57 #include "vec-perm-indices.h"
58 #include "tree-eh.h"
59 #include "case-cfn-macros.h"
60 #include "langhooks.h"
62 /* Loop Vectorization Pass.
64 This pass tries to vectorize loops.
66 For example, the vectorizer transforms the following simple loop:
68 short a[N]; short b[N]; short c[N]; int i;
70 for (i=0; i<N; i++){
71 a[i] = b[i] + c[i];
74 as if it was manually vectorized by rewriting the source code into:
76 typedef int __attribute__((mode(V8HI))) v8hi;
77 short a[N]; short b[N]; short c[N]; int i;
78 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 v8hi va, vb, vc;
81 for (i=0; i<N/8; i++){
82 vb = pb[i];
83 vc = pc[i];
84 va = vb + vc;
85 pa[i] = va;
88 The main entry to this pass is vectorize_loops(), in which
89 the vectorizer applies a set of analyses on a given set of loops,
90 followed by the actual vectorization transformation for the loops that
91 had successfully passed the analysis phase.
92 Throughout this pass we make a distinction between two types of
93 data: scalars (which are represented by SSA_NAMES), and memory references
94 ("data-refs"). These two types of data require different handling both
95 during analysis and transformation. The types of data-refs that the
96 vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 accesses are required to have a simple (consecutive) access pattern.
100 Analysis phase:
101 ===============
102 The driver for the analysis phase is vect_analyze_loop().
103 It applies a set of analyses, some of which rely on the scalar evolution
104 analyzer (scev) developed by Sebastian Pop.
106 During the analysis phase the vectorizer records some information
107 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 loop, as well as general information about the loop as a whole, which is
109 recorded in a "loop_vec_info" struct attached to each loop.
111 Transformation phase:
112 =====================
113 The loop transformation phase scans all the stmts in the loop, and
114 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 the loop that needs to be vectorized. It inserts the vector code sequence
116 just before the scalar stmt S, and records a pointer to the vector code
117 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 attached to S). This pointer will be used for the vectorization of following
119 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 otherwise, we rely on dead code elimination for removing it.
122 For example, say stmt S1 was vectorized into stmt VS1:
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 S2: a = b;
128 To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 resulting sequence would be:
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 VS2: va = vb;
136 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 Operands that are not SSA_NAMEs, are data-refs that appear in
139 load/store operations (like 'x[i]' in S1), and are handled differently.
141 Target modeling:
142 =================
143 Currently the only target specific information that is used is the
144 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 Targets that can support different sizes of vectors, for now will need
146 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 flexibility will be added in the future.
149 Since we only vectorize operations which vector form can be
150 expressed using existing tree codes, to verify that an operation is
151 supported, the vectorizer checks the relevant optab at the relevant
152 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 the value found is CODE_FOR_nothing, then there's no target support, and
154 we can't vectorize the stmt.
156 For additional information on this project see:
157 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 unsigned *);
162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 bool *, bool *, bool);
165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 may already be set for general statements (not just data refs). */
169 static opt_result
170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 bool vectype_maybe_set_p,
172 poly_uint64 *vf)
174 gimple *stmt = stmt_info->stmt;
176 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 && !STMT_VINFO_LIVE_P (stmt_info))
178 || gimple_clobber_p (stmt))
180 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 return opt_result::success ();
185 tree stmt_vectype, nunits_vectype;
186 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 &stmt_vectype,
188 &nunits_vectype);
189 if (!res)
190 return res;
192 if (stmt_vectype)
194 if (STMT_VINFO_VECTYPE (stmt_info))
195 /* The only case when a vectype had been already set is for stmts
196 that contain a data ref, or for "pattern-stmts" (stmts generated
197 by the vectorizer to represent/replace a certain idiom). */
198 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 || vectype_maybe_set_p)
200 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 else
202 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
205 if (nunits_vectype)
206 vect_update_max_nunits (vf, nunits_vectype);
208 return opt_result::success ();
211 /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 types of STMT_INFO and all attached pattern statements and update
213 the vectorization factor VF accordingly. Return true on success
214 or false if something prevented vectorization. */
216 static opt_result
217 vect_determine_vf_for_stmt (vec_info *vinfo,
218 stmt_vec_info stmt_info, poly_uint64 *vf)
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 if (!res)
225 return res;
227 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 && STMT_VINFO_RELATED_STMT (stmt_info))
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
233 /* If a pattern statement has def stmts, analyze them too. */
234 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 !gsi_end_p (si); gsi_next (&si))
237 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "==> examining pattern def stmt: %G",
241 def_stmt_info->stmt);
242 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 if (!res)
244 return res;
247 if (dump_enabled_p ())
248 dump_printf_loc (MSG_NOTE, vect_location,
249 "==> examining pattern statement: %G",
250 stmt_info->stmt);
251 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 if (!res)
253 return res;
256 return opt_result::success ();
259 /* Function vect_determine_vectorization_factor
261 Determine the vectorization factor (VF). VF is the number of data elements
262 that are operated upon in parallel in a single iteration of the vectorized
263 loop. For example, when vectorizing a loop that operates on 4byte elements,
264 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 elements can fit in a single vector register.
267 We currently support vectorization of loops in which all types operated upon
268 are of the same size. Therefore this function currently sets VF according to
269 the size of the types operated upon, and fails if there are multiple sizes
270 in the loop.
272 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 original loop:
274 for (i=0; i<N; i++){
275 a[i] = b[i] + c[i];
278 vectorized loop:
279 for (i=0; i<N; i+=VF){
280 a[i:VF] = b[i:VF] + c[i:VF];
284 static opt_result
285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
287 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 unsigned nbbs = loop->num_nodes;
290 poly_uint64 vectorization_factor = 1;
291 tree scalar_type = NULL_TREE;
292 gphi *phi;
293 tree vectype;
294 stmt_vec_info stmt_info;
295 unsigned i;
297 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
299 for (i = 0; i < nbbs; i++)
301 basic_block bb = bbs[i];
303 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 gsi_next (&si))
306 phi = si.phi ();
307 stmt_info = loop_vinfo->lookup_stmt (phi);
308 if (dump_enabled_p ())
309 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 (gimple *) phi);
312 gcc_assert (stmt_info);
314 if (STMT_VINFO_RELEVANT_P (stmt_info)
315 || STMT_VINFO_LIVE_P (stmt_info))
317 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 scalar_type = TREE_TYPE (PHI_RESULT (phi));
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE, vect_location,
322 "get vectype for scalar type: %T\n",
323 scalar_type);
325 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 if (!vectype)
327 return opt_result::failure_at (phi,
328 "not vectorized: unsupported "
329 "data-type %T\n",
330 scalar_type);
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 dump_printf (MSG_NOTE, "\n");
344 vect_update_max_nunits (&vectorization_factor, vectype);
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 gsi_next (&si))
351 if (is_gimple_debug (gsi_stmt (si)))
352 continue;
353 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 opt_result res
355 = vect_determine_vf_for_stmt (loop_vinfo,
356 stmt_info, &vectorization_factor);
357 if (!res)
358 return res;
362 /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 dump_dec (MSG_NOTE, vectorization_factor);
367 dump_printf (MSG_NOTE, "\n");
370 if (known_le (vectorization_factor, 1U))
371 return opt_result::failure_at (vect_location,
372 "not vectorized: unsupported data-type\n");
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 return opt_result::success ();
378 /* Function vect_is_simple_iv_evolution.
380 FORNOW: A simple evolution of an induction variables in the loop is
381 considered a polynomial evolution. */
383 static bool
384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 tree * step)
387 tree init_expr;
388 tree step_expr;
389 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 basic_block bb;
392 /* When there is no evolution in this loop, the evolution function
393 is not "simple". */
394 if (evolution_part == NULL_TREE)
395 return false;
397 /* When the evolution is a polynomial of degree >= 2
398 the evolution function is not "simple". */
399 if (tree_is_chrec (evolution_part))
400 return false;
402 step_expr = evolution_part;
403 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 step_expr, init_expr);
409 *init = init_expr;
410 *step = step_expr;
412 if (TREE_CODE (step_expr) != INTEGER_CST
413 && (TREE_CODE (step_expr) != SSA_NAME
414 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 || !flag_associative_math)))
419 && (TREE_CODE (step_expr) != REAL_CST
420 || !flag_associative_math))
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "step unknown.\n");
425 return false;
428 return true;
431 /* Function vect_is_nonlinear_iv_evolution
433 Only support nonlinear induction for integer type
434 1. neg
435 2. mul by constant
436 3. lshift/rshift by constant.
438 For neg induction, return a fake step as integer -1. */
439 static bool
440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 gphi* loop_phi_node, tree *init, tree *step)
443 tree init_expr, ev_expr, result, op1, op2;
444 gimple* def;
446 if (gimple_phi_num_args (loop_phi_node) != 2)
447 return false;
449 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
452 /* Support nonlinear induction only for integer type. */
453 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 return false;
456 *init = init_expr;
457 result = PHI_RESULT (loop_phi_node);
459 if (TREE_CODE (ev_expr) != SSA_NAME
460 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 || !is_gimple_assign (def))
462 return false;
464 enum tree_code t_code = gimple_assign_rhs_code (def);
465 switch (t_code)
467 case NEGATE_EXPR:
468 if (gimple_assign_rhs1 (def) != result)
469 return false;
470 *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 break;
474 case RSHIFT_EXPR:
475 case LSHIFT_EXPR:
476 case MULT_EXPR:
477 op1 = gimple_assign_rhs1 (def);
478 op2 = gimple_assign_rhs2 (def);
479 if (TREE_CODE (op2) != INTEGER_CST
480 || op1 != result)
481 return false;
482 *step = op2;
483 if (t_code == LSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 else if (t_code == RSHIFT_EXPR)
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 else
489 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 break;
492 default:
493 return false;
496 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
499 return true;
502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 what we are assuming is a double reduction. For example, given
504 a structure like this:
506 outer1:
507 x_1 = PHI <x_4(outer2), ...>;
510 inner:
511 x_2 = PHI <x_1(outer1), ...>;
513 x_3 = ...;
516 outer2:
517 x_4 = PHI <x_3(inner)>;
520 outer loop analysis would treat x_1 as a double reduction phi and
521 this function would then return true for x_2. */
523 static bool
524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
526 use_operand_p use_p;
527 ssa_op_iter op_iter;
528 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 return true;
532 return false;
535 /* Returns true if Phi is a first-order recurrence. A first-order
536 recurrence is a non-reduction recurrence relation in which the value of
537 the recurrence in the current loop iteration equals a value defined in
538 the previous iteration. */
540 static bool
541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 gphi *phi)
544 /* A nested cycle isn't vectorizable as first order recurrence. */
545 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 return false;
548 /* Ensure the loop latch definition is from within the loop. */
549 edge latch = loop_latch_edge (loop);
550 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 if (TREE_CODE (ldef) != SSA_NAME
552 || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 return false;
557 tree def = gimple_phi_result (phi);
559 /* Ensure every use_stmt of the phi node is dominated by the latch
560 definition. */
561 imm_use_iterator imm_iter;
562 use_operand_p use_p;
563 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 if (!is_gimple_debug (USE_STMT (use_p))
565 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 USE_STMT (use_p))))
568 return false;
570 /* First-order recurrence autovectorization needs shuffle vector. */
571 tree scalar_type = TREE_TYPE (def);
572 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 if (!vectype)
574 return false;
576 return true;
579 /* Function vect_analyze_scalar_cycles_1.
581 Examine the cross iteration def-use cycles of scalar variables
582 in LOOP. LOOP_VINFO represents the loop that is now being
583 considered for vectorization (can be LOOP, or an outer-loop
584 enclosing LOOP). SLP indicates there will be some subsequent
585 slp analyses or not. */
587 static void
588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 bool slp)
591 basic_block bb = loop->header;
592 tree init, step;
593 auto_vec<stmt_vec_info, 64> worklist;
594 gphi_iterator gsi;
595 bool double_reduc, reduc_chain;
597 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
599 /* First - identify all inductions. Reduction detection assumes that all the
600 inductions have been identified, therefore, this order must not be
601 changed. */
602 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
604 gphi *phi = gsi.phi ();
605 tree access_fn = NULL;
606 tree def = PHI_RESULT (phi);
607 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 (gimple *) phi);
613 /* Skip virtual phi's. The data dependences that are associated with
614 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 if (virtual_operand_p (def))
616 continue;
618 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
620 /* Analyze the evolution function. */
621 access_fn = analyze_scalar_evolution (loop, def);
622 if (access_fn)
624 STRIP_NOPS (access_fn);
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "Access function of PHI: %T\n", access_fn);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 = initial_condition_in_loop_num (access_fn, loop->num);
630 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 = evolution_part_in_loop_num (access_fn, loop->num);
634 if ((!access_fn
635 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 &init, &step)
638 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 && TREE_CODE (step) != INTEGER_CST))
640 /* Only handle nonlinear iv for same loop. */
641 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 phi, &init, &step)))
645 worklist.safe_push (stmt_vinfo);
646 continue;
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 != NULL_TREE);
651 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
659 /* Second - identify all reductions and nested cycles. */
660 while (worklist.length () > 0)
662 stmt_vec_info stmt_vinfo = worklist.pop ();
663 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
664 tree def = PHI_RESULT (phi);
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 (gimple *) phi);
670 gcc_assert (!virtual_operand_p (def)
671 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
673 stmt_vec_info reduc_stmt_info
674 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 &reduc_chain, slp);
676 if (reduc_stmt_info)
678 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 if (double_reduc)
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location,
684 "Detected double reduction.\n");
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
689 else
691 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "Detected vectorizable nested cycle.\n");
697 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
699 else
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_NOTE, vect_location,
703 "Detected reduction.\n");
705 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
706 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
707 /* Store the reduction cycles for possible vectorization in
708 loop-aware SLP if it was not detected as reduction
709 chain. */
710 if (! reduc_chain)
711 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
712 (reduc_stmt_info);
716 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
718 else
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown def-use cycle pattern.\n");
726 /* Function vect_analyze_scalar_cycles.
728 Examine the cross iteration def-use cycles of scalar variables, by
729 analyzing the loop-header PHIs of scalar variables. Classify each
730 cycle as one of the following: invariant, induction, reduction, unknown.
731 We do that for the loop represented by LOOP_VINFO, and also to its
732 inner-loop, if exists.
733 Examples for scalar cycles:
735 Example1: reduction:
737 loop1:
738 for (i=0; i<N; i++)
739 sum += a[i];
741 Example2: induction:
743 loop2:
744 for (i=0; i<N; i++)
745 a[i] = i; */
747 static void
748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
752 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
754 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 Reductions in such inner-loop therefore have different properties than
756 the reductions in the nest that gets vectorized:
757 1. When vectorized, they are executed in the same order as in the original
758 scalar loop, so we can't change the order of computation when
759 vectorizing them.
760 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 current checks are too strict. */
763 if (loop->inner)
764 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
767 /* Transfer group and reduction information from STMT_INFO to its
768 pattern stmt. */
770 static void
771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
773 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
774 stmt_vec_info stmtp;
775 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
777 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
780 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
781 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
782 == STMT_VINFO_DEF_TYPE (stmt_info));
783 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
784 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
785 if (stmt_info)
786 REDUC_GROUP_NEXT_ELEMENT (stmtp)
787 = STMT_VINFO_RELATED_STMT (stmt_info);
789 while (stmt_info);
792 /* Fixup scalar cycles that now have their stmts detected as patterns. */
794 static void
795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
797 stmt_vec_info first;
798 unsigned i;
800 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
802 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
803 while (next)
805 if ((STMT_VINFO_IN_PATTERN_P (next)
806 != STMT_VINFO_IN_PATTERN_P (first))
807 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
808 break;
809 next = REDUC_GROUP_NEXT_ELEMENT (next);
811 /* If all reduction chain members are well-formed patterns adjust
812 the group to group the pattern stmts instead. */
813 if (! next
814 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
816 if (STMT_VINFO_IN_PATTERN_P (first))
818 vect_fixup_reduc_chain (first);
819 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
820 = STMT_VINFO_RELATED_STMT (first);
823 /* If not all stmt in the chain are patterns or if we failed
824 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 it as regular reduction instead. */
826 else
828 stmt_vec_info vinfo = first;
829 stmt_vec_info last = NULL;
830 while (vinfo)
832 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
833 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
834 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
835 last = vinfo;
836 vinfo = next;
838 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
839 = vect_internal_def;
840 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
841 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
842 --i;
847 /* Function vect_get_loop_niters.
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 niter information holds in ASSUMPTIONS.
854 Return the loop exit conditions. */
857 static vec<gcond *>
858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
859 tree *number_of_iterations, tree *number_of_iterationsm1)
861 auto_vec<edge> exits = get_loop_exit_edges (loop);
862 vec<gcond *> conds;
863 conds.create (exits.length ());
864 class tree_niter_desc niter_desc;
865 tree niter_assumptions, niter, may_be_zero;
867 *assumptions = boolean_true_node;
868 *number_of_iterationsm1 = chrec_dont_know;
869 *number_of_iterations = chrec_dont_know;
871 DUMP_VECT_SCOPE ("get_loop_niters");
873 if (exits.is_empty ())
874 return conds;
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
878 exits.length ());
880 edge exit;
881 unsigned int i;
882 FOR_EACH_VEC_ELT (exits, i, exit)
884 gcond *cond = get_loop_exit_condition (exit);
885 if (cond)
886 conds.safe_push (cond);
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
891 if (exit != main_exit)
892 continue;
894 may_be_zero = NULL_TREE;
895 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
896 || chrec_contains_undetermined (niter_desc.niter))
897 continue;
899 niter_assumptions = niter_desc.assumptions;
900 may_be_zero = niter_desc.may_be_zero;
901 niter = niter_desc.niter;
903 if (may_be_zero && integer_zerop (may_be_zero))
904 may_be_zero = NULL_TREE;
906 if (may_be_zero)
908 if (COMPARISON_CLASS_P (may_be_zero))
910 /* Try to combine may_be_zero with assumptions, this can simplify
911 computation of niter expression. */
912 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
913 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
914 niter_assumptions,
915 fold_build1 (TRUTH_NOT_EXPR,
916 boolean_type_node,
917 may_be_zero));
918 else
919 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
920 build_int_cst (TREE_TYPE (niter), 0),
921 rewrite_to_non_trapping_overflow (niter));
923 may_be_zero = NULL_TREE;
925 else if (integer_nonzerop (may_be_zero))
927 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
928 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
929 continue;
931 else
932 continue;
935 /* Loop assumptions are based off the normal exit. */
936 *assumptions = niter_assumptions;
937 *number_of_iterationsm1 = niter;
939 /* We want the number of loop header executions which is the number
940 of latch executions plus one.
941 ??? For UINT_MAX latch executions this number overflows to zero
942 for loops like do { n++; } while (n != 0); */
943 if (niter && !chrec_contains_undetermined (niter))
944 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
945 unshare_expr (niter),
946 build_int_cst (TREE_TYPE (niter), 1));
947 *number_of_iterations = niter;
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
953 return conds;
956 /* Determine the main loop exit for the vectorizer. */
958 edge
959 vec_init_loop_exit_info (class loop *loop)
961 /* Before we begin we must first determine which exit is the main one and
962 which are auxilary exits. */
963 auto_vec<edge> exits = get_loop_exit_edges (loop);
964 if (exits.length () == 1)
965 return exits[0];
967 /* If we have multiple exits we only support counting IV at the moment. Analyze
968 all exits and return one */
969 class tree_niter_desc niter_desc;
970 edge candidate = NULL;
971 for (edge exit : exits)
973 if (!get_loop_exit_condition (exit))
974 continue;
976 if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
977 && !chrec_contains_undetermined (niter_desc.niter))
979 if (!niter_desc.may_be_zero || !candidate)
980 candidate = exit;
984 return candidate;
987 /* Function bb_in_loop_p
989 Used as predicate for dfs order traversal of the loop bbs. */
991 static bool
992 bb_in_loop_p (const_basic_block bb, const void *data)
994 const class loop *const loop = (const class loop *)data;
995 if (flow_bb_inside_loop_p (loop, bb))
996 return true;
997 return false;
1001 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002 stmt_vec_info structs for all the stmts in LOOP_IN. */
1004 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005 : vec_info (vec_info::loop, shared),
1006 loop (loop_in),
1007 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008 num_itersm1 (NULL_TREE),
1009 num_iters (NULL_TREE),
1010 num_iters_unchanged (NULL_TREE),
1011 num_iters_assumptions (NULL_TREE),
1012 vector_costs (nullptr),
1013 scalar_costs (nullptr),
1014 th (0),
1015 versioning_threshold (0),
1016 vectorization_factor (0),
1017 main_loop_edge (nullptr),
1018 skip_main_loop_edge (nullptr),
1019 skip_this_loop_edge (nullptr),
1020 reusable_accumulators (),
1021 suggested_unroll_factor (1),
1022 max_vectorization_factor (0),
1023 mask_skip_niters (NULL_TREE),
1024 rgroup_compare_type (NULL_TREE),
1025 simd_if_cond (NULL_TREE),
1026 partial_vector_style (vect_partial_vectors_none),
1027 unaligned_dr (NULL),
1028 peeling_for_alignment (0),
1029 ptr_mask (0),
1030 ivexpr_map (NULL),
1031 scan_map (NULL),
1032 slp_unrolling_factor (1),
1033 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034 vectorizable (false),
1035 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036 using_partial_vectors_p (false),
1037 using_decrementing_iv_p (false),
1038 using_select_vl_p (false),
1039 epil_using_partial_vectors_p (false),
1040 partial_load_store_bias (0),
1041 peeling_for_gaps (false),
1042 peeling_for_niter (false),
1043 no_data_dependencies (false),
1044 has_mask_store (false),
1045 scalar_loop_scaling (profile_probability::uninitialized ()),
1046 scalar_loop (NULL),
1047 orig_loop_info (NULL),
1048 vec_loop_iv_exit (NULL),
1049 vec_epilogue_loop_iv_exit (NULL),
1050 scalar_loop_iv_exit (NULL)
1052 /* CHECKME: We want to visit all BBs before their successors (except for
1053 latch blocks, for which this assertion wouldn't hold). In the simple
1054 case of the loop forms we allow, a dfs order of the BBs would the same
1055 as reversed postorder traversal, so we are safe. */
1057 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1058 bbs, loop->num_nodes, loop);
1059 gcc_assert (nbbs == loop->num_nodes);
1061 for (unsigned int i = 0; i < nbbs; i++)
1063 basic_block bb = bbs[i];
1064 gimple_stmt_iterator si;
1066 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1068 gimple *phi = gsi_stmt (si);
1069 gimple_set_uid (phi, 0);
1070 add_stmt (phi);
1073 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1075 gimple *stmt = gsi_stmt (si);
1076 gimple_set_uid (stmt, 0);
1077 if (is_gimple_debug (stmt))
1078 continue;
1079 add_stmt (stmt);
1080 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1081 third argument is the #pragma omp simd if (x) condition, when 0,
1082 loop shouldn't be vectorized, when non-zero constant, it should
1083 be vectorized normally, otherwise versioned with vectorized loop
1084 done if the condition is non-zero at runtime. */
1085 if (loop_in->simduid
1086 && is_gimple_call (stmt)
1087 && gimple_call_internal_p (stmt)
1088 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1089 && gimple_call_num_args (stmt) >= 3
1090 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1091 && (loop_in->simduid
1092 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1094 tree arg = gimple_call_arg (stmt, 2);
1095 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1096 simd_if_cond = arg;
1097 else
1098 gcc_assert (integer_nonzerop (arg));
1103 epilogue_vinfos.create (6);
1106 /* Free all levels of rgroup CONTROLS. */
1108 void
1109 release_vec_loop_controls (vec<rgroup_controls> *controls)
1111 rgroup_controls *rgc;
1112 unsigned int i;
1113 FOR_EACH_VEC_ELT (*controls, i, rgc)
1114 rgc->controls.release ();
1115 controls->release ();
1118 /* Free all memory used by the _loop_vec_info, as well as all the
1119 stmt_vec_info structs of all the stmts in the loop. */
1121 _loop_vec_info::~_loop_vec_info ()
1123 free (bbs);
1125 release_vec_loop_controls (&masks.rgc_vec);
1126 release_vec_loop_controls (&lens);
1127 delete ivexpr_map;
1128 delete scan_map;
1129 epilogue_vinfos.release ();
1130 delete scalar_costs;
1131 delete vector_costs;
1133 /* When we release an epiloge vinfo that we do not intend to use
1134 avoid clearing AUX of the main loop which should continue to
1135 point to the main loop vinfo since otherwise we'll leak that. */
1136 if (loop->aux == this)
1137 loop->aux = NULL;
1140 /* Return an invariant or register for EXPR and emit necessary
1141 computations in the LOOP_VINFO loop preheader. */
1143 tree
1144 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1146 if (is_gimple_reg (expr)
1147 || is_gimple_min_invariant (expr))
1148 return expr;
1150 if (! loop_vinfo->ivexpr_map)
1151 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1152 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1153 if (! cached)
1155 gimple_seq stmts = NULL;
1156 cached = force_gimple_operand (unshare_expr (expr),
1157 &stmts, true, NULL_TREE);
1158 if (stmts)
1160 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1161 gsi_insert_seq_on_edge_immediate (e, stmts);
1164 return cached;
1167 /* Return true if we can use CMP_TYPE as the comparison type to produce
1168 all masks required to mask LOOP_VINFO. */
1170 static bool
1171 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1173 rgroup_controls *rgm;
1174 unsigned int i;
1175 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1176 if (rgm->type != NULL_TREE
1177 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1178 cmp_type, rgm->type,
1179 OPTIMIZE_FOR_SPEED))
1180 return false;
1181 return true;
1184 /* Calculate the maximum number of scalars per iteration for every
1185 rgroup in LOOP_VINFO. */
1187 static unsigned int
1188 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1190 unsigned int res = 1;
1191 unsigned int i;
1192 rgroup_controls *rgm;
1193 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1194 res = MAX (res, rgm->max_nscalars_per_iter);
1195 return res;
1198 /* Calculate the minimum precision necessary to represent:
1200 MAX_NITERS * FACTOR
1202 as an unsigned integer, where MAX_NITERS is the maximum number of
1203 loop header iterations for the original scalar form of LOOP_VINFO. */
1205 static unsigned
1206 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1208 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1210 /* Get the maximum number of iterations that is representable
1211 in the counter type. */
1212 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1213 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1215 /* Get a more refined estimate for the number of iterations. */
1216 widest_int max_back_edges;
1217 if (max_loop_iterations (loop, &max_back_edges))
1218 max_ni = wi::smin (max_ni, max_back_edges + 1);
1220 /* Work out how many bits we need to represent the limit. */
1221 return wi::min_precision (max_ni * factor, UNSIGNED);
1224 /* True if the loop needs peeling or partial vectors when vectorized. */
1226 static bool
1227 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1229 unsigned HOST_WIDE_INT const_vf;
1230 HOST_WIDE_INT max_niter
1231 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1233 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1234 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1235 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1236 (loop_vinfo));
1238 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1239 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1241 /* Work out the (constant) number of iterations that need to be
1242 peeled for reasons other than niters. */
1243 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1244 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1245 peel_niter += 1;
1246 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1247 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1248 return true;
1250 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1251 /* ??? When peeling for gaps but not alignment, we could
1252 try to check whether the (variable) niters is known to be
1253 VF * N + 1. That's something of a niche case though. */
1254 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1255 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1256 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1257 < (unsigned) exact_log2 (const_vf))
1258 /* In case of versioning, check if the maximum number of
1259 iterations is greater than th. If they are identical,
1260 the epilogue is unnecessary. */
1261 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1262 || ((unsigned HOST_WIDE_INT) max_niter
1263 > (th / const_vf) * const_vf))))
1264 return true;
1266 return false;
1269 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1270 whether we can actually generate the masks required. Return true if so,
1271 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1273 static bool
1274 vect_verify_full_masking (loop_vec_info loop_vinfo)
1276 unsigned int min_ni_width;
1278 /* Use a normal loop if there are no statements that need masking.
1279 This only happens in rare degenerate cases: it means that the loop
1280 has no loads, no stores, and no live-out values. */
1281 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1282 return false;
1284 /* Produce the rgroup controls. */
1285 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1287 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1288 tree vectype = mask.first;
1289 unsigned nvectors = mask.second;
1291 if (masks->rgc_vec.length () < nvectors)
1292 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1293 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1294 /* The number of scalars per iteration and the number of vectors are
1295 both compile-time constants. */
1296 unsigned int nscalars_per_iter
1297 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1298 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1300 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1302 rgm->max_nscalars_per_iter = nscalars_per_iter;
1303 rgm->type = truth_type_for (vectype);
1304 rgm->factor = 1;
1308 unsigned int max_nscalars_per_iter
1309 = vect_get_max_nscalars_per_iter (loop_vinfo);
1311 /* Work out how many bits we need to represent the limit. */
1312 min_ni_width
1313 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1315 /* Find a scalar mode for which WHILE_ULT is supported. */
1316 opt_scalar_int_mode cmp_mode_iter;
1317 tree cmp_type = NULL_TREE;
1318 tree iv_type = NULL_TREE;
1319 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1320 unsigned int iv_precision = UINT_MAX;
1322 if (iv_limit != -1)
1323 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1324 UNSIGNED);
1326 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1328 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1329 if (cmp_bits >= min_ni_width
1330 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1332 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1333 if (this_type
1334 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1336 /* Although we could stop as soon as we find a valid mode,
1337 there are at least two reasons why that's not always the
1338 best choice:
1340 - An IV that's Pmode or wider is more likely to be reusable
1341 in address calculations than an IV that's narrower than
1342 Pmode.
1344 - Doing the comparison in IV_PRECISION or wider allows
1345 a natural 0-based IV, whereas using a narrower comparison
1346 type requires mitigations against wrap-around.
1348 Conversely, if the IV limit is variable, doing the comparison
1349 in a wider type than the original type can introduce
1350 unnecessary extensions, so picking the widest valid mode
1351 is not always a good choice either.
1353 Here we prefer the first IV type that's Pmode or wider,
1354 and the first comparison type that's IV_PRECISION or wider.
1355 (The comparison type must be no wider than the IV type,
1356 to avoid extensions in the vector loop.)
1358 ??? We might want to try continuing beyond Pmode for ILP32
1359 targets if CMP_BITS < IV_PRECISION. */
1360 iv_type = this_type;
1361 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1362 cmp_type = this_type;
1363 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1364 break;
1369 if (!cmp_type)
1371 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1372 return false;
1375 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1376 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1377 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1378 return true;
1381 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1382 whether we can actually generate AVX512 style masks. Return true if so,
1383 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1385 static bool
1386 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1388 /* Produce differently organized rgc_vec and differently check
1389 we can produce masks. */
1391 /* Use a normal loop if there are no statements that need masking.
1392 This only happens in rare degenerate cases: it means that the loop
1393 has no loads, no stores, and no live-out values. */
1394 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1395 return false;
1397 /* For the decrementing IV we need to represent all values in
1398 [0, niter + niter_skip] where niter_skip is the elements we
1399 skip in the first iteration for prologue peeling. */
1400 tree iv_type = NULL_TREE;
1401 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1402 unsigned int iv_precision = UINT_MAX;
1403 if (iv_limit != -1)
1404 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1406 /* First compute the type for the IV we use to track the remaining
1407 scalar iterations. */
1408 opt_scalar_int_mode cmp_mode_iter;
1409 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1411 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1412 if (cmp_bits >= iv_precision
1413 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1415 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1416 if (iv_type)
1417 break;
1420 if (!iv_type)
1421 return false;
1423 /* Produce the rgroup controls. */
1424 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1426 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1427 tree vectype = mask.first;
1428 unsigned nvectors = mask.second;
1430 /* The number of scalars per iteration and the number of vectors are
1431 both compile-time constants. */
1432 unsigned int nscalars_per_iter
1433 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1434 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1436 /* We index the rgroup_controls vector with nscalars_per_iter
1437 which we keep constant and instead have a varying nvectors,
1438 remembering the vector mask with the fewest nV. */
1439 if (masks->rgc_vec.length () < nscalars_per_iter)
1440 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1441 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1443 if (!rgm->type || rgm->factor > nvectors)
1445 rgm->type = truth_type_for (vectype);
1446 rgm->compare_type = NULL_TREE;
1447 rgm->max_nscalars_per_iter = nscalars_per_iter;
1448 rgm->factor = nvectors;
1449 rgm->bias_adjusted_ctrl = NULL_TREE;
1453 /* There is no fixed compare type we are going to use but we have to
1454 be able to get at one for each mask group. */
1455 unsigned int min_ni_width
1456 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1458 bool ok = true;
1459 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1461 tree mask_type = rgc.type;
1462 if (!mask_type)
1463 continue;
1465 /* For now vect_get_loop_mask only supports integer mode masks
1466 when we need to split it. */
1467 if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1468 || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1470 ok = false;
1471 break;
1474 /* If iv_type is usable as compare type use that - we can elide the
1475 saturation in that case. */
1476 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1478 tree cmp_vectype
1479 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1480 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1481 rgc.compare_type = cmp_vectype;
1483 if (!rgc.compare_type)
1484 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1486 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1487 if (cmp_bits >= min_ni_width
1488 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1490 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1491 if (!cmp_type)
1492 continue;
1494 /* Check whether we can produce the mask with cmp_type. */
1495 tree cmp_vectype
1496 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1497 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1499 rgc.compare_type = cmp_vectype;
1500 break;
1504 if (!rgc.compare_type)
1506 ok = false;
1507 break;
1510 if (!ok)
1512 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1513 return false;
1516 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1517 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1518 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1519 return true;
1522 /* Check whether we can use vector access with length based on precison
1523 comparison. So far, to keep it simple, we only allow the case that the
1524 precision of the target supported length is larger than the precision
1525 required by loop niters. */
1527 static bool
1528 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1530 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1531 return false;
1533 machine_mode len_load_mode, len_store_mode;
1534 if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1535 .exists (&len_load_mode))
1536 return false;
1537 if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1538 .exists (&len_store_mode))
1539 return false;
1541 signed char partial_load_bias = internal_len_load_store_bias
1542 (IFN_LEN_LOAD, len_load_mode);
1544 signed char partial_store_bias = internal_len_load_store_bias
1545 (IFN_LEN_STORE, len_store_mode);
1547 gcc_assert (partial_load_bias == partial_store_bias);
1549 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1550 return false;
1552 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1553 len_loads with a length of zero. In order to avoid that we prohibit
1554 more than one loop length here. */
1555 if (partial_load_bias == -1
1556 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1557 return false;
1559 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1561 unsigned int max_nitems_per_iter = 1;
1562 unsigned int i;
1563 rgroup_controls *rgl;
1564 /* Find the maximum number of items per iteration for every rgroup. */
1565 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1567 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1568 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1571 /* Work out how many bits we need to represent the length limit. */
1572 unsigned int min_ni_prec
1573 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1575 /* Now use the maximum of below precisions for one suitable IV type:
1576 - the IV's natural precision
1577 - the precision needed to hold: the maximum number of scalar
1578 iterations multiplied by the scale factor (min_ni_prec above)
1579 - the Pmode precision
1581 If min_ni_prec is less than the precision of the current niters,
1582 we perfer to still use the niters type. Prefer to use Pmode and
1583 wider IV to avoid narrow conversions. */
1585 unsigned int ni_prec
1586 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1587 min_ni_prec = MAX (min_ni_prec, ni_prec);
1588 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1590 tree iv_type = NULL_TREE;
1591 opt_scalar_int_mode tmode_iter;
1592 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1594 scalar_mode tmode = tmode_iter.require ();
1595 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1597 /* ??? Do we really want to construct one IV whose precision exceeds
1598 BITS_PER_WORD? */
1599 if (tbits > BITS_PER_WORD)
1600 break;
1602 /* Find the first available standard integral type. */
1603 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1605 iv_type = build_nonstandard_integer_type (tbits, true);
1606 break;
1610 if (!iv_type)
1612 if (dump_enabled_p ())
1613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1614 "can't vectorize with length-based partial vectors"
1615 " because there is no suitable iv type.\n");
1616 return false;
1619 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1620 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1621 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1623 return true;
1626 /* Calculate the cost of one scalar iteration of the loop. */
1627 static void
1628 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1630 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1631 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1632 int nbbs = loop->num_nodes, factor;
1633 int innerloop_iters, i;
1635 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1637 /* Gather costs for statements in the scalar loop. */
1639 /* FORNOW. */
1640 innerloop_iters = 1;
1641 if (loop->inner)
1642 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1644 for (i = 0; i < nbbs; i++)
1646 gimple_stmt_iterator si;
1647 basic_block bb = bbs[i];
1649 if (bb->loop_father == loop->inner)
1650 factor = innerloop_iters;
1651 else
1652 factor = 1;
1654 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1656 gimple *stmt = gsi_stmt (si);
1657 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1659 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1660 continue;
1662 /* Skip stmts that are not vectorized inside the loop. */
1663 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1664 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1665 && (!STMT_VINFO_LIVE_P (vstmt_info)
1666 || !VECTORIZABLE_CYCLE_DEF
1667 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1668 continue;
1670 vect_cost_for_stmt kind;
1671 if (STMT_VINFO_DATA_REF (stmt_info))
1673 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1674 kind = scalar_load;
1675 else
1676 kind = scalar_store;
1678 else if (vect_nop_conversion_p (stmt_info))
1679 continue;
1680 else
1681 kind = scalar_stmt;
1683 /* We are using vect_prologue here to avoid scaling twice
1684 by the inner loop factor. */
1685 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1686 factor, kind, stmt_info, 0, vect_prologue);
1690 /* Now accumulate cost. */
1691 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1692 add_stmt_costs (loop_vinfo->scalar_costs,
1693 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1694 loop_vinfo->scalar_costs->finish_cost (nullptr);
1698 /* Function vect_analyze_loop_form.
1700 Verify that certain CFG restrictions hold, including:
1701 - the loop has a pre-header
1702 - the loop has a single entry and exit
1703 - the loop exit condition is simple enough
1704 - the number of iterations can be analyzed, i.e, a countable loop. The
1705 niter could be analyzed under some assumptions. */
1707 opt_result
1708 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1710 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1712 edge exit_e = vec_init_loop_exit_info (loop);
1713 if (!exit_e)
1714 return opt_result::failure_at (vect_location,
1715 "not vectorized:"
1716 " could not determine main exit from"
1717 " loop with multiple exits.\n");
1718 info->loop_exit = exit_e;
1719 if (dump_enabled_p ())
1720 dump_printf_loc (MSG_NOTE, vect_location,
1721 "using as main loop exit: %d -> %d [AUX: %p]\n",
1722 exit_e->src->index, exit_e->dest->index, exit_e->aux);
1724 /* Different restrictions apply when we are considering an inner-most loop,
1725 vs. an outer (nested) loop.
1726 (FORNOW. May want to relax some of these restrictions in the future). */
1728 info->inner_loop_cond = NULL;
1729 if (!loop->inner)
1731 /* Inner-most loop. We currently require that the number of BBs is
1732 exactly 2 (the header and latch). Vectorizable inner-most loops
1733 look like this:
1735 (pre-header)
1737 header <--------+
1738 | | |
1739 | +--> latch --+
1741 (exit-bb) */
1743 if (loop->num_nodes != 2)
1744 return opt_result::failure_at (vect_location,
1745 "not vectorized:"
1746 " control flow in loop.\n");
1748 if (empty_block_p (loop->header))
1749 return opt_result::failure_at (vect_location,
1750 "not vectorized: empty loop.\n");
1752 else
1754 class loop *innerloop = loop->inner;
1755 edge entryedge;
1757 /* Nested loop. We currently require that the loop is doubly-nested,
1758 contains a single inner loop, and the number of BBs is exactly 5.
1759 Vectorizable outer-loops look like this:
1761 (pre-header)
1763 header <---+
1765 inner-loop |
1767 tail ------+
1769 (exit-bb)
1771 The inner-loop has the properties expected of inner-most loops
1772 as described above. */
1774 if ((loop->inner)->inner || (loop->inner)->next)
1775 return opt_result::failure_at (vect_location,
1776 "not vectorized:"
1777 " multiple nested loops.\n");
1779 if (loop->num_nodes != 5)
1780 return opt_result::failure_at (vect_location,
1781 "not vectorized:"
1782 " control flow in loop.\n");
1784 entryedge = loop_preheader_edge (innerloop);
1785 if (entryedge->src != loop->header
1786 || !single_exit (innerloop)
1787 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1788 return opt_result::failure_at (vect_location,
1789 "not vectorized:"
1790 " unsupported outerloop form.\n");
1792 /* Analyze the inner-loop. */
1793 vect_loop_form_info inner;
1794 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1795 if (!res)
1797 if (dump_enabled_p ())
1798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1799 "not vectorized: Bad inner loop.\n");
1800 return res;
1803 /* Don't support analyzing niter under assumptions for inner
1804 loop. */
1805 if (!integer_onep (inner.assumptions))
1806 return opt_result::failure_at (vect_location,
1807 "not vectorized: Bad inner loop.\n");
1809 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1810 return opt_result::failure_at (vect_location,
1811 "not vectorized: inner-loop count not"
1812 " invariant.\n");
1814 if (dump_enabled_p ())
1815 dump_printf_loc (MSG_NOTE, vect_location,
1816 "Considering outer-loop vectorization.\n");
1817 info->inner_loop_cond = inner.conds[0];
1820 if (!single_exit (loop))
1821 return opt_result::failure_at (vect_location,
1822 "not vectorized: multiple exits.\n");
1823 if (EDGE_COUNT (loop->header->preds) != 2)
1824 return opt_result::failure_at (vect_location,
1825 "not vectorized:"
1826 " too many incoming edges.\n");
1828 /* We assume that the loop exit condition is at the end of the loop. i.e,
1829 that the loop is represented as a do-while (with a proper if-guard
1830 before the loop if needed), where the loop header contains all the
1831 executable statements, and the latch is empty. */
1832 if (!empty_block_p (loop->latch)
1833 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1834 return opt_result::failure_at (vect_location,
1835 "not vectorized: latch block not empty.\n");
1837 /* Make sure the exit is not abnormal. */
1838 if (exit_e->flags & EDGE_ABNORMAL)
1839 return opt_result::failure_at (vect_location,
1840 "not vectorized:"
1841 " abnormal loop exit edge.\n");
1843 info->conds
1844 = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1845 &info->number_of_iterations,
1846 &info->number_of_iterationsm1);
1848 if (info->conds.is_empty ())
1849 return opt_result::failure_at
1850 (vect_location,
1851 "not vectorized: complicated exit condition.\n");
1853 /* Determine what the primary and alternate exit conds are. */
1854 for (unsigned i = 0; i < info->conds.length (); i++)
1856 gcond *cond = info->conds[i];
1857 if (exit_e->src == gimple_bb (cond))
1858 std::swap (info->conds[0], info->conds[i]);
1861 if (integer_zerop (info->assumptions)
1862 || !info->number_of_iterations
1863 || chrec_contains_undetermined (info->number_of_iterations))
1864 return opt_result::failure_at
1865 (info->conds[0],
1866 "not vectorized: number of iterations cannot be computed.\n");
1868 if (integer_zerop (info->number_of_iterations))
1869 return opt_result::failure_at
1870 (info->conds[0],
1871 "not vectorized: number of iterations = 0.\n");
1873 if (!(tree_fits_shwi_p (info->number_of_iterations)
1874 && tree_to_shwi (info->number_of_iterations) > 0))
1876 if (dump_enabled_p ())
1878 dump_printf_loc (MSG_NOTE, vect_location,
1879 "Symbolic number of iterations is ");
1880 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1881 dump_printf (MSG_NOTE, "\n");
1885 return opt_result::success ();
1888 /* Create a loop_vec_info for LOOP with SHARED and the
1889 vect_analyze_loop_form result. */
1891 loop_vec_info
1892 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1893 const vect_loop_form_info *info,
1894 loop_vec_info main_loop_info)
1896 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1897 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1898 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1899 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1900 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1901 /* Also record the assumptions for versioning. */
1902 if (!integer_onep (info->assumptions) && !main_loop_info)
1903 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1905 for (gcond *cond : info->conds)
1907 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1908 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1911 for (unsigned i = 1; i < info->conds.length (); i ++)
1912 LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1913 LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1915 LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1917 if (info->inner_loop_cond)
1919 stmt_vec_info inner_loop_cond_info
1920 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1921 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1922 /* If we have an estimate on the number of iterations of the inner
1923 loop use that to limit the scale for costing, otherwise use
1924 --param vect-inner-loop-cost-factor literally. */
1925 widest_int nit;
1926 if (estimated_stmt_executions (loop->inner, &nit))
1927 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1928 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1931 return loop_vinfo;
1936 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1937 statements update the vectorization factor. */
1939 static void
1940 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1942 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1943 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1944 int nbbs = loop->num_nodes;
1945 poly_uint64 vectorization_factor;
1946 int i;
1948 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1950 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1951 gcc_assert (known_ne (vectorization_factor, 0U));
1953 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1954 vectorization factor of the loop is the unrolling factor required by
1955 the SLP instances. If that unrolling factor is 1, we say, that we
1956 perform pure SLP on loop - cross iteration parallelism is not
1957 exploited. */
1958 bool only_slp_in_loop = true;
1959 for (i = 0; i < nbbs; i++)
1961 basic_block bb = bbs[i];
1962 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1963 gsi_next (&si))
1965 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1966 if (!stmt_info)
1967 continue;
1968 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1969 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1970 && !PURE_SLP_STMT (stmt_info))
1971 /* STMT needs both SLP and loop-based vectorization. */
1972 only_slp_in_loop = false;
1974 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1975 gsi_next (&si))
1977 if (is_gimple_debug (gsi_stmt (si)))
1978 continue;
1979 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1980 stmt_info = vect_stmt_to_vectorize (stmt_info);
1981 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1982 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1983 && !PURE_SLP_STMT (stmt_info))
1984 /* STMT needs both SLP and loop-based vectorization. */
1985 only_slp_in_loop = false;
1989 if (only_slp_in_loop)
1991 if (dump_enabled_p ())
1992 dump_printf_loc (MSG_NOTE, vect_location,
1993 "Loop contains only SLP stmts\n");
1994 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1996 else
1998 if (dump_enabled_p ())
1999 dump_printf_loc (MSG_NOTE, vect_location,
2000 "Loop contains SLP and non-SLP stmts\n");
2001 /* Both the vectorization factor and unroll factor have the form
2002 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2003 so they must have a common multiple. */
2004 vectorization_factor
2005 = force_common_multiple (vectorization_factor,
2006 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2009 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2010 if (dump_enabled_p ())
2012 dump_printf_loc (MSG_NOTE, vect_location,
2013 "Updating vectorization factor to ");
2014 dump_dec (MSG_NOTE, vectorization_factor);
2015 dump_printf (MSG_NOTE, ".\n");
2019 /* Return true if STMT_INFO describes a double reduction phi and if
2020 the other phi in the reduction is also relevant for vectorization.
2021 This rejects cases such as:
2023 outer1:
2024 x_1 = PHI <x_3(outer2), ...>;
2027 inner:
2028 x_2 = ...;
2031 outer2:
2032 x_3 = PHI <x_2(inner)>;
2034 if nothing in x_2 or elsewhere makes x_1 relevant. */
2036 static bool
2037 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2039 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2040 return false;
2042 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2045 /* Function vect_analyze_loop_operations.
2047 Scan the loop stmts and make sure they are all vectorizable. */
2049 static opt_result
2050 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2052 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2053 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2054 int nbbs = loop->num_nodes;
2055 int i;
2056 stmt_vec_info stmt_info;
2057 bool need_to_vectorize = false;
2058 bool ok;
2060 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2062 auto_vec<stmt_info_for_cost> cost_vec;
2064 for (i = 0; i < nbbs; i++)
2066 basic_block bb = bbs[i];
2068 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2069 gsi_next (&si))
2071 gphi *phi = si.phi ();
2072 ok = true;
2074 stmt_info = loop_vinfo->lookup_stmt (phi);
2075 if (dump_enabled_p ())
2076 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2077 (gimple *) phi);
2078 if (virtual_operand_p (gimple_phi_result (phi)))
2079 continue;
2081 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2082 (i.e., a phi in the tail of the outer-loop). */
2083 if (! is_loop_header_bb_p (bb))
2085 /* FORNOW: we currently don't support the case that these phis
2086 are not used in the outerloop (unless it is double reduction,
2087 i.e., this phi is vect_reduction_def), cause this case
2088 requires to actually do something here. */
2089 if (STMT_VINFO_LIVE_P (stmt_info)
2090 && !vect_active_double_reduction_p (stmt_info))
2091 return opt_result::failure_at (phi,
2092 "Unsupported loop-closed phi"
2093 " in outer-loop.\n");
2095 /* If PHI is used in the outer loop, we check that its operand
2096 is defined in the inner loop. */
2097 if (STMT_VINFO_RELEVANT_P (stmt_info))
2099 tree phi_op;
2101 if (gimple_phi_num_args (phi) != 1)
2102 return opt_result::failure_at (phi, "unsupported phi");
2104 phi_op = PHI_ARG_DEF (phi, 0);
2105 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2106 if (!op_def_info)
2107 return opt_result::failure_at (phi, "unsupported phi\n");
2109 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2110 && (STMT_VINFO_RELEVANT (op_def_info)
2111 != vect_used_in_outer_by_reduction))
2112 return opt_result::failure_at (phi, "unsupported phi\n");
2114 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2115 || (STMT_VINFO_DEF_TYPE (stmt_info)
2116 == vect_double_reduction_def))
2117 && !vectorizable_lc_phi (loop_vinfo,
2118 stmt_info, NULL, NULL))
2119 return opt_result::failure_at (phi, "unsupported phi\n");
2122 continue;
2125 gcc_assert (stmt_info);
2127 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2128 || STMT_VINFO_LIVE_P (stmt_info))
2129 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2130 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2131 /* A scalar-dependence cycle that we don't support. */
2132 return opt_result::failure_at (phi,
2133 "not vectorized:"
2134 " scalar dependence cycle.\n");
2136 if (STMT_VINFO_RELEVANT_P (stmt_info))
2138 need_to_vectorize = true;
2139 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2140 && ! PURE_SLP_STMT (stmt_info))
2141 ok = vectorizable_induction (loop_vinfo,
2142 stmt_info, NULL, NULL,
2143 &cost_vec);
2144 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2145 || (STMT_VINFO_DEF_TYPE (stmt_info)
2146 == vect_double_reduction_def)
2147 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2148 && ! PURE_SLP_STMT (stmt_info))
2149 ok = vectorizable_reduction (loop_vinfo,
2150 stmt_info, NULL, NULL, &cost_vec);
2151 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2152 == vect_first_order_recurrence)
2153 && ! PURE_SLP_STMT (stmt_info))
2154 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2155 &cost_vec);
2158 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2159 if (ok
2160 && STMT_VINFO_LIVE_P (stmt_info)
2161 && !PURE_SLP_STMT (stmt_info))
2162 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2163 -1, false, &cost_vec);
2165 if (!ok)
2166 return opt_result::failure_at (phi,
2167 "not vectorized: relevant phi not "
2168 "supported: %G",
2169 static_cast <gimple *> (phi));
2172 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2173 gsi_next (&si))
2175 gimple *stmt = gsi_stmt (si);
2176 if (!gimple_clobber_p (stmt)
2177 && !is_gimple_debug (stmt))
2179 opt_result res
2180 = vect_analyze_stmt (loop_vinfo,
2181 loop_vinfo->lookup_stmt (stmt),
2182 &need_to_vectorize,
2183 NULL, NULL, &cost_vec);
2184 if (!res)
2185 return res;
2188 } /* bbs */
2190 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2192 /* All operations in the loop are either irrelevant (deal with loop
2193 control, or dead), or only used outside the loop and can be moved
2194 out of the loop (e.g. invariants, inductions). The loop can be
2195 optimized away by scalar optimizations. We're better off not
2196 touching this loop. */
2197 if (!need_to_vectorize)
2199 if (dump_enabled_p ())
2200 dump_printf_loc (MSG_NOTE, vect_location,
2201 "All the computation can be taken out of the loop.\n");
2202 return opt_result::failure_at
2203 (vect_location,
2204 "not vectorized: redundant loop. no profit to vectorize.\n");
2207 return opt_result::success ();
2210 /* Return true if we know that the iteration count is smaller than the
2211 vectorization factor. Return false if it isn't, or if we can't be sure
2212 either way. */
2214 static bool
2215 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2217 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2219 HOST_WIDE_INT max_niter;
2220 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2221 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2222 else
2223 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2225 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2226 return true;
2228 return false;
2231 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2232 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2233 definitely no, or -1 if it's worth retrying. */
2235 static int
2236 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2237 unsigned *suggested_unroll_factor)
2239 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2240 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2242 /* Only loops that can handle partially-populated vectors can have iteration
2243 counts less than the vectorization factor. */
2244 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2245 && vect_known_niters_smaller_than_vf (loop_vinfo))
2247 if (dump_enabled_p ())
2248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249 "not vectorized: iteration count smaller than "
2250 "vectorization factor.\n");
2251 return 0;
2254 /* If we know the number of iterations we can do better, for the
2255 epilogue we can also decide whether the main loop leaves us
2256 with enough iterations, prefering a smaller vector epilog then
2257 also possibly used for the case we skip the vector loop. */
2258 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2260 widest_int scalar_niters
2261 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2262 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2264 loop_vec_info orig_loop_vinfo
2265 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2266 unsigned lowest_vf
2267 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2268 int prolog_peeling = 0;
2269 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2270 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2271 if (prolog_peeling >= 0
2272 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2273 lowest_vf))
2275 unsigned gap
2276 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2277 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2278 % lowest_vf + gap);
2281 /* Reject vectorizing for a single scalar iteration, even if
2282 we could in principle implement that using partial vectors. */
2283 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2284 if (scalar_niters <= peeling_gap + 1)
2286 if (dump_enabled_p ())
2287 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2288 "not vectorized: loop only has a single "
2289 "scalar iteration.\n");
2290 return 0;
2293 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2295 /* Check that the loop processes at least one full vector. */
2296 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2297 if (known_lt (scalar_niters, vf))
2299 if (dump_enabled_p ())
2300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301 "loop does not have enough iterations "
2302 "to support vectorization.\n");
2303 return 0;
2306 /* If we need to peel an extra epilogue iteration to handle data
2307 accesses with gaps, check that there are enough scalar iterations
2308 available.
2310 The check above is redundant with this one when peeling for gaps,
2311 but the distinction is useful for diagnostics. */
2312 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2313 && known_le (scalar_niters, vf))
2315 if (dump_enabled_p ())
2316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317 "loop does not have enough iterations "
2318 "to support peeling for gaps.\n");
2319 return 0;
2324 /* If using the "very cheap" model. reject cases in which we'd keep
2325 a copy of the scalar code (even if we might be able to vectorize it). */
2326 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2327 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2328 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2329 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2331 if (dump_enabled_p ())
2332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333 "some scalar iterations would need to be peeled\n");
2334 return 0;
2337 int min_profitable_iters, min_profitable_estimate;
2338 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2339 &min_profitable_estimate,
2340 suggested_unroll_factor);
2342 if (min_profitable_iters < 0)
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346 "not vectorized: vectorization not profitable.\n");
2347 if (dump_enabled_p ())
2348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2349 "not vectorized: vector version will never be "
2350 "profitable.\n");
2351 return -1;
2354 int min_scalar_loop_bound = (param_min_vect_loop_bound
2355 * assumed_vf);
2357 /* Use the cost model only if it is more conservative than user specified
2358 threshold. */
2359 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2360 min_profitable_iters);
2362 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2364 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2365 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369 "not vectorized: vectorization not profitable.\n");
2370 if (dump_enabled_p ())
2371 dump_printf_loc (MSG_NOTE, vect_location,
2372 "not vectorized: iteration count smaller than user "
2373 "specified loop bound parameter or minimum profitable "
2374 "iterations (whichever is more conservative).\n");
2375 return 0;
2378 /* The static profitablity threshold min_profitable_estimate includes
2379 the cost of having to check at runtime whether the scalar loop
2380 should be used instead. If it turns out that we don't need or want
2381 such a check, the threshold we should use for the static estimate
2382 is simply the point at which the vector loop becomes more profitable
2383 than the scalar loop. */
2384 if (min_profitable_estimate > min_profitable_iters
2385 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2386 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2387 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2388 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2390 if (dump_enabled_p ())
2391 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2392 " choice between the scalar and vector loops\n");
2393 min_profitable_estimate = min_profitable_iters;
2396 /* If the vector loop needs multiple iterations to be beneficial then
2397 things are probably too close to call, and the conservative thing
2398 would be to stick with the scalar code. */
2399 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2400 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2402 if (dump_enabled_p ())
2403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2404 "one iteration of the vector loop would be"
2405 " more expensive than the equivalent number of"
2406 " iterations of the scalar loop\n");
2407 return 0;
2410 HOST_WIDE_INT estimated_niter;
2412 /* If we are vectorizing an epilogue then we know the maximum number of
2413 scalar iterations it will cover is at least one lower than the
2414 vectorization factor of the main loop. */
2415 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2416 estimated_niter
2417 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2418 else
2420 estimated_niter = estimated_stmt_executions_int (loop);
2421 if (estimated_niter == -1)
2422 estimated_niter = likely_max_stmt_executions_int (loop);
2424 if (estimated_niter != -1
2425 && ((unsigned HOST_WIDE_INT) estimated_niter
2426 < MAX (th, (unsigned) min_profitable_estimate)))
2428 if (dump_enabled_p ())
2429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2430 "not vectorized: estimated iteration count too "
2431 "small.\n");
2432 if (dump_enabled_p ())
2433 dump_printf_loc (MSG_NOTE, vect_location,
2434 "not vectorized: estimated iteration count smaller "
2435 "than specified loop bound parameter or minimum "
2436 "profitable iterations (whichever is more "
2437 "conservative).\n");
2438 return -1;
2441 return 1;
2444 static opt_result
2445 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2446 vec<data_reference_p> *datarefs,
2447 unsigned int *n_stmts)
2449 *n_stmts = 0;
2450 for (unsigned i = 0; i < loop->num_nodes; i++)
2451 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2452 !gsi_end_p (gsi); gsi_next (&gsi))
2454 gimple *stmt = gsi_stmt (gsi);
2455 if (is_gimple_debug (stmt))
2456 continue;
2457 ++(*n_stmts);
2458 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2459 NULL, 0);
2460 if (!res)
2462 if (is_gimple_call (stmt) && loop->safelen)
2464 tree fndecl = gimple_call_fndecl (stmt), op;
2465 if (fndecl == NULL_TREE
2466 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2468 fndecl = gimple_call_arg (stmt, 0);
2469 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2470 fndecl = TREE_OPERAND (fndecl, 0);
2471 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2473 if (fndecl != NULL_TREE)
2475 cgraph_node *node = cgraph_node::get (fndecl);
2476 if (node != NULL && node->simd_clones != NULL)
2478 unsigned int j, n = gimple_call_num_args (stmt);
2479 for (j = 0; j < n; j++)
2481 op = gimple_call_arg (stmt, j);
2482 if (DECL_P (op)
2483 || (REFERENCE_CLASS_P (op)
2484 && get_base_address (op)))
2485 break;
2487 op = gimple_call_lhs (stmt);
2488 /* Ignore #pragma omp declare simd functions
2489 if they don't have data references in the
2490 call stmt itself. */
2491 if (j == n
2492 && !(op
2493 && (DECL_P (op)
2494 || (REFERENCE_CLASS_P (op)
2495 && get_base_address (op)))))
2496 continue;
2500 return res;
2502 /* If dependence analysis will give up due to the limit on the
2503 number of datarefs stop here and fail fatally. */
2504 if (datarefs->length ()
2505 > (unsigned)param_loop_max_datarefs_for_datadeps)
2506 return opt_result::failure_at (stmt, "exceeded param "
2507 "loop-max-datarefs-for-datadeps\n");
2509 return opt_result::success ();
2512 /* Look for SLP-only access groups and turn each individual access into its own
2513 group. */
2514 static void
2515 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2517 unsigned int i;
2518 struct data_reference *dr;
2520 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2522 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2523 FOR_EACH_VEC_ELT (datarefs, i, dr)
2525 gcc_assert (DR_REF (dr));
2526 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2528 /* Check if the load is a part of an interleaving chain. */
2529 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2531 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2532 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2533 unsigned int group_size = DR_GROUP_SIZE (first_element);
2535 /* Check if SLP-only groups. */
2536 if (!STMT_SLP_TYPE (stmt_info)
2537 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2539 /* Dissolve the group. */
2540 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2542 stmt_vec_info vinfo = first_element;
2543 while (vinfo)
2545 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2546 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2547 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2548 DR_GROUP_SIZE (vinfo) = 1;
2549 if (STMT_VINFO_STRIDED_P (first_element)
2550 /* We cannot handle stores with gaps. */
2551 || DR_IS_WRITE (dr_info->dr))
2553 STMT_VINFO_STRIDED_P (vinfo) = true;
2554 DR_GROUP_GAP (vinfo) = 0;
2556 else
2557 DR_GROUP_GAP (vinfo) = group_size - 1;
2558 /* Duplicate and adjust alignment info, it needs to
2559 be present on each group leader, see dr_misalignment. */
2560 if (vinfo != first_element)
2562 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2563 dr_info2->target_alignment = dr_info->target_alignment;
2564 int misalignment = dr_info->misalignment;
2565 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2567 HOST_WIDE_INT diff
2568 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2569 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2570 unsigned HOST_WIDE_INT align_c
2571 = dr_info->target_alignment.to_constant ();
2572 misalignment = (misalignment + diff) % align_c;
2574 dr_info2->misalignment = misalignment;
2576 vinfo = next;
2583 /* Determine if operating on full vectors for LOOP_VINFO might leave
2584 some scalar iterations still to do. If so, decide how we should
2585 handle those scalar iterations. The possibilities are:
2587 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2588 In this case:
2590 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2591 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2592 LOOP_VINFO_PEELING_FOR_NITER == false
2594 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2595 to handle the remaining scalar iterations. In this case:
2597 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2598 LOOP_VINFO_PEELING_FOR_NITER == true
2600 There are two choices:
2602 (2a) Consider vectorizing the epilogue loop at the same VF as the
2603 main loop, but using partial vectors instead of full vectors.
2604 In this case:
2606 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2608 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2609 In this case:
2611 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2614 opt_result
2615 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2617 /* Determine whether there would be any scalar iterations left over. */
2618 bool need_peeling_or_partial_vectors_p
2619 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2621 /* Decide whether to vectorize the loop with partial vectors. */
2622 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2623 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2624 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2625 && need_peeling_or_partial_vectors_p)
2627 /* For partial-vector-usage=1, try to push the handling of partial
2628 vectors to the epilogue, with the main loop continuing to operate
2629 on full vectors.
2631 If we are unrolling we also do not want to use partial vectors. This
2632 is to avoid the overhead of generating multiple masks and also to
2633 avoid having to execute entire iterations of FALSE masked instructions
2634 when dealing with one or less full iterations.
2636 ??? We could then end up failing to use partial vectors if we
2637 decide to peel iterations into a prologue, and if the main loop
2638 then ends up processing fewer than VF iterations. */
2639 if ((param_vect_partial_vector_usage == 1
2640 || loop_vinfo->suggested_unroll_factor > 1)
2641 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2642 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2643 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2644 else
2645 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2648 if (dump_enabled_p ())
2649 dump_printf_loc (MSG_NOTE, vect_location,
2650 "operating on %s vectors%s.\n",
2651 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2652 ? "partial" : "full",
2653 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2654 ? " for epilogue loop" : "");
2656 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2657 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2658 && need_peeling_or_partial_vectors_p);
2660 /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2661 analysis that we don't know whether the loop is vectorized by partial
2662 vectors (More details see tree-vect-loop-manip.cc).
2664 However, SELECT_VL vectorizaton style should only applied on partial
2665 vectorization since SELECT_VL is the GIMPLE IR that calculates the
2666 number of elements to be process for each iteration.
2668 After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2669 if it is not partial vectorized loop. */
2670 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2671 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2673 return opt_result::success ();
2676 /* Function vect_analyze_loop_2.
2678 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2679 analyses will record information in some members of LOOP_VINFO. FATAL
2680 indicates if some analysis meets fatal error. If one non-NULL pointer
2681 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2682 worked out suggested unroll factor, while one NULL pointer shows it's
2683 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2684 is to hold the slp decision when the suggested unroll factor is worked
2685 out. */
2686 static opt_result
2687 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2688 unsigned *suggested_unroll_factor,
2689 bool& slp_done_for_suggested_uf)
2691 opt_result ok = opt_result::success ();
2692 int res;
2693 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2694 poly_uint64 min_vf = 2;
2695 loop_vec_info orig_loop_vinfo = NULL;
2697 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2698 loop_vec_info of the first vectorized loop. */
2699 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2700 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2701 else
2702 orig_loop_vinfo = loop_vinfo;
2703 gcc_assert (orig_loop_vinfo);
2705 /* The first group of checks is independent of the vector size. */
2706 fatal = true;
2708 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2709 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2710 return opt_result::failure_at (vect_location,
2711 "not vectorized: simd if(0)\n");
2713 /* Find all data references in the loop (which correspond to vdefs/vuses)
2714 and analyze their evolution in the loop. */
2716 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2718 /* Gather the data references and count stmts in the loop. */
2719 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2721 opt_result res
2722 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2723 &LOOP_VINFO_DATAREFS (loop_vinfo),
2724 &LOOP_VINFO_N_STMTS (loop_vinfo));
2725 if (!res)
2727 if (dump_enabled_p ())
2728 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2729 "not vectorized: loop contains function "
2730 "calls or data references that cannot "
2731 "be analyzed\n");
2732 return res;
2734 loop_vinfo->shared->save_datarefs ();
2736 else
2737 loop_vinfo->shared->check_datarefs ();
2739 /* Analyze the data references and also adjust the minimal
2740 vectorization factor according to the loads and stores. */
2742 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2743 if (!ok)
2745 if (dump_enabled_p ())
2746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2747 "bad data references.\n");
2748 return ok;
2751 /* Check if we are applying unroll factor now. */
2752 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2753 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2755 /* If the slp decision is false when suggested unroll factor is worked
2756 out, and we are applying suggested unroll factor, we can simply skip
2757 all slp related analyses this time. */
2758 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2760 /* Classify all cross-iteration scalar data-flow cycles.
2761 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2762 vect_analyze_scalar_cycles (loop_vinfo, slp);
2764 vect_pattern_recog (loop_vinfo);
2766 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2768 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2769 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2771 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2772 if (!ok)
2774 if (dump_enabled_p ())
2775 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2776 "bad data access.\n");
2777 return ok;
2780 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2782 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2783 if (!ok)
2785 if (dump_enabled_p ())
2786 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2787 "unexpected pattern.\n");
2788 return ok;
2791 /* While the rest of the analysis below depends on it in some way. */
2792 fatal = false;
2794 /* Analyze data dependences between the data-refs in the loop
2795 and adjust the maximum vectorization factor according to
2796 the dependences.
2797 FORNOW: fail at the first data dependence that we encounter. */
2799 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2800 if (!ok)
2802 if (dump_enabled_p ())
2803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2804 "bad data dependence.\n");
2805 return ok;
2807 if (max_vf != MAX_VECTORIZATION_FACTOR
2808 && maybe_lt (max_vf, min_vf))
2809 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2810 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2812 ok = vect_determine_vectorization_factor (loop_vinfo);
2813 if (!ok)
2815 if (dump_enabled_p ())
2816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2817 "can't determine vectorization factor.\n");
2818 return ok;
2821 /* Compute the scalar iteration cost. */
2822 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2824 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2826 if (slp)
2828 /* Check the SLP opportunities in the loop, analyze and build
2829 SLP trees. */
2830 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2831 if (!ok)
2832 return ok;
2834 /* If there are any SLP instances mark them as pure_slp. */
2835 slp = vect_make_slp_decision (loop_vinfo);
2836 if (slp)
2838 /* Find stmts that need to be both vectorized and SLPed. */
2839 vect_detect_hybrid_slp (loop_vinfo);
2841 /* Update the vectorization factor based on the SLP decision. */
2842 vect_update_vf_for_slp (loop_vinfo);
2844 /* Optimize the SLP graph with the vectorization factor fixed. */
2845 vect_optimize_slp (loop_vinfo);
2847 /* Gather the loads reachable from the SLP graph entries. */
2848 vect_gather_slp_loads (loop_vinfo);
2852 bool saved_can_use_partial_vectors_p
2853 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2855 /* We don't expect to have to roll back to anything other than an empty
2856 set of rgroups. */
2857 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2859 /* This is the point where we can re-start analysis with SLP forced off. */
2860 start_over:
2862 /* Apply the suggested unrolling factor, this was determined by the backend
2863 during finish_cost the first time we ran the analyzis for this
2864 vector mode. */
2865 if (applying_suggested_uf)
2866 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2868 /* Now the vectorization factor is final. */
2869 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2870 gcc_assert (known_ne (vectorization_factor, 0U));
2872 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2874 dump_printf_loc (MSG_NOTE, vect_location,
2875 "vectorization_factor = ");
2876 dump_dec (MSG_NOTE, vectorization_factor);
2877 dump_printf (MSG_NOTE, ", niters = %wd\n",
2878 LOOP_VINFO_INT_NITERS (loop_vinfo));
2881 if (max_vf != MAX_VECTORIZATION_FACTOR
2882 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2883 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2885 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2887 /* Analyze the alignment of the data-refs in the loop.
2888 Fail if a data reference is found that cannot be vectorized. */
2890 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2891 if (!ok)
2893 if (dump_enabled_p ())
2894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2895 "bad data alignment.\n");
2896 return ok;
2899 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2900 It is important to call pruning after vect_analyze_data_ref_accesses,
2901 since we use grouping information gathered by interleaving analysis. */
2902 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2903 if (!ok)
2904 return ok;
2906 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2907 vectorization, since we do not want to add extra peeling or
2908 add versioning for alignment. */
2909 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2910 /* This pass will decide on using loop versioning and/or loop peeling in
2911 order to enhance the alignment of data references in the loop. */
2912 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2913 if (!ok)
2914 return ok;
2916 if (slp)
2918 /* Analyze operations in the SLP instances. Note this may
2919 remove unsupported SLP instances which makes the above
2920 SLP kind detection invalid. */
2921 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2922 vect_slp_analyze_operations (loop_vinfo);
2923 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2925 ok = opt_result::failure_at (vect_location,
2926 "unsupported SLP instances\n");
2927 goto again;
2930 /* Check whether any load in ALL SLP instances is possibly permuted. */
2931 slp_tree load_node, slp_root;
2932 unsigned i, x;
2933 slp_instance instance;
2934 bool can_use_lanes = true;
2935 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2937 slp_root = SLP_INSTANCE_TREE (instance);
2938 int group_size = SLP_TREE_LANES (slp_root);
2939 tree vectype = SLP_TREE_VECTYPE (slp_root);
2940 bool loads_permuted = false;
2941 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2943 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2944 continue;
2945 unsigned j;
2946 stmt_vec_info load_info;
2947 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2948 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2950 loads_permuted = true;
2951 break;
2955 /* If the loads and stores can be handled with load/store-lane
2956 instructions record it and move on to the next instance. */
2957 if (loads_permuted
2958 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2959 && vect_store_lanes_supported (vectype, group_size, false)
2960 != IFN_LAST)
2962 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2963 if (STMT_VINFO_GROUPED_ACCESS
2964 (SLP_TREE_REPRESENTATIVE (load_node)))
2966 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2967 (SLP_TREE_REPRESENTATIVE (load_node));
2968 /* Use SLP for strided accesses (or if we can't
2969 load-lanes). */
2970 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2971 || vect_load_lanes_supported
2972 (STMT_VINFO_VECTYPE (stmt_vinfo),
2973 DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2974 break;
2977 can_use_lanes
2978 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2980 if (can_use_lanes && dump_enabled_p ())
2981 dump_printf_loc (MSG_NOTE, vect_location,
2982 "SLP instance %p can use load/store-lanes\n",
2983 (void *) instance);
2985 else
2987 can_use_lanes = false;
2988 break;
2992 /* If all SLP instances can use load/store-lanes abort SLP and try again
2993 with SLP disabled. */
2994 if (can_use_lanes)
2996 ok = opt_result::failure_at (vect_location,
2997 "Built SLP cancelled: can use "
2998 "load/store-lanes\n");
2999 if (dump_enabled_p ())
3000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3001 "Built SLP cancelled: all SLP instances support "
3002 "load/store-lanes\n");
3003 goto again;
3007 /* Dissolve SLP-only groups. */
3008 vect_dissolve_slp_only_groups (loop_vinfo);
3010 /* Scan all the remaining operations in the loop that are not subject
3011 to SLP and make sure they are vectorizable. */
3012 ok = vect_analyze_loop_operations (loop_vinfo);
3013 if (!ok)
3015 if (dump_enabled_p ())
3016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3017 "bad operation or unsupported loop bound.\n");
3018 return ok;
3021 /* For now, we don't expect to mix both masking and length approaches for one
3022 loop, disable it if both are recorded. */
3023 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3024 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3025 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3027 if (dump_enabled_p ())
3028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3029 "can't vectorize a loop with partial vectors"
3030 " because we don't expect to mix different"
3031 " approaches with partial vectors for the"
3032 " same loop.\n");
3033 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3036 /* If we still have the option of using partial vectors,
3037 check whether we can generate the necessary loop controls. */
3038 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3040 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3042 if (!vect_verify_full_masking (loop_vinfo)
3043 && !vect_verify_full_masking_avx512 (loop_vinfo))
3044 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3046 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3047 if (!vect_verify_loop_lens (loop_vinfo))
3048 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3051 /* If we're vectorizing a loop that uses length "controls" and
3052 can iterate more than once, we apply decrementing IV approach
3053 in loop control. */
3054 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3055 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3056 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3057 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3058 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3059 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3060 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3062 /* If a loop uses length controls and has a decrementing loop control IV,
3063 we will normally pass that IV through a MIN_EXPR to calcaluate the
3064 basis for the length controls. E.g. in a loop that processes one
3065 element per scalar iteration, the number of elements would be
3066 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3068 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3069 step, since only the final iteration of the vector loop can have
3070 inactive lanes.
3072 However, some targets have a dedicated instruction for calculating the
3073 preferred length, given the total number of elements that still need to
3074 be processed. This is encapsulated in the SELECT_VL internal function.
3076 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3077 to determine the basis for the length controls. However, unlike the
3078 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3079 lanes inactive in any iteration of the vector loop, not just the last
3080 iteration. This SELECT_VL approach therefore requires us to use pointer
3081 IVs with variable steps.
3083 Once we've decided how many elements should be processed by one
3084 iteration of the vector loop, we need to populate the rgroup controls.
3085 If a loop has multiple rgroups, we need to make sure that those rgroups
3086 "line up" (that is, they must be consistent about which elements are
3087 active and which aren't). This is done by vect_adjust_loop_lens_control.
3089 In principle, it would be possible to use vect_adjust_loop_lens_control
3090 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3091 However:
3093 (1) In practice, it only makes sense to use SELECT_VL when a vector
3094 operation will be controlled directly by the result. It is not
3095 worth using SELECT_VL if it would only be the input to other
3096 calculations.
3098 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3099 pointer IV will need N updates by a variable amount (N-1 updates
3100 within the iteration and 1 update to move to the next iteration).
3102 Because of this, we prefer to use the MIN_EXPR approach whenever there
3103 is more than one length control.
3105 In addition, SELECT_VL always operates to a granularity of 1 unit.
3106 If we wanted to use it to control an SLP operation on N consecutive
3107 elements, we would need to make the SELECT_VL inputs measure scalar
3108 iterations (rather than elements) and then multiply the SELECT_VL
3109 result by N. But using SELECT_VL this way is inefficient because
3110 of (1) above.
3112 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3113 satisfied:
3115 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3116 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3118 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3119 we will fail to gain benefits of following unroll optimizations. We prefer
3120 using the MIN_EXPR approach in this situation. */
3121 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3123 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3124 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3125 OPTIMIZE_FOR_SPEED)
3126 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3127 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3128 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3129 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3130 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3133 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3134 assuming that the loop will be used as a main loop. We will redo
3135 this analysis later if we instead decide to use the loop as an
3136 epilogue loop. */
3137 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3138 if (!ok)
3139 return ok;
3141 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3142 to be able to handle fewer than VF scalars, or needs to have a lower VF
3143 than the main loop. */
3144 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3145 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3147 poly_uint64 unscaled_vf
3148 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3149 orig_loop_vinfo->suggested_unroll_factor);
3150 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3151 return opt_result::failure_at (vect_location,
3152 "Vectorization factor too high for"
3153 " epilogue loop.\n");
3156 /* Check the costings of the loop make vectorizing worthwhile. */
3157 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3158 if (res < 0)
3160 ok = opt_result::failure_at (vect_location,
3161 "Loop costings may not be worthwhile.\n");
3162 goto again;
3164 if (!res)
3165 return opt_result::failure_at (vect_location,
3166 "Loop costings not worthwhile.\n");
3168 /* If an epilogue loop is required make sure we can create one. */
3169 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3170 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3172 if (dump_enabled_p ())
3173 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3174 if (!vect_can_advance_ivs_p (loop_vinfo)
3175 || !slpeel_can_duplicate_loop_p (loop,
3176 LOOP_VINFO_IV_EXIT (loop_vinfo),
3177 LOOP_VINFO_IV_EXIT (loop_vinfo)))
3179 ok = opt_result::failure_at (vect_location,
3180 "not vectorized: can't create required "
3181 "epilog loop\n");
3182 goto again;
3186 /* During peeling, we need to check if number of loop iterations is
3187 enough for both peeled prolog loop and vector loop. This check
3188 can be merged along with threshold check of loop versioning, so
3189 increase threshold for this case if necessary.
3191 If we are analyzing an epilogue we still want to check what its
3192 versioning threshold would be. If we decide to vectorize the epilogues we
3193 will want to use the lowest versioning threshold of all epilogues and main
3194 loop. This will enable us to enter a vectorized epilogue even when
3195 versioning the loop. We can't simply check whether the epilogue requires
3196 versioning though since we may have skipped some versioning checks when
3197 analyzing the epilogue. For instance, checks for alias versioning will be
3198 skipped when dealing with epilogues as we assume we already checked them
3199 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3200 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3202 poly_uint64 niters_th = 0;
3203 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3205 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3207 /* Niters for peeled prolog loop. */
3208 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3210 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3211 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3212 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3214 else
3215 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3218 /* Niters for at least one iteration of vectorized loop. */
3219 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3220 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3221 /* One additional iteration because of peeling for gap. */
3222 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3223 niters_th += 1;
3225 /* Use the same condition as vect_transform_loop to decide when to use
3226 the cost to determine a versioning threshold. */
3227 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3228 && ordered_p (th, niters_th))
3229 niters_th = ordered_max (poly_uint64 (th), niters_th);
3231 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3234 gcc_assert (known_eq (vectorization_factor,
3235 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3237 slp_done_for_suggested_uf = slp;
3239 /* Ok to vectorize! */
3240 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3241 return opt_result::success ();
3243 again:
3244 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3245 gcc_assert (!ok);
3247 /* Try again with SLP forced off but if we didn't do any SLP there is
3248 no point in re-trying. */
3249 if (!slp)
3250 return ok;
3252 /* If the slp decision is true when suggested unroll factor is worked
3253 out, and we are applying suggested unroll factor, we don't need to
3254 re-try any more. */
3255 if (applying_suggested_uf && slp_done_for_suggested_uf)
3256 return ok;
3258 /* If there are reduction chains re-trying will fail anyway. */
3259 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3260 return ok;
3262 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3263 via interleaving or lane instructions. */
3264 slp_instance instance;
3265 slp_tree node;
3266 unsigned i, j;
3267 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3269 stmt_vec_info vinfo;
3270 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3271 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3272 continue;
3273 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3274 unsigned int size = DR_GROUP_SIZE (vinfo);
3275 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3276 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3277 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3278 && ! vect_grouped_store_supported (vectype, size))
3279 return opt_result::failure_at (vinfo->stmt,
3280 "unsupported grouped store\n");
3281 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3283 vinfo = SLP_TREE_REPRESENTATIVE (node);
3284 if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3286 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3287 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3288 size = DR_GROUP_SIZE (vinfo);
3289 vectype = STMT_VINFO_VECTYPE (vinfo);
3290 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3291 && ! vect_grouped_load_supported (vectype, single_element_p,
3292 size))
3293 return opt_result::failure_at (vinfo->stmt,
3294 "unsupported grouped load\n");
3299 if (dump_enabled_p ())
3300 dump_printf_loc (MSG_NOTE, vect_location,
3301 "re-trying with SLP disabled\n");
3303 /* Roll back state appropriately. No SLP this time. */
3304 slp = false;
3305 /* Restore vectorization factor as it were without SLP. */
3306 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3307 /* Free the SLP instances. */
3308 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3309 vect_free_slp_instance (instance);
3310 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3311 /* Reset SLP type to loop_vect on all stmts. */
3312 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3314 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3315 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3316 !gsi_end_p (si); gsi_next (&si))
3318 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3319 STMT_SLP_TYPE (stmt_info) = loop_vect;
3320 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3321 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3323 /* vectorizable_reduction adjusts reduction stmt def-types,
3324 restore them to that of the PHI. */
3325 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3326 = STMT_VINFO_DEF_TYPE (stmt_info);
3327 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3328 (STMT_VINFO_REDUC_DEF (stmt_info)))
3329 = STMT_VINFO_DEF_TYPE (stmt_info);
3332 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3333 !gsi_end_p (si); gsi_next (&si))
3335 if (is_gimple_debug (gsi_stmt (si)))
3336 continue;
3337 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3338 STMT_SLP_TYPE (stmt_info) = loop_vect;
3339 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3341 stmt_vec_info pattern_stmt_info
3342 = STMT_VINFO_RELATED_STMT (stmt_info);
3343 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3344 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3346 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3347 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3348 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3349 !gsi_end_p (pi); gsi_next (&pi))
3350 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3351 = loop_vect;
3355 /* Free optimized alias test DDRS. */
3356 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3357 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3358 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3359 /* Reset target cost data. */
3360 delete loop_vinfo->vector_costs;
3361 loop_vinfo->vector_costs = nullptr;
3362 /* Reset accumulated rgroup information. */
3363 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3364 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3365 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3366 /* Reset assorted flags. */
3367 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3368 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3369 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3370 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3371 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3372 = saved_can_use_partial_vectors_p;
3374 goto start_over;
3377 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3378 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3379 OLD_LOOP_VINFO is better unless something specifically indicates
3380 otherwise.
3382 Note that this deliberately isn't a partial order. */
3384 static bool
3385 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3386 loop_vec_info old_loop_vinfo)
3388 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3389 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3391 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3392 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3394 /* Always prefer a VF of loop->simdlen over any other VF. */
3395 if (loop->simdlen)
3397 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3398 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3399 if (new_simdlen_p != old_simdlen_p)
3400 return new_simdlen_p;
3403 const auto *old_costs = old_loop_vinfo->vector_costs;
3404 const auto *new_costs = new_loop_vinfo->vector_costs;
3405 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3406 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3408 return new_costs->better_main_loop_than_p (old_costs);
3411 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3412 true if we should. */
3414 static bool
3415 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3416 loop_vec_info old_loop_vinfo)
3418 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3419 return false;
3421 if (dump_enabled_p ())
3422 dump_printf_loc (MSG_NOTE, vect_location,
3423 "***** Preferring vector mode %s to vector mode %s\n",
3424 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3425 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3426 return true;
3429 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3430 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3431 MODE_I to the next mode useful to analyze.
3432 Return the loop_vinfo on success and wrapped null on failure. */
3434 static opt_loop_vec_info
3435 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3436 const vect_loop_form_info *loop_form_info,
3437 loop_vec_info main_loop_vinfo,
3438 const vector_modes &vector_modes, unsigned &mode_i,
3439 machine_mode &autodetected_vector_mode,
3440 bool &fatal)
3442 loop_vec_info loop_vinfo
3443 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3445 machine_mode vector_mode = vector_modes[mode_i];
3446 loop_vinfo->vector_mode = vector_mode;
3447 unsigned int suggested_unroll_factor = 1;
3448 bool slp_done_for_suggested_uf = false;
3450 /* Run the main analysis. */
3451 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3452 &suggested_unroll_factor,
3453 slp_done_for_suggested_uf);
3454 if (dump_enabled_p ())
3455 dump_printf_loc (MSG_NOTE, vect_location,
3456 "***** Analysis %s with vector mode %s\n",
3457 res ? "succeeded" : " failed",
3458 GET_MODE_NAME (loop_vinfo->vector_mode));
3460 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3462 if (dump_enabled_p ())
3463 dump_printf_loc (MSG_NOTE, vect_location,
3464 "***** Re-trying analysis for unrolling"
3465 " with unroll factor %d and slp %s.\n",
3466 suggested_unroll_factor,
3467 slp_done_for_suggested_uf ? "on" : "off");
3468 loop_vec_info unroll_vinfo
3469 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3470 unroll_vinfo->vector_mode = vector_mode;
3471 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3472 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3473 slp_done_for_suggested_uf);
3474 if (new_res)
3476 delete loop_vinfo;
3477 loop_vinfo = unroll_vinfo;
3479 else
3480 delete unroll_vinfo;
3483 /* Remember the autodetected vector mode. */
3484 if (vector_mode == VOIDmode)
3485 autodetected_vector_mode = loop_vinfo->vector_mode;
3487 /* Advance mode_i, first skipping modes that would result in the
3488 same analysis result. */
3489 while (mode_i + 1 < vector_modes.length ()
3490 && vect_chooses_same_modes_p (loop_vinfo,
3491 vector_modes[mode_i + 1]))
3493 if (dump_enabled_p ())
3494 dump_printf_loc (MSG_NOTE, vect_location,
3495 "***** The result for vector mode %s would"
3496 " be the same\n",
3497 GET_MODE_NAME (vector_modes[mode_i + 1]));
3498 mode_i += 1;
3500 if (mode_i + 1 < vector_modes.length ()
3501 && VECTOR_MODE_P (autodetected_vector_mode)
3502 && (related_vector_mode (vector_modes[mode_i + 1],
3503 GET_MODE_INNER (autodetected_vector_mode))
3504 == autodetected_vector_mode)
3505 && (related_vector_mode (autodetected_vector_mode,
3506 GET_MODE_INNER (vector_modes[mode_i + 1]))
3507 == vector_modes[mode_i + 1]))
3509 if (dump_enabled_p ())
3510 dump_printf_loc (MSG_NOTE, vect_location,
3511 "***** Skipping vector mode %s, which would"
3512 " repeat the analysis for %s\n",
3513 GET_MODE_NAME (vector_modes[mode_i + 1]),
3514 GET_MODE_NAME (autodetected_vector_mode));
3515 mode_i += 1;
3517 mode_i++;
3519 if (!res)
3521 delete loop_vinfo;
3522 if (fatal)
3523 gcc_checking_assert (main_loop_vinfo == NULL);
3524 return opt_loop_vec_info::propagate_failure (res);
3527 return opt_loop_vec_info::success (loop_vinfo);
3530 /* Function vect_analyze_loop.
3532 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3533 for it. The different analyses will record information in the
3534 loop_vec_info struct. */
3535 opt_loop_vec_info
3536 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3538 DUMP_VECT_SCOPE ("analyze_loop_nest");
3540 if (loop_outer (loop)
3541 && loop_vec_info_for_loop (loop_outer (loop))
3542 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3543 return opt_loop_vec_info::failure_at (vect_location,
3544 "outer-loop already vectorized.\n");
3546 if (!find_loop_nest (loop, &shared->loop_nest))
3547 return opt_loop_vec_info::failure_at
3548 (vect_location,
3549 "not vectorized: loop nest containing two or more consecutive inner"
3550 " loops cannot be vectorized\n");
3552 /* Analyze the loop form. */
3553 vect_loop_form_info loop_form_info;
3554 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3555 if (!res)
3557 if (dump_enabled_p ())
3558 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3559 "bad loop form.\n");
3560 return opt_loop_vec_info::propagate_failure (res);
3562 if (!integer_onep (loop_form_info.assumptions))
3564 /* We consider to vectorize this loop by versioning it under
3565 some assumptions. In order to do this, we need to clear
3566 existing information computed by scev and niter analyzer. */
3567 scev_reset_htab ();
3568 free_numbers_of_iterations_estimates (loop);
3569 /* Also set flag for this loop so that following scev and niter
3570 analysis are done under the assumptions. */
3571 loop_constraint_set (loop, LOOP_C_FINITE);
3573 else
3574 /* Clear the existing niter information to make sure the nonwrapping flag
3575 will be calculated and set propriately. */
3576 free_numbers_of_iterations_estimates (loop);
3578 auto_vector_modes vector_modes;
3579 /* Autodetect first vector size we try. */
3580 vector_modes.safe_push (VOIDmode);
3581 unsigned int autovec_flags
3582 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3583 loop->simdlen != 0);
3584 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3585 && !unlimited_cost_model (loop));
3586 machine_mode autodetected_vector_mode = VOIDmode;
3587 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3588 unsigned int mode_i = 0;
3589 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3591 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3592 a mode has not been analyzed. */
3593 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3594 for (unsigned i = 0; i < vector_modes.length (); ++i)
3595 cached_vf_per_mode.safe_push (0);
3597 /* First determine the main loop vectorization mode, either the first
3598 one that works, starting with auto-detecting the vector mode and then
3599 following the targets order of preference, or the one with the
3600 lowest cost if pick_lowest_cost_p. */
3601 while (1)
3603 bool fatal;
3604 unsigned int last_mode_i = mode_i;
3605 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3606 failed. */
3607 cached_vf_per_mode[last_mode_i] = -1;
3608 opt_loop_vec_info loop_vinfo
3609 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3610 NULL, vector_modes, mode_i,
3611 autodetected_vector_mode, fatal);
3612 if (fatal)
3613 break;
3615 if (loop_vinfo)
3617 /* Analyzis has been successful so update the VF value. The
3618 VF should always be a multiple of unroll_factor and we want to
3619 capture the original VF here. */
3620 cached_vf_per_mode[last_mode_i]
3621 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3622 loop_vinfo->suggested_unroll_factor);
3623 /* Once we hit the desired simdlen for the first time,
3624 discard any previous attempts. */
3625 if (simdlen
3626 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3628 delete first_loop_vinfo;
3629 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3630 simdlen = 0;
3632 else if (pick_lowest_cost_p
3633 && first_loop_vinfo
3634 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3636 /* Pick loop_vinfo over first_loop_vinfo. */
3637 delete first_loop_vinfo;
3638 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3640 if (first_loop_vinfo == NULL)
3641 first_loop_vinfo = loop_vinfo;
3642 else
3644 delete loop_vinfo;
3645 loop_vinfo = opt_loop_vec_info::success (NULL);
3648 /* Commit to first_loop_vinfo if we have no reason to try
3649 alternatives. */
3650 if (!simdlen && !pick_lowest_cost_p)
3651 break;
3653 if (mode_i == vector_modes.length ()
3654 || autodetected_vector_mode == VOIDmode)
3655 break;
3657 /* Try the next biggest vector size. */
3658 if (dump_enabled_p ())
3659 dump_printf_loc (MSG_NOTE, vect_location,
3660 "***** Re-trying analysis with vector mode %s\n",
3661 GET_MODE_NAME (vector_modes[mode_i]));
3663 if (!first_loop_vinfo)
3664 return opt_loop_vec_info::propagate_failure (res);
3666 if (dump_enabled_p ())
3667 dump_printf_loc (MSG_NOTE, vect_location,
3668 "***** Choosing vector mode %s\n",
3669 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3671 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3672 enabled, SIMDUID is not set, it is the innermost loop and we have
3673 either already found the loop's SIMDLEN or there was no SIMDLEN to
3674 begin with.
3675 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3676 bool vect_epilogues = (!simdlen
3677 && loop->inner == NULL
3678 && param_vect_epilogues_nomask
3679 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3680 && !loop->simduid);
3681 if (!vect_epilogues)
3682 return first_loop_vinfo;
3684 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3685 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3687 /* For epilogues start the analysis from the first mode. The motivation
3688 behind starting from the beginning comes from cases where the VECTOR_MODES
3689 array may contain length-agnostic and length-specific modes. Their
3690 ordering is not guaranteed, so we could end up picking a mode for the main
3691 loop that is after the epilogue's optimal mode. */
3692 vector_modes[0] = autodetected_vector_mode;
3693 mode_i = 0;
3695 bool supports_partial_vectors =
3696 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3697 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3699 while (1)
3701 /* If the target does not support partial vectors we can shorten the
3702 number of modes to analyze for the epilogue as we know we can't pick a
3703 mode that would lead to a VF at least as big as the
3704 FIRST_VINFO_VF. */
3705 if (!supports_partial_vectors
3706 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3708 mode_i++;
3709 if (mode_i == vector_modes.length ())
3710 break;
3711 continue;
3714 if (dump_enabled_p ())
3715 dump_printf_loc (MSG_NOTE, vect_location,
3716 "***** Re-trying epilogue analysis with vector "
3717 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3719 bool fatal;
3720 opt_loop_vec_info loop_vinfo
3721 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3722 first_loop_vinfo,
3723 vector_modes, mode_i,
3724 autodetected_vector_mode, fatal);
3725 if (fatal)
3726 break;
3728 if (loop_vinfo)
3730 if (pick_lowest_cost_p)
3732 /* Keep trying to roll back vectorization attempts while the
3733 loop_vec_infos they produced were worse than this one. */
3734 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3735 while (!vinfos.is_empty ()
3736 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3738 gcc_assert (vect_epilogues);
3739 delete vinfos.pop ();
3742 /* For now only allow one epilogue loop. */
3743 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3745 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3746 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3747 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3748 || maybe_ne (lowest_th, 0U));
3749 /* Keep track of the known smallest versioning
3750 threshold. */
3751 if (ordered_p (lowest_th, th))
3752 lowest_th = ordered_min (lowest_th, th);
3754 else
3756 delete loop_vinfo;
3757 loop_vinfo = opt_loop_vec_info::success (NULL);
3760 /* For now only allow one epilogue loop, but allow
3761 pick_lowest_cost_p to replace it, so commit to the
3762 first epilogue if we have no reason to try alternatives. */
3763 if (!pick_lowest_cost_p)
3764 break;
3767 if (mode_i == vector_modes.length ())
3768 break;
3772 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3774 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3775 if (dump_enabled_p ())
3776 dump_printf_loc (MSG_NOTE, vect_location,
3777 "***** Choosing epilogue vector mode %s\n",
3778 GET_MODE_NAME
3779 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3782 return first_loop_vinfo;
3785 /* Return true if there is an in-order reduction function for CODE, storing
3786 it in *REDUC_FN if so. */
3788 static bool
3789 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3791 /* We support MINUS_EXPR by negating the operand. This also preserves an
3792 initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3793 (-0.0) = -0.0. */
3794 if (code == PLUS_EXPR || code == MINUS_EXPR)
3796 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3797 return true;
3799 return false;
3802 /* Function reduction_fn_for_scalar_code
3804 Input:
3805 CODE - tree_code of a reduction operations.
3807 Output:
3808 REDUC_FN - the corresponding internal function to be used to reduce the
3809 vector of partial results into a single scalar result, or IFN_LAST
3810 if the operation is a supported reduction operation, but does not have
3811 such an internal function.
3813 Return FALSE if CODE currently cannot be vectorized as reduction. */
3815 bool
3816 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3818 if (code.is_tree_code ())
3819 switch (tree_code (code))
3821 case MAX_EXPR:
3822 *reduc_fn = IFN_REDUC_MAX;
3823 return true;
3825 case MIN_EXPR:
3826 *reduc_fn = IFN_REDUC_MIN;
3827 return true;
3829 case PLUS_EXPR:
3830 *reduc_fn = IFN_REDUC_PLUS;
3831 return true;
3833 case BIT_AND_EXPR:
3834 *reduc_fn = IFN_REDUC_AND;
3835 return true;
3837 case BIT_IOR_EXPR:
3838 *reduc_fn = IFN_REDUC_IOR;
3839 return true;
3841 case BIT_XOR_EXPR:
3842 *reduc_fn = IFN_REDUC_XOR;
3843 return true;
3845 case MULT_EXPR:
3846 case MINUS_EXPR:
3847 *reduc_fn = IFN_LAST;
3848 return true;
3850 default:
3851 return false;
3853 else
3854 switch (combined_fn (code))
3856 CASE_CFN_FMAX:
3857 *reduc_fn = IFN_REDUC_FMAX;
3858 return true;
3860 CASE_CFN_FMIN:
3861 *reduc_fn = IFN_REDUC_FMIN;
3862 return true;
3864 default:
3865 return false;
3869 /* If there is a neutral value X such that a reduction would not be affected
3870 by the introduction of additional X elements, return that X, otherwise
3871 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3872 of the scalar elements. If the reduction has just a single initial value
3873 then INITIAL_VALUE is that value, otherwise it is null.
3874 If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3875 In that case no signed zero is returned. */
3877 tree
3878 neutral_op_for_reduction (tree scalar_type, code_helper code,
3879 tree initial_value, bool as_initial)
3881 if (code.is_tree_code ())
3882 switch (tree_code (code))
3884 case DOT_PROD_EXPR:
3885 case SAD_EXPR:
3886 case MINUS_EXPR:
3887 case BIT_IOR_EXPR:
3888 case BIT_XOR_EXPR:
3889 return build_zero_cst (scalar_type);
3890 case WIDEN_SUM_EXPR:
3891 case PLUS_EXPR:
3892 if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3893 return build_real (scalar_type, dconstm0);
3894 else
3895 return build_zero_cst (scalar_type);
3897 case MULT_EXPR:
3898 return build_one_cst (scalar_type);
3900 case BIT_AND_EXPR:
3901 return build_all_ones_cst (scalar_type);
3903 case MAX_EXPR:
3904 case MIN_EXPR:
3905 return initial_value;
3907 default:
3908 return NULL_TREE;
3910 else
3911 switch (combined_fn (code))
3913 CASE_CFN_FMIN:
3914 CASE_CFN_FMAX:
3915 return initial_value;
3917 default:
3918 return NULL_TREE;
3922 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3923 STMT is printed with a message MSG. */
3925 static void
3926 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3928 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3931 /* Return true if we need an in-order reduction for operation CODE
3932 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3933 overflow must wrap. */
3935 bool
3936 needs_fold_left_reduction_p (tree type, code_helper code)
3938 /* CHECKME: check for !flag_finite_math_only too? */
3939 if (SCALAR_FLOAT_TYPE_P (type))
3941 if (code.is_tree_code ())
3942 switch (tree_code (code))
3944 case MIN_EXPR:
3945 case MAX_EXPR:
3946 return false;
3948 default:
3949 return !flag_associative_math;
3951 else
3952 switch (combined_fn (code))
3954 CASE_CFN_FMIN:
3955 CASE_CFN_FMAX:
3956 return false;
3958 default:
3959 return !flag_associative_math;
3963 if (INTEGRAL_TYPE_P (type))
3964 return (!code.is_tree_code ()
3965 || !operation_no_trapping_overflow (type, tree_code (code)));
3967 if (SAT_FIXED_POINT_TYPE_P (type))
3968 return true;
3970 return false;
3973 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3974 has a handled computation expression. Store the main reduction
3975 operation in *CODE. */
3977 static bool
3978 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3979 tree loop_arg, code_helper *code,
3980 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3982 auto_bitmap visited;
3983 tree lookfor = PHI_RESULT (phi);
3984 ssa_op_iter curri;
3985 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3986 while (USE_FROM_PTR (curr) != loop_arg)
3987 curr = op_iter_next_use (&curri);
3988 curri.i = curri.numops;
3991 path.safe_push (std::make_pair (curri, curr));
3992 tree use = USE_FROM_PTR (curr);
3993 if (use == lookfor)
3994 break;
3995 gimple *def = SSA_NAME_DEF_STMT (use);
3996 if (gimple_nop_p (def)
3997 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3999 pop:
4002 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4003 curri = x.first;
4004 curr = x.second;
4006 curr = op_iter_next_use (&curri);
4007 /* Skip already visited or non-SSA operands (from iterating
4008 over PHI args). */
4009 while (curr != NULL_USE_OPERAND_P
4010 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4011 || ! bitmap_set_bit (visited,
4012 SSA_NAME_VERSION
4013 (USE_FROM_PTR (curr)))));
4015 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4016 if (curr == NULL_USE_OPERAND_P)
4017 break;
4019 else
4021 if (gimple_code (def) == GIMPLE_PHI)
4022 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4023 else
4024 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4025 while (curr != NULL_USE_OPERAND_P
4026 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4027 || ! bitmap_set_bit (visited,
4028 SSA_NAME_VERSION
4029 (USE_FROM_PTR (curr)))))
4030 curr = op_iter_next_use (&curri);
4031 if (curr == NULL_USE_OPERAND_P)
4032 goto pop;
4035 while (1);
4036 if (dump_file && (dump_flags & TDF_DETAILS))
4038 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4039 unsigned i;
4040 std::pair<ssa_op_iter, use_operand_p> *x;
4041 FOR_EACH_VEC_ELT (path, i, x)
4042 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4043 dump_printf (MSG_NOTE, "\n");
4046 /* Check whether the reduction path detected is valid. */
4047 bool fail = path.length () == 0;
4048 bool neg = false;
4049 int sign = -1;
4050 *code = ERROR_MARK;
4051 for (unsigned i = 1; i < path.length (); ++i)
4053 gimple *use_stmt = USE_STMT (path[i].second);
4054 gimple_match_op op;
4055 if (!gimple_extract_op (use_stmt, &op))
4057 fail = true;
4058 break;
4060 unsigned int opi = op.num_ops;
4061 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4063 /* The following make sure we can compute the operand index
4064 easily plus it mostly disallows chaining via COND_EXPR condition
4065 operands. */
4066 for (opi = 0; opi < op.num_ops; ++opi)
4067 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4068 break;
4070 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4072 for (opi = 0; opi < op.num_ops; ++opi)
4073 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4074 break;
4076 if (opi == op.num_ops)
4078 fail = true;
4079 break;
4081 op.code = canonicalize_code (op.code, op.type);
4082 if (op.code == MINUS_EXPR)
4084 op.code = PLUS_EXPR;
4085 /* Track whether we negate the reduction value each iteration. */
4086 if (op.ops[1] == op.ops[opi])
4087 neg = ! neg;
4089 if (CONVERT_EXPR_CODE_P (op.code)
4090 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4092 else if (*code == ERROR_MARK)
4094 *code = op.code;
4095 sign = TYPE_SIGN (op.type);
4097 else if (op.code != *code)
4099 fail = true;
4100 break;
4102 else if ((op.code == MIN_EXPR
4103 || op.code == MAX_EXPR)
4104 && sign != TYPE_SIGN (op.type))
4106 fail = true;
4107 break;
4109 /* Check there's only a single stmt the op is used on. For the
4110 not value-changing tail and the last stmt allow out-of-loop uses.
4111 ??? We could relax this and handle arbitrary live stmts by
4112 forcing a scalar epilogue for example. */
4113 imm_use_iterator imm_iter;
4114 use_operand_p use_p;
4115 gimple *op_use_stmt;
4116 unsigned cnt = 0;
4117 bool cond_fn_p = op.code.is_internal_fn ()
4118 && (conditional_internal_fn_code (internal_fn (op.code))
4119 != ERROR_MARK);
4121 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4123 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4124 op1 twice (once as definition, once as else) in the same operation.
4125 Allow this. */
4126 if (cond_fn_p && op_use_stmt == use_stmt)
4128 gcall *call = as_a<gcall *> (use_stmt);
4129 unsigned else_pos
4130 = internal_fn_else_index (internal_fn (op.code));
4132 for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4134 if (j == else_pos)
4135 continue;
4136 if (gimple_call_arg (call, j) == op.ops[opi])
4137 cnt++;
4140 else if (!is_gimple_debug (op_use_stmt)
4141 && (*code != ERROR_MARK
4142 || flow_bb_inside_loop_p (loop,
4143 gimple_bb (op_use_stmt))))
4144 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4145 cnt++;
4148 if (cnt != 1)
4150 fail = true;
4151 break;
4154 return ! fail && ! neg && *code != ERROR_MARK;
4157 bool
4158 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4159 tree loop_arg, enum tree_code code)
4161 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4162 code_helper code_;
4163 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4164 && code_ == code);
4169 /* Function vect_is_simple_reduction
4171 (1) Detect a cross-iteration def-use cycle that represents a simple
4172 reduction computation. We look for the following pattern:
4174 loop_header:
4175 a1 = phi < a0, a2 >
4176 a3 = ...
4177 a2 = operation (a3, a1)
4181 a3 = ...
4182 loop_header:
4183 a1 = phi < a0, a2 >
4184 a2 = operation (a3, a1)
4186 such that:
4187 1. operation is commutative and associative and it is safe to
4188 change the order of the computation
4189 2. no uses for a2 in the loop (a2 is used out of the loop)
4190 3. no uses of a1 in the loop besides the reduction operation
4191 4. no uses of a1 outside the loop.
4193 Conditions 1,4 are tested here.
4194 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4196 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4197 nested cycles.
4199 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4200 reductions:
4202 a1 = phi < a0, a2 >
4203 inner loop (def of a3)
4204 a2 = phi < a3 >
4206 (4) Detect condition expressions, ie:
4207 for (int i = 0; i < N; i++)
4208 if (a[i] < val)
4209 ret_val = a[i];
4213 static stmt_vec_info
4214 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4215 bool *double_reduc, bool *reduc_chain_p, bool slp)
4217 gphi *phi = as_a <gphi *> (phi_info->stmt);
4218 gimple *phi_use_stmt = NULL;
4219 imm_use_iterator imm_iter;
4220 use_operand_p use_p;
4222 *double_reduc = false;
4223 *reduc_chain_p = false;
4224 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4226 tree phi_name = PHI_RESULT (phi);
4227 /* ??? If there are no uses of the PHI result the inner loop reduction
4228 won't be detected as possibly double-reduction by vectorizable_reduction
4229 because that tries to walk the PHI arg from the preheader edge which
4230 can be constant. See PR60382. */
4231 if (has_zero_uses (phi_name))
4232 return NULL;
4233 class loop *loop = (gimple_bb (phi))->loop_father;
4234 unsigned nphi_def_loop_uses = 0;
4235 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4237 gimple *use_stmt = USE_STMT (use_p);
4238 if (is_gimple_debug (use_stmt))
4239 continue;
4241 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4243 if (dump_enabled_p ())
4244 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4245 "intermediate value used outside loop.\n");
4247 return NULL;
4250 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4251 op1 twice (once as definition, once as else) in the same operation.
4252 Only count it as one. */
4253 if (use_stmt != phi_use_stmt)
4255 nphi_def_loop_uses++;
4256 phi_use_stmt = use_stmt;
4260 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4261 if (TREE_CODE (latch_def) != SSA_NAME)
4263 if (dump_enabled_p ())
4264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4265 "reduction: not ssa_name: %T\n", latch_def);
4266 return NULL;
4269 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4270 if (!def_stmt_info
4271 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4272 return NULL;
4274 bool nested_in_vect_loop
4275 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4276 unsigned nlatch_def_loop_uses = 0;
4277 auto_vec<gphi *, 3> lcphis;
4278 bool inner_loop_of_double_reduc = false;
4279 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4281 gimple *use_stmt = USE_STMT (use_p);
4282 if (is_gimple_debug (use_stmt))
4283 continue;
4284 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4285 nlatch_def_loop_uses++;
4286 else
4288 /* We can have more than one loop-closed PHI. */
4289 lcphis.safe_push (as_a <gphi *> (use_stmt));
4290 if (nested_in_vect_loop
4291 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4292 == vect_double_reduction_def))
4293 inner_loop_of_double_reduc = true;
4297 /* If we are vectorizing an inner reduction we are executing that
4298 in the original order only in case we are not dealing with a
4299 double reduction. */
4300 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4302 if (dump_enabled_p ())
4303 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4304 "detected nested cycle: ");
4305 return def_stmt_info;
4308 /* When the inner loop of a double reduction ends up with more than
4309 one loop-closed PHI we have failed to classify alternate such
4310 PHIs as double reduction, leading to wrong code. See PR103237. */
4311 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4313 if (dump_enabled_p ())
4314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4315 "unhandle double reduction\n");
4316 return NULL;
4319 /* If this isn't a nested cycle or if the nested cycle reduction value
4320 is used ouside of the inner loop we cannot handle uses of the reduction
4321 value. */
4322 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4324 if (dump_enabled_p ())
4325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4326 "reduction used in loop.\n");
4327 return NULL;
4330 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4331 defined in the inner loop. */
4332 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4334 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4335 if (gimple_phi_num_args (def_stmt) != 1
4336 || TREE_CODE (op1) != SSA_NAME)
4338 if (dump_enabled_p ())
4339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4340 "unsupported phi node definition.\n");
4342 return NULL;
4345 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4346 and the latch definition op1. */
4347 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4348 if (gimple_bb (def1)
4349 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4350 && loop->inner
4351 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4352 && (is_gimple_assign (def1) || is_gimple_call (def1))
4353 && is_a <gphi *> (phi_use_stmt)
4354 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4355 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4356 loop_latch_edge (loop->inner))))
4358 if (dump_enabled_p ())
4359 report_vect_op (MSG_NOTE, def_stmt,
4360 "detected double reduction: ");
4362 *double_reduc = true;
4363 return def_stmt_info;
4366 return NULL;
4369 /* Look for the expression computing latch_def from then loop PHI result. */
4370 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4371 code_helper code;
4372 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4373 path))
4375 STMT_VINFO_REDUC_CODE (phi_info) = code;
4376 if (code == COND_EXPR && !nested_in_vect_loop)
4377 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4379 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4380 reduction chain for which the additional restriction is that
4381 all operations in the chain are the same. */
4382 auto_vec<stmt_vec_info, 8> reduc_chain;
4383 unsigned i;
4384 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4385 for (i = path.length () - 1; i >= 1; --i)
4387 gimple *stmt = USE_STMT (path[i].second);
4388 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4389 gimple_match_op op;
4390 if (!gimple_extract_op (stmt, &op))
4391 gcc_unreachable ();
4392 if (gassign *assign = dyn_cast<gassign *> (stmt))
4393 STMT_VINFO_REDUC_IDX (stmt_info)
4394 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4395 else
4397 gcall *call = as_a<gcall *> (stmt);
4398 STMT_VINFO_REDUC_IDX (stmt_info)
4399 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4401 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4402 && (i == 1 || i == path.length () - 1));
4403 if ((op.code != code && !leading_conversion)
4404 /* We can only handle the final value in epilogue
4405 generation for reduction chains. */
4406 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4407 is_slp_reduc = false;
4408 /* For reduction chains we support a trailing/leading
4409 conversions. We do not store those in the actual chain. */
4410 if (leading_conversion)
4411 continue;
4412 reduc_chain.safe_push (stmt_info);
4414 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4416 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4418 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4419 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4421 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4422 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4424 /* Save the chain for further analysis in SLP detection. */
4425 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4426 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4428 *reduc_chain_p = true;
4429 if (dump_enabled_p ())
4430 dump_printf_loc (MSG_NOTE, vect_location,
4431 "reduction: detected reduction chain\n");
4433 else if (dump_enabled_p ())
4434 dump_printf_loc (MSG_NOTE, vect_location,
4435 "reduction: detected reduction\n");
4437 return def_stmt_info;
4440 if (dump_enabled_p ())
4441 dump_printf_loc (MSG_NOTE, vect_location,
4442 "reduction: unknown pattern\n");
4444 return NULL;
4447 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4448 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4449 or -1 if not known. */
4451 static int
4452 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4454 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4455 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4457 if (dump_enabled_p ())
4458 dump_printf_loc (MSG_NOTE, vect_location,
4459 "cost model: epilogue peel iters set to vf/2 "
4460 "because loop iterations are unknown .\n");
4461 return assumed_vf / 2;
4463 else
4465 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4466 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4467 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4468 /* If we need to peel for gaps, but no peeling is required, we have to
4469 peel VF iterations. */
4470 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4471 peel_iters_epilogue = assumed_vf;
4472 return peel_iters_epilogue;
4476 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4478 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4479 int *peel_iters_epilogue,
4480 stmt_vector_for_cost *scalar_cost_vec,
4481 stmt_vector_for_cost *prologue_cost_vec,
4482 stmt_vector_for_cost *epilogue_cost_vec)
4484 int retval = 0;
4486 *peel_iters_epilogue
4487 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4489 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4491 /* If peeled iterations are known but number of scalar loop
4492 iterations are unknown, count a taken branch per peeled loop. */
4493 if (peel_iters_prologue > 0)
4494 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4495 vect_prologue);
4496 if (*peel_iters_epilogue > 0)
4497 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4498 vect_epilogue);
4501 stmt_info_for_cost *si;
4502 int j;
4503 if (peel_iters_prologue)
4504 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4505 retval += record_stmt_cost (prologue_cost_vec,
4506 si->count * peel_iters_prologue,
4507 si->kind, si->stmt_info, si->misalign,
4508 vect_prologue);
4509 if (*peel_iters_epilogue)
4510 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4511 retval += record_stmt_cost (epilogue_cost_vec,
4512 si->count * *peel_iters_epilogue,
4513 si->kind, si->stmt_info, si->misalign,
4514 vect_epilogue);
4516 return retval;
4519 /* Function vect_estimate_min_profitable_iters
4521 Return the number of iterations required for the vector version of the
4522 loop to be profitable relative to the cost of the scalar version of the
4523 loop.
4525 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4526 of iterations for vectorization. -1 value means loop vectorization
4527 is not profitable. This returned value may be used for dynamic
4528 profitability check.
4530 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4531 for static check against estimated number of iterations. */
4533 static void
4534 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4535 int *ret_min_profitable_niters,
4536 int *ret_min_profitable_estimate,
4537 unsigned *suggested_unroll_factor)
4539 int min_profitable_iters;
4540 int min_profitable_estimate;
4541 int peel_iters_prologue;
4542 int peel_iters_epilogue;
4543 unsigned vec_inside_cost = 0;
4544 int vec_outside_cost = 0;
4545 unsigned vec_prologue_cost = 0;
4546 unsigned vec_epilogue_cost = 0;
4547 int scalar_single_iter_cost = 0;
4548 int scalar_outside_cost = 0;
4549 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4550 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4551 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4553 /* Cost model disabled. */
4554 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4556 if (dump_enabled_p ())
4557 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4558 *ret_min_profitable_niters = 0;
4559 *ret_min_profitable_estimate = 0;
4560 return;
4563 /* Requires loop versioning tests to handle misalignment. */
4564 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4566 /* FIXME: Make cost depend on complexity of individual check. */
4567 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4568 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4569 if (dump_enabled_p ())
4570 dump_printf (MSG_NOTE,
4571 "cost model: Adding cost of checks for loop "
4572 "versioning to treat misalignment.\n");
4575 /* Requires loop versioning with alias checks. */
4576 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4578 /* FIXME: Make cost depend on complexity of individual check. */
4579 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4580 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4581 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4582 if (len)
4583 /* Count LEN - 1 ANDs and LEN comparisons. */
4584 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4585 scalar_stmt, vect_prologue);
4586 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4587 if (len)
4589 /* Count LEN - 1 ANDs and LEN comparisons. */
4590 unsigned int nstmts = len * 2 - 1;
4591 /* +1 for each bias that needs adding. */
4592 for (unsigned int i = 0; i < len; ++i)
4593 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4594 nstmts += 1;
4595 (void) add_stmt_cost (target_cost_data, nstmts,
4596 scalar_stmt, vect_prologue);
4598 if (dump_enabled_p ())
4599 dump_printf (MSG_NOTE,
4600 "cost model: Adding cost of checks for loop "
4601 "versioning aliasing.\n");
4604 /* Requires loop versioning with niter checks. */
4605 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4607 /* FIXME: Make cost depend on complexity of individual check. */
4608 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4609 NULL, NULL, NULL_TREE, 0, vect_prologue);
4610 if (dump_enabled_p ())
4611 dump_printf (MSG_NOTE,
4612 "cost model: Adding cost of checks for loop "
4613 "versioning niters.\n");
4616 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4617 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4618 vect_prologue);
4620 /* Count statements in scalar loop. Using this as scalar cost for a single
4621 iteration for now.
4623 TODO: Add outer loop support.
4625 TODO: Consider assigning different costs to different scalar
4626 statements. */
4628 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4630 /* Add additional cost for the peeled instructions in prologue and epilogue
4631 loop. (For fully-masked loops there will be no peeling.)
4633 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4634 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4636 TODO: Build an expression that represents peel_iters for prologue and
4637 epilogue to be used in a run-time test. */
4639 bool prologue_need_br_taken_cost = false;
4640 bool prologue_need_br_not_taken_cost = false;
4642 /* Calculate peel_iters_prologue. */
4643 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4644 peel_iters_prologue = 0;
4645 else if (npeel < 0)
4647 peel_iters_prologue = assumed_vf / 2;
4648 if (dump_enabled_p ())
4649 dump_printf (MSG_NOTE, "cost model: "
4650 "prologue peel iters set to vf/2.\n");
4652 /* If peeled iterations are unknown, count a taken branch and a not taken
4653 branch per peeled loop. Even if scalar loop iterations are known,
4654 vector iterations are not known since peeled prologue iterations are
4655 not known. Hence guards remain the same. */
4656 prologue_need_br_taken_cost = true;
4657 prologue_need_br_not_taken_cost = true;
4659 else
4661 peel_iters_prologue = npeel;
4662 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4663 /* If peeled iterations are known but number of scalar loop
4664 iterations are unknown, count a taken branch per peeled loop. */
4665 prologue_need_br_taken_cost = true;
4668 bool epilogue_need_br_taken_cost = false;
4669 bool epilogue_need_br_not_taken_cost = false;
4671 /* Calculate peel_iters_epilogue. */
4672 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4673 /* We need to peel exactly one iteration for gaps. */
4674 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4675 else if (npeel < 0)
4677 /* If peeling for alignment is unknown, loop bound of main loop
4678 becomes unknown. */
4679 peel_iters_epilogue = assumed_vf / 2;
4680 if (dump_enabled_p ())
4681 dump_printf (MSG_NOTE, "cost model: "
4682 "epilogue peel iters set to vf/2 because "
4683 "peeling for alignment is unknown.\n");
4685 /* See the same reason above in peel_iters_prologue calculation. */
4686 epilogue_need_br_taken_cost = true;
4687 epilogue_need_br_not_taken_cost = true;
4689 else
4691 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4692 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4693 /* If peeled iterations are known but number of scalar loop
4694 iterations are unknown, count a taken branch per peeled loop. */
4695 epilogue_need_br_taken_cost = true;
4698 stmt_info_for_cost *si;
4699 int j;
4700 /* Add costs associated with peel_iters_prologue. */
4701 if (peel_iters_prologue)
4702 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4704 (void) add_stmt_cost (target_cost_data,
4705 si->count * peel_iters_prologue, si->kind,
4706 si->stmt_info, si->node, si->vectype,
4707 si->misalign, vect_prologue);
4710 /* Add costs associated with peel_iters_epilogue. */
4711 if (peel_iters_epilogue)
4712 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4714 (void) add_stmt_cost (target_cost_data,
4715 si->count * peel_iters_epilogue, si->kind,
4716 si->stmt_info, si->node, si->vectype,
4717 si->misalign, vect_epilogue);
4720 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4722 if (prologue_need_br_taken_cost)
4723 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4724 vect_prologue);
4726 if (prologue_need_br_not_taken_cost)
4727 (void) add_stmt_cost (target_cost_data, 1,
4728 cond_branch_not_taken, vect_prologue);
4730 if (epilogue_need_br_taken_cost)
4731 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4732 vect_epilogue);
4734 if (epilogue_need_br_not_taken_cost)
4735 (void) add_stmt_cost (target_cost_data, 1,
4736 cond_branch_not_taken, vect_epilogue);
4738 /* Take care of special costs for rgroup controls of partial vectors. */
4739 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4740 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4741 == vect_partial_vectors_avx512))
4743 /* Calculate how many masks we need to generate. */
4744 unsigned int num_masks = 0;
4745 bool need_saturation = false;
4746 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4747 if (rgm.type)
4749 unsigned nvectors = rgm.factor;
4750 num_masks += nvectors;
4751 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4752 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4753 need_saturation = true;
4756 /* ??? The target isn't able to identify the costs below as
4757 producing masks so it cannot penaltize cases where we'd run
4758 out of mask registers for example. */
4760 /* ??? We are also failing to account for smaller vector masks
4761 we generate by splitting larger masks in vect_get_loop_mask. */
4763 /* In the worst case, we need to generate each mask in the prologue
4764 and in the loop body. We need one splat per group and one
4765 compare per mask.
4767 Sometimes the prologue mask will fold to a constant,
4768 so the actual prologue cost might be smaller. However, it's
4769 simpler and safer to use the worst-case cost; if this ends up
4770 being the tie-breaker between vectorizing or not, then it's
4771 probably better not to vectorize. */
4772 (void) add_stmt_cost (target_cost_data,
4773 num_masks
4774 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4775 vector_stmt, NULL, NULL, NULL_TREE, 0,
4776 vect_prologue);
4777 (void) add_stmt_cost (target_cost_data,
4778 num_masks
4779 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4780 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4782 /* When we need saturation we need it both in the prologue and
4783 the epilogue. */
4784 if (need_saturation)
4786 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4787 NULL, NULL, NULL_TREE, 0, vect_prologue);
4788 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4789 NULL, NULL, NULL_TREE, 0, vect_body);
4792 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4793 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4794 == vect_partial_vectors_while_ult))
4796 /* Calculate how many masks we need to generate. */
4797 unsigned int num_masks = 0;
4798 rgroup_controls *rgm;
4799 unsigned int num_vectors_m1;
4800 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4801 num_vectors_m1, rgm)
4802 if (rgm->type)
4803 num_masks += num_vectors_m1 + 1;
4804 gcc_assert (num_masks > 0);
4806 /* In the worst case, we need to generate each mask in the prologue
4807 and in the loop body. One of the loop body mask instructions
4808 replaces the comparison in the scalar loop, and since we don't
4809 count the scalar comparison against the scalar body, we shouldn't
4810 count that vector instruction against the vector body either.
4812 Sometimes we can use unpacks instead of generating prologue
4813 masks and sometimes the prologue mask will fold to a constant,
4814 so the actual prologue cost might be smaller. However, it's
4815 simpler and safer to use the worst-case cost; if this ends up
4816 being the tie-breaker between vectorizing or not, then it's
4817 probably better not to vectorize. */
4818 (void) add_stmt_cost (target_cost_data, num_masks,
4819 vector_stmt, NULL, NULL, NULL_TREE, 0,
4820 vect_prologue);
4821 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4822 vector_stmt, NULL, NULL, NULL_TREE, 0,
4823 vect_body);
4825 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4827 /* Referring to the functions vect_set_loop_condition_partial_vectors
4828 and vect_set_loop_controls_directly, we need to generate each
4829 length in the prologue and in the loop body if required. Although
4830 there are some possible optimizations, we consider the worst case
4831 here. */
4833 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4834 signed char partial_load_store_bias
4835 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4836 bool need_iterate_p
4837 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4838 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4840 /* Calculate how many statements to be added. */
4841 unsigned int prologue_stmts = 0;
4842 unsigned int body_stmts = 0;
4844 rgroup_controls *rgc;
4845 unsigned int num_vectors_m1;
4846 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4847 if (rgc->type)
4849 /* May need one SHIFT for nitems_total computation. */
4850 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4851 if (nitems != 1 && !niters_known_p)
4852 prologue_stmts += 1;
4854 /* May need one MAX and one MINUS for wrap around. */
4855 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4856 prologue_stmts += 2;
4858 /* Need one MAX and one MINUS for each batch limit excepting for
4859 the 1st one. */
4860 prologue_stmts += num_vectors_m1 * 2;
4862 unsigned int num_vectors = num_vectors_m1 + 1;
4864 /* Need to set up lengths in prologue, only one MIN required
4865 for each since start index is zero. */
4866 prologue_stmts += num_vectors;
4868 /* If we have a non-zero partial load bias, we need one PLUS
4869 to adjust the load length. */
4870 if (partial_load_store_bias != 0)
4871 body_stmts += 1;
4873 /* Each may need two MINs and one MINUS to update lengths in body
4874 for next iteration. */
4875 if (need_iterate_p)
4876 body_stmts += 3 * num_vectors;
4879 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4880 scalar_stmt, vect_prologue);
4881 (void) add_stmt_cost (target_cost_data, body_stmts,
4882 scalar_stmt, vect_body);
4885 /* FORNOW: The scalar outside cost is incremented in one of the
4886 following ways:
4888 1. The vectorizer checks for alignment and aliasing and generates
4889 a condition that allows dynamic vectorization. A cost model
4890 check is ANDED with the versioning condition. Hence scalar code
4891 path now has the added cost of the versioning check.
4893 if (cost > th & versioning_check)
4894 jmp to vector code
4896 Hence run-time scalar is incremented by not-taken branch cost.
4898 2. The vectorizer then checks if a prologue is required. If the
4899 cost model check was not done before during versioning, it has to
4900 be done before the prologue check.
4902 if (cost <= th)
4903 prologue = scalar_iters
4904 if (prologue == 0)
4905 jmp to vector code
4906 else
4907 execute prologue
4908 if (prologue == num_iters)
4909 go to exit
4911 Hence the run-time scalar cost is incremented by a taken branch,
4912 plus a not-taken branch, plus a taken branch cost.
4914 3. The vectorizer then checks if an epilogue is required. If the
4915 cost model check was not done before during prologue check, it
4916 has to be done with the epilogue check.
4918 if (prologue == 0)
4919 jmp to vector code
4920 else
4921 execute prologue
4922 if (prologue == num_iters)
4923 go to exit
4924 vector code:
4925 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4926 jmp to epilogue
4928 Hence the run-time scalar cost should be incremented by 2 taken
4929 branches.
4931 TODO: The back end may reorder the BBS's differently and reverse
4932 conditions/branch directions. Change the estimates below to
4933 something more reasonable. */
4935 /* If the number of iterations is known and we do not do versioning, we can
4936 decide whether to vectorize at compile time. Hence the scalar version
4937 do not carry cost model guard costs. */
4938 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4939 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4941 /* Cost model check occurs at versioning. */
4942 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4943 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4944 else
4946 /* Cost model check occurs at prologue generation. */
4947 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4948 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4949 + vect_get_stmt_cost (cond_branch_not_taken);
4950 /* Cost model check occurs at epilogue generation. */
4951 else
4952 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4956 /* Complete the target-specific cost calculations. */
4957 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4958 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4959 suggested_unroll_factor);
4961 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4962 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4963 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4964 *suggested_unroll_factor,
4965 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4967 if (dump_enabled_p ())
4968 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4969 "can't unroll as unrolled vectorization factor larger"
4970 " than maximum vectorization factor: "
4971 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4972 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4973 *suggested_unroll_factor = 1;
4976 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4978 if (dump_enabled_p ())
4980 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4981 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4982 vec_inside_cost);
4983 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4984 vec_prologue_cost);
4985 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4986 vec_epilogue_cost);
4987 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4988 scalar_single_iter_cost);
4989 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4990 scalar_outside_cost);
4991 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4992 vec_outside_cost);
4993 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4994 peel_iters_prologue);
4995 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4996 peel_iters_epilogue);
4999 /* Calculate number of iterations required to make the vector version
5000 profitable, relative to the loop bodies only. The following condition
5001 must hold true:
5002 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5003 where
5004 SIC = scalar iteration cost, VIC = vector iteration cost,
5005 VOC = vector outside cost, VF = vectorization factor,
5006 NPEEL = prologue iterations + epilogue iterations,
5007 SOC = scalar outside cost for run time cost model check. */
5009 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5010 - vec_inside_cost);
5011 if (saving_per_viter <= 0)
5013 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5014 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5015 "vectorization did not happen for a simd loop");
5017 if (dump_enabled_p ())
5018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5019 "cost model: the vector iteration cost = %d "
5020 "divided by the scalar iteration cost = %d "
5021 "is greater or equal to the vectorization factor = %d"
5022 ".\n",
5023 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5024 *ret_min_profitable_niters = -1;
5025 *ret_min_profitable_estimate = -1;
5026 return;
5029 /* ??? The "if" arm is written to handle all cases; see below for what
5030 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5031 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5033 /* Rewriting the condition above in terms of the number of
5034 vector iterations (vniters) rather than the number of
5035 scalar iterations (niters) gives:
5037 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5039 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5041 For integer N, X and Y when X > 0:
5043 N * X > Y <==> N >= (Y /[floor] X) + 1. */
5044 int outside_overhead = (vec_outside_cost
5045 - scalar_single_iter_cost * peel_iters_prologue
5046 - scalar_single_iter_cost * peel_iters_epilogue
5047 - scalar_outside_cost);
5048 /* We're only interested in cases that require at least one
5049 vector iteration. */
5050 int min_vec_niters = 1;
5051 if (outside_overhead > 0)
5052 min_vec_niters = outside_overhead / saving_per_viter + 1;
5054 if (dump_enabled_p ())
5055 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5056 min_vec_niters);
5058 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5060 /* Now that we know the minimum number of vector iterations,
5061 find the minimum niters for which the scalar cost is larger:
5063 SIC * niters > VIC * vniters + VOC - SOC
5065 We know that the minimum niters is no more than
5066 vniters * VF + NPEEL, but it might be (and often is) less
5067 than that if a partial vector iteration is cheaper than the
5068 equivalent scalar code. */
5069 int threshold = (vec_inside_cost * min_vec_niters
5070 + vec_outside_cost
5071 - scalar_outside_cost);
5072 if (threshold <= 0)
5073 min_profitable_iters = 1;
5074 else
5075 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5077 else
5078 /* Convert the number of vector iterations into a number of
5079 scalar iterations. */
5080 min_profitable_iters = (min_vec_niters * assumed_vf
5081 + peel_iters_prologue
5082 + peel_iters_epilogue);
5084 else
5086 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5087 * assumed_vf
5088 - vec_inside_cost * peel_iters_prologue
5089 - vec_inside_cost * peel_iters_epilogue);
5090 if (min_profitable_iters <= 0)
5091 min_profitable_iters = 0;
5092 else
5094 min_profitable_iters /= saving_per_viter;
5096 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5097 <= (((int) vec_inside_cost * min_profitable_iters)
5098 + (((int) vec_outside_cost - scalar_outside_cost)
5099 * assumed_vf)))
5100 min_profitable_iters++;
5104 if (dump_enabled_p ())
5105 dump_printf (MSG_NOTE,
5106 " Calculated minimum iters for profitability: %d\n",
5107 min_profitable_iters);
5109 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5110 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5111 /* We want the vectorized loop to execute at least once. */
5112 min_profitable_iters = assumed_vf + peel_iters_prologue;
5113 else if (min_profitable_iters < peel_iters_prologue)
5114 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5115 vectorized loop executes at least once. */
5116 min_profitable_iters = peel_iters_prologue;
5118 if (dump_enabled_p ())
5119 dump_printf_loc (MSG_NOTE, vect_location,
5120 " Runtime profitability threshold = %d\n",
5121 min_profitable_iters);
5123 *ret_min_profitable_niters = min_profitable_iters;
5125 /* Calculate number of iterations required to make the vector version
5126 profitable, relative to the loop bodies only.
5128 Non-vectorized variant is SIC * niters and it must win over vector
5129 variant on the expected loop trip count. The following condition must hold true:
5130 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5132 if (vec_outside_cost <= 0)
5133 min_profitable_estimate = 0;
5134 /* ??? This "else if" arm is written to handle all cases; see below for
5135 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5136 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5138 /* This is a repeat of the code above, but with + SOC rather
5139 than - SOC. */
5140 int outside_overhead = (vec_outside_cost
5141 - scalar_single_iter_cost * peel_iters_prologue
5142 - scalar_single_iter_cost * peel_iters_epilogue
5143 + scalar_outside_cost);
5144 int min_vec_niters = 1;
5145 if (outside_overhead > 0)
5146 min_vec_niters = outside_overhead / saving_per_viter + 1;
5148 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5150 int threshold = (vec_inside_cost * min_vec_niters
5151 + vec_outside_cost
5152 + scalar_outside_cost);
5153 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5155 else
5156 min_profitable_estimate = (min_vec_niters * assumed_vf
5157 + peel_iters_prologue
5158 + peel_iters_epilogue);
5160 else
5162 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5163 * assumed_vf
5164 - vec_inside_cost * peel_iters_prologue
5165 - vec_inside_cost * peel_iters_epilogue)
5166 / ((scalar_single_iter_cost * assumed_vf)
5167 - vec_inside_cost);
5169 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5170 if (dump_enabled_p ())
5171 dump_printf_loc (MSG_NOTE, vect_location,
5172 " Static estimate profitability threshold = %d\n",
5173 min_profitable_estimate);
5175 *ret_min_profitable_estimate = min_profitable_estimate;
5178 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5179 vector elements (not bits) for a vector with NELT elements. */
5180 static void
5181 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5182 vec_perm_builder *sel)
5184 /* The encoding is a single stepped pattern. Any wrap-around is handled
5185 by vec_perm_indices. */
5186 sel->new_vector (nelt, 1, 3);
5187 for (unsigned int i = 0; i < 3; i++)
5188 sel->quick_push (i + offset);
5191 /* Checks whether the target supports whole-vector shifts for vectors of mode
5192 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5193 it supports vec_perm_const with masks for all necessary shift amounts. */
5194 static bool
5195 have_whole_vector_shift (machine_mode mode)
5197 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5198 return true;
5200 /* Variable-length vectors should be handled via the optab. */
5201 unsigned int nelt;
5202 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5203 return false;
5205 vec_perm_builder sel;
5206 vec_perm_indices indices;
5207 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5209 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5210 indices.new_vector (sel, 2, nelt);
5211 if (!can_vec_perm_const_p (mode, mode, indices, false))
5212 return false;
5214 return true;
5217 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5218 multiplication operands have differing signs and (b) we intend
5219 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5220 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5222 static bool
5223 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5224 stmt_vec_info stmt_info)
5226 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5227 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5228 return false;
5230 tree rhs1 = gimple_assign_rhs1 (assign);
5231 tree rhs2 = gimple_assign_rhs2 (assign);
5232 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5233 return false;
5235 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5236 gcc_assert (reduc_info->is_reduc_info);
5237 return !directly_supported_p (DOT_PROD_EXPR,
5238 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5239 optab_vector_mixed_sign);
5242 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5243 functions. Design better to avoid maintenance issues. */
5245 /* Function vect_model_reduction_cost.
5247 Models cost for a reduction operation, including the vector ops
5248 generated within the strip-mine loop in some cases, the initial
5249 definition before the loop, and the epilogue code that must be generated. */
5251 static void
5252 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5253 stmt_vec_info stmt_info, internal_fn reduc_fn,
5254 vect_reduction_type reduction_type,
5255 int ncopies, stmt_vector_for_cost *cost_vec)
5257 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5258 tree vectype;
5259 machine_mode mode;
5260 class loop *loop = NULL;
5262 if (loop_vinfo)
5263 loop = LOOP_VINFO_LOOP (loop_vinfo);
5265 /* Condition reductions generate two reductions in the loop. */
5266 if (reduction_type == COND_REDUCTION)
5267 ncopies *= 2;
5269 vectype = STMT_VINFO_VECTYPE (stmt_info);
5270 mode = TYPE_MODE (vectype);
5271 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5273 gimple_match_op op;
5274 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5275 gcc_unreachable ();
5277 bool emulated_mixed_dot_prod
5278 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5279 if (reduction_type == EXTRACT_LAST_REDUCTION)
5280 /* No extra instructions are needed in the prologue. The loop body
5281 operations are costed in vectorizable_condition. */
5282 inside_cost = 0;
5283 else if (reduction_type == FOLD_LEFT_REDUCTION)
5285 /* No extra instructions needed in the prologue. */
5286 prologue_cost = 0;
5288 if (reduc_fn != IFN_LAST)
5289 /* Count one reduction-like operation per vector. */
5290 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5291 stmt_info, 0, vect_body);
5292 else
5294 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5295 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5296 inside_cost = record_stmt_cost (cost_vec, nelements,
5297 vec_to_scalar, stmt_info, 0,
5298 vect_body);
5299 inside_cost += record_stmt_cost (cost_vec, nelements,
5300 scalar_stmt, stmt_info, 0,
5301 vect_body);
5304 else
5306 /* Add in the cost of the initial definitions. */
5307 int prologue_stmts;
5308 if (reduction_type == COND_REDUCTION)
5309 /* For cond reductions we have four vectors: initial index, step,
5310 initial result of the data reduction, initial value of the index
5311 reduction. */
5312 prologue_stmts = 4;
5313 else if (emulated_mixed_dot_prod)
5314 /* We need the initial reduction value and two invariants:
5315 one that contains the minimum signed value and one that
5316 contains half of its negative. */
5317 prologue_stmts = 3;
5318 else
5319 prologue_stmts = 1;
5320 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5321 scalar_to_vec, stmt_info, 0,
5322 vect_prologue);
5325 /* Determine cost of epilogue code.
5327 We have a reduction operator that will reduce the vector in one statement.
5328 Also requires scalar extract. */
5330 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5332 if (reduc_fn != IFN_LAST)
5334 if (reduction_type == COND_REDUCTION)
5336 /* An EQ stmt and an COND_EXPR stmt. */
5337 epilogue_cost += record_stmt_cost (cost_vec, 2,
5338 vector_stmt, stmt_info, 0,
5339 vect_epilogue);
5340 /* Reduction of the max index and a reduction of the found
5341 values. */
5342 epilogue_cost += record_stmt_cost (cost_vec, 2,
5343 vec_to_scalar, stmt_info, 0,
5344 vect_epilogue);
5345 /* A broadcast of the max value. */
5346 epilogue_cost += record_stmt_cost (cost_vec, 1,
5347 scalar_to_vec, stmt_info, 0,
5348 vect_epilogue);
5350 else
5352 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5353 stmt_info, 0, vect_epilogue);
5354 epilogue_cost += record_stmt_cost (cost_vec, 1,
5355 vec_to_scalar, stmt_info, 0,
5356 vect_epilogue);
5359 else if (reduction_type == COND_REDUCTION)
5361 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5362 /* Extraction of scalar elements. */
5363 epilogue_cost += record_stmt_cost (cost_vec,
5364 2 * estimated_nunits,
5365 vec_to_scalar, stmt_info, 0,
5366 vect_epilogue);
5367 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5368 epilogue_cost += record_stmt_cost (cost_vec,
5369 2 * estimated_nunits - 3,
5370 scalar_stmt, stmt_info, 0,
5371 vect_epilogue);
5373 else if (reduction_type == EXTRACT_LAST_REDUCTION
5374 || reduction_type == FOLD_LEFT_REDUCTION)
5375 /* No extra instructions need in the epilogue. */
5377 else
5379 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5380 tree bitsize = TYPE_SIZE (op.type);
5381 int element_bitsize = tree_to_uhwi (bitsize);
5382 int nelements = vec_size_in_bits / element_bitsize;
5384 if (op.code == COND_EXPR)
5385 op.code = MAX_EXPR;
5387 /* We have a whole vector shift available. */
5388 if (VECTOR_MODE_P (mode)
5389 && directly_supported_p (op.code, vectype)
5390 && have_whole_vector_shift (mode))
5392 /* Final reduction via vector shifts and the reduction operator.
5393 Also requires scalar extract. */
5394 epilogue_cost += record_stmt_cost (cost_vec,
5395 exact_log2 (nelements) * 2,
5396 vector_stmt, stmt_info, 0,
5397 vect_epilogue);
5398 epilogue_cost += record_stmt_cost (cost_vec, 1,
5399 vec_to_scalar, stmt_info, 0,
5400 vect_epilogue);
5402 else
5403 /* Use extracts and reduction op for final reduction. For N
5404 elements, we have N extracts and N-1 reduction ops. */
5405 epilogue_cost += record_stmt_cost (cost_vec,
5406 nelements + nelements - 1,
5407 vector_stmt, stmt_info, 0,
5408 vect_epilogue);
5412 if (dump_enabled_p ())
5413 dump_printf (MSG_NOTE,
5414 "vect_model_reduction_cost: inside_cost = %d, "
5415 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5416 prologue_cost, epilogue_cost);
5419 /* SEQ is a sequence of instructions that initialize the reduction
5420 described by REDUC_INFO. Emit them in the appropriate place. */
5422 static void
5423 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5424 stmt_vec_info reduc_info, gimple *seq)
5426 if (reduc_info->reused_accumulator)
5428 /* When reusing an accumulator from the main loop, we only need
5429 initialization instructions if the main loop can be skipped.
5430 In that case, emit the initialization instructions at the end
5431 of the guard block that does the skip. */
5432 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5433 gcc_assert (skip_edge);
5434 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5435 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5437 else
5439 /* The normal case: emit the initialization instructions on the
5440 preheader edge. */
5441 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5442 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5446 /* Function get_initial_def_for_reduction
5448 Input:
5449 REDUC_INFO - the info_for_reduction
5450 INIT_VAL - the initial value of the reduction variable
5451 NEUTRAL_OP - a value that has no effect on the reduction, as per
5452 neutral_op_for_reduction
5454 Output:
5455 Return a vector variable, initialized according to the operation that
5456 STMT_VINFO performs. This vector will be used as the initial value
5457 of the vector of partial results.
5459 The value we need is a vector in which element 0 has value INIT_VAL
5460 and every other element has value NEUTRAL_OP. */
5462 static tree
5463 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5464 stmt_vec_info reduc_info,
5465 tree init_val, tree neutral_op)
5467 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5468 tree scalar_type = TREE_TYPE (init_val);
5469 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5470 tree init_def;
5471 gimple_seq stmts = NULL;
5473 gcc_assert (vectype);
5475 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5476 || SCALAR_FLOAT_TYPE_P (scalar_type));
5478 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5479 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5481 if (operand_equal_p (init_val, neutral_op))
5483 /* If both elements are equal then the vector described above is
5484 just a splat. */
5485 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5486 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5488 else
5490 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5491 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5492 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5494 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5495 element 0. */
5496 init_def = gimple_build_vector_from_val (&stmts, vectype,
5497 neutral_op);
5498 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5499 vectype, init_def, init_val);
5501 else
5503 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5504 tree_vector_builder elts (vectype, 1, 2);
5505 elts.quick_push (init_val);
5506 elts.quick_push (neutral_op);
5507 init_def = gimple_build_vector (&stmts, &elts);
5511 if (stmts)
5512 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5513 return init_def;
5516 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5517 which performs a reduction involving GROUP_SIZE scalar statements.
5518 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5519 is nonnull, introducing extra elements of that value will not change the
5520 result. */
5522 static void
5523 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5524 stmt_vec_info reduc_info,
5525 vec<tree> *vec_oprnds,
5526 unsigned int number_of_vectors,
5527 unsigned int group_size, tree neutral_op)
5529 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5530 unsigned HOST_WIDE_INT nunits;
5531 unsigned j, number_of_places_left_in_vector;
5532 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5533 unsigned int i;
5535 gcc_assert (group_size == initial_values.length () || neutral_op);
5537 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5538 created vectors. It is greater than 1 if unrolling is performed.
5540 For example, we have two scalar operands, s1 and s2 (e.g., group of
5541 strided accesses of size two), while NUNITS is four (i.e., four scalars
5542 of this type can be packed in a vector). The output vector will contain
5543 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5544 will be 2).
5546 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5547 vectors containing the operands.
5549 For example, NUNITS is four as before, and the group size is 8
5550 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5551 {s5, s6, s7, s8}. */
5553 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5554 nunits = group_size;
5556 number_of_places_left_in_vector = nunits;
5557 bool constant_p = true;
5558 tree_vector_builder elts (vector_type, nunits, 1);
5559 elts.quick_grow (nunits);
5560 gimple_seq ctor_seq = NULL;
5561 for (j = 0; j < nunits * number_of_vectors; ++j)
5563 tree op;
5564 i = j % group_size;
5566 /* Get the def before the loop. In reduction chain we have only
5567 one initial value. Else we have as many as PHIs in the group. */
5568 if (i >= initial_values.length () || (j > i && neutral_op))
5569 op = neutral_op;
5570 else
5571 op = initial_values[i];
5573 /* Create 'vect_ = {op0,op1,...,opn}'. */
5574 number_of_places_left_in_vector--;
5575 elts[nunits - number_of_places_left_in_vector - 1] = op;
5576 if (!CONSTANT_CLASS_P (op))
5577 constant_p = false;
5579 if (number_of_places_left_in_vector == 0)
5581 tree init;
5582 if (constant_p && !neutral_op
5583 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5584 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5585 /* Build the vector directly from ELTS. */
5586 init = gimple_build_vector (&ctor_seq, &elts);
5587 else if (neutral_op)
5589 /* Build a vector of the neutral value and shift the
5590 other elements into place. */
5591 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5592 neutral_op);
5593 int k = nunits;
5594 while (k > 0 && elts[k - 1] == neutral_op)
5595 k -= 1;
5596 while (k > 0)
5598 k -= 1;
5599 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5600 vector_type, init, elts[k]);
5603 else
5605 /* First time round, duplicate ELTS to fill the
5606 required number of vectors. */
5607 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5608 elts, number_of_vectors, *vec_oprnds);
5609 break;
5611 vec_oprnds->quick_push (init);
5613 number_of_places_left_in_vector = nunits;
5614 elts.new_vector (vector_type, nunits, 1);
5615 elts.quick_grow (nunits);
5616 constant_p = true;
5619 if (ctor_seq != NULL)
5620 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5623 /* For a statement STMT_INFO taking part in a reduction operation return
5624 the stmt_vec_info the meta information is stored on. */
5626 stmt_vec_info
5627 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5629 stmt_info = vect_orig_stmt (stmt_info);
5630 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5631 if (!is_a <gphi *> (stmt_info->stmt)
5632 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5633 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5634 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5635 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5637 if (gimple_phi_num_args (phi) == 1)
5638 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5640 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5642 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5643 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5644 stmt_info = info;
5646 return stmt_info;
5649 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5650 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5651 return false. */
5653 static bool
5654 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5655 stmt_vec_info reduc_info)
5657 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5658 if (!main_loop_vinfo)
5659 return false;
5661 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5662 return false;
5664 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5665 auto_vec<tree, 16> main_loop_results (num_phis);
5666 auto_vec<tree, 16> initial_values (num_phis);
5667 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5669 /* The epilogue loop can be entered either from the main loop or
5670 from an earlier guard block. */
5671 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5672 for (tree incoming_value : reduc_info->reduc_initial_values)
5674 /* Look for:
5676 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5677 INITIAL_VALUE(guard block)>. */
5678 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5680 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5681 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5683 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5684 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5686 main_loop_results.quick_push (from_main_loop);
5687 initial_values.quick_push (from_skip);
5690 else
5691 /* The main loop dominates the epilogue loop. */
5692 main_loop_results.splice (reduc_info->reduc_initial_values);
5694 /* See if the main loop has the kind of accumulator we need. */
5695 vect_reusable_accumulator *accumulator
5696 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5697 if (!accumulator
5698 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5699 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5700 accumulator->reduc_info->reduc_scalar_results.begin ()))
5701 return false;
5703 /* Handle the case where we can reduce wider vectors to narrower ones. */
5704 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5705 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5706 unsigned HOST_WIDE_INT m;
5707 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5708 TYPE_VECTOR_SUBPARTS (vectype), &m))
5709 return false;
5710 /* Check the intermediate vector types and operations are available. */
5711 tree prev_vectype = old_vectype;
5712 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5713 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5715 intermediate_nunits = exact_div (intermediate_nunits, 2);
5716 tree intermediate_vectype = get_related_vectype_for_scalar_type
5717 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5718 if (!intermediate_vectype
5719 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5720 intermediate_vectype)
5721 || !can_vec_extract (TYPE_MODE (prev_vectype),
5722 TYPE_MODE (intermediate_vectype)))
5723 return false;
5724 prev_vectype = intermediate_vectype;
5727 /* Non-SLP reductions might apply an adjustment after the reduction
5728 operation, in order to simplify the initialization of the accumulator.
5729 If the epilogue loop carries on from where the main loop left off,
5730 it should apply the same adjustment to the final reduction result.
5732 If the epilogue loop can also be entered directly (rather than via
5733 the main loop), we need to be able to handle that case in the same way,
5734 with the same adjustment. (In principle we could add a PHI node
5735 to select the correct adjustment, but in practice that shouldn't be
5736 necessary.) */
5737 tree main_adjustment
5738 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5739 if (loop_vinfo->main_loop_edge && main_adjustment)
5741 gcc_assert (num_phis == 1);
5742 tree initial_value = initial_values[0];
5743 /* Check that we can use INITIAL_VALUE as the adjustment and
5744 initialize the accumulator with a neutral value instead. */
5745 if (!operand_equal_p (initial_value, main_adjustment))
5746 return false;
5747 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5748 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5749 code, initial_value);
5751 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5752 reduc_info->reduc_initial_values.truncate (0);
5753 reduc_info->reduc_initial_values.splice (initial_values);
5754 reduc_info->reused_accumulator = accumulator;
5755 return true;
5758 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5759 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5761 static tree
5762 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5763 gimple_seq *seq)
5765 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5766 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5767 tree stype = TREE_TYPE (vectype);
5768 tree new_temp = vec_def;
5769 while (nunits > nunits1)
5771 nunits /= 2;
5772 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5773 stype, nunits);
5774 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5776 /* The target has to make sure we support lowpart/highpart
5777 extraction, either via direct vector extract or through
5778 an integer mode punning. */
5779 tree dst1, dst2;
5780 gimple *epilog_stmt;
5781 if (convert_optab_handler (vec_extract_optab,
5782 TYPE_MODE (TREE_TYPE (new_temp)),
5783 TYPE_MODE (vectype1))
5784 != CODE_FOR_nothing)
5786 /* Extract sub-vectors directly once vec_extract becomes
5787 a conversion optab. */
5788 dst1 = make_ssa_name (vectype1);
5789 epilog_stmt
5790 = gimple_build_assign (dst1, BIT_FIELD_REF,
5791 build3 (BIT_FIELD_REF, vectype1,
5792 new_temp, TYPE_SIZE (vectype1),
5793 bitsize_int (0)));
5794 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5795 dst2 = make_ssa_name (vectype1);
5796 epilog_stmt
5797 = gimple_build_assign (dst2, BIT_FIELD_REF,
5798 build3 (BIT_FIELD_REF, vectype1,
5799 new_temp, TYPE_SIZE (vectype1),
5800 bitsize_int (bitsize)));
5801 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5803 else
5805 /* Extract via punning to appropriately sized integer mode
5806 vector. */
5807 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5808 tree etype = build_vector_type (eltype, 2);
5809 gcc_assert (convert_optab_handler (vec_extract_optab,
5810 TYPE_MODE (etype),
5811 TYPE_MODE (eltype))
5812 != CODE_FOR_nothing);
5813 tree tem = make_ssa_name (etype);
5814 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5815 build1 (VIEW_CONVERT_EXPR,
5816 etype, new_temp));
5817 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5818 new_temp = tem;
5819 tem = make_ssa_name (eltype);
5820 epilog_stmt
5821 = gimple_build_assign (tem, BIT_FIELD_REF,
5822 build3 (BIT_FIELD_REF, eltype,
5823 new_temp, TYPE_SIZE (eltype),
5824 bitsize_int (0)));
5825 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5826 dst1 = make_ssa_name (vectype1);
5827 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5828 build1 (VIEW_CONVERT_EXPR,
5829 vectype1, tem));
5830 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5831 tem = make_ssa_name (eltype);
5832 epilog_stmt
5833 = gimple_build_assign (tem, BIT_FIELD_REF,
5834 build3 (BIT_FIELD_REF, eltype,
5835 new_temp, TYPE_SIZE (eltype),
5836 bitsize_int (bitsize)));
5837 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5838 dst2 = make_ssa_name (vectype1);
5839 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5840 build1 (VIEW_CONVERT_EXPR,
5841 vectype1, tem));
5842 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5845 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5848 return new_temp;
5851 /* Function vect_create_epilog_for_reduction
5853 Create code at the loop-epilog to finalize the result of a reduction
5854 computation.
5856 STMT_INFO is the scalar reduction stmt that is being vectorized.
5857 SLP_NODE is an SLP node containing a group of reduction statements. The
5858 first one in this group is STMT_INFO.
5859 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5860 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5861 (counting from 0)
5863 This function:
5864 1. Completes the reduction def-use cycles.
5865 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5866 by calling the function specified by REDUC_FN if available, or by
5867 other means (whole-vector shifts or a scalar loop).
5868 The function also creates a new phi node at the loop exit to preserve
5869 loop-closed form, as illustrated below.
5871 The flow at the entry to this function:
5873 loop:
5874 vec_def = phi <vec_init, null> # REDUCTION_PHI
5875 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5876 s_loop = scalar_stmt # (scalar) STMT_INFO
5877 loop_exit:
5878 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5879 use <s_out0>
5880 use <s_out0>
5882 The above is transformed by this function into:
5884 loop:
5885 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5886 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5887 s_loop = scalar_stmt # (scalar) STMT_INFO
5888 loop_exit:
5889 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5890 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5891 v_out2 = reduce <v_out1>
5892 s_out3 = extract_field <v_out2, 0>
5893 s_out4 = adjust_result <s_out3>
5894 use <s_out4>
5895 use <s_out4>
5898 static void
5899 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5900 stmt_vec_info stmt_info,
5901 slp_tree slp_node,
5902 slp_instance slp_node_instance)
5904 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5905 gcc_assert (reduc_info->is_reduc_info);
5906 /* For double reductions we need to get at the inner loop reduction
5907 stmt which has the meta info attached. Our stmt_info is that of the
5908 loop-closed PHI of the inner loop which we remember as
5909 def for the reduction PHI generation. */
5910 bool double_reduc = false;
5911 stmt_vec_info rdef_info = stmt_info;
5912 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5914 gcc_assert (!slp_node);
5915 double_reduc = true;
5916 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5917 (stmt_info->stmt, 0));
5918 stmt_info = vect_stmt_to_vectorize (stmt_info);
5920 gphi *reduc_def_stmt
5921 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5922 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5923 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5924 tree vectype;
5925 machine_mode mode;
5926 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5927 basic_block exit_bb;
5928 tree scalar_dest;
5929 tree scalar_type;
5930 gimple *new_phi = NULL, *phi = NULL;
5931 gimple_stmt_iterator exit_gsi;
5932 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5933 gimple *epilog_stmt = NULL;
5934 gimple *exit_phi;
5935 tree bitsize;
5936 tree def;
5937 tree orig_name, scalar_result;
5938 imm_use_iterator imm_iter, phi_imm_iter;
5939 use_operand_p use_p, phi_use_p;
5940 gimple *use_stmt;
5941 auto_vec<tree> reduc_inputs;
5942 int j, i;
5943 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5944 unsigned int group_size = 1, k;
5945 auto_vec<gimple *> phis;
5946 /* SLP reduction without reduction chain, e.g.,
5947 # a1 = phi <a2, a0>
5948 # b1 = phi <b2, b0>
5949 a2 = operation (a1)
5950 b2 = operation (b1) */
5951 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5952 bool direct_slp_reduc;
5953 tree induction_index = NULL_TREE;
5955 if (slp_node)
5956 group_size = SLP_TREE_LANES (slp_node);
5958 if (nested_in_vect_loop_p (loop, stmt_info))
5960 outer_loop = loop;
5961 loop = loop->inner;
5962 gcc_assert (!slp_node && double_reduc);
5965 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5966 gcc_assert (vectype);
5967 mode = TYPE_MODE (vectype);
5969 tree induc_val = NULL_TREE;
5970 tree adjustment_def = NULL;
5971 if (slp_node)
5973 else
5975 /* Optimize: for induction condition reduction, if we can't use zero
5976 for induc_val, use initial_def. */
5977 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5978 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5979 else if (double_reduc)
5981 else
5982 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5985 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5986 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5987 if (slp_reduc)
5988 /* All statements produce live-out values. */
5989 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5990 else if (slp_node)
5992 /* The last statement in the reduction chain produces the live-out
5993 value. Note SLP optimization can shuffle scalar stmts to
5994 optimize permutations so we have to search for the last stmt. */
5995 for (k = 0; k < group_size; ++k)
5996 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5998 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5999 break;
6003 unsigned vec_num;
6004 int ncopies;
6005 if (slp_node)
6007 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6008 ncopies = 1;
6010 else
6012 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6013 vec_num = 1;
6014 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6017 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6018 which is updated with the current index of the loop for every match of
6019 the original loop's cond_expr (VEC_STMT). This results in a vector
6020 containing the last time the condition passed for that vector lane.
6021 The first match will be a 1 to allow 0 to be used for non-matching
6022 indexes. If there are no matches at all then the vector will be all
6023 zeroes.
6025 PR92772: This algorithm is broken for architectures that support
6026 masked vectors, but do not provide fold_extract_last. */
6027 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6029 auto_vec<std::pair<tree, bool>, 2> ccompares;
6030 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6031 cond_info = vect_stmt_to_vectorize (cond_info);
6032 while (cond_info != reduc_info)
6034 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6036 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6037 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6038 ccompares.safe_push
6039 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6040 STMT_VINFO_REDUC_IDX (cond_info) == 2));
6042 cond_info
6043 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6044 1 + STMT_VINFO_REDUC_IDX
6045 (cond_info)));
6046 cond_info = vect_stmt_to_vectorize (cond_info);
6048 gcc_assert (ccompares.length () != 0);
6050 tree indx_before_incr, indx_after_incr;
6051 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6052 int scalar_precision
6053 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6054 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6055 tree cr_index_vector_type = get_related_vectype_for_scalar_type
6056 (TYPE_MODE (vectype), cr_index_scalar_type,
6057 TYPE_VECTOR_SUBPARTS (vectype));
6059 /* First we create a simple vector induction variable which starts
6060 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6061 vector size (STEP). */
6063 /* Create a {1,2,3,...} vector. */
6064 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6066 /* Create a vector of the step value. */
6067 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6068 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6070 /* Create an induction variable. */
6071 gimple_stmt_iterator incr_gsi;
6072 bool insert_after;
6073 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6074 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6075 insert_after, &indx_before_incr, &indx_after_incr);
6077 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6078 filled with zeros (VEC_ZERO). */
6080 /* Create a vector of 0s. */
6081 tree zero = build_zero_cst (cr_index_scalar_type);
6082 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6084 /* Create a vector phi node. */
6085 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6086 new_phi = create_phi_node (new_phi_tree, loop->header);
6087 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6088 loop_preheader_edge (loop), UNKNOWN_LOCATION);
6090 /* Now take the condition from the loops original cond_exprs
6091 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6092 every match uses values from the induction variable
6093 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6094 (NEW_PHI_TREE).
6095 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6096 the new cond_expr (INDEX_COND_EXPR). */
6097 gimple_seq stmts = NULL;
6098 for (int i = ccompares.length () - 1; i != -1; --i)
6100 tree ccompare = ccompares[i].first;
6101 if (ccompares[i].second)
6102 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6103 cr_index_vector_type,
6104 ccompare,
6105 indx_before_incr, new_phi_tree);
6106 else
6107 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6108 cr_index_vector_type,
6109 ccompare,
6110 new_phi_tree, indx_before_incr);
6112 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6114 /* Update the phi with the vec cond. */
6115 induction_index = new_phi_tree;
6116 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6117 loop_latch_edge (loop), UNKNOWN_LOCATION);
6120 /* 2. Create epilog code.
6121 The reduction epilog code operates across the elements of the vector
6122 of partial results computed by the vectorized loop.
6123 The reduction epilog code consists of:
6125 step 1: compute the scalar result in a vector (v_out2)
6126 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6127 step 3: adjust the scalar result (s_out3) if needed.
6129 Step 1 can be accomplished using one the following three schemes:
6130 (scheme 1) using reduc_fn, if available.
6131 (scheme 2) using whole-vector shifts, if available.
6132 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6133 combined.
6135 The overall epilog code looks like this:
6137 s_out0 = phi <s_loop> # original EXIT_PHI
6138 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6139 v_out2 = reduce <v_out1> # step 1
6140 s_out3 = extract_field <v_out2, 0> # step 2
6141 s_out4 = adjust_result <s_out3> # step 3
6143 (step 3 is optional, and steps 1 and 2 may be combined).
6144 Lastly, the uses of s_out0 are replaced by s_out4. */
6147 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6148 v_out1 = phi <VECT_DEF>
6149 Store them in NEW_PHIS. */
6150 if (double_reduc)
6151 loop = outer_loop;
6152 exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
6153 exit_gsi = gsi_after_labels (exit_bb);
6154 reduc_inputs.create (slp_node ? vec_num : ncopies);
6155 for (unsigned i = 0; i < vec_num; i++)
6157 gimple_seq stmts = NULL;
6158 if (slp_node)
6159 def = vect_get_slp_vect_def (slp_node, i);
6160 else
6161 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6162 for (j = 0; j < ncopies; j++)
6164 tree new_def = copy_ssa_name (def);
6165 phi = create_phi_node (new_def, exit_bb);
6166 if (j)
6167 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6168 SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
6169 new_def = gimple_convert (&stmts, vectype, new_def);
6170 reduc_inputs.quick_push (new_def);
6172 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6175 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6176 (i.e. when reduc_fn is not available) and in the final adjustment
6177 code (if needed). Also get the original scalar reduction variable as
6178 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6179 represents a reduction pattern), the tree-code and scalar-def are
6180 taken from the original stmt that the pattern-stmt (STMT) replaces.
6181 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6182 are taken from STMT. */
6184 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6185 if (orig_stmt_info != stmt_info)
6187 /* Reduction pattern */
6188 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6189 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6192 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6193 scalar_type = TREE_TYPE (scalar_dest);
6194 scalar_results.truncate (0);
6195 scalar_results.reserve_exact (group_size);
6196 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6197 bitsize = TYPE_SIZE (scalar_type);
6199 /* True if we should implement SLP_REDUC using native reduction operations
6200 instead of scalar operations. */
6201 direct_slp_reduc = (reduc_fn != IFN_LAST
6202 && slp_reduc
6203 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6205 /* In case of reduction chain, e.g.,
6206 # a1 = phi <a3, a0>
6207 a2 = operation (a1)
6208 a3 = operation (a2),
6210 we may end up with more than one vector result. Here we reduce them
6211 to one vector.
6213 The same is true for a SLP reduction, e.g.,
6214 # a1 = phi <a2, a0>
6215 # b1 = phi <b2, b0>
6216 a2 = operation (a1)
6217 b2 = operation (a2),
6219 where we can end up with more than one vector as well. We can
6220 easily accumulate vectors when the number of vector elements is
6221 a multiple of the SLP group size.
6223 The same is true if we couldn't use a single defuse cycle. */
6224 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6225 || direct_slp_reduc
6226 || (slp_reduc
6227 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6228 || ncopies > 1)
6230 gimple_seq stmts = NULL;
6231 tree single_input = reduc_inputs[0];
6232 for (k = 1; k < reduc_inputs.length (); k++)
6233 single_input = gimple_build (&stmts, code, vectype,
6234 single_input, reduc_inputs[k]);
6235 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6237 reduc_inputs.truncate (0);
6238 reduc_inputs.safe_push (single_input);
6241 tree orig_reduc_input = reduc_inputs[0];
6243 /* If this loop is an epilogue loop that can be skipped after the
6244 main loop, we can only share a reduction operation between the
6245 main loop and the epilogue if we put it at the target of the
6246 skip edge.
6248 We can still reuse accumulators if this check fails. Doing so has
6249 the minor(?) benefit of making the epilogue loop's scalar result
6250 independent of the main loop's scalar result. */
6251 bool unify_with_main_loop_p = false;
6252 if (reduc_info->reused_accumulator
6253 && loop_vinfo->skip_this_loop_edge
6254 && single_succ_p (exit_bb)
6255 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6257 unify_with_main_loop_p = true;
6259 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6260 reduc_inputs[0] = make_ssa_name (vectype);
6261 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6262 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6263 UNKNOWN_LOCATION);
6264 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6265 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6266 exit_gsi = gsi_after_labels (reduc_block);
6269 /* Shouldn't be used beyond this point. */
6270 exit_bb = nullptr;
6272 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6273 && reduc_fn != IFN_LAST)
6275 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6276 various data values where the condition matched and another vector
6277 (INDUCTION_INDEX) containing all the indexes of those matches. We
6278 need to extract the last matching index (which will be the index with
6279 highest value) and use this to index into the data vector.
6280 For the case where there were no matches, the data vector will contain
6281 all default values and the index vector will be all zeros. */
6283 /* Get various versions of the type of the vector of indexes. */
6284 tree index_vec_type = TREE_TYPE (induction_index);
6285 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6286 tree index_scalar_type = TREE_TYPE (index_vec_type);
6287 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6289 /* Get an unsigned integer version of the type of the data vector. */
6290 int scalar_precision
6291 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6292 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6293 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6294 vectype);
6296 /* First we need to create a vector (ZERO_VEC) of zeros and another
6297 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6298 can create using a MAX reduction and then expanding.
6299 In the case where the loop never made any matches, the max index will
6300 be zero. */
6302 /* Vector of {0, 0, 0,...}. */
6303 tree zero_vec = build_zero_cst (vectype);
6305 /* Find maximum value from the vector of found indexes. */
6306 tree max_index = make_ssa_name (index_scalar_type);
6307 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6308 1, induction_index);
6309 gimple_call_set_lhs (max_index_stmt, max_index);
6310 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6312 /* Vector of {max_index, max_index, max_index,...}. */
6313 tree max_index_vec = make_ssa_name (index_vec_type);
6314 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6315 max_index);
6316 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6317 max_index_vec_rhs);
6318 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6320 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6321 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6322 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6323 otherwise. Only one value should match, resulting in a vector
6324 (VEC_COND) with one data value and the rest zeros.
6325 In the case where the loop never made any matches, every index will
6326 match, resulting in a vector with all data values (which will all be
6327 the default value). */
6329 /* Compare the max index vector to the vector of found indexes to find
6330 the position of the max value. */
6331 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6332 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6333 induction_index,
6334 max_index_vec);
6335 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6337 /* Use the compare to choose either values from the data vector or
6338 zero. */
6339 tree vec_cond = make_ssa_name (vectype);
6340 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6341 vec_compare,
6342 reduc_inputs[0],
6343 zero_vec);
6344 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6346 /* Finally we need to extract the data value from the vector (VEC_COND)
6347 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6348 reduction, but because this doesn't exist, we can use a MAX reduction
6349 instead. The data value might be signed or a float so we need to cast
6350 it first.
6351 In the case where the loop never made any matches, the data values are
6352 all identical, and so will reduce down correctly. */
6354 /* Make the matched data values unsigned. */
6355 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6356 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6357 vec_cond);
6358 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6359 VIEW_CONVERT_EXPR,
6360 vec_cond_cast_rhs);
6361 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6363 /* Reduce down to a scalar value. */
6364 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6365 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6366 1, vec_cond_cast);
6367 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6368 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6370 /* Convert the reduced value back to the result type and set as the
6371 result. */
6372 gimple_seq stmts = NULL;
6373 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6374 data_reduc);
6375 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6376 scalar_results.safe_push (new_temp);
6378 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6379 && reduc_fn == IFN_LAST)
6381 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6382 idx = 0;
6383 idx_val = induction_index[0];
6384 val = data_reduc[0];
6385 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6386 if (induction_index[i] > idx_val)
6387 val = data_reduc[i], idx_val = induction_index[i];
6388 return val; */
6390 tree data_eltype = TREE_TYPE (vectype);
6391 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6392 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6393 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6394 /* Enforced by vectorizable_reduction, which ensures we have target
6395 support before allowing a conditional reduction on variable-length
6396 vectors. */
6397 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6398 tree idx_val = NULL_TREE, val = NULL_TREE;
6399 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6401 tree old_idx_val = idx_val;
6402 tree old_val = val;
6403 idx_val = make_ssa_name (idx_eltype);
6404 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6405 build3 (BIT_FIELD_REF, idx_eltype,
6406 induction_index,
6407 bitsize_int (el_size),
6408 bitsize_int (off)));
6409 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6410 val = make_ssa_name (data_eltype);
6411 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6412 build3 (BIT_FIELD_REF,
6413 data_eltype,
6414 reduc_inputs[0],
6415 bitsize_int (el_size),
6416 bitsize_int (off)));
6417 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6418 if (off != 0)
6420 tree new_idx_val = idx_val;
6421 if (off != v_size - el_size)
6423 new_idx_val = make_ssa_name (idx_eltype);
6424 epilog_stmt = gimple_build_assign (new_idx_val,
6425 MAX_EXPR, idx_val,
6426 old_idx_val);
6427 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6429 tree cond = make_ssa_name (boolean_type_node);
6430 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6431 idx_val, old_idx_val);
6432 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6433 tree new_val = make_ssa_name (data_eltype);
6434 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6435 cond, val, old_val);
6436 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6437 idx_val = new_idx_val;
6438 val = new_val;
6441 /* Convert the reduced value back to the result type and set as the
6442 result. */
6443 gimple_seq stmts = NULL;
6444 val = gimple_convert (&stmts, scalar_type, val);
6445 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6446 scalar_results.safe_push (val);
6449 /* 2.3 Create the reduction code, using one of the three schemes described
6450 above. In SLP we simply need to extract all the elements from the
6451 vector (without reducing them), so we use scalar shifts. */
6452 else if (reduc_fn != IFN_LAST && !slp_reduc)
6454 tree tmp;
6455 tree vec_elem_type;
6457 /* Case 1: Create:
6458 v_out2 = reduc_expr <v_out1> */
6460 if (dump_enabled_p ())
6461 dump_printf_loc (MSG_NOTE, vect_location,
6462 "Reduce using direct vector reduction.\n");
6464 gimple_seq stmts = NULL;
6465 vec_elem_type = TREE_TYPE (vectype);
6466 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6467 vec_elem_type, reduc_inputs[0]);
6468 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6469 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6471 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6472 && induc_val)
6474 /* Earlier we set the initial value to be a vector if induc_val
6475 values. Check the result and if it is induc_val then replace
6476 with the original initial value, unless induc_val is
6477 the same as initial_def already. */
6478 tree zcompare = make_ssa_name (boolean_type_node);
6479 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6480 new_temp, induc_val);
6481 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6482 tree initial_def = reduc_info->reduc_initial_values[0];
6483 tmp = make_ssa_name (new_scalar_dest);
6484 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6485 initial_def, new_temp);
6486 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6487 new_temp = tmp;
6490 scalar_results.safe_push (new_temp);
6492 else if (direct_slp_reduc)
6494 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6495 with the elements for other SLP statements replaced with the
6496 neutral value. We can then do a normal reduction on each vector. */
6498 /* Enforced by vectorizable_reduction. */
6499 gcc_assert (reduc_inputs.length () == 1);
6500 gcc_assert (pow2p_hwi (group_size));
6502 gimple_seq seq = NULL;
6504 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6505 and the same element size as VECTYPE. */
6506 tree index = build_index_vector (vectype, 0, 1);
6507 tree index_type = TREE_TYPE (index);
6508 tree index_elt_type = TREE_TYPE (index_type);
6509 tree mask_type = truth_type_for (index_type);
6511 /* Create a vector that, for each element, identifies which of
6512 the REDUC_GROUP_SIZE results should use it. */
6513 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6514 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6515 build_vector_from_val (index_type, index_mask));
6517 /* Get a neutral vector value. This is simply a splat of the neutral
6518 scalar value if we have one, otherwise the initial scalar value
6519 is itself a neutral value. */
6520 tree vector_identity = NULL_TREE;
6521 tree neutral_op = NULL_TREE;
6522 if (slp_node)
6524 tree initial_value = NULL_TREE;
6525 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6526 initial_value = reduc_info->reduc_initial_values[0];
6527 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6528 initial_value, false);
6530 if (neutral_op)
6531 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6532 neutral_op);
6533 for (unsigned int i = 0; i < group_size; ++i)
6535 /* If there's no univeral neutral value, we can use the
6536 initial scalar value from the original PHI. This is used
6537 for MIN and MAX reduction, for example. */
6538 if (!neutral_op)
6540 tree scalar_value = reduc_info->reduc_initial_values[i];
6541 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6542 scalar_value);
6543 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6544 scalar_value);
6547 /* Calculate the equivalent of:
6549 sel[j] = (index[j] == i);
6551 which selects the elements of REDUC_INPUTS[0] that should
6552 be included in the result. */
6553 tree compare_val = build_int_cst (index_elt_type, i);
6554 compare_val = build_vector_from_val (index_type, compare_val);
6555 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6556 index, compare_val);
6558 /* Calculate the equivalent of:
6560 vec = seq ? reduc_inputs[0] : vector_identity;
6562 VEC is now suitable for a full vector reduction. */
6563 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6564 sel, reduc_inputs[0], vector_identity);
6566 /* Do the reduction and convert it to the appropriate type. */
6567 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6568 TREE_TYPE (vectype), vec);
6569 scalar = gimple_convert (&seq, scalar_type, scalar);
6570 scalar_results.safe_push (scalar);
6572 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6574 else
6576 bool reduce_with_shift;
6577 tree vec_temp;
6579 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6581 /* See if the target wants to do the final (shift) reduction
6582 in a vector mode of smaller size and first reduce upper/lower
6583 halves against each other. */
6584 enum machine_mode mode1 = mode;
6585 tree stype = TREE_TYPE (vectype);
6586 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6587 unsigned nunits1 = nunits;
6588 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6589 && reduc_inputs.length () == 1)
6591 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6592 /* For SLP reductions we have to make sure lanes match up, but
6593 since we're doing individual element final reduction reducing
6594 vector width here is even more important.
6595 ??? We can also separate lanes with permutes, for the common
6596 case of power-of-two group-size odd/even extracts would work. */
6597 if (slp_reduc && nunits != nunits1)
6599 nunits1 = least_common_multiple (nunits1, group_size);
6600 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6603 if (!slp_reduc
6604 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6605 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6607 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6608 stype, nunits1);
6609 reduce_with_shift = have_whole_vector_shift (mode1);
6610 if (!VECTOR_MODE_P (mode1)
6611 || !directly_supported_p (code, vectype1))
6612 reduce_with_shift = false;
6614 /* First reduce the vector to the desired vector size we should
6615 do shift reduction on by combining upper and lower halves. */
6616 gimple_seq stmts = NULL;
6617 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6618 code, &stmts);
6619 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6620 reduc_inputs[0] = new_temp;
6622 if (reduce_with_shift && !slp_reduc)
6624 int element_bitsize = tree_to_uhwi (bitsize);
6625 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6626 for variable-length vectors and also requires direct target support
6627 for loop reductions. */
6628 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6629 int nelements = vec_size_in_bits / element_bitsize;
6630 vec_perm_builder sel;
6631 vec_perm_indices indices;
6633 int elt_offset;
6635 tree zero_vec = build_zero_cst (vectype1);
6636 /* Case 2: Create:
6637 for (offset = nelements/2; offset >= 1; offset/=2)
6639 Create: va' = vec_shift <va, offset>
6640 Create: va = vop <va, va'>
6641 } */
6643 tree rhs;
6645 if (dump_enabled_p ())
6646 dump_printf_loc (MSG_NOTE, vect_location,
6647 "Reduce using vector shifts\n");
6649 gimple_seq stmts = NULL;
6650 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6651 for (elt_offset = nelements / 2;
6652 elt_offset >= 1;
6653 elt_offset /= 2)
6655 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6656 indices.new_vector (sel, 2, nelements);
6657 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6658 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6659 new_temp, zero_vec, mask);
6660 new_temp = gimple_build (&stmts, code,
6661 vectype1, new_name, new_temp);
6663 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6665 /* 2.4 Extract the final scalar result. Create:
6666 s_out3 = extract_field <v_out2, bitpos> */
6668 if (dump_enabled_p ())
6669 dump_printf_loc (MSG_NOTE, vect_location,
6670 "extract scalar result\n");
6672 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6673 bitsize, bitsize_zero_node);
6674 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6675 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6676 gimple_assign_set_lhs (epilog_stmt, new_temp);
6677 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6678 scalar_results.safe_push (new_temp);
6680 else
6682 /* Case 3: Create:
6683 s = extract_field <v_out2, 0>
6684 for (offset = element_size;
6685 offset < vector_size;
6686 offset += element_size;)
6688 Create: s' = extract_field <v_out2, offset>
6689 Create: s = op <s, s'> // For non SLP cases
6690 } */
6692 if (dump_enabled_p ())
6693 dump_printf_loc (MSG_NOTE, vect_location,
6694 "Reduce using scalar code.\n");
6696 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6697 int element_bitsize = tree_to_uhwi (bitsize);
6698 tree compute_type = TREE_TYPE (vectype);
6699 gimple_seq stmts = NULL;
6700 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6702 int bit_offset;
6703 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6704 vec_temp, bitsize, bitsize_zero_node);
6706 /* In SLP we don't need to apply reduction operation, so we just
6707 collect s' values in SCALAR_RESULTS. */
6708 if (slp_reduc)
6709 scalar_results.safe_push (new_temp);
6711 for (bit_offset = element_bitsize;
6712 bit_offset < vec_size_in_bits;
6713 bit_offset += element_bitsize)
6715 tree bitpos = bitsize_int (bit_offset);
6716 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6717 compute_type, vec_temp,
6718 bitsize, bitpos);
6719 if (slp_reduc)
6721 /* In SLP we don't need to apply reduction operation, so
6722 we just collect s' values in SCALAR_RESULTS. */
6723 new_temp = new_name;
6724 scalar_results.safe_push (new_name);
6726 else
6727 new_temp = gimple_build (&stmts, code, compute_type,
6728 new_name, new_temp);
6732 /* The only case where we need to reduce scalar results in SLP, is
6733 unrolling. If the size of SCALAR_RESULTS is greater than
6734 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6735 REDUC_GROUP_SIZE. */
6736 if (slp_reduc)
6738 tree res, first_res, new_res;
6740 /* Reduce multiple scalar results in case of SLP unrolling. */
6741 for (j = group_size; scalar_results.iterate (j, &res);
6742 j++)
6744 first_res = scalar_results[j % group_size];
6745 new_res = gimple_build (&stmts, code, compute_type,
6746 first_res, res);
6747 scalar_results[j % group_size] = new_res;
6749 scalar_results.truncate (group_size);
6750 for (k = 0; k < group_size; k++)
6751 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6752 scalar_results[k]);
6754 else
6756 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6757 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6758 scalar_results.safe_push (new_temp);
6761 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6764 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6765 && induc_val)
6767 /* Earlier we set the initial value to be a vector if induc_val
6768 values. Check the result and if it is induc_val then replace
6769 with the original initial value, unless induc_val is
6770 the same as initial_def already. */
6771 tree zcompare = make_ssa_name (boolean_type_node);
6772 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6773 induc_val);
6774 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6775 tree initial_def = reduc_info->reduc_initial_values[0];
6776 tree tmp = make_ssa_name (new_scalar_dest);
6777 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6778 initial_def, new_temp);
6779 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6780 scalar_results[0] = tmp;
6784 /* 2.5 Adjust the final result by the initial value of the reduction
6785 variable. (When such adjustment is not needed, then
6786 'adjustment_def' is zero). For example, if code is PLUS we create:
6787 new_temp = loop_exit_def + adjustment_def */
6789 if (adjustment_def)
6791 gcc_assert (!slp_reduc);
6792 gimple_seq stmts = NULL;
6793 if (double_reduc)
6795 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6796 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6797 new_temp = gimple_build (&stmts, code, vectype,
6798 reduc_inputs[0], adjustment_def);
6800 else
6802 new_temp = scalar_results[0];
6803 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6804 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6805 adjustment_def);
6806 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6807 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6808 new_temp, adjustment_def);
6809 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6812 epilog_stmt = gimple_seq_last_stmt (stmts);
6813 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6814 scalar_results[0] = new_temp;
6817 /* Record this operation if it could be reused by the epilogue loop. */
6818 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6819 && reduc_inputs.length () == 1)
6820 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6821 { orig_reduc_input, reduc_info });
6823 if (double_reduc)
6824 loop = outer_loop;
6826 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6827 phis with new adjusted scalar results, i.e., replace use <s_out0>
6828 with use <s_out4>.
6830 Transform:
6831 loop_exit:
6832 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6833 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6834 v_out2 = reduce <v_out1>
6835 s_out3 = extract_field <v_out2, 0>
6836 s_out4 = adjust_result <s_out3>
6837 use <s_out0>
6838 use <s_out0>
6840 into:
6842 loop_exit:
6843 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6844 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6845 v_out2 = reduce <v_out1>
6846 s_out3 = extract_field <v_out2, 0>
6847 s_out4 = adjust_result <s_out3>
6848 use <s_out4>
6849 use <s_out4> */
6851 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6852 for (k = 0; k < live_out_stmts.size (); k++)
6854 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6855 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6857 phis.create (3);
6858 /* Find the loop-closed-use at the loop exit of the original scalar
6859 result. (The reduction result is expected to have two immediate uses,
6860 one at the latch block, and one at the loop exit). For double
6861 reductions we are looking for exit phis of the outer loop. */
6862 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6864 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6866 if (!is_gimple_debug (USE_STMT (use_p)))
6867 phis.safe_push (USE_STMT (use_p));
6869 else
6871 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6873 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6875 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6877 if (!flow_bb_inside_loop_p (loop,
6878 gimple_bb (USE_STMT (phi_use_p)))
6879 && !is_gimple_debug (USE_STMT (phi_use_p)))
6880 phis.safe_push (USE_STMT (phi_use_p));
6886 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6888 /* Replace the uses: */
6889 orig_name = PHI_RESULT (exit_phi);
6891 /* Look for a single use at the target of the skip edge. */
6892 if (unify_with_main_loop_p)
6894 use_operand_p use_p;
6895 gimple *user;
6896 if (!single_imm_use (orig_name, &use_p, &user))
6897 gcc_unreachable ();
6898 orig_name = gimple_get_lhs (user);
6901 scalar_result = scalar_results[k];
6902 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6904 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6905 SET_USE (use_p, scalar_result);
6906 update_stmt (use_stmt);
6910 phis.release ();
6914 /* Return a vector of type VECTYPE that is equal to the vector select
6915 operation "MASK ? VEC : IDENTITY". Insert the select statements
6916 before GSI. */
6918 static tree
6919 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6920 tree vec, tree identity)
6922 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6923 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6924 mask, vec, identity);
6925 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6926 return cond;
6929 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6930 order, starting with LHS. Insert the extraction statements before GSI and
6931 associate the new scalar SSA names with variable SCALAR_DEST.
6932 If MASK is nonzero mask the input and then operate on it unconditionally.
6933 Return the SSA name for the result. */
6935 static tree
6936 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6937 tree_code code, tree lhs, tree vector_rhs,
6938 tree mask)
6940 tree vectype = TREE_TYPE (vector_rhs);
6941 tree scalar_type = TREE_TYPE (vectype);
6942 tree bitsize = TYPE_SIZE (scalar_type);
6943 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6944 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6946 /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6947 to perform an unconditional element-wise reduction of it. */
6948 if (mask)
6950 tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6951 "masked_vector_rhs");
6952 tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6953 false);
6954 tree vector_identity = build_vector_from_val (vectype, neutral_op);
6955 gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6956 mask, vector_rhs, vector_identity);
6957 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6958 vector_rhs = masked_vector_rhs;
6961 for (unsigned HOST_WIDE_INT bit_offset = 0;
6962 bit_offset < vec_size_in_bits;
6963 bit_offset += element_bitsize)
6965 tree bitpos = bitsize_int (bit_offset);
6966 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6967 bitsize, bitpos);
6969 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6970 rhs = make_ssa_name (scalar_dest, stmt);
6971 gimple_assign_set_lhs (stmt, rhs);
6972 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6974 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6975 tree new_name = make_ssa_name (scalar_dest, stmt);
6976 gimple_assign_set_lhs (stmt, new_name);
6977 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6978 lhs = new_name;
6980 return lhs;
6983 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6984 type of the vector input. */
6986 static internal_fn
6987 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6989 internal_fn mask_reduc_fn;
6990 internal_fn mask_len_reduc_fn;
6992 switch (reduc_fn)
6994 case IFN_FOLD_LEFT_PLUS:
6995 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6996 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6997 break;
6999 default:
7000 return IFN_LAST;
7003 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7004 OPTIMIZE_FOR_SPEED))
7005 return mask_reduc_fn;
7006 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7007 OPTIMIZE_FOR_SPEED))
7008 return mask_len_reduc_fn;
7009 return IFN_LAST;
7012 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
7013 statement that sets the live-out value. REDUC_DEF_STMT is the phi
7014 statement. CODE is the operation performed by STMT_INFO and OPS are
7015 its scalar operands. REDUC_INDEX is the index of the operand in
7016 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
7017 implements in-order reduction, or IFN_LAST if we should open-code it.
7018 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7019 that should be used to control the operation in a fully-masked loop. */
7021 static bool
7022 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7023 stmt_vec_info stmt_info,
7024 gimple_stmt_iterator *gsi,
7025 gimple **vec_stmt, slp_tree slp_node,
7026 gimple *reduc_def_stmt,
7027 code_helper code, internal_fn reduc_fn,
7028 tree *ops, int num_ops, tree vectype_in,
7029 int reduc_index, vec_loop_masks *masks,
7030 vec_loop_lens *lens)
7032 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7033 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7034 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7036 int ncopies;
7037 if (slp_node)
7038 ncopies = 1;
7039 else
7040 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7042 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7043 gcc_assert (ncopies == 1);
7045 bool is_cond_op = false;
7046 if (!code.is_tree_code ())
7048 code = conditional_internal_fn_code (internal_fn (code));
7049 gcc_assert (code != ERROR_MARK);
7050 is_cond_op = true;
7053 gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7055 if (slp_node)
7057 if (is_cond_op)
7059 if (dump_enabled_p ())
7060 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7061 "fold-left reduction on SLP not supported.\n");
7062 return false;
7065 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7066 TYPE_VECTOR_SUBPARTS (vectype_in)));
7069 /* The operands either come from a binary operation or an IFN_COND operation.
7070 The former is a gimple assign with binary rhs and the latter is a
7071 gimple call with four arguments. */
7072 gcc_assert (num_ops == 2 || num_ops == 4);
7073 tree op0, opmask;
7074 if (!is_cond_op)
7075 op0 = ops[1 - reduc_index];
7076 else
7078 op0 = ops[2 + (1 - reduc_index)];
7079 opmask = ops[0];
7080 gcc_assert (!slp_node);
7083 int group_size = 1;
7084 stmt_vec_info scalar_dest_def_info;
7085 auto_vec<tree> vec_oprnds0, vec_opmask;
7086 if (slp_node)
7088 auto_vec<vec<tree> > vec_defs (2);
7089 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7090 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7091 vec_defs[0].release ();
7092 vec_defs[1].release ();
7093 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7094 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7096 else
7098 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7099 op0, &vec_oprnds0);
7100 scalar_dest_def_info = stmt_info;
7102 /* For an IFN_COND_OP we also need the vector mask operand. */
7103 if (is_cond_op)
7104 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7105 opmask, &vec_opmask);
7108 gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7109 tree scalar_dest = gimple_get_lhs (sdef);
7110 tree scalar_type = TREE_TYPE (scalar_dest);
7111 tree reduc_var = gimple_phi_result (reduc_def_stmt);
7113 int vec_num = vec_oprnds0.length ();
7114 gcc_assert (vec_num == 1 || slp_node);
7115 tree vec_elem_type = TREE_TYPE (vectype_out);
7116 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7118 tree vector_identity = NULL_TREE;
7119 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7121 vector_identity = build_zero_cst (vectype_out);
7122 if (!HONOR_SIGNED_ZEROS (vectype_out))
7124 else
7126 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7127 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7128 vector_identity);
7132 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7133 int i;
7134 tree def0;
7135 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7137 gimple *new_stmt;
7138 tree mask = NULL_TREE;
7139 tree len = NULL_TREE;
7140 tree bias = NULL_TREE;
7141 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7142 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7143 else if (is_cond_op)
7144 mask = vec_opmask[0];
7145 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7147 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7148 i, 1);
7149 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7150 bias = build_int_cst (intQI_type_node, biasval);
7151 if (!is_cond_op)
7152 mask = build_minus_one_cst (truth_type_for (vectype_in));
7155 /* Handle MINUS by adding the negative. */
7156 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7158 tree negated = make_ssa_name (vectype_out);
7159 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7160 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7161 def0 = negated;
7164 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7165 && mask && mask_reduc_fn == IFN_LAST)
7166 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7167 vector_identity);
7169 /* On the first iteration the input is simply the scalar phi
7170 result, and for subsequent iterations it is the output of
7171 the preceding operation. */
7172 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7174 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7175 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7176 def0, mask, len, bias);
7177 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7178 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7179 def0, mask);
7180 else
7181 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7182 def0);
7183 /* For chained SLP reductions the output of the previous reduction
7184 operation serves as the input of the next. For the final statement
7185 the output cannot be a temporary - we reuse the original
7186 scalar destination of the last statement. */
7187 if (i != vec_num - 1)
7189 gimple_set_lhs (new_stmt, scalar_dest_var);
7190 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7191 gimple_set_lhs (new_stmt, reduc_var);
7194 else
7196 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7197 tree_code (code), reduc_var, def0,
7198 mask);
7199 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7200 /* Remove the statement, so that we can use the same code paths
7201 as for statements that we've just created. */
7202 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7203 gsi_remove (&tmp_gsi, true);
7206 if (i == vec_num - 1)
7208 gimple_set_lhs (new_stmt, scalar_dest);
7209 vect_finish_replace_stmt (loop_vinfo,
7210 scalar_dest_def_info,
7211 new_stmt);
7213 else
7214 vect_finish_stmt_generation (loop_vinfo,
7215 scalar_dest_def_info,
7216 new_stmt, gsi);
7218 if (slp_node)
7219 slp_node->push_vec_def (new_stmt);
7220 else
7222 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7223 *vec_stmt = new_stmt;
7227 return true;
7230 /* Function is_nonwrapping_integer_induction.
7232 Check if STMT_VINO (which is part of loop LOOP) both increments and
7233 does not cause overflow. */
7235 static bool
7236 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7238 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7239 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7240 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7241 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7242 widest_int ni, max_loop_value, lhs_max;
7243 wi::overflow_type overflow = wi::OVF_NONE;
7245 /* Make sure the loop is integer based. */
7246 if (TREE_CODE (base) != INTEGER_CST
7247 || TREE_CODE (step) != INTEGER_CST)
7248 return false;
7250 /* Check that the max size of the loop will not wrap. */
7252 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7253 return true;
7255 if (! max_stmt_executions (loop, &ni))
7256 return false;
7258 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7259 &overflow);
7260 if (overflow)
7261 return false;
7263 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7264 TYPE_SIGN (lhs_type), &overflow);
7265 if (overflow)
7266 return false;
7268 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7269 <= TYPE_PRECISION (lhs_type));
7272 /* Check if masking can be supported by inserting a conditional expression.
7273 CODE is the code for the operation. COND_FN is the conditional internal
7274 function, if it exists. VECTYPE_IN is the type of the vector input. */
7275 static bool
7276 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7277 tree vectype_in)
7279 if (cond_fn != IFN_LAST
7280 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7281 OPTIMIZE_FOR_SPEED))
7282 return false;
7284 if (code.is_tree_code ())
7285 switch (tree_code (code))
7287 case DOT_PROD_EXPR:
7288 case SAD_EXPR:
7289 return true;
7291 default:
7292 break;
7294 return false;
7297 /* Insert a conditional expression to enable masked vectorization. CODE is the
7298 code for the operation. VOP is the array of operands. MASK is the loop
7299 mask. GSI is a statement iterator used to place the new conditional
7300 expression. */
7301 static void
7302 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7303 gimple_stmt_iterator *gsi)
7305 switch (tree_code (code))
7307 case DOT_PROD_EXPR:
7309 tree vectype = TREE_TYPE (vop[1]);
7310 tree zero = build_zero_cst (vectype);
7311 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7312 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7313 mask, vop[1], zero);
7314 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7315 vop[1] = masked_op1;
7316 break;
7319 case SAD_EXPR:
7321 tree vectype = TREE_TYPE (vop[1]);
7322 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7323 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7324 mask, vop[1], vop[0]);
7325 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7326 vop[1] = masked_op1;
7327 break;
7330 default:
7331 gcc_unreachable ();
7335 /* Function vectorizable_reduction.
7337 Check if STMT_INFO performs a reduction operation that can be vectorized.
7338 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7339 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7340 Return true if STMT_INFO is vectorizable in this way.
7342 This function also handles reduction idioms (patterns) that have been
7343 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7344 may be of this form:
7345 X = pattern_expr (arg0, arg1, ..., X)
7346 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7347 sequence that had been detected and replaced by the pattern-stmt
7348 (STMT_INFO).
7350 This function also handles reduction of condition expressions, for example:
7351 for (int i = 0; i < N; i++)
7352 if (a[i] < value)
7353 last = a[i];
7354 This is handled by vectorising the loop and creating an additional vector
7355 containing the loop indexes for which "a[i] < value" was true. In the
7356 function epilogue this is reduced to a single max value and then used to
7357 index into the vector of results.
7359 In some cases of reduction patterns, the type of the reduction variable X is
7360 different than the type of the other arguments of STMT_INFO.
7361 In such cases, the vectype that is used when transforming STMT_INFO into
7362 a vector stmt is different than the vectype that is used to determine the
7363 vectorization factor, because it consists of a different number of elements
7364 than the actual number of elements that are being operated upon in parallel.
7366 For example, consider an accumulation of shorts into an int accumulator.
7367 On some targets it's possible to vectorize this pattern operating on 8
7368 shorts at a time (hence, the vectype for purposes of determining the
7369 vectorization factor should be V8HI); on the other hand, the vectype that
7370 is used to create the vector form is actually V4SI (the type of the result).
7372 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7373 indicates what is the actual level of parallelism (V8HI in the example), so
7374 that the right vectorization factor would be derived. This vectype
7375 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7376 be used to create the vectorized stmt. The right vectype for the vectorized
7377 stmt is obtained from the type of the result X:
7378 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7380 This means that, contrary to "regular" reductions (or "regular" stmts in
7381 general), the following equation:
7382 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7383 does *NOT* necessarily hold for reduction patterns. */
7385 bool
7386 vectorizable_reduction (loop_vec_info loop_vinfo,
7387 stmt_vec_info stmt_info, slp_tree slp_node,
7388 slp_instance slp_node_instance,
7389 stmt_vector_for_cost *cost_vec)
7391 tree vectype_in = NULL_TREE;
7392 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7393 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7394 stmt_vec_info cond_stmt_vinfo = NULL;
7395 int i;
7396 int ncopies;
7397 bool single_defuse_cycle = false;
7398 bool nested_cycle = false;
7399 bool double_reduc = false;
7400 int vec_num;
7401 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7402 tree cond_reduc_val = NULL_TREE;
7404 /* Make sure it was already recognized as a reduction computation. */
7405 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7406 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7407 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7408 return false;
7410 /* The stmt we store reduction analysis meta on. */
7411 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7412 reduc_info->is_reduc_info = true;
7414 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7416 if (is_a <gphi *> (stmt_info->stmt))
7418 if (slp_node)
7420 /* We eventually need to set a vector type on invariant
7421 arguments. */
7422 unsigned j;
7423 slp_tree child;
7424 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7425 if (!vect_maybe_update_slp_op_vectype
7426 (child, SLP_TREE_VECTYPE (slp_node)))
7428 if (dump_enabled_p ())
7429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7430 "incompatible vector types for "
7431 "invariants\n");
7432 return false;
7435 /* Analysis for double-reduction is done on the outer
7436 loop PHI, nested cycles have no further restrictions. */
7437 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7439 else
7440 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7441 return true;
7444 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7445 stmt_vec_info phi_info = stmt_info;
7446 if (!is_a <gphi *> (stmt_info->stmt))
7448 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7449 return true;
7451 if (slp_node)
7453 slp_node_instance->reduc_phis = slp_node;
7454 /* ??? We're leaving slp_node to point to the PHIs, we only
7455 need it to get at the number of vector stmts which wasn't
7456 yet initialized for the instance root. */
7458 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7460 use_operand_p use_p;
7461 gimple *use_stmt;
7462 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7463 &use_p, &use_stmt);
7464 gcc_assert (res);
7465 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7468 /* PHIs should not participate in patterns. */
7469 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7470 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7472 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7473 and compute the reduction chain length. Discover the real
7474 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7475 tree reduc_def
7476 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7477 loop_latch_edge
7478 (gimple_bb (reduc_def_phi)->loop_father));
7479 unsigned reduc_chain_length = 0;
7480 bool only_slp_reduc_chain = true;
7481 stmt_info = NULL;
7482 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7483 while (reduc_def != PHI_RESULT (reduc_def_phi))
7485 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7486 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7487 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7489 if (dump_enabled_p ())
7490 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7491 "reduction chain broken by patterns.\n");
7492 return false;
7494 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7495 only_slp_reduc_chain = false;
7496 /* For epilogue generation live members of the chain need
7497 to point back to the PHI via their original stmt for
7498 info_for_reduction to work. For SLP we need to look at
7499 all lanes here - even though we only will vectorize from
7500 the SLP node with live lane zero the other live lanes also
7501 need to be identified as part of a reduction to be able
7502 to skip code generation for them. */
7503 if (slp_for_stmt_info)
7505 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7506 if (STMT_VINFO_LIVE_P (s))
7507 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7509 else if (STMT_VINFO_LIVE_P (vdef))
7510 STMT_VINFO_REDUC_DEF (def) = phi_info;
7511 gimple_match_op op;
7512 if (!gimple_extract_op (vdef->stmt, &op))
7514 if (dump_enabled_p ())
7515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7516 "reduction chain includes unsupported"
7517 " statement type.\n");
7518 return false;
7520 if (CONVERT_EXPR_CODE_P (op.code))
7522 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7524 if (dump_enabled_p ())
7525 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7526 "conversion in the reduction chain.\n");
7527 return false;
7530 else if (!stmt_info)
7531 /* First non-conversion stmt. */
7532 stmt_info = vdef;
7533 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7534 reduc_chain_length++;
7535 if (!stmt_info && slp_node)
7536 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7538 /* PHIs should not participate in patterns. */
7539 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7541 if (nested_in_vect_loop_p (loop, stmt_info))
7543 loop = loop->inner;
7544 nested_cycle = true;
7547 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7548 element. */
7549 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7551 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7552 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7554 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7555 gcc_assert (slp_node
7556 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7558 /* 1. Is vectorizable reduction? */
7559 /* Not supportable if the reduction variable is used in the loop, unless
7560 it's a reduction chain. */
7561 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7562 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7563 return false;
7565 /* Reductions that are not used even in an enclosing outer-loop,
7566 are expected to be "live" (used out of the loop). */
7567 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7568 && !STMT_VINFO_LIVE_P (stmt_info))
7569 return false;
7571 /* 2. Has this been recognized as a reduction pattern?
7573 Check if STMT represents a pattern that has been recognized
7574 in earlier analysis stages. For stmts that represent a pattern,
7575 the STMT_VINFO_RELATED_STMT field records the last stmt in
7576 the original sequence that constitutes the pattern. */
7578 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7579 if (orig_stmt_info)
7581 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7582 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7585 /* 3. Check the operands of the operation. The first operands are defined
7586 inside the loop body. The last operand is the reduction variable,
7587 which is defined by the loop-header-phi. */
7589 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7590 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7591 gimple_match_op op;
7592 if (!gimple_extract_op (stmt_info->stmt, &op))
7593 gcc_unreachable ();
7594 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7595 || op.code == WIDEN_SUM_EXPR
7596 || op.code == SAD_EXPR);
7598 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7599 && !SCALAR_FLOAT_TYPE_P (op.type))
7600 return false;
7602 /* Do not try to vectorize bit-precision reductions. */
7603 if (!type_has_mode_precision_p (op.type))
7604 return false;
7606 /* For lane-reducing ops we're reducing the number of reduction PHIs
7607 which means the only use of that may be in the lane-reducing operation. */
7608 if (lane_reduc_code_p
7609 && reduc_chain_length != 1
7610 && !only_slp_reduc_chain)
7612 if (dump_enabled_p ())
7613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7614 "lane-reducing reduction with extra stmts.\n");
7615 return false;
7618 /* All uses but the last are expected to be defined in the loop.
7619 The last use is the reduction variable. In case of nested cycle this
7620 assumption is not true: we use reduc_index to record the index of the
7621 reduction variable. */
7622 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7623 tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7624 /* We need to skip an extra operand for COND_EXPRs with embedded
7625 comparison. */
7626 unsigned opno_adjust = 0;
7627 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7628 opno_adjust = 1;
7629 for (i = 0; i < (int) op.num_ops; i++)
7631 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7632 if (i == 0 && op.code == COND_EXPR)
7633 continue;
7635 stmt_vec_info def_stmt_info;
7636 enum vect_def_type dt;
7637 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7638 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7639 &vectype_op[i], &def_stmt_info))
7641 if (dump_enabled_p ())
7642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7643 "use not simple.\n");
7644 return false;
7646 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7647 continue;
7649 /* For an IFN_COND_OP we might hit the reduction definition operand
7650 twice (once as definition, once as else). */
7651 if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7652 continue;
7654 /* There should be only one cycle def in the stmt, the one
7655 leading to reduc_def. */
7656 if (VECTORIZABLE_CYCLE_DEF (dt))
7657 return false;
7659 if (!vectype_op[i])
7660 vectype_op[i]
7661 = get_vectype_for_scalar_type (loop_vinfo,
7662 TREE_TYPE (op.ops[i]), slp_op[i]);
7664 /* To properly compute ncopies we are interested in the widest
7665 non-reduction input type in case we're looking at a widening
7666 accumulation that we later handle in vect_transform_reduction. */
7667 if (lane_reduc_code_p
7668 && vectype_op[i]
7669 && (!vectype_in
7670 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7671 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7672 vectype_in = vectype_op[i];
7674 if (op.code == COND_EXPR)
7676 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7677 if (dt == vect_constant_def)
7679 cond_reduc_dt = dt;
7680 cond_reduc_val = op.ops[i];
7682 if (dt == vect_induction_def
7683 && def_stmt_info
7684 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7686 cond_reduc_dt = dt;
7687 cond_stmt_vinfo = def_stmt_info;
7691 if (!vectype_in)
7692 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7693 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7695 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7696 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7697 /* If we have a condition reduction, see if we can simplify it further. */
7698 if (v_reduc_type == COND_REDUCTION)
7700 if (slp_node)
7701 return false;
7703 /* When the condition uses the reduction value in the condition, fail. */
7704 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7706 if (dump_enabled_p ())
7707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7708 "condition depends on previous iteration\n");
7709 return false;
7712 if (reduc_chain_length == 1
7713 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7714 OPTIMIZE_FOR_SPEED)
7715 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7716 vectype_in,
7717 OPTIMIZE_FOR_SPEED)))
7719 if (dump_enabled_p ())
7720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7721 "optimizing condition reduction with"
7722 " FOLD_EXTRACT_LAST.\n");
7723 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7725 else if (cond_reduc_dt == vect_induction_def)
7727 tree base
7728 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7729 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7731 gcc_assert (TREE_CODE (base) == INTEGER_CST
7732 && TREE_CODE (step) == INTEGER_CST);
7733 cond_reduc_val = NULL_TREE;
7734 enum tree_code cond_reduc_op_code = ERROR_MARK;
7735 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7736 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7738 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7739 above base; punt if base is the minimum value of the type for
7740 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7741 else if (tree_int_cst_sgn (step) == -1)
7743 cond_reduc_op_code = MIN_EXPR;
7744 if (tree_int_cst_sgn (base) == -1)
7745 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7746 else if (tree_int_cst_lt (base,
7747 TYPE_MAX_VALUE (TREE_TYPE (base))))
7748 cond_reduc_val
7749 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7751 else
7753 cond_reduc_op_code = MAX_EXPR;
7754 if (tree_int_cst_sgn (base) == 1)
7755 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7756 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7757 base))
7758 cond_reduc_val
7759 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7761 if (cond_reduc_val)
7763 if (dump_enabled_p ())
7764 dump_printf_loc (MSG_NOTE, vect_location,
7765 "condition expression based on "
7766 "integer induction.\n");
7767 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7768 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7769 = cond_reduc_val;
7770 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7773 else if (cond_reduc_dt == vect_constant_def)
7775 enum vect_def_type cond_initial_dt;
7776 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7777 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7778 if (cond_initial_dt == vect_constant_def
7779 && types_compatible_p (TREE_TYPE (cond_initial_val),
7780 TREE_TYPE (cond_reduc_val)))
7782 tree e = fold_binary (LE_EXPR, boolean_type_node,
7783 cond_initial_val, cond_reduc_val);
7784 if (e && (integer_onep (e) || integer_zerop (e)))
7786 if (dump_enabled_p ())
7787 dump_printf_loc (MSG_NOTE, vect_location,
7788 "condition expression based on "
7789 "compile time constant.\n");
7790 /* Record reduction code at analysis stage. */
7791 STMT_VINFO_REDUC_CODE (reduc_info)
7792 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7793 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7799 if (STMT_VINFO_LIVE_P (phi_info))
7800 return false;
7802 if (slp_node)
7803 ncopies = 1;
7804 else
7805 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7807 gcc_assert (ncopies >= 1);
7809 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7811 if (nested_cycle)
7813 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7814 == vect_double_reduction_def);
7815 double_reduc = true;
7818 /* 4.2. Check support for the epilog operation.
7820 If STMT represents a reduction pattern, then the type of the
7821 reduction variable may be different than the type of the rest
7822 of the arguments. For example, consider the case of accumulation
7823 of shorts into an int accumulator; The original code:
7824 S1: int_a = (int) short_a;
7825 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7827 was replaced with:
7828 STMT: int_acc = widen_sum <short_a, int_acc>
7830 This means that:
7831 1. The tree-code that is used to create the vector operation in the
7832 epilog code (that reduces the partial results) is not the
7833 tree-code of STMT, but is rather the tree-code of the original
7834 stmt from the pattern that STMT is replacing. I.e, in the example
7835 above we want to use 'widen_sum' in the loop, but 'plus' in the
7836 epilog.
7837 2. The type (mode) we use to check available target support
7838 for the vector operation to be created in the *epilog*, is
7839 determined by the type of the reduction variable (in the example
7840 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7841 However the type (mode) we use to check available target support
7842 for the vector operation to be created *inside the loop*, is
7843 determined by the type of the other arguments to STMT (in the
7844 example we'd check this: optab_handler (widen_sum_optab,
7845 vect_short_mode)).
7847 This is contrary to "regular" reductions, in which the types of all
7848 the arguments are the same as the type of the reduction variable.
7849 For "regular" reductions we can therefore use the same vector type
7850 (and also the same tree-code) when generating the epilog code and
7851 when generating the code inside the loop. */
7853 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7855 /* If conversion might have created a conditional operation like
7856 IFN_COND_ADD already. Use the internal code for the following checks. */
7857 if (orig_code.is_internal_fn ())
7859 tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7860 orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7863 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7865 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7866 if (reduction_type == TREE_CODE_REDUCTION)
7868 /* Check whether it's ok to change the order of the computation.
7869 Generally, when vectorizing a reduction we change the order of the
7870 computation. This may change the behavior of the program in some
7871 cases, so we need to check that this is ok. One exception is when
7872 vectorizing an outer-loop: the inner-loop is executed sequentially,
7873 and therefore vectorizing reductions in the inner-loop during
7874 outer-loop vectorization is safe. Likewise when we are vectorizing
7875 a series of reductions using SLP and the VF is one the reductions
7876 are performed in scalar order. */
7877 if (slp_node
7878 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7879 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7881 else if (needs_fold_left_reduction_p (op.type, orig_code))
7883 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7884 is not directy used in stmt. */
7885 if (!only_slp_reduc_chain
7886 && reduc_chain_length != 1)
7888 if (dump_enabled_p ())
7889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7890 "in-order reduction chain without SLP.\n");
7891 return false;
7893 STMT_VINFO_REDUC_TYPE (reduc_info)
7894 = reduction_type = FOLD_LEFT_REDUCTION;
7896 else if (!commutative_binary_op_p (orig_code, op.type)
7897 || !associative_binary_op_p (orig_code, op.type))
7899 if (dump_enabled_p ())
7900 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7901 "reduction: not commutative/associative\n");
7902 return false;
7906 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7907 && ncopies > 1)
7909 if (dump_enabled_p ())
7910 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7911 "multiple types in double reduction or condition "
7912 "reduction or fold-left reduction.\n");
7913 return false;
7916 internal_fn reduc_fn = IFN_LAST;
7917 if (reduction_type == TREE_CODE_REDUCTION
7918 || reduction_type == FOLD_LEFT_REDUCTION
7919 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7920 || reduction_type == CONST_COND_REDUCTION)
7922 if (reduction_type == FOLD_LEFT_REDUCTION
7923 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7924 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7926 if (reduc_fn != IFN_LAST
7927 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7928 OPTIMIZE_FOR_SPEED))
7930 if (dump_enabled_p ())
7931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7932 "reduc op not supported by target.\n");
7934 reduc_fn = IFN_LAST;
7937 else
7939 if (!nested_cycle || double_reduc)
7941 if (dump_enabled_p ())
7942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7943 "no reduc code for scalar code.\n");
7945 return false;
7949 else if (reduction_type == COND_REDUCTION)
7951 int scalar_precision
7952 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7953 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7954 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7955 vectype_out);
7957 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7958 OPTIMIZE_FOR_SPEED))
7959 reduc_fn = IFN_REDUC_MAX;
7961 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7963 if (reduction_type != EXTRACT_LAST_REDUCTION
7964 && (!nested_cycle || double_reduc)
7965 && reduc_fn == IFN_LAST
7966 && !nunits_out.is_constant ())
7968 if (dump_enabled_p ())
7969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7970 "missing target support for reduction on"
7971 " variable-length vectors.\n");
7972 return false;
7975 /* For SLP reductions, see if there is a neutral value we can use. */
7976 tree neutral_op = NULL_TREE;
7977 if (slp_node)
7979 tree initial_value = NULL_TREE;
7980 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7981 initial_value = vect_phi_initial_value (reduc_def_phi);
7982 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7983 orig_code, initial_value);
7986 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7988 /* We can't support in-order reductions of code such as this:
7990 for (int i = 0; i < n1; ++i)
7991 for (int j = 0; j < n2; ++j)
7992 l += a[j];
7994 since GCC effectively transforms the loop when vectorizing:
7996 for (int i = 0; i < n1 / VF; ++i)
7997 for (int j = 0; j < n2; ++j)
7998 for (int k = 0; k < VF; ++k)
7999 l += a[j];
8001 which is a reassociation of the original operation. */
8002 if (dump_enabled_p ())
8003 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8004 "in-order double reduction not supported.\n");
8006 return false;
8009 if (reduction_type == FOLD_LEFT_REDUCTION
8010 && slp_node
8011 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8013 /* We cannot use in-order reductions in this case because there is
8014 an implicit reassociation of the operations involved. */
8015 if (dump_enabled_p ())
8016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8017 "in-order unchained SLP reductions not supported.\n");
8018 return false;
8021 /* For double reductions, and for SLP reductions with a neutral value,
8022 we construct a variable-length initial vector by loading a vector
8023 full of the neutral value and then shift-and-inserting the start
8024 values into the low-numbered elements. */
8025 if ((double_reduc || neutral_op)
8026 && !nunits_out.is_constant ()
8027 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8028 vectype_out, OPTIMIZE_FOR_SPEED))
8030 if (dump_enabled_p ())
8031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8032 "reduction on variable-length vectors requires"
8033 " target support for a vector-shift-and-insert"
8034 " operation.\n");
8035 return false;
8038 /* Check extra constraints for variable-length unchained SLP reductions. */
8039 if (slp_node
8040 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8041 && !nunits_out.is_constant ())
8043 /* We checked above that we could build the initial vector when
8044 there's a neutral element value. Check here for the case in
8045 which each SLP statement has its own initial value and in which
8046 that value needs to be repeated for every instance of the
8047 statement within the initial vector. */
8048 unsigned int group_size = SLP_TREE_LANES (slp_node);
8049 if (!neutral_op
8050 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8051 TREE_TYPE (vectype_out)))
8053 if (dump_enabled_p ())
8054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8055 "unsupported form of SLP reduction for"
8056 " variable-length vectors: cannot build"
8057 " initial vector.\n");
8058 return false;
8060 /* The epilogue code relies on the number of elements being a multiple
8061 of the group size. The duplicate-and-interleave approach to setting
8062 up the initial vector does too. */
8063 if (!multiple_p (nunits_out, group_size))
8065 if (dump_enabled_p ())
8066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8067 "unsupported form of SLP reduction for"
8068 " variable-length vectors: the vector size"
8069 " is not a multiple of the number of results.\n");
8070 return false;
8074 if (reduction_type == COND_REDUCTION)
8076 widest_int ni;
8078 if (! max_loop_iterations (loop, &ni))
8080 if (dump_enabled_p ())
8081 dump_printf_loc (MSG_NOTE, vect_location,
8082 "loop count not known, cannot create cond "
8083 "reduction.\n");
8084 return false;
8086 /* Convert backedges to iterations. */
8087 ni += 1;
8089 /* The additional index will be the same type as the condition. Check
8090 that the loop can fit into this less one (because we'll use up the
8091 zero slot for when there are no matches). */
8092 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8093 if (wi::geu_p (ni, wi::to_widest (max_index)))
8095 if (dump_enabled_p ())
8096 dump_printf_loc (MSG_NOTE, vect_location,
8097 "loop size is greater than data size.\n");
8098 return false;
8102 /* In case the vectorization factor (VF) is bigger than the number
8103 of elements that we can fit in a vectype (nunits), we have to generate
8104 more than one vector stmt - i.e - we need to "unroll" the
8105 vector stmt by a factor VF/nunits. For more details see documentation
8106 in vectorizable_operation. */
8108 /* If the reduction is used in an outer loop we need to generate
8109 VF intermediate results, like so (e.g. for ncopies=2):
8110 r0 = phi (init, r0)
8111 r1 = phi (init, r1)
8112 r0 = x0 + r0;
8113 r1 = x1 + r1;
8114 (i.e. we generate VF results in 2 registers).
8115 In this case we have a separate def-use cycle for each copy, and therefore
8116 for each copy we get the vector def for the reduction variable from the
8117 respective phi node created for this copy.
8119 Otherwise (the reduction is unused in the loop nest), we can combine
8120 together intermediate results, like so (e.g. for ncopies=2):
8121 r = phi (init, r)
8122 r = x0 + r;
8123 r = x1 + r;
8124 (i.e. we generate VF/2 results in a single register).
8125 In this case for each copy we get the vector def for the reduction variable
8126 from the vectorized reduction operation generated in the previous iteration.
8128 This only works when we see both the reduction PHI and its only consumer
8129 in vectorizable_reduction and there are no intermediate stmts
8130 participating. When unrolling we want each unrolled iteration to have its
8131 own reduction accumulator since one of the main goals of unrolling a
8132 reduction is to reduce the aggregate loop-carried latency. */
8133 if (ncopies > 1
8134 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8135 && reduc_chain_length == 1
8136 && loop_vinfo->suggested_unroll_factor == 1)
8137 single_defuse_cycle = true;
8139 if (single_defuse_cycle || lane_reduc_code_p)
8141 gcc_assert (op.code != COND_EXPR);
8143 /* 4. Supportable by target? */
8144 bool ok = true;
8146 /* 4.1. check support for the operation in the loop
8148 This isn't necessary for the lane reduction codes, since they
8149 can only be produced by pattern matching, and it's up to the
8150 pattern matcher to test for support. The main reason for
8151 specifically skipping this step is to avoid rechecking whether
8152 mixed-sign dot-products can be implemented using signed
8153 dot-products. */
8154 machine_mode vec_mode = TYPE_MODE (vectype_in);
8155 if (!lane_reduc_code_p
8156 && !directly_supported_p (op.code, vectype_in, optab_vector))
8158 if (dump_enabled_p ())
8159 dump_printf (MSG_NOTE, "op not supported by target.\n");
8160 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8161 || !vect_can_vectorize_without_simd_p (op.code))
8162 ok = false;
8163 else
8164 if (dump_enabled_p ())
8165 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8168 if (vect_emulated_vector_p (vectype_in)
8169 && !vect_can_vectorize_without_simd_p (op.code))
8171 if (dump_enabled_p ())
8172 dump_printf (MSG_NOTE, "using word mode not possible.\n");
8173 return false;
8176 /* lane-reducing operations have to go through vect_transform_reduction.
8177 For the other cases try without the single cycle optimization. */
8178 if (!ok)
8180 if (lane_reduc_code_p)
8181 return false;
8182 else
8183 single_defuse_cycle = false;
8186 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8188 /* If the reduction stmt is one of the patterns that have lane
8189 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
8190 if ((ncopies > 1 && ! single_defuse_cycle)
8191 && lane_reduc_code_p)
8193 if (dump_enabled_p ())
8194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8195 "multi def-use cycle not possible for lane-reducing "
8196 "reduction operation\n");
8197 return false;
8200 if (slp_node
8201 && !(!single_defuse_cycle
8202 && !lane_reduc_code_p
8203 && reduction_type != FOLD_LEFT_REDUCTION))
8204 for (i = 0; i < (int) op.num_ops; i++)
8205 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8207 if (dump_enabled_p ())
8208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8209 "incompatible vector types for invariants\n");
8210 return false;
8213 if (slp_node)
8214 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8215 else
8216 vec_num = 1;
8218 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8219 reduction_type, ncopies, cost_vec);
8220 /* Cost the reduction op inside the loop if transformed via
8221 vect_transform_reduction. Otherwise this is costed by the
8222 separate vectorizable_* routines. */
8223 if (single_defuse_cycle || lane_reduc_code_p)
8225 int factor = 1;
8226 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8227 /* Three dot-products and a subtraction. */
8228 factor = 4;
8229 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8230 stmt_info, 0, vect_body);
8233 if (dump_enabled_p ()
8234 && reduction_type == FOLD_LEFT_REDUCTION)
8235 dump_printf_loc (MSG_NOTE, vect_location,
8236 "using an in-order (fold-left) reduction.\n");
8237 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8238 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8239 reductions go through their own vectorizable_* routines. */
8240 if (!single_defuse_cycle
8241 && !lane_reduc_code_p
8242 && reduction_type != FOLD_LEFT_REDUCTION)
8244 stmt_vec_info tem
8245 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8246 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8248 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8249 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8251 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8252 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8254 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8256 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8257 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8258 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8260 if (reduction_type != FOLD_LEFT_REDUCTION
8261 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8262 && (cond_fn == IFN_LAST
8263 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8264 OPTIMIZE_FOR_SPEED)))
8266 if (dump_enabled_p ())
8267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8268 "can't operate on partial vectors because"
8269 " no conditional operation is available.\n");
8270 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8272 else if (reduction_type == FOLD_LEFT_REDUCTION
8273 && reduc_fn == IFN_LAST
8274 && !expand_vec_cond_expr_p (vectype_in,
8275 truth_type_for (vectype_in),
8276 SSA_NAME))
8278 if (dump_enabled_p ())
8279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8280 "can't operate on partial vectors because"
8281 " no conditional operation is available.\n");
8282 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8284 else if (reduction_type == FOLD_LEFT_REDUCTION
8285 && internal_fn_mask_index (reduc_fn) == -1
8286 && FLOAT_TYPE_P (vectype_in)
8287 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8289 if (dump_enabled_p ())
8290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8291 "can't operate on partial vectors because"
8292 " signed zeros cannot be preserved.\n");
8293 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8295 else
8297 internal_fn mask_reduc_fn
8298 = get_masked_reduction_fn (reduc_fn, vectype_in);
8300 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8301 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8302 vectype_in, 1);
8303 else
8304 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8305 vectype_in, NULL);
8308 return true;
8311 /* STMT_INFO is a dot-product reduction whose multiplication operands
8312 have different signs. Emit a sequence to emulate the operation
8313 using a series of signed DOT_PROD_EXPRs and return the last
8314 statement generated. VEC_DEST is the result of the vector operation
8315 and VOP lists its inputs. */
8317 static gassign *
8318 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8319 gimple_stmt_iterator *gsi, tree vec_dest,
8320 tree vop[3])
8322 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8323 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8324 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8325 gimple *new_stmt;
8327 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8328 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8329 std::swap (vop[0], vop[1]);
8331 /* Convert all inputs to signed types. */
8332 for (int i = 0; i < 3; ++i)
8333 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8335 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8336 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8337 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8338 vop[i] = tmp;
8341 /* In the comments below we assume 8-bit inputs for simplicity,
8342 but the approach works for any full integer type. */
8344 /* Create a vector of -128. */
8345 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8346 tree min_narrow = build_vector_from_val (narrow_vectype,
8347 min_narrow_elttype);
8349 /* Create a vector of 64. */
8350 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8351 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8352 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8354 /* Emit: SUB_RES = VOP[0] - 128. */
8355 tree sub_res = make_ssa_name (narrow_vectype);
8356 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8357 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8359 /* Emit:
8361 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8362 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8363 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8365 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8366 Doing the two 64 * y steps first allows more time to compute x. */
8367 tree stage1 = make_ssa_name (wide_vectype);
8368 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8369 vop[1], half_narrow, vop[2]);
8370 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8372 tree stage2 = make_ssa_name (wide_vectype);
8373 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8374 vop[1], half_narrow, stage1);
8375 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8377 tree stage3 = make_ssa_name (wide_vectype);
8378 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8379 sub_res, vop[1], stage2);
8380 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8382 /* Convert STAGE3 to the reduction type. */
8383 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8386 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8387 value. */
8389 bool
8390 vect_transform_reduction (loop_vec_info loop_vinfo,
8391 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8392 gimple **vec_stmt, slp_tree slp_node)
8394 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8395 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8396 int i;
8397 int ncopies;
8398 int vec_num;
8400 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8401 gcc_assert (reduc_info->is_reduc_info);
8403 if (nested_in_vect_loop_p (loop, stmt_info))
8405 loop = loop->inner;
8406 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8409 gimple_match_op op;
8410 if (!gimple_extract_op (stmt_info->stmt, &op))
8411 gcc_unreachable ();
8413 /* All uses but the last are expected to be defined in the loop.
8414 The last use is the reduction variable. In case of nested cycle this
8415 assumption is not true: we use reduc_index to record the index of the
8416 reduction variable. */
8417 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8418 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8419 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8420 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8422 if (slp_node)
8424 ncopies = 1;
8425 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8427 else
8429 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8430 vec_num = 1;
8433 code_helper code = canonicalize_code (op.code, op.type);
8434 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8436 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8437 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8438 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8440 /* Transform. */
8441 tree new_temp = NULL_TREE;
8442 auto_vec<tree> vec_oprnds0;
8443 auto_vec<tree> vec_oprnds1;
8444 auto_vec<tree> vec_oprnds2;
8445 tree def0;
8447 if (dump_enabled_p ())
8448 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8450 /* FORNOW: Multiple types are not supported for condition. */
8451 if (code == COND_EXPR)
8452 gcc_assert (ncopies == 1);
8454 /* A binary COND_OP reduction must have the same definition and else
8455 value. */
8456 bool cond_fn_p = code.is_internal_fn ()
8457 && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8458 if (cond_fn_p)
8460 gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8461 || code == IFN_COND_MUL || code == IFN_COND_AND
8462 || code == IFN_COND_IOR || code == IFN_COND_XOR);
8463 gcc_assert (op.num_ops == 4
8464 && (op.ops[reduc_index]
8465 == op.ops[internal_fn_else_index ((internal_fn) code)]));
8468 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8470 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8471 if (reduction_type == FOLD_LEFT_REDUCTION)
8473 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8474 gcc_assert (code.is_tree_code () || cond_fn_p);
8475 return vectorize_fold_left_reduction
8476 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8477 code, reduc_fn, op.ops, op.num_ops, vectype_in,
8478 reduc_index, masks, lens);
8481 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8482 gcc_assert (single_defuse_cycle
8483 || code == DOT_PROD_EXPR
8484 || code == WIDEN_SUM_EXPR
8485 || code == SAD_EXPR);
8487 /* Create the destination vector */
8488 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8489 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8491 /* Get NCOPIES vector definitions for all operands except the reduction
8492 definition. */
8493 if (!cond_fn_p)
8495 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8496 single_defuse_cycle && reduc_index == 0
8497 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8498 single_defuse_cycle && reduc_index == 1
8499 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8500 op.num_ops == 3
8501 && !(single_defuse_cycle && reduc_index == 2)
8502 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8504 else
8506 /* For a conditional operation pass the truth type as mask
8507 vectype. */
8508 gcc_assert (single_defuse_cycle
8509 && (reduc_index == 1 || reduc_index == 2));
8510 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8511 op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8512 reduc_index == 1 ? NULL_TREE : op.ops[1],
8513 NULL_TREE, &vec_oprnds1,
8514 reduc_index == 2 ? NULL_TREE : op.ops[2],
8515 NULL_TREE, &vec_oprnds2);
8518 /* For single def-use cycles get one copy of the vectorized reduction
8519 definition. */
8520 if (single_defuse_cycle)
8522 gcc_assert (!slp_node);
8523 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8524 op.ops[reduc_index],
8525 reduc_index == 0 ? &vec_oprnds0
8526 : (reduc_index == 1 ? &vec_oprnds1
8527 : &vec_oprnds2));
8530 bool emulated_mixed_dot_prod
8531 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8532 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8534 gimple *new_stmt;
8535 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8536 if (masked_loop_p && !mask_by_cond_expr)
8538 /* No conditional ifns have been defined for dot-product yet. */
8539 gcc_assert (code != DOT_PROD_EXPR);
8541 /* Make sure that the reduction accumulator is vop[0]. */
8542 if (reduc_index == 1)
8544 gcc_assert (commutative_binary_op_p (code, op.type));
8545 std::swap (vop[0], vop[1]);
8547 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8548 vec_num * ncopies, vectype_in, i);
8549 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8550 vop[0], vop[1], vop[0]);
8551 new_temp = make_ssa_name (vec_dest, call);
8552 gimple_call_set_lhs (call, new_temp);
8553 gimple_call_set_nothrow (call, true);
8554 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8555 new_stmt = call;
8557 else
8559 if (op.num_ops >= 3)
8560 vop[2] = vec_oprnds2[i];
8562 if (masked_loop_p && mask_by_cond_expr)
8564 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8565 vec_num * ncopies, vectype_in, i);
8566 build_vect_cond_expr (code, vop, mask, gsi);
8569 if (emulated_mixed_dot_prod)
8570 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8571 vec_dest, vop);
8573 else if (code.is_internal_fn () && !cond_fn_p)
8574 new_stmt = gimple_build_call_internal (internal_fn (code),
8575 op.num_ops,
8576 vop[0], vop[1], vop[2]);
8577 else if (code.is_internal_fn () && cond_fn_p)
8578 new_stmt = gimple_build_call_internal (internal_fn (code),
8579 op.num_ops,
8580 vop[0], vop[1], vop[2],
8581 vop[1]);
8582 else
8583 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8584 vop[0], vop[1], vop[2]);
8585 new_temp = make_ssa_name (vec_dest, new_stmt);
8586 gimple_set_lhs (new_stmt, new_temp);
8587 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8590 if (slp_node)
8591 slp_node->push_vec_def (new_stmt);
8592 else if (single_defuse_cycle
8593 && i < ncopies - 1)
8595 if (reduc_index == 0)
8596 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8597 else if (reduc_index == 1)
8598 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8599 else if (reduc_index == 2)
8600 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8602 else
8603 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8606 if (!slp_node)
8607 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8609 return true;
8612 /* Transform phase of a cycle PHI. */
8614 bool
8615 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8616 stmt_vec_info stmt_info, gimple **vec_stmt,
8617 slp_tree slp_node, slp_instance slp_node_instance)
8619 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8620 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8621 int i;
8622 int ncopies;
8623 int j;
8624 bool nested_cycle = false;
8625 int vec_num;
8627 if (nested_in_vect_loop_p (loop, stmt_info))
8629 loop = loop->inner;
8630 nested_cycle = true;
8633 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8634 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8635 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8636 gcc_assert (reduc_info->is_reduc_info);
8638 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8639 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8640 /* Leave the scalar phi in place. */
8641 return true;
8643 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8644 /* For a nested cycle we do not fill the above. */
8645 if (!vectype_in)
8646 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8647 gcc_assert (vectype_in);
8649 if (slp_node)
8651 /* The size vect_schedule_slp_instance computes is off for us. */
8652 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8653 * SLP_TREE_LANES (slp_node), vectype_in);
8654 ncopies = 1;
8656 else
8658 vec_num = 1;
8659 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8662 /* Check whether we should use a single PHI node and accumulate
8663 vectors to one before the backedge. */
8664 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8665 ncopies = 1;
8667 /* Create the destination vector */
8668 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8669 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8670 vectype_out);
8672 /* Get the loop-entry arguments. */
8673 tree vec_initial_def = NULL_TREE;
8674 auto_vec<tree> vec_initial_defs;
8675 if (slp_node)
8677 vec_initial_defs.reserve (vec_num);
8678 if (nested_cycle)
8680 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8681 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8682 &vec_initial_defs);
8684 else
8686 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8687 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8688 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8690 unsigned int num_phis = stmts.length ();
8691 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8692 num_phis = 1;
8693 initial_values.reserve (num_phis);
8694 for (unsigned int i = 0; i < num_phis; ++i)
8696 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8697 initial_values.quick_push (vect_phi_initial_value (this_phi));
8699 if (vec_num == 1)
8700 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8701 if (!initial_values.is_empty ())
8703 tree initial_value
8704 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8705 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8706 tree neutral_op
8707 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8708 code, initial_value);
8709 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8710 &vec_initial_defs, vec_num,
8711 stmts.length (), neutral_op);
8715 else
8717 /* Get at the scalar def before the loop, that defines the initial
8718 value of the reduction variable. */
8719 tree initial_def = vect_phi_initial_value (phi);
8720 reduc_info->reduc_initial_values.safe_push (initial_def);
8721 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8722 and we can't use zero for induc_val, use initial_def. Similarly
8723 for REDUC_MIN and initial_def larger than the base. */
8724 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8726 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8727 if (TREE_CODE (initial_def) == INTEGER_CST
8728 && !integer_zerop (induc_val)
8729 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8730 && tree_int_cst_lt (initial_def, induc_val))
8731 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8732 && tree_int_cst_lt (induc_val, initial_def))))
8734 induc_val = initial_def;
8735 /* Communicate we used the initial_def to epilouge
8736 generation. */
8737 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8739 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8741 else if (nested_cycle)
8743 /* Do not use an adjustment def as that case is not supported
8744 correctly if ncopies is not one. */
8745 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8746 ncopies, initial_def,
8747 &vec_initial_defs);
8749 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8750 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8751 /* Fill the initial vector with the initial scalar value. */
8752 vec_initial_def
8753 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8754 initial_def, initial_def);
8755 else
8757 if (ncopies == 1)
8758 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8759 if (!reduc_info->reduc_initial_values.is_empty ())
8761 initial_def = reduc_info->reduc_initial_values[0];
8762 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8763 tree neutral_op
8764 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8765 code, initial_def);
8766 gcc_assert (neutral_op);
8767 /* Try to simplify the vector initialization by applying an
8768 adjustment after the reduction has been performed. */
8769 if (!reduc_info->reused_accumulator
8770 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8771 && !operand_equal_p (neutral_op, initial_def))
8773 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8774 = initial_def;
8775 initial_def = neutral_op;
8777 vec_initial_def
8778 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8779 initial_def, neutral_op);
8784 if (vec_initial_def)
8786 vec_initial_defs.create (ncopies);
8787 for (i = 0; i < ncopies; ++i)
8788 vec_initial_defs.quick_push (vec_initial_def);
8791 if (auto *accumulator = reduc_info->reused_accumulator)
8793 tree def = accumulator->reduc_input;
8794 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8796 unsigned int nreduc;
8797 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8798 (TREE_TYPE (def)),
8799 TYPE_VECTOR_SUBPARTS (vectype_out),
8800 &nreduc);
8801 gcc_assert (res);
8802 gimple_seq stmts = NULL;
8803 /* Reduce the single vector to a smaller one. */
8804 if (nreduc != 1)
8806 /* Perform the reduction in the appropriate type. */
8807 tree rvectype = vectype_out;
8808 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8809 TREE_TYPE (TREE_TYPE (def))))
8810 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8811 TYPE_VECTOR_SUBPARTS
8812 (vectype_out));
8813 def = vect_create_partial_epilog (def, rvectype,
8814 STMT_VINFO_REDUC_CODE
8815 (reduc_info),
8816 &stmts);
8818 /* The epilogue loop might use a different vector mode, like
8819 VNx2DI vs. V2DI. */
8820 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8822 tree reduc_type = build_vector_type_for_mode
8823 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8824 def = gimple_convert (&stmts, reduc_type, def);
8826 /* Adjust the input so we pick up the partially reduced value
8827 for the skip edge in vect_create_epilog_for_reduction. */
8828 accumulator->reduc_input = def;
8829 /* And the reduction could be carried out using a different sign. */
8830 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8831 def = gimple_convert (&stmts, vectype_out, def);
8832 if (loop_vinfo->main_loop_edge)
8834 /* While we'd like to insert on the edge this will split
8835 blocks and disturb bookkeeping, we also will eventually
8836 need this on the skip edge. Rely on sinking to
8837 fixup optimal placement and insert in the pred. */
8838 gimple_stmt_iterator gsi
8839 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8840 /* Insert before a cond that eventually skips the
8841 epilogue. */
8842 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8843 gsi_prev (&gsi);
8844 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8846 else
8847 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8848 stmts);
8850 if (loop_vinfo->main_loop_edge)
8851 vec_initial_defs[0]
8852 = vect_get_main_loop_result (loop_vinfo, def,
8853 vec_initial_defs[0]);
8854 else
8855 vec_initial_defs.safe_push (def);
8858 /* Generate the reduction PHIs upfront. */
8859 for (i = 0; i < vec_num; i++)
8861 tree vec_init_def = vec_initial_defs[i];
8862 for (j = 0; j < ncopies; j++)
8864 /* Create the reduction-phi that defines the reduction
8865 operand. */
8866 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8868 /* Set the loop-entry arg of the reduction-phi. */
8869 if (j != 0 && nested_cycle)
8870 vec_init_def = vec_initial_defs[j];
8871 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8872 UNKNOWN_LOCATION);
8874 /* The loop-latch arg is set in epilogue processing. */
8876 if (slp_node)
8877 slp_node->push_vec_def (new_phi);
8878 else
8880 if (j == 0)
8881 *vec_stmt = new_phi;
8882 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8887 return true;
8890 /* Vectorizes LC PHIs. */
8892 bool
8893 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8894 stmt_vec_info stmt_info, gimple **vec_stmt,
8895 slp_tree slp_node)
8897 if (!loop_vinfo
8898 || !is_a <gphi *> (stmt_info->stmt)
8899 || gimple_phi_num_args (stmt_info->stmt) != 1)
8900 return false;
8902 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8903 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8904 return false;
8906 if (!vec_stmt) /* transformation not required. */
8908 /* Deal with copies from externs or constants that disguise as
8909 loop-closed PHI nodes (PR97886). */
8910 if (slp_node
8911 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8912 SLP_TREE_VECTYPE (slp_node)))
8914 if (dump_enabled_p ())
8915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8916 "incompatible vector types for invariants\n");
8917 return false;
8919 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8920 return true;
8923 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8924 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8925 basic_block bb = gimple_bb (stmt_info->stmt);
8926 edge e = single_pred_edge (bb);
8927 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8928 auto_vec<tree> vec_oprnds;
8929 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8930 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8931 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8932 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8934 /* Create the vectorized LC PHI node. */
8935 gphi *new_phi = create_phi_node (vec_dest, bb);
8936 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8937 if (slp_node)
8938 slp_node->push_vec_def (new_phi);
8939 else
8940 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8942 if (!slp_node)
8943 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8945 return true;
8948 /* Vectorizes PHIs. */
8950 bool
8951 vectorizable_phi (vec_info *,
8952 stmt_vec_info stmt_info, gimple **vec_stmt,
8953 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8955 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8956 return false;
8958 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8959 return false;
8961 tree vectype = SLP_TREE_VECTYPE (slp_node);
8963 if (!vec_stmt) /* transformation not required. */
8965 slp_tree child;
8966 unsigned i;
8967 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8968 if (!child)
8970 if (dump_enabled_p ())
8971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8972 "PHI node with unvectorized backedge def\n");
8973 return false;
8975 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8977 if (dump_enabled_p ())
8978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8979 "incompatible vector types for invariants\n");
8980 return false;
8982 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8983 && !useless_type_conversion_p (vectype,
8984 SLP_TREE_VECTYPE (child)))
8986 /* With bools we can have mask and non-mask precision vectors
8987 or different non-mask precisions. while pattern recog is
8988 supposed to guarantee consistency here bugs in it can cause
8989 mismatches (PR103489 and PR103800 for example).
8990 Deal with them here instead of ICEing later. */
8991 if (dump_enabled_p ())
8992 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8993 "incompatible vector type setup from "
8994 "bool pattern detection\n");
8995 return false;
8998 /* For single-argument PHIs assume coalescing which means zero cost
8999 for the scalar and the vector PHIs. This avoids artificially
9000 favoring the vector path (but may pessimize it in some cases). */
9001 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9002 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9003 vector_stmt, stmt_info, vectype, 0, vect_body);
9004 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9005 return true;
9008 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9009 basic_block bb = gimple_bb (stmt_info->stmt);
9010 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9011 auto_vec<gphi *> new_phis;
9012 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9014 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9016 /* Skip not yet vectorized defs. */
9017 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9018 && SLP_TREE_VEC_DEFS (child).is_empty ())
9019 continue;
9021 auto_vec<tree> vec_oprnds;
9022 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9023 if (!new_phis.exists ())
9025 new_phis.create (vec_oprnds.length ());
9026 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9028 /* Create the vectorized LC PHI node. */
9029 new_phis.quick_push (create_phi_node (vec_dest, bb));
9030 slp_node->push_vec_def (new_phis[j]);
9033 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9034 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9035 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9037 /* We should have at least one already vectorized child. */
9038 gcc_assert (new_phis.exists ());
9040 return true;
9043 /* Vectorizes first order recurrences. An overview of the transformation
9044 is described below. Suppose we have the following loop.
9046 int t = 0;
9047 for (int i = 0; i < n; ++i)
9049 b[i] = a[i] - t;
9050 t = a[i];
9053 There is a first-order recurrence on 'a'. For this loop, the scalar IR
9054 looks (simplified) like:
9056 scalar.preheader:
9057 init = 0;
9059 scalar.body:
9060 i = PHI <0(scalar.preheader), i+1(scalar.body)>
9061 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9062 _1 = a[i]
9063 b[i] = _1 - _2
9064 if (i < n) goto scalar.body
9066 In this example, _2 is a recurrence because it's value depends on the
9067 previous iteration. We vectorize this as (VF = 4)
9069 vector.preheader:
9070 vect_init = vect_cst(..., ..., ..., 0)
9072 vector.body
9073 i = PHI <0(vector.preheader), i+4(vector.body)>
9074 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9075 vect_2 = a[i, i+1, i+2, i+3];
9076 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9077 b[i, i+1, i+2, i+3] = vect_2 - vect_3
9078 if (..) goto vector.body
9080 In this function, vectorizable_recurr, we code generate both the
9081 vector PHI node and the permute since those together compute the
9082 vectorized value of the scalar PHI. We do not yet have the
9083 backedge value to fill in there nor into the vec_perm. Those
9084 are filled in maybe_set_vectorized_backedge_value and
9085 vect_schedule_scc.
9087 TODO: Since the scalar loop does not have a use of the recurrence
9088 outside of the loop the natural way to implement peeling via
9089 vectorizing the live value doesn't work. For now peeling of loops
9090 with a recurrence is not implemented. For SLP the supported cases
9091 are restricted to those requiring a single vector recurrence PHI. */
9093 bool
9094 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9095 gimple **vec_stmt, slp_tree slp_node,
9096 stmt_vector_for_cost *cost_vec)
9098 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9099 return false;
9101 gphi *phi = as_a<gphi *> (stmt_info->stmt);
9103 /* So far we only support first-order recurrence auto-vectorization. */
9104 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9105 return false;
9107 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9108 unsigned ncopies;
9109 if (slp_node)
9110 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9111 else
9112 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9113 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9114 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9115 /* We need to be able to make progress with a single vector. */
9116 if (maybe_gt (dist * 2, nunits))
9118 if (dump_enabled_p ())
9119 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9120 "first order recurrence exceeds half of "
9121 "a vector\n");
9122 return false;
9125 /* First-order recurrence autovectorization needs to handle permutation
9126 with indices = [nunits-1, nunits, nunits+1, ...]. */
9127 vec_perm_builder sel (nunits, 1, 3);
9128 for (int i = 0; i < 3; ++i)
9129 sel.quick_push (nunits - dist + i);
9130 vec_perm_indices indices (sel, 2, nunits);
9132 if (!vec_stmt) /* transformation not required. */
9134 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9135 indices))
9136 return false;
9138 if (slp_node)
9140 /* We eventually need to set a vector type on invariant
9141 arguments. */
9142 unsigned j;
9143 slp_tree child;
9144 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9145 if (!vect_maybe_update_slp_op_vectype
9146 (child, SLP_TREE_VECTYPE (slp_node)))
9148 if (dump_enabled_p ())
9149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9150 "incompatible vector types for "
9151 "invariants\n");
9152 return false;
9155 /* The recurrence costs the initialization vector and one permute
9156 for each copy. */
9157 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9158 stmt_info, 0, vect_prologue);
9159 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9160 stmt_info, 0, vect_body);
9161 if (dump_enabled_p ())
9162 dump_printf_loc (MSG_NOTE, vect_location,
9163 "vectorizable_recurr: inside_cost = %d, "
9164 "prologue_cost = %d .\n", inside_cost,
9165 prologue_cost);
9167 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9168 return true;
9171 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9172 basic_block bb = gimple_bb (phi);
9173 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9174 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9176 gimple_seq stmts = NULL;
9177 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9178 gsi_insert_seq_on_edge_immediate (pe, stmts);
9180 tree vec_init = build_vector_from_val (vectype, preheader);
9181 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9183 /* Create the vectorized first-order PHI node. */
9184 tree vec_dest = vect_get_new_vect_var (vectype,
9185 vect_simple_var, "vec_recur_");
9186 gphi *new_phi = create_phi_node (vec_dest, bb);
9187 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9189 /* Insert shuffles the first-order recurrence autovectorization.
9190 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9191 tree perm = vect_gen_perm_mask_checked (vectype, indices);
9193 /* Insert the required permute after the latch definition. The
9194 second and later operands are tentative and will be updated when we have
9195 vectorized the latch definition. */
9196 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9197 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9198 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9199 gsi_next (&gsi2);
9201 for (unsigned i = 0; i < ncopies; ++i)
9203 vec_dest = make_ssa_name (vectype);
9204 gassign *vperm
9205 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9206 i == 0 ? gimple_phi_result (new_phi) : NULL,
9207 NULL, perm);
9208 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9210 if (slp_node)
9211 slp_node->push_vec_def (vperm);
9212 else
9213 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9216 if (!slp_node)
9217 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9218 return true;
9221 /* Return true if VECTYPE represents a vector that requires lowering
9222 by the vector lowering pass. */
9224 bool
9225 vect_emulated_vector_p (tree vectype)
9227 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9228 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9229 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9232 /* Return true if we can emulate CODE on an integer mode representation
9233 of a vector. */
9235 bool
9236 vect_can_vectorize_without_simd_p (tree_code code)
9238 switch (code)
9240 case PLUS_EXPR:
9241 case MINUS_EXPR:
9242 case NEGATE_EXPR:
9243 case BIT_AND_EXPR:
9244 case BIT_IOR_EXPR:
9245 case BIT_XOR_EXPR:
9246 case BIT_NOT_EXPR:
9247 return true;
9249 default:
9250 return false;
9254 /* Likewise, but taking a code_helper. */
9256 bool
9257 vect_can_vectorize_without_simd_p (code_helper code)
9259 return (code.is_tree_code ()
9260 && vect_can_vectorize_without_simd_p (tree_code (code)));
9263 /* Create vector init for vectorized iv. */
9264 static tree
9265 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9266 tree step_expr, poly_uint64 nunits,
9267 tree vectype,
9268 enum vect_induction_op_type induction_type)
9270 unsigned HOST_WIDE_INT const_nunits;
9271 tree vec_shift, vec_init, new_name;
9272 unsigned i;
9273 tree itype = TREE_TYPE (vectype);
9275 /* iv_loop is the loop to be vectorized. Create:
9276 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9277 new_name = gimple_convert (stmts, itype, init_expr);
9278 switch (induction_type)
9280 case vect_step_op_shr:
9281 case vect_step_op_shl:
9282 /* Build the Initial value from shift_expr. */
9283 vec_init = gimple_build_vector_from_val (stmts,
9284 vectype,
9285 new_name);
9286 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9287 build_zero_cst (itype), step_expr);
9288 vec_init = gimple_build (stmts,
9289 (induction_type == vect_step_op_shr
9290 ? RSHIFT_EXPR : LSHIFT_EXPR),
9291 vectype, vec_init, vec_shift);
9292 break;
9294 case vect_step_op_neg:
9296 vec_init = gimple_build_vector_from_val (stmts,
9297 vectype,
9298 new_name);
9299 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9300 vectype, vec_init);
9301 /* The encoding has 2 interleaved stepped patterns. */
9302 vec_perm_builder sel (nunits, 2, 3);
9303 sel.quick_grow (6);
9304 for (i = 0; i < 3; i++)
9306 sel[2 * i] = i;
9307 sel[2 * i + 1] = i + nunits;
9309 vec_perm_indices indices (sel, 2, nunits);
9310 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9311 fail when vec_init is const vector. In that situation vec_perm is not
9312 really needed. */
9313 tree perm_mask_even
9314 = vect_gen_perm_mask_any (vectype, indices);
9315 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9316 vectype,
9317 vec_init, vec_neg,
9318 perm_mask_even);
9320 break;
9322 case vect_step_op_mul:
9324 /* Use unsigned mult to avoid UD integer overflow. */
9325 gcc_assert (nunits.is_constant (&const_nunits));
9326 tree utype = unsigned_type_for (itype);
9327 tree uvectype = build_vector_type (utype,
9328 TYPE_VECTOR_SUBPARTS (vectype));
9329 new_name = gimple_convert (stmts, utype, new_name);
9330 vec_init = gimple_build_vector_from_val (stmts,
9331 uvectype,
9332 new_name);
9333 tree_vector_builder elts (uvectype, const_nunits, 1);
9334 tree elt_step = build_one_cst (utype);
9336 elts.quick_push (elt_step);
9337 for (i = 1; i < const_nunits; i++)
9339 /* Create: new_name_i = new_name + step_expr. */
9340 elt_step = gimple_build (stmts, MULT_EXPR,
9341 utype, elt_step, step_expr);
9342 elts.quick_push (elt_step);
9344 /* Create a vector from [new_name_0, new_name_1, ...,
9345 new_name_nunits-1]. */
9346 tree vec_mul = gimple_build_vector (stmts, &elts);
9347 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9348 vec_init, vec_mul);
9349 vec_init = gimple_convert (stmts, vectype, vec_init);
9351 break;
9353 default:
9354 gcc_unreachable ();
9357 return vec_init;
9360 /* Peel init_expr by skip_niter for induction_type. */
9361 tree
9362 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9363 tree skip_niters, tree step_expr,
9364 enum vect_induction_op_type induction_type)
9366 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9367 tree type = TREE_TYPE (init_expr);
9368 unsigned prec = TYPE_PRECISION (type);
9369 switch (induction_type)
9371 case vect_step_op_neg:
9372 if (TREE_INT_CST_LOW (skip_niters) % 2)
9373 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9374 /* else no change. */
9375 break;
9377 case vect_step_op_shr:
9378 case vect_step_op_shl:
9379 skip_niters = gimple_convert (stmts, type, skip_niters);
9380 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9381 /* When shift mount >= precision, need to avoid UD.
9382 In the original loop, there's no UD, and according to semantic,
9383 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9384 if (!tree_fits_uhwi_p (step_expr)
9385 || tree_to_uhwi (step_expr) >= prec)
9387 if (induction_type == vect_step_op_shl
9388 || TYPE_UNSIGNED (type))
9389 init_expr = build_zero_cst (type);
9390 else
9391 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9392 init_expr,
9393 wide_int_to_tree (type, prec - 1));
9395 else
9396 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9397 ? RSHIFT_EXPR : LSHIFT_EXPR),
9398 type, init_expr, step_expr);
9399 break;
9401 case vect_step_op_mul:
9403 tree utype = unsigned_type_for (type);
9404 init_expr = gimple_convert (stmts, utype, init_expr);
9405 wide_int skipn = wi::to_wide (skip_niters);
9406 wide_int begin = wi::to_wide (step_expr);
9407 auto_mpz base, exp, mod, res;
9408 wi::to_mpz (begin, base, TYPE_SIGN (type));
9409 wi::to_mpz (skipn, exp, UNSIGNED);
9410 mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9411 mpz_powm (res, base, exp, mod);
9412 begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9413 tree mult_expr = wide_int_to_tree (utype, begin);
9414 init_expr = gimple_build (stmts, MULT_EXPR, utype,
9415 init_expr, mult_expr);
9416 init_expr = gimple_convert (stmts, type, init_expr);
9418 break;
9420 default:
9421 gcc_unreachable ();
9424 return init_expr;
9427 /* Create vector step for vectorized iv. */
9428 static tree
9429 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9430 poly_uint64 vf,
9431 enum vect_induction_op_type induction_type)
9433 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9434 tree new_name = NULL;
9435 /* Step should be pow (step, vf) for mult induction. */
9436 if (induction_type == vect_step_op_mul)
9438 gcc_assert (vf.is_constant ());
9439 wide_int begin = wi::to_wide (step_expr);
9441 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9442 begin = wi::mul (begin, wi::to_wide (step_expr));
9444 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9446 else if (induction_type == vect_step_op_neg)
9447 /* Do nothing. */
9449 else
9450 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9451 expr, step_expr);
9452 return new_name;
9455 static tree
9456 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9457 stmt_vec_info stmt_info,
9458 tree new_name, tree vectype,
9459 enum vect_induction_op_type induction_type)
9461 /* No step is needed for neg induction. */
9462 if (induction_type == vect_step_op_neg)
9463 return NULL;
9465 tree t = unshare_expr (new_name);
9466 gcc_assert (CONSTANT_CLASS_P (new_name)
9467 || TREE_CODE (new_name) == SSA_NAME);
9468 tree new_vec = build_vector_from_val (vectype, t);
9469 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9470 new_vec, vectype, NULL);
9471 return vec_step;
9474 /* Update vectorized iv with vect_step, induc_def is init. */
9475 static tree
9476 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9477 tree induc_def, tree vec_step,
9478 enum vect_induction_op_type induction_type)
9480 tree vec_def = induc_def;
9481 switch (induction_type)
9483 case vect_step_op_mul:
9485 /* Use unsigned mult to avoid UD integer overflow. */
9486 tree uvectype
9487 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9488 TYPE_VECTOR_SUBPARTS (vectype));
9489 vec_def = gimple_convert (stmts, uvectype, vec_def);
9490 vec_step = gimple_convert (stmts, uvectype, vec_step);
9491 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9492 vec_def, vec_step);
9493 vec_def = gimple_convert (stmts, vectype, vec_def);
9495 break;
9497 case vect_step_op_shr:
9498 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9499 vec_def, vec_step);
9500 break;
9502 case vect_step_op_shl:
9503 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9504 vec_def, vec_step);
9505 break;
9506 case vect_step_op_neg:
9507 vec_def = induc_def;
9508 /* Do nothing. */
9509 break;
9510 default:
9511 gcc_unreachable ();
9514 return vec_def;
9518 /* Function vectorizable_induction
9520 Check if STMT_INFO performs an nonlinear induction computation that can be
9521 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9522 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9523 basic block.
9524 Return true if STMT_INFO is vectorizable in this way. */
9526 static bool
9527 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9528 stmt_vec_info stmt_info,
9529 gimple **vec_stmt, slp_tree slp_node,
9530 stmt_vector_for_cost *cost_vec)
9532 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9533 unsigned ncopies;
9534 bool nested_in_vect_loop = false;
9535 class loop *iv_loop;
9536 tree vec_def;
9537 edge pe = loop_preheader_edge (loop);
9538 basic_block new_bb;
9539 tree vec_init, vec_step;
9540 tree new_name;
9541 gimple *new_stmt;
9542 gphi *induction_phi;
9543 tree induc_def, vec_dest;
9544 tree init_expr, step_expr;
9545 tree niters_skip;
9546 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9547 unsigned i;
9548 gimple_stmt_iterator si;
9550 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9552 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9553 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9554 enum vect_induction_op_type induction_type
9555 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9557 gcc_assert (induction_type > vect_step_op_add);
9559 if (slp_node)
9560 ncopies = 1;
9561 else
9562 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9563 gcc_assert (ncopies >= 1);
9565 /* FORNOW. Only handle nonlinear induction in the same loop. */
9566 if (nested_in_vect_loop_p (loop, stmt_info))
9568 if (dump_enabled_p ())
9569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9570 "nonlinear induction in nested loop.\n");
9571 return false;
9574 iv_loop = loop;
9575 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9577 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9578 update for each iv and a permutation to generate wanted vector iv. */
9579 if (slp_node)
9581 if (dump_enabled_p ())
9582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9583 "SLP induction not supported for nonlinear"
9584 " induction.\n");
9585 return false;
9588 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9590 if (dump_enabled_p ())
9591 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9592 "floating point nonlinear induction vectorization"
9593 " not supported.\n");
9594 return false;
9597 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9598 init_expr = vect_phi_initial_value (phi);
9599 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9600 && TREE_CODE (step_expr) == INTEGER_CST);
9601 /* step_expr should be aligned with init_expr,
9602 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9603 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9605 if (TREE_CODE (init_expr) == INTEGER_CST)
9606 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9607 else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9609 /* INIT_EXPR could be a bit_field, bail out for such case. */
9610 if (dump_enabled_p ())
9611 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9612 "nonlinear induction vectorization failed:"
9613 " component type of vectype is not a nop conversion"
9614 " from type of init_expr.\n");
9615 return false;
9618 switch (induction_type)
9620 case vect_step_op_neg:
9621 if (TREE_CODE (init_expr) != INTEGER_CST
9622 && TREE_CODE (init_expr) != REAL_CST)
9624 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9625 if (!directly_supported_p (NEGATE_EXPR, vectype))
9626 return false;
9628 /* The encoding has 2 interleaved stepped patterns. */
9629 vec_perm_builder sel (nunits, 2, 3);
9630 machine_mode mode = TYPE_MODE (vectype);
9631 sel.quick_grow (6);
9632 for (i = 0; i < 3; i++)
9634 sel[i * 2] = i;
9635 sel[i * 2 + 1] = i + nunits;
9637 vec_perm_indices indices (sel, 2, nunits);
9638 if (!can_vec_perm_const_p (mode, mode, indices))
9639 return false;
9641 break;
9643 case vect_step_op_mul:
9645 /* Check for backend support of MULT_EXPR. */
9646 if (!directly_supported_p (MULT_EXPR, vectype))
9647 return false;
9649 /* ?? How to construct vector step for variable number vector.
9650 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9651 if (!vf.is_constant ())
9652 return false;
9654 break;
9656 case vect_step_op_shr:
9657 /* Check for backend support of RSHIFT_EXPR. */
9658 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9659 return false;
9661 /* Don't shift more than type precision to avoid UD. */
9662 if (!tree_fits_uhwi_p (step_expr)
9663 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9664 TYPE_PRECISION (TREE_TYPE (init_expr))))
9665 return false;
9666 break;
9668 case vect_step_op_shl:
9669 /* Check for backend support of RSHIFT_EXPR. */
9670 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9671 return false;
9673 /* Don't shift more than type precision to avoid UD. */
9674 if (!tree_fits_uhwi_p (step_expr)
9675 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9676 TYPE_PRECISION (TREE_TYPE (init_expr))))
9677 return false;
9679 break;
9681 default:
9682 gcc_unreachable ();
9685 if (!vec_stmt) /* transformation not required. */
9687 unsigned inside_cost = 0, prologue_cost = 0;
9688 /* loop cost for vec_loop. Neg induction doesn't have any
9689 inside_cost. */
9690 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9691 stmt_info, 0, vect_body);
9693 /* loop cost for vec_loop. Neg induction doesn't have any
9694 inside_cost. */
9695 if (induction_type == vect_step_op_neg)
9696 inside_cost = 0;
9698 /* prologue cost for vec_init and vec_step. */
9699 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9700 stmt_info, 0, vect_prologue);
9702 if (dump_enabled_p ())
9703 dump_printf_loc (MSG_NOTE, vect_location,
9704 "vect_model_induction_cost: inside_cost = %d, "
9705 "prologue_cost = %d. \n", inside_cost,
9706 prologue_cost);
9708 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9709 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9710 return true;
9713 /* Transform. */
9715 /* Compute a vector variable, initialized with the first VF values of
9716 the induction variable. E.g., for an iv with IV_PHI='X' and
9717 evolution S, for a vector of 4 units, we want to compute:
9718 [X, X + S, X + 2*S, X + 3*S]. */
9720 if (dump_enabled_p ())
9721 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9723 pe = loop_preheader_edge (iv_loop);
9724 /* Find the first insertion point in the BB. */
9725 basic_block bb = gimple_bb (phi);
9726 si = gsi_after_labels (bb);
9728 gimple_seq stmts = NULL;
9730 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9731 /* If we are using the loop mask to "peel" for alignment then we need
9732 to adjust the start value here. */
9733 if (niters_skip != NULL_TREE)
9734 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9735 step_expr, induction_type);
9737 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9738 step_expr, nunits, vectype,
9739 induction_type);
9740 if (stmts)
9742 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9743 gcc_assert (!new_bb);
9746 stmts = NULL;
9747 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9748 vf, induction_type);
9749 if (stmts)
9751 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9752 gcc_assert (!new_bb);
9755 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9756 new_name, vectype,
9757 induction_type);
9758 /* Create the following def-use cycle:
9759 loop prolog:
9760 vec_init = ...
9761 vec_step = ...
9762 loop:
9763 vec_iv = PHI <vec_init, vec_loop>
9765 STMT
9767 vec_loop = vec_iv + vec_step; */
9769 /* Create the induction-phi that defines the induction-operand. */
9770 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9771 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9772 induc_def = PHI_RESULT (induction_phi);
9774 /* Create the iv update inside the loop. */
9775 stmts = NULL;
9776 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9777 induc_def, vec_step,
9778 induction_type);
9780 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9781 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9783 /* Set the arguments of the phi node: */
9784 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9785 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9786 UNKNOWN_LOCATION);
9788 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9789 *vec_stmt = induction_phi;
9791 /* In case that vectorization factor (VF) is bigger than the number
9792 of elements that we can fit in a vectype (nunits), we have to generate
9793 more than one vector stmt - i.e - we need to "unroll" the
9794 vector stmt by a factor VF/nunits. For more details see documentation
9795 in vectorizable_operation. */
9797 if (ncopies > 1)
9799 stmts = NULL;
9800 /* FORNOW. This restriction should be relaxed. */
9801 gcc_assert (!nested_in_vect_loop);
9803 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9804 nunits, induction_type);
9806 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9807 new_name, vectype,
9808 induction_type);
9809 vec_def = induc_def;
9810 for (i = 1; i < ncopies; i++)
9812 /* vec_i = vec_prev + vec_step. */
9813 stmts = NULL;
9814 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9815 vec_def, vec_step,
9816 induction_type);
9817 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9818 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9819 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9823 if (dump_enabled_p ())
9824 dump_printf_loc (MSG_NOTE, vect_location,
9825 "transform induction: created def-use cycle: %G%G",
9826 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9828 return true;
9831 /* Function vectorizable_induction
9833 Check if STMT_INFO performs an induction computation that can be vectorized.
9834 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9835 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9836 Return true if STMT_INFO is vectorizable in this way. */
9838 bool
9839 vectorizable_induction (loop_vec_info loop_vinfo,
9840 stmt_vec_info stmt_info,
9841 gimple **vec_stmt, slp_tree slp_node,
9842 stmt_vector_for_cost *cost_vec)
9844 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9845 unsigned ncopies;
9846 bool nested_in_vect_loop = false;
9847 class loop *iv_loop;
9848 tree vec_def;
9849 edge pe = loop_preheader_edge (loop);
9850 basic_block new_bb;
9851 tree new_vec, vec_init, vec_step, t;
9852 tree new_name;
9853 gimple *new_stmt;
9854 gphi *induction_phi;
9855 tree induc_def, vec_dest;
9856 tree init_expr, step_expr;
9857 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9858 unsigned i;
9859 tree expr;
9860 gimple_stmt_iterator si;
9861 enum vect_induction_op_type induction_type
9862 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9864 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9865 if (!phi)
9866 return false;
9868 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9869 return false;
9871 /* Make sure it was recognized as induction computation. */
9872 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9873 return false;
9875 /* Handle nonlinear induction in a separate place. */
9876 if (induction_type != vect_step_op_add)
9877 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9878 vec_stmt, slp_node, cost_vec);
9880 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9881 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9883 if (slp_node)
9884 ncopies = 1;
9885 else
9886 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9887 gcc_assert (ncopies >= 1);
9889 /* FORNOW. These restrictions should be relaxed. */
9890 if (nested_in_vect_loop_p (loop, stmt_info))
9892 imm_use_iterator imm_iter;
9893 use_operand_p use_p;
9894 gimple *exit_phi;
9895 edge latch_e;
9896 tree loop_arg;
9898 if (ncopies > 1)
9900 if (dump_enabled_p ())
9901 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9902 "multiple types in nested loop.\n");
9903 return false;
9906 exit_phi = NULL;
9907 latch_e = loop_latch_edge (loop->inner);
9908 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9909 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9911 gimple *use_stmt = USE_STMT (use_p);
9912 if (is_gimple_debug (use_stmt))
9913 continue;
9915 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9917 exit_phi = use_stmt;
9918 break;
9921 if (exit_phi)
9923 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9924 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9925 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9927 if (dump_enabled_p ())
9928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9929 "inner-loop induction only used outside "
9930 "of the outer vectorized loop.\n");
9931 return false;
9935 nested_in_vect_loop = true;
9936 iv_loop = loop->inner;
9938 else
9939 iv_loop = loop;
9940 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9942 if (slp_node && !nunits.is_constant ())
9944 /* The current SLP code creates the step value element-by-element. */
9945 if (dump_enabled_p ())
9946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9947 "SLP induction not supported for variable-length"
9948 " vectors.\n");
9949 return false;
9952 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9954 if (dump_enabled_p ())
9955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9956 "floating point induction vectorization disabled\n");
9957 return false;
9960 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9961 gcc_assert (step_expr != NULL_TREE);
9962 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9964 /* Check for backend support of PLUS/MINUS_EXPR. */
9965 if (!directly_supported_p (PLUS_EXPR, step_vectype)
9966 || !directly_supported_p (MINUS_EXPR, step_vectype))
9967 return false;
9969 if (!vec_stmt) /* transformation not required. */
9971 unsigned inside_cost = 0, prologue_cost = 0;
9972 if (slp_node)
9974 /* We eventually need to set a vector type on invariant
9975 arguments. */
9976 unsigned j;
9977 slp_tree child;
9978 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9979 if (!vect_maybe_update_slp_op_vectype
9980 (child, SLP_TREE_VECTYPE (slp_node)))
9982 if (dump_enabled_p ())
9983 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9984 "incompatible vector types for "
9985 "invariants\n");
9986 return false;
9988 /* loop cost for vec_loop. */
9989 inside_cost
9990 = record_stmt_cost (cost_vec,
9991 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9992 vector_stmt, stmt_info, 0, vect_body);
9993 /* prologue cost for vec_init (if not nested) and step. */
9994 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9995 scalar_to_vec,
9996 stmt_info, 0, vect_prologue);
9998 else /* if (!slp_node) */
10000 /* loop cost for vec_loop. */
10001 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10002 stmt_info, 0, vect_body);
10003 /* prologue cost for vec_init and vec_step. */
10004 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10005 stmt_info, 0, vect_prologue);
10007 if (dump_enabled_p ())
10008 dump_printf_loc (MSG_NOTE, vect_location,
10009 "vect_model_induction_cost: inside_cost = %d, "
10010 "prologue_cost = %d .\n", inside_cost,
10011 prologue_cost);
10013 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10014 DUMP_VECT_SCOPE ("vectorizable_induction");
10015 return true;
10018 /* Transform. */
10020 /* Compute a vector variable, initialized with the first VF values of
10021 the induction variable. E.g., for an iv with IV_PHI='X' and
10022 evolution S, for a vector of 4 units, we want to compute:
10023 [X, X + S, X + 2*S, X + 3*S]. */
10025 if (dump_enabled_p ())
10026 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10028 pe = loop_preheader_edge (iv_loop);
10029 /* Find the first insertion point in the BB. */
10030 basic_block bb = gimple_bb (phi);
10031 si = gsi_after_labels (bb);
10033 /* For SLP induction we have to generate several IVs as for example
10034 with group size 3 we need
10035 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10036 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
10037 if (slp_node)
10039 /* Enforced above. */
10040 unsigned int const_nunits = nunits.to_constant ();
10042 /* The initial values are vectorized, but any lanes > group_size
10043 need adjustment. */
10044 slp_tree init_node
10045 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10047 /* Gather steps. Since we do not vectorize inductions as
10048 cycles we have to reconstruct the step from SCEV data. */
10049 unsigned group_size = SLP_TREE_LANES (slp_node);
10050 tree *steps = XALLOCAVEC (tree, group_size);
10051 tree *inits = XALLOCAVEC (tree, group_size);
10052 stmt_vec_info phi_info;
10053 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10055 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10056 if (!init_node)
10057 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10058 pe->dest_idx);
10061 /* Now generate the IVs. */
10062 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10063 gcc_assert ((const_nunits * nvects) % group_size == 0);
10064 unsigned nivs;
10065 if (nested_in_vect_loop)
10066 nivs = nvects;
10067 else
10069 /* Compute the number of distinct IVs we need. First reduce
10070 group_size if it is a multiple of const_nunits so we get
10071 one IV for a group_size of 4 but const_nunits 2. */
10072 unsigned group_sizep = group_size;
10073 if (group_sizep % const_nunits == 0)
10074 group_sizep = group_sizep / const_nunits;
10075 nivs = least_common_multiple (group_sizep,
10076 const_nunits) / const_nunits;
10078 tree stept = TREE_TYPE (step_vectype);
10079 tree lupdate_mul = NULL_TREE;
10080 if (!nested_in_vect_loop)
10082 /* The number of iterations covered in one vector iteration. */
10083 unsigned lup_mul = (nvects * const_nunits) / group_size;
10084 lupdate_mul
10085 = build_vector_from_val (step_vectype,
10086 SCALAR_FLOAT_TYPE_P (stept)
10087 ? build_real_from_wide (stept, lup_mul,
10088 UNSIGNED)
10089 : build_int_cstu (stept, lup_mul));
10091 tree peel_mul = NULL_TREE;
10092 gimple_seq init_stmts = NULL;
10093 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10095 if (SCALAR_FLOAT_TYPE_P (stept))
10096 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10097 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10098 else
10099 peel_mul = gimple_convert (&init_stmts, stept,
10100 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10101 peel_mul = gimple_build_vector_from_val (&init_stmts,
10102 step_vectype, peel_mul);
10104 unsigned ivn;
10105 auto_vec<tree> vec_steps;
10106 for (ivn = 0; ivn < nivs; ++ivn)
10108 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10109 tree_vector_builder init_elts (vectype, const_nunits, 1);
10110 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10111 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10113 /* The scalar steps of the IVs. */
10114 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10115 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10116 step_elts.quick_push (elt);
10117 if (!init_node)
10119 /* The scalar inits of the IVs if not vectorized. */
10120 elt = inits[(ivn*const_nunits + eltn) % group_size];
10121 if (!useless_type_conversion_p (TREE_TYPE (vectype),
10122 TREE_TYPE (elt)))
10123 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10124 TREE_TYPE (vectype), elt);
10125 init_elts.quick_push (elt);
10127 /* The number of steps to add to the initial values. */
10128 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10129 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10130 ? build_real_from_wide (stept,
10131 mul_elt, UNSIGNED)
10132 : build_int_cstu (stept, mul_elt));
10134 vec_step = gimple_build_vector (&init_stmts, &step_elts);
10135 vec_steps.safe_push (vec_step);
10136 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10137 if (peel_mul)
10138 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10139 step_mul, peel_mul);
10140 if (!init_node)
10141 vec_init = gimple_build_vector (&init_stmts, &init_elts);
10143 /* Create the induction-phi that defines the induction-operand. */
10144 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10145 "vec_iv_");
10146 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10147 induc_def = PHI_RESULT (induction_phi);
10149 /* Create the iv update inside the loop */
10150 tree up = vec_step;
10151 if (lupdate_mul)
10152 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10153 vec_step, lupdate_mul);
10154 gimple_seq stmts = NULL;
10155 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10156 vec_def = gimple_build (&stmts,
10157 PLUS_EXPR, step_vectype, vec_def, up);
10158 vec_def = gimple_convert (&stmts, vectype, vec_def);
10159 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10160 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10161 UNKNOWN_LOCATION);
10163 if (init_node)
10164 vec_init = vect_get_slp_vect_def (init_node, ivn);
10165 if (!nested_in_vect_loop
10166 && !integer_zerop (step_mul))
10168 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10169 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10170 vec_step, step_mul);
10171 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10172 vec_def, up);
10173 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10176 /* Set the arguments of the phi node: */
10177 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10179 slp_node->push_vec_def (induction_phi);
10181 if (!nested_in_vect_loop)
10183 /* Fill up to the number of vectors we need for the whole group. */
10184 nivs = least_common_multiple (group_size,
10185 const_nunits) / const_nunits;
10186 vec_steps.reserve (nivs-ivn);
10187 for (; ivn < nivs; ++ivn)
10189 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10190 vec_steps.quick_push (vec_steps[0]);
10194 /* Re-use IVs when we can. We are generating further vector
10195 stmts by adding VF' * stride to the IVs generated above. */
10196 if (ivn < nvects)
10198 unsigned vfp
10199 = least_common_multiple (group_size, const_nunits) / group_size;
10200 tree lupdate_mul
10201 = build_vector_from_val (step_vectype,
10202 SCALAR_FLOAT_TYPE_P (stept)
10203 ? build_real_from_wide (stept,
10204 vfp, UNSIGNED)
10205 : build_int_cstu (stept, vfp));
10206 for (; ivn < nvects; ++ivn)
10208 gimple *iv
10209 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10210 tree def = gimple_get_lhs (iv);
10211 if (ivn < 2*nivs)
10212 vec_steps[ivn - nivs]
10213 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10214 vec_steps[ivn - nivs], lupdate_mul);
10215 gimple_seq stmts = NULL;
10216 def = gimple_convert (&stmts, step_vectype, def);
10217 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10218 def, vec_steps[ivn % nivs]);
10219 def = gimple_convert (&stmts, vectype, def);
10220 if (gimple_code (iv) == GIMPLE_PHI)
10221 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10222 else
10224 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10225 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10227 slp_node->push_vec_def (def);
10231 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10232 gcc_assert (!new_bb);
10234 return true;
10237 init_expr = vect_phi_initial_value (phi);
10239 gimple_seq stmts = NULL;
10240 if (!nested_in_vect_loop)
10242 /* Convert the initial value to the IV update type. */
10243 tree new_type = TREE_TYPE (step_expr);
10244 init_expr = gimple_convert (&stmts, new_type, init_expr);
10246 /* If we are using the loop mask to "peel" for alignment then we need
10247 to adjust the start value here. */
10248 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10249 if (skip_niters != NULL_TREE)
10251 if (FLOAT_TYPE_P (vectype))
10252 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10253 skip_niters);
10254 else
10255 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10256 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10257 skip_niters, step_expr);
10258 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10259 init_expr, skip_step);
10263 if (stmts)
10265 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10266 gcc_assert (!new_bb);
10269 /* Create the vector that holds the initial_value of the induction. */
10270 if (nested_in_vect_loop)
10272 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10273 been created during vectorization of previous stmts. We obtain it
10274 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10275 auto_vec<tree> vec_inits;
10276 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10277 init_expr, &vec_inits);
10278 vec_init = vec_inits[0];
10279 /* If the initial value is not of proper type, convert it. */
10280 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10282 new_stmt
10283 = gimple_build_assign (vect_get_new_ssa_name (vectype,
10284 vect_simple_var,
10285 "vec_iv_"),
10286 VIEW_CONVERT_EXPR,
10287 build1 (VIEW_CONVERT_EXPR, vectype,
10288 vec_init));
10289 vec_init = gimple_assign_lhs (new_stmt);
10290 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10291 new_stmt);
10292 gcc_assert (!new_bb);
10295 else
10297 /* iv_loop is the loop to be vectorized. Create:
10298 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10299 stmts = NULL;
10300 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10302 unsigned HOST_WIDE_INT const_nunits;
10303 if (nunits.is_constant (&const_nunits))
10305 tree_vector_builder elts (step_vectype, const_nunits, 1);
10306 elts.quick_push (new_name);
10307 for (i = 1; i < const_nunits; i++)
10309 /* Create: new_name_i = new_name + step_expr */
10310 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10311 new_name, step_expr);
10312 elts.quick_push (new_name);
10314 /* Create a vector from [new_name_0, new_name_1, ...,
10315 new_name_nunits-1] */
10316 vec_init = gimple_build_vector (&stmts, &elts);
10318 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10319 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10320 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10321 new_name, step_expr);
10322 else
10324 /* Build:
10325 [base, base, base, ...]
10326 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10327 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10328 gcc_assert (flag_associative_math);
10329 tree index = build_index_vector (step_vectype, 0, 1);
10330 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10331 new_name);
10332 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10333 step_expr);
10334 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10335 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10336 vec_init, step_vec);
10337 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10338 vec_init, base_vec);
10340 vec_init = gimple_convert (&stmts, vectype, vec_init);
10342 if (stmts)
10344 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10345 gcc_assert (!new_bb);
10350 /* Create the vector that holds the step of the induction. */
10351 gimple_stmt_iterator *step_iv_si = NULL;
10352 if (nested_in_vect_loop)
10353 /* iv_loop is nested in the loop to be vectorized. Generate:
10354 vec_step = [S, S, S, S] */
10355 new_name = step_expr;
10356 else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10358 /* When we're using loop_len produced by SELEC_VL, the non-final
10359 iterations are not always processing VF elements. So vectorize
10360 induction variable instead of
10362 _21 = vect_vec_iv_.6_22 + { VF, ... };
10364 We should generate:
10366 _35 = .SELECT_VL (ivtmp_33, VF);
10367 vect_cst__22 = [vec_duplicate_expr] _35;
10368 _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10369 gcc_assert (!slp_node);
10370 gimple_seq seq = NULL;
10371 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10372 tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10373 expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10374 unshare_expr (len)),
10375 &seq, true, NULL_TREE);
10376 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10377 step_expr);
10378 gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10379 step_iv_si = &si;
10381 else
10383 /* iv_loop is the loop to be vectorized. Generate:
10384 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10385 gimple_seq seq = NULL;
10386 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10388 expr = build_int_cst (integer_type_node, vf);
10389 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10391 else
10392 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10393 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10394 expr, step_expr);
10395 if (seq)
10397 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10398 gcc_assert (!new_bb);
10402 t = unshare_expr (new_name);
10403 gcc_assert (CONSTANT_CLASS_P (new_name)
10404 || TREE_CODE (new_name) == SSA_NAME);
10405 new_vec = build_vector_from_val (step_vectype, t);
10406 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10407 new_vec, step_vectype, step_iv_si);
10410 /* Create the following def-use cycle:
10411 loop prolog:
10412 vec_init = ...
10413 vec_step = ...
10414 loop:
10415 vec_iv = PHI <vec_init, vec_loop>
10417 STMT
10419 vec_loop = vec_iv + vec_step; */
10421 /* Create the induction-phi that defines the induction-operand. */
10422 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10423 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10424 induc_def = PHI_RESULT (induction_phi);
10426 /* Create the iv update inside the loop */
10427 stmts = NULL;
10428 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10429 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10430 vec_def = gimple_convert (&stmts, vectype, vec_def);
10431 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10432 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10434 /* Set the arguments of the phi node: */
10435 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10436 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10437 UNKNOWN_LOCATION);
10439 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10440 *vec_stmt = induction_phi;
10442 /* In case that vectorization factor (VF) is bigger than the number
10443 of elements that we can fit in a vectype (nunits), we have to generate
10444 more than one vector stmt - i.e - we need to "unroll" the
10445 vector stmt by a factor VF/nunits. For more details see documentation
10446 in vectorizable_operation. */
10448 if (ncopies > 1)
10450 gimple_seq seq = NULL;
10451 /* FORNOW. This restriction should be relaxed. */
10452 gcc_assert (!nested_in_vect_loop);
10453 /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1. */
10454 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10456 /* Create the vector that holds the step of the induction. */
10457 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10459 expr = build_int_cst (integer_type_node, nunits);
10460 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10462 else
10463 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10464 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10465 expr, step_expr);
10466 if (seq)
10468 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10469 gcc_assert (!new_bb);
10472 t = unshare_expr (new_name);
10473 gcc_assert (CONSTANT_CLASS_P (new_name)
10474 || TREE_CODE (new_name) == SSA_NAME);
10475 new_vec = build_vector_from_val (step_vectype, t);
10476 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10477 new_vec, step_vectype, NULL);
10479 vec_def = induc_def;
10480 for (i = 1; i < ncopies + 1; i++)
10482 /* vec_i = vec_prev + vec_step */
10483 gimple_seq stmts = NULL;
10484 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10485 vec_def = gimple_build (&stmts,
10486 PLUS_EXPR, step_vectype, vec_def, vec_step);
10487 vec_def = gimple_convert (&stmts, vectype, vec_def);
10489 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10490 if (i < ncopies)
10492 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10493 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10495 else
10497 /* vec_1 = vec_iv + (VF/n * S)
10498 vec_2 = vec_1 + (VF/n * S)
10500 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10502 vec_n is used as vec_loop to save the large step register and
10503 related operations. */
10504 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10505 UNKNOWN_LOCATION);
10510 if (dump_enabled_p ())
10511 dump_printf_loc (MSG_NOTE, vect_location,
10512 "transform induction: created def-use cycle: %G%G",
10513 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10515 return true;
10518 /* Function vectorizable_live_operation.
10520 STMT_INFO computes a value that is used outside the loop. Check if
10521 it can be supported. */
10523 bool
10524 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10525 slp_tree slp_node, slp_instance slp_node_instance,
10526 int slp_index, bool vec_stmt_p,
10527 stmt_vector_for_cost *cost_vec)
10529 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10530 imm_use_iterator imm_iter;
10531 tree lhs, lhs_type, bitsize;
10532 tree vectype = (slp_node
10533 ? SLP_TREE_VECTYPE (slp_node)
10534 : STMT_VINFO_VECTYPE (stmt_info));
10535 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10536 int ncopies;
10537 gimple *use_stmt;
10538 auto_vec<tree> vec_oprnds;
10539 int vec_entry = 0;
10540 poly_uint64 vec_index = 0;
10542 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10544 /* If a stmt of a reduction is live, vectorize it via
10545 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10546 validity so just trigger the transform here. */
10547 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10549 if (!vec_stmt_p)
10550 return true;
10551 if (slp_node)
10553 /* For reduction chains the meta-info is attached to
10554 the group leader. */
10555 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10556 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10557 /* For SLP reductions we vectorize the epilogue for
10558 all involved stmts together. */
10559 else if (slp_index != 0)
10560 return true;
10562 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10563 gcc_assert (reduc_info->is_reduc_info);
10564 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10565 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10566 return true;
10567 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10568 slp_node_instance);
10569 return true;
10572 /* If STMT is not relevant and it is a simple assignment and its inputs are
10573 invariant then it can remain in place, unvectorized. The original last
10574 scalar value that it computes will be used. */
10575 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10577 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10578 if (dump_enabled_p ())
10579 dump_printf_loc (MSG_NOTE, vect_location,
10580 "statement is simple and uses invariant. Leaving in "
10581 "place.\n");
10582 return true;
10585 if (slp_node)
10586 ncopies = 1;
10587 else
10588 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10590 if (slp_node)
10592 gcc_assert (slp_index >= 0);
10594 /* Get the last occurrence of the scalar index from the concatenation of
10595 all the slp vectors. Calculate which slp vector it is and the index
10596 within. */
10597 int num_scalar = SLP_TREE_LANES (slp_node);
10598 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10599 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10601 /* Calculate which vector contains the result, and which lane of
10602 that vector we need. */
10603 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10605 if (dump_enabled_p ())
10606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10607 "Cannot determine which vector holds the"
10608 " final result.\n");
10609 return false;
10613 if (!vec_stmt_p)
10615 /* No transformation required. */
10616 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10618 if (slp_node)
10620 if (dump_enabled_p ())
10621 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10622 "can't operate on partial vectors "
10623 "because an SLP statement is live after "
10624 "the loop.\n");
10625 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10627 else if (ncopies > 1)
10629 if (dump_enabled_p ())
10630 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10631 "can't operate on partial vectors "
10632 "because ncopies is greater than 1.\n");
10633 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10635 else
10637 gcc_assert (ncopies == 1 && !slp_node);
10638 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10639 OPTIMIZE_FOR_SPEED))
10640 vect_record_loop_mask (loop_vinfo,
10641 &LOOP_VINFO_MASKS (loop_vinfo),
10642 1, vectype, NULL);
10643 else if (can_vec_extract_var_idx_p (
10644 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10645 vect_record_loop_len (loop_vinfo,
10646 &LOOP_VINFO_LENS (loop_vinfo),
10647 1, vectype, 1);
10648 else
10650 if (dump_enabled_p ())
10651 dump_printf_loc (
10652 MSG_MISSED_OPTIMIZATION, vect_location,
10653 "can't operate on partial vectors "
10654 "because the target doesn't support extract "
10655 "last reduction.\n");
10656 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10660 /* ??? Enable for loop costing as well. */
10661 if (!loop_vinfo)
10662 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10663 0, vect_epilogue);
10664 return true;
10667 /* Use the lhs of the original scalar statement. */
10668 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10669 if (dump_enabled_p ())
10670 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10671 "stmt %G", stmt);
10673 lhs = gimple_get_lhs (stmt);
10674 lhs_type = TREE_TYPE (lhs);
10676 bitsize = vector_element_bits_tree (vectype);
10678 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10679 tree vec_lhs, bitstart;
10680 gimple *vec_stmt;
10681 if (slp_node)
10683 gcc_assert (!loop_vinfo
10684 || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10685 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10687 /* Get the correct slp vectorized stmt. */
10688 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10689 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10691 /* Get entry to use. */
10692 bitstart = bitsize_int (vec_index);
10693 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10695 else
10697 /* For multiple copies, get the last copy. */
10698 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10699 vec_lhs = gimple_get_lhs (vec_stmt);
10701 /* Get the last lane in the vector. */
10702 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10705 if (loop_vinfo)
10707 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10708 requirement, insert one phi node for it. It looks like:
10709 loop;
10711 # lhs' = PHI <lhs>
10713 loop;
10715 # vec_lhs' = PHI <vec_lhs>
10716 new_tree = lane_extract <vec_lhs', ...>;
10717 lhs' = new_tree; */
10719 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10720 basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
10721 gcc_assert (single_pred_p (exit_bb));
10723 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10724 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10725 SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
10727 gimple_seq stmts = NULL;
10728 tree new_tree;
10729 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10731 /* Emit:
10733 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10735 where VEC_LHS is the vectorized live-out result and MASK is
10736 the loop mask for the final iteration. */
10737 gcc_assert (ncopies == 1 && !slp_node);
10738 gimple_seq tem = NULL;
10739 gimple_stmt_iterator gsi = gsi_last (tem);
10740 tree len
10741 = vect_get_loop_len (loop_vinfo, &gsi,
10742 &LOOP_VINFO_LENS (loop_vinfo),
10743 1, vectype, 0, 0);
10745 /* BIAS - 1. */
10746 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10747 tree bias_minus_one
10748 = int_const_binop (MINUS_EXPR,
10749 build_int_cst (TREE_TYPE (len), biasval),
10750 build_one_cst (TREE_TYPE (len)));
10752 /* LAST_INDEX = LEN + (BIAS - 1). */
10753 tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10754 len, bias_minus_one);
10756 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10757 tree scalar_res
10758 = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10759 vec_lhs_phi, last_index);
10761 /* Convert the extracted vector element to the scalar type. */
10762 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10764 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10766 /* Emit:
10768 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10770 where VEC_LHS is the vectorized live-out result and MASK is
10771 the loop mask for the final iteration. */
10772 gcc_assert (ncopies == 1 && !slp_node);
10773 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10774 gimple_seq tem = NULL;
10775 gimple_stmt_iterator gsi = gsi_last (tem);
10776 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10777 &LOOP_VINFO_MASKS (loop_vinfo),
10778 1, vectype, 0);
10779 gimple_seq_add_seq (&stmts, tem);
10780 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10781 mask, vec_lhs_phi);
10783 /* Convert the extracted vector element to the scalar type. */
10784 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10786 else
10788 tree bftype = TREE_TYPE (vectype);
10789 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10790 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10791 new_tree = build3 (BIT_FIELD_REF, bftype,
10792 vec_lhs_phi, bitsize, bitstart);
10793 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10794 &stmts, true, NULL_TREE);
10797 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10798 if (stmts)
10799 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10801 /* Remove existing phis that copy from lhs and create copies
10802 from new_tree. */
10803 gimple_stmt_iterator gsi;
10804 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
10806 gimple *phi = gsi_stmt (gsi);
10807 if ((gimple_phi_arg_def (phi, 0) == lhs))
10809 remove_phi_node (&gsi, false);
10810 tree lhs_phi = gimple_phi_result (phi);
10811 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10812 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10814 else
10815 gsi_next (&gsi);
10818 /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10819 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10820 gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10822 else
10824 /* For basic-block vectorization simply insert the lane-extraction. */
10825 tree bftype = TREE_TYPE (vectype);
10826 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10827 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10828 tree new_tree = build3 (BIT_FIELD_REF, bftype,
10829 vec_lhs, bitsize, bitstart);
10830 gimple_seq stmts = NULL;
10831 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10832 &stmts, true, NULL_TREE);
10833 if (TREE_CODE (new_tree) == SSA_NAME
10834 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10835 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10836 if (is_a <gphi *> (vec_stmt))
10838 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10839 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10841 else
10843 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10844 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10847 /* Replace use of lhs with newly computed result. If the use stmt is a
10848 single arg PHI, just replace all uses of PHI result. It's necessary
10849 because lcssa PHI defining lhs may be before newly inserted stmt. */
10850 use_operand_p use_p;
10851 stmt_vec_info use_stmt_info;
10852 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10853 if (!is_gimple_debug (use_stmt)
10854 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10855 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10857 /* ??? This can happen when the live lane ends up being
10858 rooted in a vector construction code-generated by an
10859 external SLP node (and code-generation for that already
10860 happened). See gcc.dg/vect/bb-slp-47.c.
10861 Doing this is what would happen if that vector CTOR
10862 were not code-generated yet so it is not too bad.
10863 ??? In fact we'd likely want to avoid this situation
10864 in the first place. */
10865 if (TREE_CODE (new_tree) == SSA_NAME
10866 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10867 && gimple_code (use_stmt) != GIMPLE_PHI
10868 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10869 use_stmt))
10871 if (dump_enabled_p ())
10872 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10873 "Using original scalar computation for "
10874 "live lane because use preceeds vector "
10875 "def\n");
10876 continue;
10878 /* ??? It can also happen that we end up pulling a def into
10879 a loop where replacing out-of-loop uses would require
10880 a new LC SSA PHI node. Retain the original scalar in
10881 those cases as well. PR98064. */
10882 if (TREE_CODE (new_tree) == SSA_NAME
10883 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10884 && (gimple_bb (use_stmt)->loop_father
10885 != gimple_bb (vec_stmt)->loop_father)
10886 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10887 gimple_bb (use_stmt)->loop_father))
10889 if (dump_enabled_p ())
10890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10891 "Using original scalar computation for "
10892 "live lane because there is an out-of-loop "
10893 "definition for it\n");
10894 continue;
10896 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10897 SET_USE (use_p, new_tree);
10898 update_stmt (use_stmt);
10902 return true;
10905 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
10907 static void
10908 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10910 ssa_op_iter op_iter;
10911 imm_use_iterator imm_iter;
10912 def_operand_p def_p;
10913 gimple *ustmt;
10915 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10917 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10919 basic_block bb;
10921 if (!is_gimple_debug (ustmt))
10922 continue;
10924 bb = gimple_bb (ustmt);
10926 if (!flow_bb_inside_loop_p (loop, bb))
10928 if (gimple_debug_bind_p (ustmt))
10930 if (dump_enabled_p ())
10931 dump_printf_loc (MSG_NOTE, vect_location,
10932 "killing debug use\n");
10934 gimple_debug_bind_reset_value (ustmt);
10935 update_stmt (ustmt);
10937 else
10938 gcc_unreachable ();
10944 /* Given loop represented by LOOP_VINFO, return true if computation of
10945 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10946 otherwise. */
10948 static bool
10949 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10951 /* Constant case. */
10952 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10954 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10955 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10957 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10958 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10959 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10960 return true;
10963 widest_int max;
10964 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10965 /* Check the upper bound of loop niters. */
10966 if (get_max_loop_iterations (loop, &max))
10968 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10969 signop sgn = TYPE_SIGN (type);
10970 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10971 if (max < type_max)
10972 return true;
10974 return false;
10977 /* Return a mask type with half the number of elements as OLD_TYPE,
10978 given that it should have mode NEW_MODE. */
10980 tree
10981 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10983 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10984 return build_truth_vector_type_for_mode (nunits, new_mode);
10987 /* Return a mask type with twice as many elements as OLD_TYPE,
10988 given that it should have mode NEW_MODE. */
10990 tree
10991 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10993 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10994 return build_truth_vector_type_for_mode (nunits, new_mode);
10997 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10998 contain a sequence of NVECTORS masks that each control a vector of type
10999 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
11000 these vector masks with the vector version of SCALAR_MASK. */
11002 void
11003 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11004 unsigned int nvectors, tree vectype, tree scalar_mask)
11006 gcc_assert (nvectors != 0);
11008 if (scalar_mask)
11010 scalar_cond_masked_key cond (scalar_mask, nvectors);
11011 loop_vinfo->scalar_cond_masked_set.add (cond);
11014 masks->mask_set.add (std::make_pair (vectype, nvectors));
11017 /* Given a complete set of masks MASKS, extract mask number INDEX
11018 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11019 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
11021 See the comment above vec_loop_masks for more details about the mask
11022 arrangement. */
11024 tree
11025 vect_get_loop_mask (loop_vec_info loop_vinfo,
11026 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11027 unsigned int nvectors, tree vectype, unsigned int index)
11029 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11030 == vect_partial_vectors_while_ult)
11032 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11033 tree mask_type = rgm->type;
11035 /* Populate the rgroup's mask array, if this is the first time we've
11036 used it. */
11037 if (rgm->controls.is_empty ())
11039 rgm->controls.safe_grow_cleared (nvectors, true);
11040 for (unsigned int i = 0; i < nvectors; ++i)
11042 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11043 /* Provide a dummy definition until the real one is available. */
11044 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11045 rgm->controls[i] = mask;
11049 tree mask = rgm->controls[index];
11050 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11051 TYPE_VECTOR_SUBPARTS (vectype)))
11053 /* A loop mask for data type X can be reused for data type Y
11054 if X has N times more elements than Y and if Y's elements
11055 are N times bigger than X's. In this case each sequence
11056 of N elements in the loop mask will be all-zero or all-one.
11057 We can then view-convert the mask so that each sequence of
11058 N elements is replaced by a single element. */
11059 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11060 TYPE_VECTOR_SUBPARTS (vectype)));
11061 gimple_seq seq = NULL;
11062 mask_type = truth_type_for (vectype);
11063 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11064 if (seq)
11065 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11067 return mask;
11069 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11070 == vect_partial_vectors_avx512)
11072 /* The number of scalars per iteration and the number of vectors are
11073 both compile-time constants. */
11074 unsigned int nscalars_per_iter
11075 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11076 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11078 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11080 /* The stored nV is dependent on the mask type produced. */
11081 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11082 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11083 == rgm->factor);
11084 nvectors = rgm->factor;
11086 /* Populate the rgroup's mask array, if this is the first time we've
11087 used it. */
11088 if (rgm->controls.is_empty ())
11090 rgm->controls.safe_grow_cleared (nvectors, true);
11091 for (unsigned int i = 0; i < nvectors; ++i)
11093 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11094 /* Provide a dummy definition until the real one is available. */
11095 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11096 rgm->controls[i] = mask;
11099 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11100 TYPE_VECTOR_SUBPARTS (vectype)))
11101 return rgm->controls[index];
11103 /* Split the vector if needed. Since we are dealing with integer mode
11104 masks with AVX512 we can operate on the integer representation
11105 performing the whole vector shifting. */
11106 unsigned HOST_WIDE_INT factor;
11107 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11108 TYPE_VECTOR_SUBPARTS (vectype), &factor);
11109 gcc_assert (ok);
11110 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11111 tree mask_type = truth_type_for (vectype);
11112 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11113 unsigned vi = index / factor;
11114 unsigned vpart = index % factor;
11115 tree vec = rgm->controls[vi];
11116 gimple_seq seq = NULL;
11117 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11118 lang_hooks.types.type_for_mode
11119 (TYPE_MODE (rgm->type), 1), vec);
11120 /* For integer mode masks simply shift the right bits into position. */
11121 if (vpart != 0)
11122 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11123 build_int_cst (integer_type_node,
11124 (TYPE_VECTOR_SUBPARTS (vectype)
11125 * vpart)));
11126 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11127 (TYPE_MODE (mask_type), 1), vec);
11128 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11129 if (seq)
11130 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11131 return vec;
11133 else
11134 gcc_unreachable ();
11137 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11138 lengths for controlling an operation on VECTYPE. The operation splits
11139 each element of VECTYPE into FACTOR separate subelements, measuring the
11140 length as a number of these subelements. */
11142 void
11143 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11144 unsigned int nvectors, tree vectype, unsigned int factor)
11146 gcc_assert (nvectors != 0);
11147 if (lens->length () < nvectors)
11148 lens->safe_grow_cleared (nvectors, true);
11149 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11151 /* The number of scalars per iteration, scalar occupied bytes and
11152 the number of vectors are both compile-time constants. */
11153 unsigned int nscalars_per_iter
11154 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11155 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11157 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11159 /* For now, we only support cases in which all loads and stores fall back
11160 to VnQI or none do. */
11161 gcc_assert (!rgl->max_nscalars_per_iter
11162 || (rgl->factor == 1 && factor == 1)
11163 || (rgl->max_nscalars_per_iter * rgl->factor
11164 == nscalars_per_iter * factor));
11165 rgl->max_nscalars_per_iter = nscalars_per_iter;
11166 rgl->type = vectype;
11167 rgl->factor = factor;
11171 /* Given a complete set of lengths LENS, extract length number INDEX
11172 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11173 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11174 multipled by the number of elements that should be processed.
11175 Insert any set-up statements before GSI. */
11177 tree
11178 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11179 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11180 unsigned int index, unsigned int factor)
11182 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11183 bool use_bias_adjusted_len =
11184 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11186 /* Populate the rgroup's len array, if this is the first time we've
11187 used it. */
11188 if (rgl->controls.is_empty ())
11190 rgl->controls.safe_grow_cleared (nvectors, true);
11191 for (unsigned int i = 0; i < nvectors; ++i)
11193 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11194 gcc_assert (len_type != NULL_TREE);
11196 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11198 /* Provide a dummy definition until the real one is available. */
11199 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11200 rgl->controls[i] = len;
11202 if (use_bias_adjusted_len)
11204 gcc_assert (i == 0);
11205 tree adjusted_len =
11206 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11207 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11208 rgl->bias_adjusted_ctrl = adjusted_len;
11213 if (use_bias_adjusted_len)
11214 return rgl->bias_adjusted_ctrl;
11216 tree loop_len = rgl->controls[index];
11217 if (rgl->factor == 1 && factor == 1)
11219 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11220 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11221 if (maybe_ne (nunits1, nunits2))
11223 /* A loop len for data type X can be reused for data type Y
11224 if X has N times more elements than Y and if Y's elements
11225 are N times bigger than X's. */
11226 gcc_assert (multiple_p (nunits1, nunits2));
11227 factor = exact_div (nunits1, nunits2).to_constant ();
11228 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11229 gimple_seq seq = NULL;
11230 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11231 build_int_cst (iv_type, factor));
11232 if (seq)
11233 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11236 return loop_len;
11239 /* Scale profiling counters by estimation for LOOP which is vectorized
11240 by factor VF.
11241 If FLAT is true, the loop we started with had unrealistically flat
11242 profile. */
11244 static void
11245 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11247 /* For flat profiles do not scale down proportionally by VF and only
11248 cap by known iteration count bounds. */
11249 if (flat)
11251 if (dump_file && (dump_flags & TDF_DETAILS))
11252 fprintf (dump_file,
11253 "Vectorized loop profile seems flat; not scaling iteration "
11254 "count down by the vectorization factor %i\n", vf);
11255 scale_loop_profile (loop, profile_probability::always (),
11256 get_likely_max_loop_iterations_int (loop));
11257 return;
11259 /* Loop body executes VF fewer times and exit increases VF times. */
11260 profile_count entry_count = loop_preheader_edge (loop)->count ();
11262 /* If we have unreliable loop profile avoid dropping entry
11263 count bellow header count. This can happen since loops
11264 has unrealistically low trip counts. */
11265 while (vf > 1
11266 && loop->header->count > entry_count
11267 && loop->header->count < entry_count * vf)
11269 if (dump_file && (dump_flags & TDF_DETAILS))
11270 fprintf (dump_file,
11271 "Vectorization factor %i seems too large for profile "
11272 "prevoiusly believed to be consistent; reducing.\n", vf);
11273 vf /= 2;
11276 if (entry_count.nonzero_p ())
11277 set_edge_probability_and_rescale_others
11278 (exit_e,
11279 entry_count.probability_in (loop->header->count / vf));
11280 /* Avoid producing very large exit probability when we do not have
11281 sensible profile. */
11282 else if (exit_e->probability < profile_probability::always () / (vf * 2))
11283 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11284 loop->latch->count = single_pred_edge (loop->latch)->count ();
11286 scale_loop_profile (loop, profile_probability::always () / vf,
11287 get_likely_max_loop_iterations_int (loop));
11290 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11291 latch edge values originally defined by it. */
11293 static void
11294 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11295 stmt_vec_info def_stmt_info)
11297 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11298 if (!def || TREE_CODE (def) != SSA_NAME)
11299 return;
11300 stmt_vec_info phi_info;
11301 imm_use_iterator iter;
11302 use_operand_p use_p;
11303 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11305 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11306 if (!phi)
11307 continue;
11308 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11309 && (phi_info = loop_vinfo->lookup_stmt (phi))
11310 && STMT_VINFO_RELEVANT_P (phi_info)))
11311 continue;
11312 loop_p loop = gimple_bb (phi)->loop_father;
11313 edge e = loop_latch_edge (loop);
11314 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11315 continue;
11317 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11318 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11319 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11321 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11322 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11323 gcc_assert (phi_defs.length () == latch_defs.length ());
11324 for (unsigned i = 0; i < phi_defs.length (); ++i)
11325 add_phi_arg (as_a <gphi *> (phi_defs[i]),
11326 gimple_get_lhs (latch_defs[i]), e,
11327 gimple_phi_arg_location (phi, e->dest_idx));
11329 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11331 /* For first order recurrences we have to update both uses of
11332 the latch definition, the one in the PHI node and the one
11333 in the generated VEC_PERM_EXPR. */
11334 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11335 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11336 gcc_assert (phi_defs.length () == latch_defs.length ());
11337 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11338 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11339 for (unsigned i = 0; i < phi_defs.length (); ++i)
11341 gassign *perm = as_a <gassign *> (phi_defs[i]);
11342 if (i > 0)
11343 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11344 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11345 update_stmt (perm);
11347 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11348 gimple_phi_arg_location (phi, e->dest_idx));
11353 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11354 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11355 stmt_vec_info. */
11357 static bool
11358 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11359 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11361 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11362 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11364 if (dump_enabled_p ())
11365 dump_printf_loc (MSG_NOTE, vect_location,
11366 "------>vectorizing statement: %G", stmt_info->stmt);
11368 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11369 vect_loop_kill_debug_uses (loop, stmt_info);
11371 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11372 && !STMT_VINFO_LIVE_P (stmt_info))
11374 if (is_gimple_call (stmt_info->stmt)
11375 && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11377 gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11378 *seen_store = stmt_info;
11379 return false;
11381 return false;
11384 if (STMT_VINFO_VECTYPE (stmt_info))
11386 poly_uint64 nunits
11387 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11388 if (!STMT_SLP_TYPE (stmt_info)
11389 && maybe_ne (nunits, vf)
11390 && dump_enabled_p ())
11391 /* For SLP VF is set according to unrolling factor, and not
11392 to vector size, hence for SLP this print is not valid. */
11393 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11396 /* Pure SLP statements have already been vectorized. We still need
11397 to apply loop vectorization to hybrid SLP statements. */
11398 if (PURE_SLP_STMT (stmt_info))
11399 return false;
11401 if (dump_enabled_p ())
11402 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11404 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11405 *seen_store = stmt_info;
11407 return true;
11410 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11411 in the hash_map with its corresponding values. */
11413 static tree
11414 find_in_mapping (tree t, void *context)
11416 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11418 tree *value = mapping->get (t);
11419 return value ? *value : t;
11422 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11423 original loop that has now been vectorized.
11425 The inits of the data_references need to be advanced with the number of
11426 iterations of the main loop. This has been computed in vect_do_peeling and
11427 is stored in parameter ADVANCE. We first restore the data_references
11428 initial offset with the values recored in ORIG_DRS_INIT.
11430 Since the loop_vec_info of this EPILOGUE was constructed for the original
11431 loop, its stmt_vec_infos all point to the original statements. These need
11432 to be updated to point to their corresponding copies as well as the SSA_NAMES
11433 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11435 The data_reference's connections also need to be updated. Their
11436 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11437 stmt_vec_infos, their statements need to point to their corresponding copy,
11438 if they are gather loads or scatter stores then their reference needs to be
11439 updated to point to its corresponding copy and finally we set
11440 'base_misaligned' to false as we have already peeled for alignment in the
11441 prologue of the main loop. */
11443 static void
11444 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11446 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11447 auto_vec<gimple *> stmt_worklist;
11448 hash_map<tree,tree> mapping;
11449 gimple *orig_stmt, *new_stmt;
11450 gimple_stmt_iterator epilogue_gsi;
11451 gphi_iterator epilogue_phi_gsi;
11452 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11453 basic_block *epilogue_bbs = get_loop_body (epilogue);
11454 unsigned i;
11456 free (LOOP_VINFO_BBS (epilogue_vinfo));
11457 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11459 /* Advance data_reference's with the number of iterations of the previous
11460 loop and its prologue. */
11461 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11464 /* The EPILOGUE loop is a copy of the original loop so they share the same
11465 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11466 point to the copied statements. We also create a mapping of all LHS' in
11467 the original loop and all the LHS' in the EPILOGUE and create worklists to
11468 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11469 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11471 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11472 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11474 new_stmt = epilogue_phi_gsi.phi ();
11476 gcc_assert (gimple_uid (new_stmt) > 0);
11477 stmt_vinfo
11478 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11480 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11481 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11483 mapping.put (gimple_phi_result (orig_stmt),
11484 gimple_phi_result (new_stmt));
11485 /* PHI nodes can not have patterns or related statements. */
11486 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11487 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11490 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11491 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11493 new_stmt = gsi_stmt (epilogue_gsi);
11494 if (is_gimple_debug (new_stmt))
11495 continue;
11497 gcc_assert (gimple_uid (new_stmt) > 0);
11498 stmt_vinfo
11499 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11501 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11502 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11504 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11505 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11507 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11509 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11510 for (gimple_stmt_iterator gsi = gsi_start (seq);
11511 !gsi_end_p (gsi); gsi_next (&gsi))
11512 stmt_worklist.safe_push (gsi_stmt (gsi));
11515 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11516 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11518 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11519 stmt_worklist.safe_push (stmt);
11520 /* Set BB such that the assert in
11521 'get_initial_def_for_reduction' is able to determine that
11522 the BB of the related stmt is inside this loop. */
11523 gimple_set_bb (stmt,
11524 gimple_bb (new_stmt));
11525 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11526 gcc_assert (related_vinfo == NULL
11527 || related_vinfo == stmt_vinfo);
11532 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11533 using the original main loop and thus need to be updated to refer to the
11534 cloned variables used in the epilogue. */
11535 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11537 gimple *stmt = stmt_worklist[i];
11538 tree *new_op;
11540 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11542 tree op = gimple_op (stmt, j);
11543 if ((new_op = mapping.get(op)))
11544 gimple_set_op (stmt, j, *new_op);
11545 else
11547 /* PR92429: The last argument of simplify_replace_tree disables
11548 folding when replacing arguments. This is required as
11549 otherwise you might end up with different statements than the
11550 ones analyzed in vect_loop_analyze, leading to different
11551 vectorization. */
11552 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11553 &find_in_mapping, &mapping, false);
11554 gimple_set_op (stmt, j, op);
11559 struct data_reference *dr;
11560 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11561 FOR_EACH_VEC_ELT (datarefs, i, dr)
11563 orig_stmt = DR_STMT (dr);
11564 gcc_assert (gimple_uid (orig_stmt) > 0);
11565 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11566 /* Data references for gather loads and scatter stores do not use the
11567 updated offset we set using ADVANCE. Instead we have to make sure the
11568 reference in the data references point to the corresponding copy of
11569 the original in the epilogue. Make sure to update both
11570 gather/scatters recognized by dataref analysis and also other
11571 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11572 auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11573 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11574 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11576 DR_REF (dr)
11577 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11578 &find_in_mapping, &mapping);
11579 DR_BASE_ADDRESS (dr)
11580 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11581 &find_in_mapping, &mapping);
11583 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11584 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11585 /* The vector size of the epilogue is smaller than that of the main loop
11586 so the alignment is either the same or lower. This means the dr will
11587 thus by definition be aligned. */
11588 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11591 epilogue_vinfo->shared->datarefs_copy.release ();
11592 epilogue_vinfo->shared->save_datarefs ();
11595 /* Function vect_transform_loop.
11597 The analysis phase has determined that the loop is vectorizable.
11598 Vectorize the loop - created vectorized stmts to replace the scalar
11599 stmts in the loop, and update the loop exit condition.
11600 Returns scalar epilogue loop if any. */
11602 class loop *
11603 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11605 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11606 class loop *epilogue = NULL;
11607 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11608 int nbbs = loop->num_nodes;
11609 int i;
11610 tree niters_vector = NULL_TREE;
11611 tree step_vector = NULL_TREE;
11612 tree niters_vector_mult_vf = NULL_TREE;
11613 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11614 unsigned int lowest_vf = constant_lower_bound (vf);
11615 gimple *stmt;
11616 bool check_profitability = false;
11617 unsigned int th;
11618 bool flat = maybe_flat_loop_profile (loop);
11620 DUMP_VECT_SCOPE ("vec_transform_loop");
11622 loop_vinfo->shared->check_datarefs ();
11624 /* Use the more conservative vectorization threshold. If the number
11625 of iterations is constant assume the cost check has been performed
11626 by our caller. If the threshold makes all loops profitable that
11627 run at least the (estimated) vectorization factor number of times
11628 checking is pointless, too. */
11629 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11630 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11632 if (dump_enabled_p ())
11633 dump_printf_loc (MSG_NOTE, vect_location,
11634 "Profitability threshold is %d loop iterations.\n",
11635 th);
11636 check_profitability = true;
11639 /* Make sure there exists a single-predecessor exit bb. Do this before
11640 versioning. */
11641 edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11642 if (! single_pred_p (e->dest))
11644 split_loop_exit_edge (e, true);
11645 if (dump_enabled_p ())
11646 dump_printf (MSG_NOTE, "split exit edge\n");
11649 /* Version the loop first, if required, so the profitability check
11650 comes first. */
11652 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11654 class loop *sloop
11655 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11656 sloop->force_vectorize = false;
11657 check_profitability = false;
11660 /* Make sure there exists a single-predecessor exit bb also on the
11661 scalar loop copy. Do this after versioning but before peeling
11662 so CFG structure is fine for both scalar and if-converted loop
11663 to make slpeel_duplicate_current_defs_from_edges face matched
11664 loop closed PHI nodes on the exit. */
11665 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11667 e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11668 if (! single_pred_p (e->dest))
11670 split_loop_exit_edge (e, true);
11671 if (dump_enabled_p ())
11672 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11676 tree niters = vect_build_loop_niters (loop_vinfo);
11677 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11678 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11679 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11680 tree advance;
11681 drs_init_vec orig_drs_init;
11683 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11684 &step_vector, &niters_vector_mult_vf, th,
11685 check_profitability, niters_no_overflow,
11686 &advance);
11687 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11688 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11690 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11691 block after loop exit. We need to scale all that. */
11692 basic_block preheader
11693 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11694 preheader->count
11695 = preheader->count.apply_probability
11696 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11697 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11698 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11699 single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11700 = preheader->count;
11703 if (niters_vector == NULL_TREE)
11705 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11706 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11707 && known_eq (lowest_vf, vf))
11709 niters_vector
11710 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11711 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11712 step_vector = build_one_cst (TREE_TYPE (niters));
11714 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11715 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11716 &step_vector, niters_no_overflow);
11717 else
11718 /* vect_do_peeling subtracted the number of peeled prologue
11719 iterations from LOOP_VINFO_NITERS. */
11720 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11721 &niters_vector, &step_vector,
11722 niters_no_overflow);
11725 /* 1) Make sure the loop header has exactly two entries
11726 2) Make sure we have a preheader basic block. */
11728 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11730 split_edge (loop_preheader_edge (loop));
11732 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11733 /* This will deal with any possible peeling. */
11734 vect_prepare_for_masked_peels (loop_vinfo);
11736 /* Schedule the SLP instances first, then handle loop vectorization
11737 below. */
11738 if (!loop_vinfo->slp_instances.is_empty ())
11740 DUMP_VECT_SCOPE ("scheduling SLP instances");
11741 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11744 /* FORNOW: the vectorizer supports only loops which body consist
11745 of one basic block (header + empty latch). When the vectorizer will
11746 support more involved loop forms, the order by which the BBs are
11747 traversed need to be reconsidered. */
11749 for (i = 0; i < nbbs; i++)
11751 basic_block bb = bbs[i];
11752 stmt_vec_info stmt_info;
11754 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11755 gsi_next (&si))
11757 gphi *phi = si.phi ();
11758 if (dump_enabled_p ())
11759 dump_printf_loc (MSG_NOTE, vect_location,
11760 "------>vectorizing phi: %G", (gimple *) phi);
11761 stmt_info = loop_vinfo->lookup_stmt (phi);
11762 if (!stmt_info)
11763 continue;
11765 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11766 vect_loop_kill_debug_uses (loop, stmt_info);
11768 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11769 && !STMT_VINFO_LIVE_P (stmt_info))
11770 continue;
11772 if (STMT_VINFO_VECTYPE (stmt_info)
11773 && (maybe_ne
11774 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11775 && dump_enabled_p ())
11776 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11778 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11779 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11780 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11781 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11782 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11783 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11784 && ! PURE_SLP_STMT (stmt_info))
11786 if (dump_enabled_p ())
11787 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11788 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11792 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11793 gsi_next (&si))
11795 gphi *phi = si.phi ();
11796 stmt_info = loop_vinfo->lookup_stmt (phi);
11797 if (!stmt_info)
11798 continue;
11800 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11801 && !STMT_VINFO_LIVE_P (stmt_info))
11802 continue;
11804 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11805 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11806 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11807 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11808 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11809 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11810 && ! PURE_SLP_STMT (stmt_info))
11811 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11814 for (gimple_stmt_iterator si = gsi_start_bb (bb);
11815 !gsi_end_p (si);)
11817 stmt = gsi_stmt (si);
11818 /* During vectorization remove existing clobber stmts. */
11819 if (gimple_clobber_p (stmt))
11821 unlink_stmt_vdef (stmt);
11822 gsi_remove (&si, true);
11823 release_defs (stmt);
11825 else
11827 /* Ignore vector stmts created in the outer loop. */
11828 stmt_info = loop_vinfo->lookup_stmt (stmt);
11830 /* vector stmts created in the outer-loop during vectorization of
11831 stmts in an inner-loop may not have a stmt_info, and do not
11832 need to be vectorized. */
11833 stmt_vec_info seen_store = NULL;
11834 if (stmt_info)
11836 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11838 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11839 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11840 !gsi_end_p (subsi); gsi_next (&subsi))
11842 stmt_vec_info pat_stmt_info
11843 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11844 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11845 &si, &seen_store);
11847 stmt_vec_info pat_stmt_info
11848 = STMT_VINFO_RELATED_STMT (stmt_info);
11849 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11850 &si, &seen_store))
11851 maybe_set_vectorized_backedge_value (loop_vinfo,
11852 pat_stmt_info);
11854 else
11856 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11857 &seen_store))
11858 maybe_set_vectorized_backedge_value (loop_vinfo,
11859 stmt_info);
11862 gsi_next (&si);
11863 if (seen_store)
11865 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11866 /* Interleaving. If IS_STORE is TRUE, the
11867 vectorization of the interleaving chain was
11868 completed - free all the stores in the chain. */
11869 vect_remove_stores (loop_vinfo,
11870 DR_GROUP_FIRST_ELEMENT (seen_store));
11871 else
11872 /* Free the attached stmt_vec_info and remove the stmt. */
11873 loop_vinfo->remove_stmt (stmt_info);
11878 /* Stub out scalar statements that must not survive vectorization.
11879 Doing this here helps with grouped statements, or statements that
11880 are involved in patterns. */
11881 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11882 !gsi_end_p (gsi); gsi_next (&gsi))
11884 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11885 if (!call || !gimple_call_internal_p (call))
11886 continue;
11887 internal_fn ifn = gimple_call_internal_fn (call);
11888 if (ifn == IFN_MASK_LOAD)
11890 tree lhs = gimple_get_lhs (call);
11891 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11893 tree zero = build_zero_cst (TREE_TYPE (lhs));
11894 gimple *new_stmt = gimple_build_assign (lhs, zero);
11895 gsi_replace (&gsi, new_stmt, true);
11898 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11900 tree lhs = gimple_get_lhs (call);
11901 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11903 tree else_arg
11904 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11905 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11906 gsi_replace (&gsi, new_stmt, true);
11910 } /* BBs in loop */
11912 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11913 a zero NITERS becomes a nonzero NITERS_VECTOR. */
11914 if (integer_onep (step_vector))
11915 niters_no_overflow = true;
11916 vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11917 niters_vector, step_vector, niters_vector_mult_vf,
11918 !niters_no_overflow);
11920 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11922 /* True if the final iteration might not handle a full vector's
11923 worth of scalar iterations. */
11924 bool final_iter_may_be_partial
11925 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11926 /* The minimum number of iterations performed by the epilogue. This
11927 is 1 when peeling for gaps because we always need a final scalar
11928 iteration. */
11929 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11930 /* +1 to convert latch counts to loop iteration counts,
11931 -min_epilogue_iters to remove iterations that cannot be performed
11932 by the vector code. */
11933 int bias_for_lowest = 1 - min_epilogue_iters;
11934 int bias_for_assumed = bias_for_lowest;
11935 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11936 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11938 /* When the amount of peeling is known at compile time, the first
11939 iteration will have exactly alignment_npeels active elements.
11940 In the worst case it will have at least one. */
11941 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11942 bias_for_lowest += lowest_vf - min_first_active;
11943 bias_for_assumed += assumed_vf - min_first_active;
11945 /* In these calculations the "- 1" converts loop iteration counts
11946 back to latch counts. */
11947 if (loop->any_upper_bound)
11949 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11950 loop->nb_iterations_upper_bound
11951 = (final_iter_may_be_partial
11952 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11953 lowest_vf) - 1
11954 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11955 lowest_vf) - 1);
11956 if (main_vinfo
11957 /* Both peeling for alignment and peeling for gaps can end up
11958 with the scalar epilogue running for more than VF-1 iterations. */
11959 && !main_vinfo->peeling_for_alignment
11960 && !main_vinfo->peeling_for_gaps)
11962 unsigned int bound;
11963 poly_uint64 main_iters
11964 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11965 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11966 main_iters
11967 = upper_bound (main_iters,
11968 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11969 if (can_div_away_from_zero_p (main_iters,
11970 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11971 &bound))
11972 loop->nb_iterations_upper_bound
11973 = wi::umin ((bound_wide_int) (bound - 1),
11974 loop->nb_iterations_upper_bound);
11977 if (loop->any_likely_upper_bound)
11978 loop->nb_iterations_likely_upper_bound
11979 = (final_iter_may_be_partial
11980 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11981 + bias_for_lowest, lowest_vf) - 1
11982 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11983 + bias_for_lowest, lowest_vf) - 1);
11984 if (loop->any_estimate)
11985 loop->nb_iterations_estimate
11986 = (final_iter_may_be_partial
11987 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11988 assumed_vf) - 1
11989 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11990 assumed_vf) - 1);
11991 scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11992 assumed_vf, flat);
11994 if (dump_enabled_p ())
11996 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11998 dump_printf_loc (MSG_NOTE, vect_location,
11999 "LOOP VECTORIZED\n");
12000 if (loop->inner)
12001 dump_printf_loc (MSG_NOTE, vect_location,
12002 "OUTER LOOP VECTORIZED\n");
12003 dump_printf (MSG_NOTE, "\n");
12005 else
12006 dump_printf_loc (MSG_NOTE, vect_location,
12007 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12008 GET_MODE_NAME (loop_vinfo->vector_mode));
12011 /* Loops vectorized with a variable factor won't benefit from
12012 unrolling/peeling. */
12013 if (!vf.is_constant ())
12015 loop->unroll = 1;
12016 if (dump_enabled_p ())
12017 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12018 " variable-length vectorization factor\n");
12020 /* Free SLP instances here because otherwise stmt reference counting
12021 won't work. */
12022 slp_instance instance;
12023 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12024 vect_free_slp_instance (instance);
12025 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12026 /* Clear-up safelen field since its value is invalid after vectorization
12027 since vectorized loop can have loop-carried dependencies. */
12028 loop->safelen = 0;
12030 if (epilogue)
12032 update_epilogue_loop_vinfo (epilogue, advance);
12034 epilogue->simduid = loop->simduid;
12035 epilogue->force_vectorize = loop->force_vectorize;
12036 epilogue->dont_vectorize = false;
12039 return epilogue;
12042 /* The code below is trying to perform simple optimization - revert
12043 if-conversion for masked stores, i.e. if the mask of a store is zero
12044 do not perform it and all stored value producers also if possible.
12045 For example,
12046 for (i=0; i<n; i++)
12047 if (c[i])
12049 p1[i] += 1;
12050 p2[i] = p3[i] +2;
12052 this transformation will produce the following semi-hammock:
12054 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12056 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12057 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12058 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12059 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12060 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12061 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12065 void
12066 optimize_mask_stores (class loop *loop)
12068 basic_block *bbs = get_loop_body (loop);
12069 unsigned nbbs = loop->num_nodes;
12070 unsigned i;
12071 basic_block bb;
12072 class loop *bb_loop;
12073 gimple_stmt_iterator gsi;
12074 gimple *stmt;
12075 auto_vec<gimple *> worklist;
12076 auto_purge_vect_location sentinel;
12078 vect_location = find_loop_location (loop);
12079 /* Pick up all masked stores in loop if any. */
12080 for (i = 0; i < nbbs; i++)
12082 bb = bbs[i];
12083 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12084 gsi_next (&gsi))
12086 stmt = gsi_stmt (gsi);
12087 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12088 worklist.safe_push (stmt);
12092 free (bbs);
12093 if (worklist.is_empty ())
12094 return;
12096 /* Loop has masked stores. */
12097 while (!worklist.is_empty ())
12099 gimple *last, *last_store;
12100 edge e, efalse;
12101 tree mask;
12102 basic_block store_bb, join_bb;
12103 gimple_stmt_iterator gsi_to;
12104 tree vdef, new_vdef;
12105 gphi *phi;
12106 tree vectype;
12107 tree zero;
12109 last = worklist.pop ();
12110 mask = gimple_call_arg (last, 2);
12111 bb = gimple_bb (last);
12112 /* Create then_bb and if-then structure in CFG, then_bb belongs to
12113 the same loop as if_bb. It could be different to LOOP when two
12114 level loop-nest is vectorized and mask_store belongs to the inner
12115 one. */
12116 e = split_block (bb, last);
12117 bb_loop = bb->loop_father;
12118 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12119 join_bb = e->dest;
12120 store_bb = create_empty_bb (bb);
12121 add_bb_to_loop (store_bb, bb_loop);
12122 e->flags = EDGE_TRUE_VALUE;
12123 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12124 /* Put STORE_BB to likely part. */
12125 efalse->probability = profile_probability::likely ();
12126 e->probability = efalse->probability.invert ();
12127 store_bb->count = efalse->count ();
12128 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12129 if (dom_info_available_p (CDI_DOMINATORS))
12130 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12131 if (dump_enabled_p ())
12132 dump_printf_loc (MSG_NOTE, vect_location,
12133 "Create new block %d to sink mask stores.",
12134 store_bb->index);
12135 /* Create vector comparison with boolean result. */
12136 vectype = TREE_TYPE (mask);
12137 zero = build_zero_cst (vectype);
12138 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12139 gsi = gsi_last_bb (bb);
12140 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12141 /* Create new PHI node for vdef of the last masked store:
12142 .MEM_2 = VDEF <.MEM_1>
12143 will be converted to
12144 .MEM.3 = VDEF <.MEM_1>
12145 and new PHI node will be created in join bb
12146 .MEM_2 = PHI <.MEM_1, .MEM_3>
12148 vdef = gimple_vdef (last);
12149 new_vdef = make_ssa_name (gimple_vop (cfun), last);
12150 gimple_set_vdef (last, new_vdef);
12151 phi = create_phi_node (vdef, join_bb);
12152 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12154 /* Put all masked stores with the same mask to STORE_BB if possible. */
12155 while (true)
12157 gimple_stmt_iterator gsi_from;
12158 gimple *stmt1 = NULL;
12160 /* Move masked store to STORE_BB. */
12161 last_store = last;
12162 gsi = gsi_for_stmt (last);
12163 gsi_from = gsi;
12164 /* Shift GSI to the previous stmt for further traversal. */
12165 gsi_prev (&gsi);
12166 gsi_to = gsi_start_bb (store_bb);
12167 gsi_move_before (&gsi_from, &gsi_to);
12168 /* Setup GSI_TO to the non-empty block start. */
12169 gsi_to = gsi_start_bb (store_bb);
12170 if (dump_enabled_p ())
12171 dump_printf_loc (MSG_NOTE, vect_location,
12172 "Move stmt to created bb\n%G", last);
12173 /* Move all stored value producers if possible. */
12174 while (!gsi_end_p (gsi))
12176 tree lhs;
12177 imm_use_iterator imm_iter;
12178 use_operand_p use_p;
12179 bool res;
12181 /* Skip debug statements. */
12182 if (is_gimple_debug (gsi_stmt (gsi)))
12184 gsi_prev (&gsi);
12185 continue;
12187 stmt1 = gsi_stmt (gsi);
12188 /* Do not consider statements writing to memory or having
12189 volatile operand. */
12190 if (gimple_vdef (stmt1)
12191 || gimple_has_volatile_ops (stmt1))
12192 break;
12193 gsi_from = gsi;
12194 gsi_prev (&gsi);
12195 lhs = gimple_get_lhs (stmt1);
12196 if (!lhs)
12197 break;
12199 /* LHS of vectorized stmt must be SSA_NAME. */
12200 if (TREE_CODE (lhs) != SSA_NAME)
12201 break;
12203 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12205 /* Remove dead scalar statement. */
12206 if (has_zero_uses (lhs))
12208 gsi_remove (&gsi_from, true);
12209 continue;
12213 /* Check that LHS does not have uses outside of STORE_BB. */
12214 res = true;
12215 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12217 gimple *use_stmt;
12218 use_stmt = USE_STMT (use_p);
12219 if (is_gimple_debug (use_stmt))
12220 continue;
12221 if (gimple_bb (use_stmt) != store_bb)
12223 res = false;
12224 break;
12227 if (!res)
12228 break;
12230 if (gimple_vuse (stmt1)
12231 && gimple_vuse (stmt1) != gimple_vuse (last_store))
12232 break;
12234 /* Can move STMT1 to STORE_BB. */
12235 if (dump_enabled_p ())
12236 dump_printf_loc (MSG_NOTE, vect_location,
12237 "Move stmt to created bb\n%G", stmt1);
12238 gsi_move_before (&gsi_from, &gsi_to);
12239 /* Shift GSI_TO for further insertion. */
12240 gsi_prev (&gsi_to);
12242 /* Put other masked stores with the same mask to STORE_BB. */
12243 if (worklist.is_empty ()
12244 || gimple_call_arg (worklist.last (), 2) != mask
12245 || worklist.last () != stmt1)
12246 break;
12247 last = worklist.pop ();
12249 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12253 /* Decide whether it is possible to use a zero-based induction variable
12254 when vectorizing LOOP_VINFO with partial vectors. If it is, return
12255 the value that the induction variable must be able to hold in order
12256 to ensure that the rgroups eventually have no active vector elements.
12257 Return -1 otherwise. */
12259 widest_int
12260 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12262 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12263 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12264 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12266 /* Calculate the value that the induction variable must be able
12267 to hit in order to ensure that we end the loop with an all-false mask.
12268 This involves adding the maximum number of inactive trailing scalar
12269 iterations. */
12270 widest_int iv_limit = -1;
12271 if (max_loop_iterations (loop, &iv_limit))
12273 if (niters_skip)
12275 /* Add the maximum number of skipped iterations to the
12276 maximum iteration count. */
12277 if (TREE_CODE (niters_skip) == INTEGER_CST)
12278 iv_limit += wi::to_widest (niters_skip);
12279 else
12280 iv_limit += max_vf - 1;
12282 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12283 /* Make a conservatively-correct assumption. */
12284 iv_limit += max_vf - 1;
12286 /* IV_LIMIT is the maximum number of latch iterations, which is also
12287 the maximum in-range IV value. Round this value down to the previous
12288 vector alignment boundary and then add an extra full iteration. */
12289 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12290 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12292 return iv_limit;
12295 /* For the given rgroup_controls RGC, check whether an induction variable
12296 would ever hit a value that produces a set of all-false masks or zero
12297 lengths before wrapping around. Return true if it's possible to wrap
12298 around before hitting the desirable value, otherwise return false. */
12300 bool
12301 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12303 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12305 if (iv_limit == -1)
12306 return true;
12308 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12309 unsigned int compare_precision = TYPE_PRECISION (compare_type);
12310 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12312 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12313 return true;
12315 return false;