tree-optimization/113126 - vector extension compare optimization
[official-gcc.git] / gcc / tree-vect-loop.cc
blob1bdad0fbe0f4ac7bec5ad4891746d877f5900e92
1 /* Loop Vectorization
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "memmodel.h"
36 #include "optabs.h"
37 #include "diagnostic-core.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "cfganal.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "gimple-fold.h"
52 #include "cgraph.h"
53 #include "tree-cfg.h"
54 #include "tree-if-conv.h"
55 #include "internal-fn.h"
56 #include "tree-vector-builder.h"
57 #include "vec-perm-indices.h"
58 #include "tree-eh.h"
59 #include "case-cfn-macros.h"
60 #include "langhooks.h"
62 /* Loop Vectorization Pass.
64 This pass tries to vectorize loops.
66 For example, the vectorizer transforms the following simple loop:
68 short a[N]; short b[N]; short c[N]; int i;
70 for (i=0; i<N; i++){
71 a[i] = b[i] + c[i];
74 as if it was manually vectorized by rewriting the source code into:
76 typedef int __attribute__((mode(V8HI))) v8hi;
77 short a[N]; short b[N]; short c[N]; int i;
78 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 v8hi va, vb, vc;
81 for (i=0; i<N/8; i++){
82 vb = pb[i];
83 vc = pc[i];
84 va = vb + vc;
85 pa[i] = va;
88 The main entry to this pass is vectorize_loops(), in which
89 the vectorizer applies a set of analyses on a given set of loops,
90 followed by the actual vectorization transformation for the loops that
91 had successfully passed the analysis phase.
92 Throughout this pass we make a distinction between two types of
93 data: scalars (which are represented by SSA_NAMES), and memory references
94 ("data-refs"). These two types of data require different handling both
95 during analysis and transformation. The types of data-refs that the
96 vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 accesses are required to have a simple (consecutive) access pattern.
100 Analysis phase:
101 ===============
102 The driver for the analysis phase is vect_analyze_loop().
103 It applies a set of analyses, some of which rely on the scalar evolution
104 analyzer (scev) developed by Sebastian Pop.
106 During the analysis phase the vectorizer records some information
107 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 loop, as well as general information about the loop as a whole, which is
109 recorded in a "loop_vec_info" struct attached to each loop.
111 Transformation phase:
112 =====================
113 The loop transformation phase scans all the stmts in the loop, and
114 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 the loop that needs to be vectorized. It inserts the vector code sequence
116 just before the scalar stmt S, and records a pointer to the vector code
117 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 attached to S). This pointer will be used for the vectorization of following
119 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 otherwise, we rely on dead code elimination for removing it.
122 For example, say stmt S1 was vectorized into stmt VS1:
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 S2: a = b;
128 To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 resulting sequence would be:
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 VS2: va = vb;
136 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 Operands that are not SSA_NAMEs, are data-refs that appear in
139 load/store operations (like 'x[i]' in S1), and are handled differently.
141 Target modeling:
142 =================
143 Currently the only target specific information that is used is the
144 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 Targets that can support different sizes of vectors, for now will need
146 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 flexibility will be added in the future.
149 Since we only vectorize operations which vector form can be
150 expressed using existing tree codes, to verify that an operation is
151 supported, the vectorizer checks the relevant optab at the relevant
152 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 the value found is CODE_FOR_nothing, then there's no target support, and
154 we can't vectorize the stmt.
156 For additional information on this project see:
157 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 unsigned *);
162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 bool *, bool *, bool);
165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 may already be set for general statements (not just data refs). */
169 static opt_result
170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 bool vectype_maybe_set_p,
172 poly_uint64 *vf)
174 gimple *stmt = stmt_info->stmt;
176 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 && !STMT_VINFO_LIVE_P (stmt_info))
178 || gimple_clobber_p (stmt))
180 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 return opt_result::success ();
185 tree stmt_vectype, nunits_vectype;
186 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 &stmt_vectype,
188 &nunits_vectype);
189 if (!res)
190 return res;
192 if (stmt_vectype)
194 if (STMT_VINFO_VECTYPE (stmt_info))
195 /* The only case when a vectype had been already set is for stmts
196 that contain a data ref, or for "pattern-stmts" (stmts generated
197 by the vectorizer to represent/replace a certain idiom). */
198 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 || vectype_maybe_set_p)
200 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 else
202 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
205 if (nunits_vectype)
206 vect_update_max_nunits (vf, nunits_vectype);
208 return opt_result::success ();
211 /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 types of STMT_INFO and all attached pattern statements and update
213 the vectorization factor VF accordingly. Return true on success
214 or false if something prevented vectorization. */
216 static opt_result
217 vect_determine_vf_for_stmt (vec_info *vinfo,
218 stmt_vec_info stmt_info, poly_uint64 *vf)
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 if (!res)
225 return res;
227 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 && STMT_VINFO_RELATED_STMT (stmt_info))
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
233 /* If a pattern statement has def stmts, analyze them too. */
234 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 !gsi_end_p (si); gsi_next (&si))
237 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "==> examining pattern def stmt: %G",
241 def_stmt_info->stmt);
242 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 if (!res)
244 return res;
247 if (dump_enabled_p ())
248 dump_printf_loc (MSG_NOTE, vect_location,
249 "==> examining pattern statement: %G",
250 stmt_info->stmt);
251 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 if (!res)
253 return res;
256 return opt_result::success ();
259 /* Function vect_determine_vectorization_factor
261 Determine the vectorization factor (VF). VF is the number of data elements
262 that are operated upon in parallel in a single iteration of the vectorized
263 loop. For example, when vectorizing a loop that operates on 4byte elements,
264 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 elements can fit in a single vector register.
267 We currently support vectorization of loops in which all types operated upon
268 are of the same size. Therefore this function currently sets VF according to
269 the size of the types operated upon, and fails if there are multiple sizes
270 in the loop.
272 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 original loop:
274 for (i=0; i<N; i++){
275 a[i] = b[i] + c[i];
278 vectorized loop:
279 for (i=0; i<N; i+=VF){
280 a[i:VF] = b[i:VF] + c[i:VF];
284 static opt_result
285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
287 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 unsigned nbbs = loop->num_nodes;
290 poly_uint64 vectorization_factor = 1;
291 tree scalar_type = NULL_TREE;
292 gphi *phi;
293 tree vectype;
294 stmt_vec_info stmt_info;
295 unsigned i;
297 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
299 for (i = 0; i < nbbs; i++)
301 basic_block bb = bbs[i];
303 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 gsi_next (&si))
306 phi = si.phi ();
307 stmt_info = loop_vinfo->lookup_stmt (phi);
308 if (dump_enabled_p ())
309 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 (gimple *) phi);
312 gcc_assert (stmt_info);
314 if (STMT_VINFO_RELEVANT_P (stmt_info)
315 || STMT_VINFO_LIVE_P (stmt_info))
317 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 scalar_type = TREE_TYPE (PHI_RESULT (phi));
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE, vect_location,
322 "get vectype for scalar type: %T\n",
323 scalar_type);
325 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 if (!vectype)
327 return opt_result::failure_at (phi,
328 "not vectorized: unsupported "
329 "data-type %T\n",
330 scalar_type);
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 dump_printf (MSG_NOTE, "\n");
344 vect_update_max_nunits (&vectorization_factor, vectype);
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 gsi_next (&si))
351 if (is_gimple_debug (gsi_stmt (si)))
352 continue;
353 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 opt_result res
355 = vect_determine_vf_for_stmt (loop_vinfo,
356 stmt_info, &vectorization_factor);
357 if (!res)
358 return res;
362 /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 dump_dec (MSG_NOTE, vectorization_factor);
367 dump_printf (MSG_NOTE, "\n");
370 if (known_le (vectorization_factor, 1U))
371 return opt_result::failure_at (vect_location,
372 "not vectorized: unsupported data-type\n");
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 return opt_result::success ();
378 /* Function vect_is_simple_iv_evolution.
380 FORNOW: A simple evolution of an induction variables in the loop is
381 considered a polynomial evolution. */
383 static bool
384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 tree * step)
387 tree init_expr;
388 tree step_expr;
389 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 basic_block bb;
392 /* When there is no evolution in this loop, the evolution function
393 is not "simple". */
394 if (evolution_part == NULL_TREE)
395 return false;
397 /* When the evolution is a polynomial of degree >= 2
398 the evolution function is not "simple". */
399 if (tree_is_chrec (evolution_part))
400 return false;
402 step_expr = evolution_part;
403 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 step_expr, init_expr);
409 *init = init_expr;
410 *step = step_expr;
412 if (TREE_CODE (step_expr) != INTEGER_CST
413 && (TREE_CODE (step_expr) != SSA_NAME
414 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 || !flag_associative_math)))
419 && (TREE_CODE (step_expr) != REAL_CST
420 || !flag_associative_math))
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "step unknown.\n");
425 return false;
428 return true;
431 /* Function vect_is_nonlinear_iv_evolution
433 Only support nonlinear induction for integer type
434 1. neg
435 2. mul by constant
436 3. lshift/rshift by constant.
438 For neg induction, return a fake step as integer -1. */
439 static bool
440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 gphi* loop_phi_node, tree *init, tree *step)
443 tree init_expr, ev_expr, result, op1, op2;
444 gimple* def;
446 if (gimple_phi_num_args (loop_phi_node) != 2)
447 return false;
449 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
452 /* Support nonlinear induction only for integer type. */
453 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 return false;
456 *init = init_expr;
457 result = PHI_RESULT (loop_phi_node);
459 if (TREE_CODE (ev_expr) != SSA_NAME
460 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 || !is_gimple_assign (def))
462 return false;
464 enum tree_code t_code = gimple_assign_rhs_code (def);
465 switch (t_code)
467 case NEGATE_EXPR:
468 if (gimple_assign_rhs1 (def) != result)
469 return false;
470 *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 break;
474 case RSHIFT_EXPR:
475 case LSHIFT_EXPR:
476 case MULT_EXPR:
477 op1 = gimple_assign_rhs1 (def);
478 op2 = gimple_assign_rhs2 (def);
479 if (TREE_CODE (op2) != INTEGER_CST
480 || op1 != result)
481 return false;
482 *step = op2;
483 if (t_code == LSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 else if (t_code == RSHIFT_EXPR)
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 else
489 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 break;
492 default:
493 return false;
496 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
499 return true;
502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 what we are assuming is a double reduction. For example, given
504 a structure like this:
506 outer1:
507 x_1 = PHI <x_4(outer2), ...>;
510 inner:
511 x_2 = PHI <x_1(outer1), ...>;
513 x_3 = ...;
516 outer2:
517 x_4 = PHI <x_3(inner)>;
520 outer loop analysis would treat x_1 as a double reduction phi and
521 this function would then return true for x_2. */
523 static bool
524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
526 use_operand_p use_p;
527 ssa_op_iter op_iter;
528 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 return true;
532 return false;
535 /* Returns true if Phi is a first-order recurrence. A first-order
536 recurrence is a non-reduction recurrence relation in which the value of
537 the recurrence in the current loop iteration equals a value defined in
538 the previous iteration. */
540 static bool
541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 gphi *phi)
544 /* A nested cycle isn't vectorizable as first order recurrence. */
545 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 return false;
548 /* Ensure the loop latch definition is from within the loop. */
549 edge latch = loop_latch_edge (loop);
550 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 if (TREE_CODE (ldef) != SSA_NAME
552 || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 return false;
557 tree def = gimple_phi_result (phi);
559 /* Ensure every use_stmt of the phi node is dominated by the latch
560 definition. */
561 imm_use_iterator imm_iter;
562 use_operand_p use_p;
563 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 if (!is_gimple_debug (USE_STMT (use_p))
565 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 USE_STMT (use_p))))
568 return false;
570 /* First-order recurrence autovectorization needs shuffle vector. */
571 tree scalar_type = TREE_TYPE (def);
572 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 if (!vectype)
574 return false;
576 return true;
579 /* Function vect_analyze_scalar_cycles_1.
581 Examine the cross iteration def-use cycles of scalar variables
582 in LOOP. LOOP_VINFO represents the loop that is now being
583 considered for vectorization (can be LOOP, or an outer-loop
584 enclosing LOOP). SLP indicates there will be some subsequent
585 slp analyses or not. */
587 static void
588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 bool slp)
591 basic_block bb = loop->header;
592 tree init, step;
593 auto_vec<stmt_vec_info, 64> worklist;
594 gphi_iterator gsi;
595 bool double_reduc, reduc_chain;
597 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
599 /* First - identify all inductions. Reduction detection assumes that all the
600 inductions have been identified, therefore, this order must not be
601 changed. */
602 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
604 gphi *phi = gsi.phi ();
605 tree access_fn = NULL;
606 tree def = PHI_RESULT (phi);
607 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 (gimple *) phi);
613 /* Skip virtual phi's. The data dependences that are associated with
614 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 if (virtual_operand_p (def))
616 continue;
618 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
620 /* Analyze the evolution function. */
621 access_fn = analyze_scalar_evolution (loop, def);
622 if (access_fn)
624 STRIP_NOPS (access_fn);
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "Access function of PHI: %T\n", access_fn);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 = initial_condition_in_loop_num (access_fn, loop->num);
630 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 = evolution_part_in_loop_num (access_fn, loop->num);
634 if ((!access_fn
635 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 &init, &step)
638 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 && TREE_CODE (step) != INTEGER_CST))
640 /* Only handle nonlinear iv for same loop. */
641 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 phi, &init, &step)))
645 worklist.safe_push (stmt_vinfo);
646 continue;
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 != NULL_TREE);
651 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
659 /* Second - identify all reductions and nested cycles. */
660 while (worklist.length () > 0)
662 stmt_vec_info stmt_vinfo = worklist.pop ();
663 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
664 tree def = PHI_RESULT (phi);
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 (gimple *) phi);
670 gcc_assert (!virtual_operand_p (def)
671 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
673 stmt_vec_info reduc_stmt_info
674 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 &reduc_chain, slp);
676 if (reduc_stmt_info)
678 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 if (double_reduc)
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location,
684 "Detected double reduction.\n");
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
689 else
691 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "Detected vectorizable nested cycle.\n");
697 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
699 else
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_NOTE, vect_location,
703 "Detected reduction.\n");
705 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
706 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
707 /* Store the reduction cycles for possible vectorization in
708 loop-aware SLP if it was not detected as reduction
709 chain. */
710 if (! reduc_chain)
711 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
712 (reduc_stmt_info);
716 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
718 else
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown def-use cycle pattern.\n");
726 /* Function vect_analyze_scalar_cycles.
728 Examine the cross iteration def-use cycles of scalar variables, by
729 analyzing the loop-header PHIs of scalar variables. Classify each
730 cycle as one of the following: invariant, induction, reduction, unknown.
731 We do that for the loop represented by LOOP_VINFO, and also to its
732 inner-loop, if exists.
733 Examples for scalar cycles:
735 Example1: reduction:
737 loop1:
738 for (i=0; i<N; i++)
739 sum += a[i];
741 Example2: induction:
743 loop2:
744 for (i=0; i<N; i++)
745 a[i] = i; */
747 static void
748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
752 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
754 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 Reductions in such inner-loop therefore have different properties than
756 the reductions in the nest that gets vectorized:
757 1. When vectorized, they are executed in the same order as in the original
758 scalar loop, so we can't change the order of computation when
759 vectorizing them.
760 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 current checks are too strict. */
763 if (loop->inner)
764 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
767 /* Transfer group and reduction information from STMT_INFO to its
768 pattern stmt. */
770 static void
771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
773 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
774 stmt_vec_info stmtp;
775 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
777 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
780 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
781 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
782 == STMT_VINFO_DEF_TYPE (stmt_info));
783 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
784 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
785 if (stmt_info)
786 REDUC_GROUP_NEXT_ELEMENT (stmtp)
787 = STMT_VINFO_RELATED_STMT (stmt_info);
789 while (stmt_info);
792 /* Fixup scalar cycles that now have their stmts detected as patterns. */
794 static void
795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
797 stmt_vec_info first;
798 unsigned i;
800 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
802 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
803 while (next)
805 if ((STMT_VINFO_IN_PATTERN_P (next)
806 != STMT_VINFO_IN_PATTERN_P (first))
807 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
808 break;
809 next = REDUC_GROUP_NEXT_ELEMENT (next);
811 /* If all reduction chain members are well-formed patterns adjust
812 the group to group the pattern stmts instead. */
813 if (! next
814 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
816 if (STMT_VINFO_IN_PATTERN_P (first))
818 vect_fixup_reduc_chain (first);
819 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
820 = STMT_VINFO_RELATED_STMT (first);
823 /* If not all stmt in the chain are patterns or if we failed
824 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 it as regular reduction instead. */
826 else
828 stmt_vec_info vinfo = first;
829 stmt_vec_info last = NULL;
830 while (vinfo)
832 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
833 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
834 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
835 last = vinfo;
836 vinfo = next;
838 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
839 = vect_internal_def;
840 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
841 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
842 --i;
847 /* Function vect_get_loop_niters.
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 niter information holds in ASSUMPTIONS.
854 Return the loop exit conditions. */
857 static vec<gcond *>
858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
859 tree *number_of_iterations, tree *number_of_iterationsm1)
861 auto_vec<edge> exits = get_loop_exit_edges (loop);
862 vec<gcond *> conds;
863 conds.create (exits.length ());
864 class tree_niter_desc niter_desc;
865 tree niter_assumptions, niter, may_be_zero;
867 *assumptions = boolean_true_node;
868 *number_of_iterationsm1 = chrec_dont_know;
869 *number_of_iterations = chrec_dont_know;
871 DUMP_VECT_SCOPE ("get_loop_niters");
873 if (exits.is_empty ())
874 return conds;
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
878 exits.length ());
880 edge exit;
881 unsigned int i;
882 FOR_EACH_VEC_ELT (exits, i, exit)
884 gcond *cond = get_loop_exit_condition (exit);
885 if (cond)
886 conds.safe_push (cond);
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
891 if (exit != main_exit)
892 continue;
894 may_be_zero = NULL_TREE;
895 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
896 || chrec_contains_undetermined (niter_desc.niter))
897 continue;
899 niter_assumptions = niter_desc.assumptions;
900 may_be_zero = niter_desc.may_be_zero;
901 niter = niter_desc.niter;
903 if (may_be_zero && integer_zerop (may_be_zero))
904 may_be_zero = NULL_TREE;
906 if (may_be_zero)
908 if (COMPARISON_CLASS_P (may_be_zero))
910 /* Try to combine may_be_zero with assumptions, this can simplify
911 computation of niter expression. */
912 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
913 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
914 niter_assumptions,
915 fold_build1 (TRUTH_NOT_EXPR,
916 boolean_type_node,
917 may_be_zero));
918 else
919 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
920 build_int_cst (TREE_TYPE (niter), 0),
921 rewrite_to_non_trapping_overflow (niter));
923 may_be_zero = NULL_TREE;
925 else if (integer_nonzerop (may_be_zero))
927 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
928 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
929 continue;
931 else
932 continue;
935 /* Loop assumptions are based off the normal exit. */
936 *assumptions = niter_assumptions;
937 *number_of_iterationsm1 = niter;
939 /* We want the number of loop header executions which is the number
940 of latch executions plus one.
941 ??? For UINT_MAX latch executions this number overflows to zero
942 for loops like do { n++; } while (n != 0); */
943 if (niter && !chrec_contains_undetermined (niter))
945 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
946 unshare_expr (niter),
947 build_int_cst (TREE_TYPE (niter), 1));
948 if (TREE_CODE (niter) == INTEGER_CST
949 && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
951 /* If we manage to fold niter + 1 into INTEGER_CST even when
952 niter is some complex expression, ensure back
953 *number_of_iterationsm1 is an INTEGER_CST as well. See
954 PR113210. */
955 *number_of_iterationsm1
956 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
957 build_minus_one_cst (TREE_TYPE (niter)));
960 *number_of_iterations = niter;
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
966 return conds;
969 /* Determine the main loop exit for the vectorizer. */
971 edge
972 vec_init_loop_exit_info (class loop *loop)
974 /* Before we begin we must first determine which exit is the main one and
975 which are auxilary exits. */
976 auto_vec<edge> exits = get_loop_exit_edges (loop);
977 if (exits.length () == 1)
978 return exits[0];
980 /* If we have multiple exits we only support counting IV at the moment. Analyze
981 all exits and return one */
982 class tree_niter_desc niter_desc;
983 edge candidate = NULL;
984 for (edge exit : exits)
986 if (!get_loop_exit_condition (exit))
987 continue;
989 if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
990 && !chrec_contains_undetermined (niter_desc.niter))
992 if (!niter_desc.may_be_zero || !candidate)
993 candidate = exit;
997 return candidate;
1000 /* Function bb_in_loop_p
1002 Used as predicate for dfs order traversal of the loop bbs. */
1004 static bool
1005 bb_in_loop_p (const_basic_block bb, const void *data)
1007 const class loop *const loop = (const class loop *)data;
1008 if (flow_bb_inside_loop_p (loop, bb))
1009 return true;
1010 return false;
1014 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1015 stmt_vec_info structs for all the stmts in LOOP_IN. */
1017 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1018 : vec_info (vec_info::loop, shared),
1019 loop (loop_in),
1020 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1021 num_itersm1 (NULL_TREE),
1022 num_iters (NULL_TREE),
1023 num_iters_unchanged (NULL_TREE),
1024 num_iters_assumptions (NULL_TREE),
1025 vector_costs (nullptr),
1026 scalar_costs (nullptr),
1027 th (0),
1028 versioning_threshold (0),
1029 vectorization_factor (0),
1030 main_loop_edge (nullptr),
1031 skip_main_loop_edge (nullptr),
1032 skip_this_loop_edge (nullptr),
1033 reusable_accumulators (),
1034 suggested_unroll_factor (1),
1035 max_vectorization_factor (0),
1036 mask_skip_niters (NULL_TREE),
1037 rgroup_compare_type (NULL_TREE),
1038 simd_if_cond (NULL_TREE),
1039 partial_vector_style (vect_partial_vectors_none),
1040 unaligned_dr (NULL),
1041 peeling_for_alignment (0),
1042 ptr_mask (0),
1043 ivexpr_map (NULL),
1044 scan_map (NULL),
1045 slp_unrolling_factor (1),
1046 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1047 vectorizable (false),
1048 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1049 using_partial_vectors_p (false),
1050 using_decrementing_iv_p (false),
1051 using_select_vl_p (false),
1052 epil_using_partial_vectors_p (false),
1053 partial_load_store_bias (0),
1054 peeling_for_gaps (false),
1055 peeling_for_niter (false),
1056 early_breaks (false),
1057 no_data_dependencies (false),
1058 has_mask_store (false),
1059 scalar_loop_scaling (profile_probability::uninitialized ()),
1060 scalar_loop (NULL),
1061 orig_loop_info (NULL),
1062 vec_loop_iv_exit (NULL),
1063 vec_epilogue_loop_iv_exit (NULL),
1064 scalar_loop_iv_exit (NULL)
1066 /* CHECKME: We want to visit all BBs before their successors (except for
1067 latch blocks, for which this assertion wouldn't hold). In the simple
1068 case of the loop forms we allow, a dfs order of the BBs would the same
1069 as reversed postorder traversal, so we are safe. */
1071 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1072 bbs, loop->num_nodes, loop);
1073 gcc_assert (nbbs == loop->num_nodes);
1075 for (unsigned int i = 0; i < nbbs; i++)
1077 basic_block bb = bbs[i];
1078 gimple_stmt_iterator si;
1080 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1082 gimple *phi = gsi_stmt (si);
1083 gimple_set_uid (phi, 0);
1084 add_stmt (phi);
1087 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1089 gimple *stmt = gsi_stmt (si);
1090 gimple_set_uid (stmt, 0);
1091 if (is_gimple_debug (stmt))
1092 continue;
1093 add_stmt (stmt);
1094 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1095 third argument is the #pragma omp simd if (x) condition, when 0,
1096 loop shouldn't be vectorized, when non-zero constant, it should
1097 be vectorized normally, otherwise versioned with vectorized loop
1098 done if the condition is non-zero at runtime. */
1099 if (loop_in->simduid
1100 && is_gimple_call (stmt)
1101 && gimple_call_internal_p (stmt)
1102 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1103 && gimple_call_num_args (stmt) >= 3
1104 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1105 && (loop_in->simduid
1106 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1108 tree arg = gimple_call_arg (stmt, 2);
1109 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1110 simd_if_cond = arg;
1111 else
1112 gcc_assert (integer_nonzerop (arg));
1117 epilogue_vinfos.create (6);
1120 /* Free all levels of rgroup CONTROLS. */
1122 void
1123 release_vec_loop_controls (vec<rgroup_controls> *controls)
1125 rgroup_controls *rgc;
1126 unsigned int i;
1127 FOR_EACH_VEC_ELT (*controls, i, rgc)
1128 rgc->controls.release ();
1129 controls->release ();
1132 /* Free all memory used by the _loop_vec_info, as well as all the
1133 stmt_vec_info structs of all the stmts in the loop. */
1135 _loop_vec_info::~_loop_vec_info ()
1137 free (bbs);
1139 release_vec_loop_controls (&masks.rgc_vec);
1140 release_vec_loop_controls (&lens);
1141 delete ivexpr_map;
1142 delete scan_map;
1143 epilogue_vinfos.release ();
1144 delete scalar_costs;
1145 delete vector_costs;
1147 /* When we release an epiloge vinfo that we do not intend to use
1148 avoid clearing AUX of the main loop which should continue to
1149 point to the main loop vinfo since otherwise we'll leak that. */
1150 if (loop->aux == this)
1151 loop->aux = NULL;
1154 /* Return an invariant or register for EXPR and emit necessary
1155 computations in the LOOP_VINFO loop preheader. */
1157 tree
1158 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1160 if (is_gimple_reg (expr)
1161 || is_gimple_min_invariant (expr))
1162 return expr;
1164 if (! loop_vinfo->ivexpr_map)
1165 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1166 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1167 if (! cached)
1169 gimple_seq stmts = NULL;
1170 cached = force_gimple_operand (unshare_expr (expr),
1171 &stmts, true, NULL_TREE);
1172 if (stmts)
1174 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1175 gsi_insert_seq_on_edge_immediate (e, stmts);
1178 return cached;
1181 /* Return true if we can use CMP_TYPE as the comparison type to produce
1182 all masks required to mask LOOP_VINFO. */
1184 static bool
1185 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1187 rgroup_controls *rgm;
1188 unsigned int i;
1189 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1190 if (rgm->type != NULL_TREE
1191 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1192 cmp_type, rgm->type,
1193 OPTIMIZE_FOR_SPEED))
1194 return false;
1195 return true;
1198 /* Calculate the maximum number of scalars per iteration for every
1199 rgroup in LOOP_VINFO. */
1201 static unsigned int
1202 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1204 unsigned int res = 1;
1205 unsigned int i;
1206 rgroup_controls *rgm;
1207 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1208 res = MAX (res, rgm->max_nscalars_per_iter);
1209 return res;
1212 /* Calculate the minimum precision necessary to represent:
1214 MAX_NITERS * FACTOR
1216 as an unsigned integer, where MAX_NITERS is the maximum number of
1217 loop header iterations for the original scalar form of LOOP_VINFO. */
1219 static unsigned
1220 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1222 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1224 /* Get the maximum number of iterations that is representable
1225 in the counter type. */
1226 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1227 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1229 /* Get a more refined estimate for the number of iterations. */
1230 widest_int max_back_edges;
1231 if (max_loop_iterations (loop, &max_back_edges))
1232 max_ni = wi::smin (max_ni, max_back_edges + 1);
1234 /* Work out how many bits we need to represent the limit. */
1235 return wi::min_precision (max_ni * factor, UNSIGNED);
1238 /* True if the loop needs peeling or partial vectors when vectorized. */
1240 static bool
1241 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1243 unsigned HOST_WIDE_INT const_vf;
1244 HOST_WIDE_INT max_niter
1245 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1247 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1248 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1249 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1250 (loop_vinfo));
1252 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1253 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1255 /* Work out the (constant) number of iterations that need to be
1256 peeled for reasons other than niters. */
1257 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1258 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1259 peel_niter += 1;
1260 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1261 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1262 return true;
1264 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1265 /* ??? When peeling for gaps but not alignment, we could
1266 try to check whether the (variable) niters is known to be
1267 VF * N + 1. That's something of a niche case though. */
1268 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1269 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1270 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1271 < (unsigned) exact_log2 (const_vf))
1272 /* In case of versioning, check if the maximum number of
1273 iterations is greater than th. If they are identical,
1274 the epilogue is unnecessary. */
1275 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1276 || ((unsigned HOST_WIDE_INT) max_niter
1277 /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1278 but that's only computed later based on our result.
1279 The following is the most conservative approximation. */
1280 > (std::max ((unsigned HOST_WIDE_INT) th,
1281 const_vf) / const_vf) * const_vf))))
1282 return true;
1284 return false;
1287 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1288 whether we can actually generate the masks required. Return true if so,
1289 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1291 static bool
1292 vect_verify_full_masking (loop_vec_info loop_vinfo)
1294 unsigned int min_ni_width;
1296 /* Use a normal loop if there are no statements that need masking.
1297 This only happens in rare degenerate cases: it means that the loop
1298 has no loads, no stores, and no live-out values. */
1299 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1300 return false;
1302 /* Produce the rgroup controls. */
1303 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1305 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1306 tree vectype = mask.first;
1307 unsigned nvectors = mask.second;
1309 if (masks->rgc_vec.length () < nvectors)
1310 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1311 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1312 /* The number of scalars per iteration and the number of vectors are
1313 both compile-time constants. */
1314 unsigned int nscalars_per_iter
1315 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1316 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1318 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1320 rgm->max_nscalars_per_iter = nscalars_per_iter;
1321 rgm->type = truth_type_for (vectype);
1322 rgm->factor = 1;
1326 unsigned int max_nscalars_per_iter
1327 = vect_get_max_nscalars_per_iter (loop_vinfo);
1329 /* Work out how many bits we need to represent the limit. */
1330 min_ni_width
1331 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1333 /* Find a scalar mode for which WHILE_ULT is supported. */
1334 opt_scalar_int_mode cmp_mode_iter;
1335 tree cmp_type = NULL_TREE;
1336 tree iv_type = NULL_TREE;
1337 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1338 unsigned int iv_precision = UINT_MAX;
1340 if (iv_limit != -1)
1341 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1342 UNSIGNED);
1344 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1346 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1347 if (cmp_bits >= min_ni_width
1348 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1350 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1351 if (this_type
1352 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1354 /* Although we could stop as soon as we find a valid mode,
1355 there are at least two reasons why that's not always the
1356 best choice:
1358 - An IV that's Pmode or wider is more likely to be reusable
1359 in address calculations than an IV that's narrower than
1360 Pmode.
1362 - Doing the comparison in IV_PRECISION or wider allows
1363 a natural 0-based IV, whereas using a narrower comparison
1364 type requires mitigations against wrap-around.
1366 Conversely, if the IV limit is variable, doing the comparison
1367 in a wider type than the original type can introduce
1368 unnecessary extensions, so picking the widest valid mode
1369 is not always a good choice either.
1371 Here we prefer the first IV type that's Pmode or wider,
1372 and the first comparison type that's IV_PRECISION or wider.
1373 (The comparison type must be no wider than the IV type,
1374 to avoid extensions in the vector loop.)
1376 ??? We might want to try continuing beyond Pmode for ILP32
1377 targets if CMP_BITS < IV_PRECISION. */
1378 iv_type = this_type;
1379 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1380 cmp_type = this_type;
1381 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1382 break;
1387 if (!cmp_type)
1389 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1390 return false;
1393 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1394 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1395 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1396 return true;
1399 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1400 whether we can actually generate AVX512 style masks. Return true if so,
1401 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1403 static bool
1404 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1406 /* Produce differently organized rgc_vec and differently check
1407 we can produce masks. */
1409 /* Use a normal loop if there are no statements that need masking.
1410 This only happens in rare degenerate cases: it means that the loop
1411 has no loads, no stores, and no live-out values. */
1412 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1413 return false;
1415 /* For the decrementing IV we need to represent all values in
1416 [0, niter + niter_skip] where niter_skip is the elements we
1417 skip in the first iteration for prologue peeling. */
1418 tree iv_type = NULL_TREE;
1419 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1420 unsigned int iv_precision = UINT_MAX;
1421 if (iv_limit != -1)
1422 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1424 /* First compute the type for the IV we use to track the remaining
1425 scalar iterations. */
1426 opt_scalar_int_mode cmp_mode_iter;
1427 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1429 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1430 if (cmp_bits >= iv_precision
1431 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1433 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1434 if (iv_type)
1435 break;
1438 if (!iv_type)
1439 return false;
1441 /* Produce the rgroup controls. */
1442 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1444 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1445 tree vectype = mask.first;
1446 unsigned nvectors = mask.second;
1448 /* The number of scalars per iteration and the number of vectors are
1449 both compile-time constants. */
1450 unsigned int nscalars_per_iter
1451 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1452 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1454 /* We index the rgroup_controls vector with nscalars_per_iter
1455 which we keep constant and instead have a varying nvectors,
1456 remembering the vector mask with the fewest nV. */
1457 if (masks->rgc_vec.length () < nscalars_per_iter)
1458 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1459 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1461 if (!rgm->type || rgm->factor > nvectors)
1463 rgm->type = truth_type_for (vectype);
1464 rgm->compare_type = NULL_TREE;
1465 rgm->max_nscalars_per_iter = nscalars_per_iter;
1466 rgm->factor = nvectors;
1467 rgm->bias_adjusted_ctrl = NULL_TREE;
1471 /* There is no fixed compare type we are going to use but we have to
1472 be able to get at one for each mask group. */
1473 unsigned int min_ni_width
1474 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1476 bool ok = true;
1477 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1479 tree mask_type = rgc.type;
1480 if (!mask_type)
1481 continue;
1483 /* For now vect_get_loop_mask only supports integer mode masks
1484 when we need to split it. */
1485 if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1486 || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1488 ok = false;
1489 break;
1492 /* If iv_type is usable as compare type use that - we can elide the
1493 saturation in that case. */
1494 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1496 tree cmp_vectype
1497 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1498 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1499 rgc.compare_type = cmp_vectype;
1501 if (!rgc.compare_type)
1502 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1504 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1505 if (cmp_bits >= min_ni_width
1506 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1508 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1509 if (!cmp_type)
1510 continue;
1512 /* Check whether we can produce the mask with cmp_type. */
1513 tree cmp_vectype
1514 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1515 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1517 rgc.compare_type = cmp_vectype;
1518 break;
1522 if (!rgc.compare_type)
1524 ok = false;
1525 break;
1528 if (!ok)
1530 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1531 return false;
1534 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1535 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1536 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1537 return true;
1540 /* Check whether we can use vector access with length based on precison
1541 comparison. So far, to keep it simple, we only allow the case that the
1542 precision of the target supported length is larger than the precision
1543 required by loop niters. */
1545 static bool
1546 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1548 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1549 return false;
1551 machine_mode len_load_mode, len_store_mode;
1552 if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1553 .exists (&len_load_mode))
1554 return false;
1555 if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1556 .exists (&len_store_mode))
1557 return false;
1559 signed char partial_load_bias = internal_len_load_store_bias
1560 (IFN_LEN_LOAD, len_load_mode);
1562 signed char partial_store_bias = internal_len_load_store_bias
1563 (IFN_LEN_STORE, len_store_mode);
1565 gcc_assert (partial_load_bias == partial_store_bias);
1567 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1568 return false;
1570 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1571 len_loads with a length of zero. In order to avoid that we prohibit
1572 more than one loop length here. */
1573 if (partial_load_bias == -1
1574 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1575 return false;
1577 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1579 unsigned int max_nitems_per_iter = 1;
1580 unsigned int i;
1581 rgroup_controls *rgl;
1582 /* Find the maximum number of items per iteration for every rgroup. */
1583 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1585 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1586 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1589 /* Work out how many bits we need to represent the length limit. */
1590 unsigned int min_ni_prec
1591 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1593 /* Now use the maximum of below precisions for one suitable IV type:
1594 - the IV's natural precision
1595 - the precision needed to hold: the maximum number of scalar
1596 iterations multiplied by the scale factor (min_ni_prec above)
1597 - the Pmode precision
1599 If min_ni_prec is less than the precision of the current niters,
1600 we perfer to still use the niters type. Prefer to use Pmode and
1601 wider IV to avoid narrow conversions. */
1603 unsigned int ni_prec
1604 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1605 min_ni_prec = MAX (min_ni_prec, ni_prec);
1606 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1608 tree iv_type = NULL_TREE;
1609 opt_scalar_int_mode tmode_iter;
1610 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1612 scalar_mode tmode = tmode_iter.require ();
1613 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1615 /* ??? Do we really want to construct one IV whose precision exceeds
1616 BITS_PER_WORD? */
1617 if (tbits > BITS_PER_WORD)
1618 break;
1620 /* Find the first available standard integral type. */
1621 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1623 iv_type = build_nonstandard_integer_type (tbits, true);
1624 break;
1628 if (!iv_type)
1630 if (dump_enabled_p ())
1631 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1632 "can't vectorize with length-based partial vectors"
1633 " because there is no suitable iv type.\n");
1634 return false;
1637 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1638 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1639 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1641 return true;
1644 /* Calculate the cost of one scalar iteration of the loop. */
1645 static void
1646 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1648 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1649 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1650 int nbbs = loop->num_nodes, factor;
1651 int innerloop_iters, i;
1653 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1655 /* Gather costs for statements in the scalar loop. */
1657 /* FORNOW. */
1658 innerloop_iters = 1;
1659 if (loop->inner)
1660 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1662 for (i = 0; i < nbbs; i++)
1664 gimple_stmt_iterator si;
1665 basic_block bb = bbs[i];
1667 if (bb->loop_father == loop->inner)
1668 factor = innerloop_iters;
1669 else
1670 factor = 1;
1672 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1674 gimple *stmt = gsi_stmt (si);
1675 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1677 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1678 continue;
1680 /* Skip stmts that are not vectorized inside the loop. */
1681 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1682 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1683 && (!STMT_VINFO_LIVE_P (vstmt_info)
1684 || !VECTORIZABLE_CYCLE_DEF
1685 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1686 continue;
1688 vect_cost_for_stmt kind;
1689 if (STMT_VINFO_DATA_REF (stmt_info))
1691 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1692 kind = scalar_load;
1693 else
1694 kind = scalar_store;
1696 else if (vect_nop_conversion_p (stmt_info))
1697 continue;
1698 else
1699 kind = scalar_stmt;
1701 /* We are using vect_prologue here to avoid scaling twice
1702 by the inner loop factor. */
1703 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1704 factor, kind, stmt_info, 0, vect_prologue);
1708 /* Now accumulate cost. */
1709 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1710 add_stmt_costs (loop_vinfo->scalar_costs,
1711 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1712 loop_vinfo->scalar_costs->finish_cost (nullptr);
1715 /* Function vect_analyze_loop_form.
1717 Verify that certain CFG restrictions hold, including:
1718 - the loop has a pre-header
1719 - the loop has a single entry
1720 - nested loops can have only a single exit.
1721 - the loop exit condition is simple enough
1722 - the number of iterations can be analyzed, i.e, a countable loop. The
1723 niter could be analyzed under some assumptions. */
1725 opt_result
1726 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1728 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1730 edge exit_e = vec_init_loop_exit_info (loop);
1731 if (!exit_e)
1732 return opt_result::failure_at (vect_location,
1733 "not vectorized:"
1734 " could not determine main exit from"
1735 " loop with multiple exits.\n");
1736 info->loop_exit = exit_e;
1737 if (dump_enabled_p ())
1738 dump_printf_loc (MSG_NOTE, vect_location,
1739 "using as main loop exit: %d -> %d [AUX: %p]\n",
1740 exit_e->src->index, exit_e->dest->index, exit_e->aux);
1742 /* Check if we have any control flow that doesn't leave the loop. */
1743 class loop *v_loop = loop->inner ? loop->inner : loop;
1744 basic_block *bbs= get_loop_body (v_loop);
1745 for (unsigned i = 0; i < v_loop->num_nodes; i++)
1746 if (EDGE_COUNT (bbs[i]->succs) != 1
1747 && (EDGE_COUNT (bbs[i]->succs) != 2
1748 || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1749 return opt_result::failure_at (vect_location,
1750 "not vectorized:"
1751 " unsupported control flow in loop.\n");
1753 /* Different restrictions apply when we are considering an inner-most loop,
1754 vs. an outer (nested) loop.
1755 (FORNOW. May want to relax some of these restrictions in the future). */
1757 info->inner_loop_cond = NULL;
1758 if (!loop->inner)
1760 /* Inner-most loop. We currently require that the number of BBs is
1761 exactly 2 (the header and latch). Vectorizable inner-most loops
1762 look like this:
1764 (pre-header)
1766 header <--------+
1767 | | |
1768 | +--> latch --+
1770 (exit-bb) */
1772 if (empty_block_p (loop->header))
1773 return opt_result::failure_at (vect_location,
1774 "not vectorized: empty loop.\n");
1776 else
1778 class loop *innerloop = loop->inner;
1779 edge entryedge;
1781 /* Nested loop. We currently require that the loop is doubly-nested,
1782 contains a single inner loop, and the number of BBs is exactly 5.
1783 Vectorizable outer-loops look like this:
1785 (pre-header)
1787 header <---+
1789 inner-loop |
1791 tail ------+
1793 (exit-bb)
1795 The inner-loop has the properties expected of inner-most loops
1796 as described above. */
1798 if ((loop->inner)->inner || (loop->inner)->next)
1799 return opt_result::failure_at (vect_location,
1800 "not vectorized:"
1801 " multiple nested loops.\n");
1803 entryedge = loop_preheader_edge (innerloop);
1804 if (entryedge->src != loop->header
1805 || !single_exit (innerloop)
1806 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1807 return opt_result::failure_at (vect_location,
1808 "not vectorized:"
1809 " unsupported outerloop form.\n");
1811 /* Analyze the inner-loop. */
1812 vect_loop_form_info inner;
1813 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1814 if (!res)
1816 if (dump_enabled_p ())
1817 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1818 "not vectorized: Bad inner loop.\n");
1819 return res;
1822 /* Don't support analyzing niter under assumptions for inner
1823 loop. */
1824 if (!integer_onep (inner.assumptions))
1825 return opt_result::failure_at (vect_location,
1826 "not vectorized: Bad inner loop.\n");
1828 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1829 return opt_result::failure_at (vect_location,
1830 "not vectorized: inner-loop count not"
1831 " invariant.\n");
1833 if (dump_enabled_p ())
1834 dump_printf_loc (MSG_NOTE, vect_location,
1835 "Considering outer-loop vectorization.\n");
1836 info->inner_loop_cond = inner.conds[0];
1839 if (EDGE_COUNT (loop->header->preds) != 2)
1840 return opt_result::failure_at (vect_location,
1841 "not vectorized:"
1842 " too many incoming edges.\n");
1844 /* We assume that the loop exit condition is at the end of the loop. i.e,
1845 that the loop is represented as a do-while (with a proper if-guard
1846 before the loop if needed), where the loop header contains all the
1847 executable statements, and the latch is empty. */
1848 if (!empty_block_p (loop->latch)
1849 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1850 return opt_result::failure_at (vect_location,
1851 "not vectorized: latch block not empty.\n");
1853 /* Make sure the exit is not abnormal. */
1854 auto_vec<edge> exits = get_loop_exit_edges (loop);
1855 for (edge e : exits)
1857 if (e->flags & EDGE_ABNORMAL)
1858 return opt_result::failure_at (vect_location,
1859 "not vectorized:"
1860 " abnormal loop exit edge.\n");
1863 info->conds
1864 = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1865 &info->number_of_iterations,
1866 &info->number_of_iterationsm1);
1868 if (info->conds.is_empty ())
1869 return opt_result::failure_at
1870 (vect_location,
1871 "not vectorized: complicated exit condition.\n");
1873 /* Determine what the primary and alternate exit conds are. */
1874 for (unsigned i = 0; i < info->conds.length (); i++)
1876 gcond *cond = info->conds[i];
1877 if (exit_e->src == gimple_bb (cond))
1878 std::swap (info->conds[0], info->conds[i]);
1881 if (integer_zerop (info->assumptions)
1882 || !info->number_of_iterations
1883 || chrec_contains_undetermined (info->number_of_iterations))
1884 return opt_result::failure_at
1885 (info->conds[0],
1886 "not vectorized: number of iterations cannot be computed.\n");
1888 if (integer_zerop (info->number_of_iterations))
1889 return opt_result::failure_at
1890 (info->conds[0],
1891 "not vectorized: number of iterations = 0.\n");
1893 if (!(tree_fits_shwi_p (info->number_of_iterations)
1894 && tree_to_shwi (info->number_of_iterations) > 0))
1896 if (dump_enabled_p ())
1898 dump_printf_loc (MSG_NOTE, vect_location,
1899 "Symbolic number of iterations is ");
1900 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1901 dump_printf (MSG_NOTE, "\n");
1905 return opt_result::success ();
1908 /* Create a loop_vec_info for LOOP with SHARED and the
1909 vect_analyze_loop_form result. */
1911 loop_vec_info
1912 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1913 const vect_loop_form_info *info,
1914 loop_vec_info main_loop_info)
1916 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1917 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1918 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1919 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1920 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1921 /* Also record the assumptions for versioning. */
1922 if (!integer_onep (info->assumptions) && !main_loop_info)
1923 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1925 for (gcond *cond : info->conds)
1927 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1928 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1929 /* Mark the statement as a condition. */
1930 STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1933 for (unsigned i = 1; i < info->conds.length (); i ++)
1934 LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1935 LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1937 LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1939 /* Check to see if we're vectorizing multiple exits. */
1940 LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1941 = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1943 if (info->inner_loop_cond)
1945 stmt_vec_info inner_loop_cond_info
1946 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1947 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1948 /* If we have an estimate on the number of iterations of the inner
1949 loop use that to limit the scale for costing, otherwise use
1950 --param vect-inner-loop-cost-factor literally. */
1951 widest_int nit;
1952 if (estimated_stmt_executions (loop->inner, &nit))
1953 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1954 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1957 return loop_vinfo;
1962 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1963 statements update the vectorization factor. */
1965 static void
1966 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1968 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1969 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1970 int nbbs = loop->num_nodes;
1971 poly_uint64 vectorization_factor;
1972 int i;
1974 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1976 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1977 gcc_assert (known_ne (vectorization_factor, 0U));
1979 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1980 vectorization factor of the loop is the unrolling factor required by
1981 the SLP instances. If that unrolling factor is 1, we say, that we
1982 perform pure SLP on loop - cross iteration parallelism is not
1983 exploited. */
1984 bool only_slp_in_loop = true;
1985 for (i = 0; i < nbbs; i++)
1987 basic_block bb = bbs[i];
1988 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1989 gsi_next (&si))
1991 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1992 if (!stmt_info)
1993 continue;
1994 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1995 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1996 && !PURE_SLP_STMT (stmt_info))
1997 /* STMT needs both SLP and loop-based vectorization. */
1998 only_slp_in_loop = false;
2000 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2001 gsi_next (&si))
2003 if (is_gimple_debug (gsi_stmt (si)))
2004 continue;
2005 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2006 stmt_info = vect_stmt_to_vectorize (stmt_info);
2007 if ((STMT_VINFO_RELEVANT_P (stmt_info)
2008 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2009 && !PURE_SLP_STMT (stmt_info))
2010 /* STMT needs both SLP and loop-based vectorization. */
2011 only_slp_in_loop = false;
2015 if (only_slp_in_loop)
2017 if (dump_enabled_p ())
2018 dump_printf_loc (MSG_NOTE, vect_location,
2019 "Loop contains only SLP stmts\n");
2020 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2022 else
2024 if (dump_enabled_p ())
2025 dump_printf_loc (MSG_NOTE, vect_location,
2026 "Loop contains SLP and non-SLP stmts\n");
2027 /* Both the vectorization factor and unroll factor have the form
2028 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2029 so they must have a common multiple. */
2030 vectorization_factor
2031 = force_common_multiple (vectorization_factor,
2032 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2035 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2036 if (dump_enabled_p ())
2038 dump_printf_loc (MSG_NOTE, vect_location,
2039 "Updating vectorization factor to ");
2040 dump_dec (MSG_NOTE, vectorization_factor);
2041 dump_printf (MSG_NOTE, ".\n");
2045 /* Return true if STMT_INFO describes a double reduction phi and if
2046 the other phi in the reduction is also relevant for vectorization.
2047 This rejects cases such as:
2049 outer1:
2050 x_1 = PHI <x_3(outer2), ...>;
2053 inner:
2054 x_2 = ...;
2057 outer2:
2058 x_3 = PHI <x_2(inner)>;
2060 if nothing in x_2 or elsewhere makes x_1 relevant. */
2062 static bool
2063 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2065 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2066 return false;
2068 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2071 /* Function vect_analyze_loop_operations.
2073 Scan the loop stmts and make sure they are all vectorizable. */
2075 static opt_result
2076 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2078 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2079 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2080 int nbbs = loop->num_nodes;
2081 int i;
2082 stmt_vec_info stmt_info;
2083 bool need_to_vectorize = false;
2084 bool ok;
2086 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2088 auto_vec<stmt_info_for_cost> cost_vec;
2090 for (i = 0; i < nbbs; i++)
2092 basic_block bb = bbs[i];
2094 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2095 gsi_next (&si))
2097 gphi *phi = si.phi ();
2098 ok = true;
2100 stmt_info = loop_vinfo->lookup_stmt (phi);
2101 if (dump_enabled_p ())
2102 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2103 (gimple *) phi);
2104 if (virtual_operand_p (gimple_phi_result (phi)))
2105 continue;
2107 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2108 (i.e., a phi in the tail of the outer-loop). */
2109 if (! is_loop_header_bb_p (bb))
2111 /* FORNOW: we currently don't support the case that these phis
2112 are not used in the outerloop (unless it is double reduction,
2113 i.e., this phi is vect_reduction_def), cause this case
2114 requires to actually do something here. */
2115 if (STMT_VINFO_LIVE_P (stmt_info)
2116 && !vect_active_double_reduction_p (stmt_info))
2117 return opt_result::failure_at (phi,
2118 "Unsupported loop-closed phi"
2119 " in outer-loop.\n");
2121 /* If PHI is used in the outer loop, we check that its operand
2122 is defined in the inner loop. */
2123 if (STMT_VINFO_RELEVANT_P (stmt_info))
2125 tree phi_op;
2127 if (gimple_phi_num_args (phi) != 1)
2128 return opt_result::failure_at (phi, "unsupported phi");
2130 phi_op = PHI_ARG_DEF (phi, 0);
2131 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2132 if (!op_def_info)
2133 return opt_result::failure_at (phi, "unsupported phi\n");
2135 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2136 && (STMT_VINFO_RELEVANT (op_def_info)
2137 != vect_used_in_outer_by_reduction))
2138 return opt_result::failure_at (phi, "unsupported phi\n");
2140 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2141 || (STMT_VINFO_DEF_TYPE (stmt_info)
2142 == vect_double_reduction_def))
2143 && !vectorizable_lc_phi (loop_vinfo,
2144 stmt_info, NULL, NULL))
2145 return opt_result::failure_at (phi, "unsupported phi\n");
2148 continue;
2151 gcc_assert (stmt_info);
2153 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2154 || STMT_VINFO_LIVE_P (stmt_info))
2155 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2156 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2157 /* A scalar-dependence cycle that we don't support. */
2158 return opt_result::failure_at (phi,
2159 "not vectorized:"
2160 " scalar dependence cycle.\n");
2162 if (STMT_VINFO_RELEVANT_P (stmt_info))
2164 need_to_vectorize = true;
2165 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2166 && ! PURE_SLP_STMT (stmt_info))
2167 ok = vectorizable_induction (loop_vinfo,
2168 stmt_info, NULL, NULL,
2169 &cost_vec);
2170 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2171 || (STMT_VINFO_DEF_TYPE (stmt_info)
2172 == vect_double_reduction_def)
2173 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2174 && ! PURE_SLP_STMT (stmt_info))
2175 ok = vectorizable_reduction (loop_vinfo,
2176 stmt_info, NULL, NULL, &cost_vec);
2177 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2178 == vect_first_order_recurrence)
2179 && ! PURE_SLP_STMT (stmt_info))
2180 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2181 &cost_vec);
2184 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2185 if (ok
2186 && STMT_VINFO_LIVE_P (stmt_info)
2187 && !PURE_SLP_STMT (stmt_info))
2188 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2189 -1, false, &cost_vec);
2191 if (!ok)
2192 return opt_result::failure_at (phi,
2193 "not vectorized: relevant phi not "
2194 "supported: %G",
2195 static_cast <gimple *> (phi));
2198 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2199 gsi_next (&si))
2201 gimple *stmt = gsi_stmt (si);
2202 if (!gimple_clobber_p (stmt)
2203 && !is_gimple_debug (stmt))
2205 opt_result res
2206 = vect_analyze_stmt (loop_vinfo,
2207 loop_vinfo->lookup_stmt (stmt),
2208 &need_to_vectorize,
2209 NULL, NULL, &cost_vec);
2210 if (!res)
2211 return res;
2214 } /* bbs */
2216 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2218 /* All operations in the loop are either irrelevant (deal with loop
2219 control, or dead), or only used outside the loop and can be moved
2220 out of the loop (e.g. invariants, inductions). The loop can be
2221 optimized away by scalar optimizations. We're better off not
2222 touching this loop. */
2223 if (!need_to_vectorize)
2225 if (dump_enabled_p ())
2226 dump_printf_loc (MSG_NOTE, vect_location,
2227 "All the computation can be taken out of the loop.\n");
2228 return opt_result::failure_at
2229 (vect_location,
2230 "not vectorized: redundant loop. no profit to vectorize.\n");
2233 return opt_result::success ();
2236 /* Return true if we know that the iteration count is smaller than the
2237 vectorization factor. Return false if it isn't, or if we can't be sure
2238 either way. */
2240 static bool
2241 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2243 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2245 HOST_WIDE_INT max_niter;
2246 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2247 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2248 else
2249 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2251 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2252 return true;
2254 return false;
2257 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2258 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2259 definitely no, or -1 if it's worth retrying. */
2261 static int
2262 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2263 unsigned *suggested_unroll_factor)
2265 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2266 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2268 /* Only loops that can handle partially-populated vectors can have iteration
2269 counts less than the vectorization factor. */
2270 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2271 && vect_known_niters_smaller_than_vf (loop_vinfo))
2273 if (dump_enabled_p ())
2274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2275 "not vectorized: iteration count smaller than "
2276 "vectorization factor.\n");
2277 return 0;
2280 /* If we know the number of iterations we can do better, for the
2281 epilogue we can also decide whether the main loop leaves us
2282 with enough iterations, prefering a smaller vector epilog then
2283 also possibly used for the case we skip the vector loop. */
2284 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2286 widest_int scalar_niters
2287 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2288 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2290 loop_vec_info orig_loop_vinfo
2291 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2292 unsigned lowest_vf
2293 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2294 int prolog_peeling = 0;
2295 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2296 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2297 if (prolog_peeling >= 0
2298 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2299 lowest_vf))
2301 unsigned gap
2302 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2303 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2304 % lowest_vf + gap);
2307 /* Reject vectorizing for a single scalar iteration, even if
2308 we could in principle implement that using partial vectors. */
2309 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2310 if (scalar_niters <= peeling_gap + 1)
2312 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2314 "not vectorized: loop only has a single "
2315 "scalar iteration.\n");
2316 return 0;
2319 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2321 /* Check that the loop processes at least one full vector. */
2322 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2323 if (known_lt (scalar_niters, vf))
2325 if (dump_enabled_p ())
2326 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2327 "loop does not have enough iterations "
2328 "to support vectorization.\n");
2329 return 0;
2332 /* If we need to peel an extra epilogue iteration to handle data
2333 accesses with gaps, check that there are enough scalar iterations
2334 available.
2336 The check above is redundant with this one when peeling for gaps,
2337 but the distinction is useful for diagnostics. */
2338 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2339 && known_le (scalar_niters, vf))
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343 "loop does not have enough iterations "
2344 "to support peeling for gaps.\n");
2345 return 0;
2350 /* If using the "very cheap" model. reject cases in which we'd keep
2351 a copy of the scalar code (even if we might be able to vectorize it). */
2352 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2353 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2354 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2355 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2357 if (dump_enabled_p ())
2358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2359 "some scalar iterations would need to be peeled\n");
2360 return 0;
2363 int min_profitable_iters, min_profitable_estimate;
2364 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2365 &min_profitable_estimate,
2366 suggested_unroll_factor);
2368 if (min_profitable_iters < 0)
2370 if (dump_enabled_p ())
2371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2372 "not vectorized: vectorization not profitable.\n");
2373 if (dump_enabled_p ())
2374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2375 "not vectorized: vector version will never be "
2376 "profitable.\n");
2377 return -1;
2380 int min_scalar_loop_bound = (param_min_vect_loop_bound
2381 * assumed_vf);
2383 /* Use the cost model only if it is more conservative than user specified
2384 threshold. */
2385 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2386 min_profitable_iters);
2388 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2390 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2391 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2393 if (dump_enabled_p ())
2394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2395 "not vectorized: vectorization not profitable.\n");
2396 if (dump_enabled_p ())
2397 dump_printf_loc (MSG_NOTE, vect_location,
2398 "not vectorized: iteration count smaller than user "
2399 "specified loop bound parameter or minimum profitable "
2400 "iterations (whichever is more conservative).\n");
2401 return 0;
2404 /* The static profitablity threshold min_profitable_estimate includes
2405 the cost of having to check at runtime whether the scalar loop
2406 should be used instead. If it turns out that we don't need or want
2407 such a check, the threshold we should use for the static estimate
2408 is simply the point at which the vector loop becomes more profitable
2409 than the scalar loop. */
2410 if (min_profitable_estimate > min_profitable_iters
2411 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2412 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2413 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2414 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2416 if (dump_enabled_p ())
2417 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2418 " choice between the scalar and vector loops\n");
2419 min_profitable_estimate = min_profitable_iters;
2422 /* If the vector loop needs multiple iterations to be beneficial then
2423 things are probably too close to call, and the conservative thing
2424 would be to stick with the scalar code. */
2425 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2426 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2428 if (dump_enabled_p ())
2429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2430 "one iteration of the vector loop would be"
2431 " more expensive than the equivalent number of"
2432 " iterations of the scalar loop\n");
2433 return 0;
2436 HOST_WIDE_INT estimated_niter;
2438 /* If we are vectorizing an epilogue then we know the maximum number of
2439 scalar iterations it will cover is at least one lower than the
2440 vectorization factor of the main loop. */
2441 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2442 estimated_niter
2443 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2444 else
2446 estimated_niter = estimated_stmt_executions_int (loop);
2447 if (estimated_niter == -1)
2448 estimated_niter = likely_max_stmt_executions_int (loop);
2450 if (estimated_niter != -1
2451 && ((unsigned HOST_WIDE_INT) estimated_niter
2452 < MAX (th, (unsigned) min_profitable_estimate)))
2454 if (dump_enabled_p ())
2455 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2456 "not vectorized: estimated iteration count too "
2457 "small.\n");
2458 if (dump_enabled_p ())
2459 dump_printf_loc (MSG_NOTE, vect_location,
2460 "not vectorized: estimated iteration count smaller "
2461 "than specified loop bound parameter or minimum "
2462 "profitable iterations (whichever is more "
2463 "conservative).\n");
2464 return -1;
2467 return 1;
2470 static opt_result
2471 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2472 vec<data_reference_p> *datarefs,
2473 unsigned int *n_stmts)
2475 *n_stmts = 0;
2476 for (unsigned i = 0; i < loop->num_nodes; i++)
2477 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2478 !gsi_end_p (gsi); gsi_next (&gsi))
2480 gimple *stmt = gsi_stmt (gsi);
2481 if (is_gimple_debug (stmt))
2482 continue;
2483 ++(*n_stmts);
2484 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2485 NULL, 0);
2486 if (!res)
2488 if (is_gimple_call (stmt) && loop->safelen)
2490 tree fndecl = gimple_call_fndecl (stmt), op;
2491 if (fndecl == NULL_TREE
2492 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2494 fndecl = gimple_call_arg (stmt, 0);
2495 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2496 fndecl = TREE_OPERAND (fndecl, 0);
2497 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2499 if (fndecl != NULL_TREE)
2501 cgraph_node *node = cgraph_node::get (fndecl);
2502 if (node != NULL && node->simd_clones != NULL)
2504 unsigned int j, n = gimple_call_num_args (stmt);
2505 for (j = 0; j < n; j++)
2507 op = gimple_call_arg (stmt, j);
2508 if (DECL_P (op)
2509 || (REFERENCE_CLASS_P (op)
2510 && get_base_address (op)))
2511 break;
2513 op = gimple_call_lhs (stmt);
2514 /* Ignore #pragma omp declare simd functions
2515 if they don't have data references in the
2516 call stmt itself. */
2517 if (j == n
2518 && !(op
2519 && (DECL_P (op)
2520 || (REFERENCE_CLASS_P (op)
2521 && get_base_address (op)))))
2522 continue;
2526 return res;
2528 /* If dependence analysis will give up due to the limit on the
2529 number of datarefs stop here and fail fatally. */
2530 if (datarefs->length ()
2531 > (unsigned)param_loop_max_datarefs_for_datadeps)
2532 return opt_result::failure_at (stmt, "exceeded param "
2533 "loop-max-datarefs-for-datadeps\n");
2535 return opt_result::success ();
2538 /* Look for SLP-only access groups and turn each individual access into its own
2539 group. */
2540 static void
2541 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2543 unsigned int i;
2544 struct data_reference *dr;
2546 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2548 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2549 FOR_EACH_VEC_ELT (datarefs, i, dr)
2551 gcc_assert (DR_REF (dr));
2552 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2554 /* Check if the load is a part of an interleaving chain. */
2555 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2557 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2558 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2559 unsigned int group_size = DR_GROUP_SIZE (first_element);
2561 /* Check if SLP-only groups. */
2562 if (!STMT_SLP_TYPE (stmt_info)
2563 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2565 /* Dissolve the group. */
2566 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2568 stmt_vec_info vinfo = first_element;
2569 while (vinfo)
2571 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2572 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2573 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2574 DR_GROUP_SIZE (vinfo) = 1;
2575 if (STMT_VINFO_STRIDED_P (first_element)
2576 /* We cannot handle stores with gaps. */
2577 || DR_IS_WRITE (dr_info->dr))
2579 STMT_VINFO_STRIDED_P (vinfo) = true;
2580 DR_GROUP_GAP (vinfo) = 0;
2582 else
2583 DR_GROUP_GAP (vinfo) = group_size - 1;
2584 /* Duplicate and adjust alignment info, it needs to
2585 be present on each group leader, see dr_misalignment. */
2586 if (vinfo != first_element)
2588 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2589 dr_info2->target_alignment = dr_info->target_alignment;
2590 int misalignment = dr_info->misalignment;
2591 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2593 HOST_WIDE_INT diff
2594 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2595 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2596 unsigned HOST_WIDE_INT align_c
2597 = dr_info->target_alignment.to_constant ();
2598 misalignment = (misalignment + diff) % align_c;
2600 dr_info2->misalignment = misalignment;
2602 vinfo = next;
2609 /* Determine if operating on full vectors for LOOP_VINFO might leave
2610 some scalar iterations still to do. If so, decide how we should
2611 handle those scalar iterations. The possibilities are:
2613 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2614 In this case:
2616 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2617 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2618 LOOP_VINFO_PEELING_FOR_NITER == false
2620 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2621 to handle the remaining scalar iterations. In this case:
2623 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2624 LOOP_VINFO_PEELING_FOR_NITER == true
2626 There are two choices:
2628 (2a) Consider vectorizing the epilogue loop at the same VF as the
2629 main loop, but using partial vectors instead of full vectors.
2630 In this case:
2632 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2634 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2635 In this case:
2637 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2640 opt_result
2641 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2643 /* Determine whether there would be any scalar iterations left over. */
2644 bool need_peeling_or_partial_vectors_p
2645 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2647 /* Decide whether to vectorize the loop with partial vectors. */
2648 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2649 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2650 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2651 && need_peeling_or_partial_vectors_p)
2653 /* For partial-vector-usage=1, try to push the handling of partial
2654 vectors to the epilogue, with the main loop continuing to operate
2655 on full vectors.
2657 If we are unrolling we also do not want to use partial vectors. This
2658 is to avoid the overhead of generating multiple masks and also to
2659 avoid having to execute entire iterations of FALSE masked instructions
2660 when dealing with one or less full iterations.
2662 ??? We could then end up failing to use partial vectors if we
2663 decide to peel iterations into a prologue, and if the main loop
2664 then ends up processing fewer than VF iterations. */
2665 if ((param_vect_partial_vector_usage == 1
2666 || loop_vinfo->suggested_unroll_factor > 1)
2667 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2668 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2669 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2670 else
2671 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2674 if (dump_enabled_p ())
2675 dump_printf_loc (MSG_NOTE, vect_location,
2676 "operating on %s vectors%s.\n",
2677 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2678 ? "partial" : "full",
2679 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2680 ? " for epilogue loop" : "");
2682 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2683 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2684 && need_peeling_or_partial_vectors_p);
2686 /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2687 analysis that we don't know whether the loop is vectorized by partial
2688 vectors (More details see tree-vect-loop-manip.cc).
2690 However, SELECT_VL vectorizaton style should only applied on partial
2691 vectorization since SELECT_VL is the GIMPLE IR that calculates the
2692 number of elements to be process for each iteration.
2694 After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2695 if it is not partial vectorized loop. */
2696 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2697 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2699 return opt_result::success ();
2702 /* Function vect_analyze_loop_2.
2704 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2705 analyses will record information in some members of LOOP_VINFO. FATAL
2706 indicates if some analysis meets fatal error. If one non-NULL pointer
2707 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2708 worked out suggested unroll factor, while one NULL pointer shows it's
2709 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2710 is to hold the slp decision when the suggested unroll factor is worked
2711 out. */
2712 static opt_result
2713 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2714 unsigned *suggested_unroll_factor,
2715 bool& slp_done_for_suggested_uf)
2717 opt_result ok = opt_result::success ();
2718 int res;
2719 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2720 poly_uint64 min_vf = 2;
2721 loop_vec_info orig_loop_vinfo = NULL;
2723 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2724 loop_vec_info of the first vectorized loop. */
2725 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2726 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2727 else
2728 orig_loop_vinfo = loop_vinfo;
2729 gcc_assert (orig_loop_vinfo);
2731 /* The first group of checks is independent of the vector size. */
2732 fatal = true;
2734 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2735 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2736 return opt_result::failure_at (vect_location,
2737 "not vectorized: simd if(0)\n");
2739 /* Find all data references in the loop (which correspond to vdefs/vuses)
2740 and analyze their evolution in the loop. */
2742 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2744 /* Gather the data references and count stmts in the loop. */
2745 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2747 opt_result res
2748 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2749 &LOOP_VINFO_DATAREFS (loop_vinfo),
2750 &LOOP_VINFO_N_STMTS (loop_vinfo));
2751 if (!res)
2753 if (dump_enabled_p ())
2754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2755 "not vectorized: loop contains function "
2756 "calls or data references that cannot "
2757 "be analyzed\n");
2758 return res;
2760 loop_vinfo->shared->save_datarefs ();
2762 else
2763 loop_vinfo->shared->check_datarefs ();
2765 /* Analyze the data references and also adjust the minimal
2766 vectorization factor according to the loads and stores. */
2768 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2769 if (!ok)
2771 if (dump_enabled_p ())
2772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2773 "bad data references.\n");
2774 return ok;
2777 /* Check if we are applying unroll factor now. */
2778 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2779 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2781 /* If the slp decision is false when suggested unroll factor is worked
2782 out, and we are applying suggested unroll factor, we can simply skip
2783 all slp related analyses this time. */
2784 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2786 /* Classify all cross-iteration scalar data-flow cycles.
2787 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2788 vect_analyze_scalar_cycles (loop_vinfo, slp);
2790 vect_pattern_recog (loop_vinfo);
2792 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2794 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2795 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2797 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2798 if (!ok)
2800 if (dump_enabled_p ())
2801 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2802 "bad data access.\n");
2803 return ok;
2806 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2808 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2809 if (!ok)
2811 if (dump_enabled_p ())
2812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2813 "unexpected pattern.\n");
2814 return ok;
2817 /* While the rest of the analysis below depends on it in some way. */
2818 fatal = false;
2820 /* Analyze data dependences between the data-refs in the loop
2821 and adjust the maximum vectorization factor according to
2822 the dependences.
2823 FORNOW: fail at the first data dependence that we encounter. */
2825 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2826 if (!ok)
2828 if (dump_enabled_p ())
2829 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2830 "bad data dependence.\n");
2831 return ok;
2833 if (max_vf != MAX_VECTORIZATION_FACTOR
2834 && maybe_lt (max_vf, min_vf))
2835 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2836 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2838 ok = vect_determine_vectorization_factor (loop_vinfo);
2839 if (!ok)
2841 if (dump_enabled_p ())
2842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2843 "can't determine vectorization factor.\n");
2844 return ok;
2847 /* Compute the scalar iteration cost. */
2848 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2850 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2852 if (slp)
2854 /* Check the SLP opportunities in the loop, analyze and build
2855 SLP trees. */
2856 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2857 if (!ok)
2858 return ok;
2860 /* If there are any SLP instances mark them as pure_slp. */
2861 slp = vect_make_slp_decision (loop_vinfo);
2862 if (slp)
2864 /* Find stmts that need to be both vectorized and SLPed. */
2865 vect_detect_hybrid_slp (loop_vinfo);
2867 /* Update the vectorization factor based on the SLP decision. */
2868 vect_update_vf_for_slp (loop_vinfo);
2870 /* Optimize the SLP graph with the vectorization factor fixed. */
2871 vect_optimize_slp (loop_vinfo);
2873 /* Gather the loads reachable from the SLP graph entries. */
2874 vect_gather_slp_loads (loop_vinfo);
2878 bool saved_can_use_partial_vectors_p
2879 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2881 /* We don't expect to have to roll back to anything other than an empty
2882 set of rgroups. */
2883 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2885 /* This is the point where we can re-start analysis with SLP forced off. */
2886 start_over:
2888 /* Apply the suggested unrolling factor, this was determined by the backend
2889 during finish_cost the first time we ran the analyzis for this
2890 vector mode. */
2891 if (applying_suggested_uf)
2892 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2894 /* Now the vectorization factor is final. */
2895 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2896 gcc_assert (known_ne (vectorization_factor, 0U));
2898 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2900 dump_printf_loc (MSG_NOTE, vect_location,
2901 "vectorization_factor = ");
2902 dump_dec (MSG_NOTE, vectorization_factor);
2903 dump_printf (MSG_NOTE, ", niters = %wd\n",
2904 LOOP_VINFO_INT_NITERS (loop_vinfo));
2907 if (max_vf != MAX_VECTORIZATION_FACTOR
2908 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2909 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2911 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2913 /* Analyze the alignment of the data-refs in the loop.
2914 Fail if a data reference is found that cannot be vectorized. */
2916 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2917 if (!ok)
2919 if (dump_enabled_p ())
2920 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2921 "bad data alignment.\n");
2922 return ok;
2925 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2926 It is important to call pruning after vect_analyze_data_ref_accesses,
2927 since we use grouping information gathered by interleaving analysis. */
2928 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2929 if (!ok)
2930 return ok;
2932 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2933 vectorization, since we do not want to add extra peeling or
2934 add versioning for alignment. */
2935 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2936 /* This pass will decide on using loop versioning and/or loop peeling in
2937 order to enhance the alignment of data references in the loop. */
2938 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2939 if (!ok)
2940 return ok;
2942 if (slp)
2944 /* Analyze operations in the SLP instances. Note this may
2945 remove unsupported SLP instances which makes the above
2946 SLP kind detection invalid. */
2947 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2948 vect_slp_analyze_operations (loop_vinfo);
2949 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2951 ok = opt_result::failure_at (vect_location,
2952 "unsupported SLP instances\n");
2953 goto again;
2956 /* Check whether any load in ALL SLP instances is possibly permuted. */
2957 slp_tree load_node, slp_root;
2958 unsigned i, x;
2959 slp_instance instance;
2960 bool can_use_lanes = true;
2961 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2963 slp_root = SLP_INSTANCE_TREE (instance);
2964 int group_size = SLP_TREE_LANES (slp_root);
2965 tree vectype = SLP_TREE_VECTYPE (slp_root);
2966 bool loads_permuted = false;
2967 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2969 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2970 continue;
2971 unsigned j;
2972 stmt_vec_info load_info;
2973 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2974 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2976 loads_permuted = true;
2977 break;
2981 /* If the loads and stores can be handled with load/store-lane
2982 instructions record it and move on to the next instance. */
2983 if (loads_permuted
2984 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2985 && vect_store_lanes_supported (vectype, group_size, false)
2986 != IFN_LAST)
2988 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2989 if (STMT_VINFO_GROUPED_ACCESS
2990 (SLP_TREE_REPRESENTATIVE (load_node)))
2992 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2993 (SLP_TREE_REPRESENTATIVE (load_node));
2994 /* Use SLP for strided accesses (or if we can't
2995 load-lanes). */
2996 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2997 || vect_load_lanes_supported
2998 (STMT_VINFO_VECTYPE (stmt_vinfo),
2999 DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
3000 break;
3003 can_use_lanes
3004 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
3006 if (can_use_lanes && dump_enabled_p ())
3007 dump_printf_loc (MSG_NOTE, vect_location,
3008 "SLP instance %p can use load/store-lanes\n",
3009 (void *) instance);
3011 else
3013 can_use_lanes = false;
3014 break;
3018 /* If all SLP instances can use load/store-lanes abort SLP and try again
3019 with SLP disabled. */
3020 if (can_use_lanes)
3022 ok = opt_result::failure_at (vect_location,
3023 "Built SLP cancelled: can use "
3024 "load/store-lanes\n");
3025 if (dump_enabled_p ())
3026 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3027 "Built SLP cancelled: all SLP instances support "
3028 "load/store-lanes\n");
3029 goto again;
3033 /* Dissolve SLP-only groups. */
3034 vect_dissolve_slp_only_groups (loop_vinfo);
3036 /* Scan all the remaining operations in the loop that are not subject
3037 to SLP and make sure they are vectorizable. */
3038 ok = vect_analyze_loop_operations (loop_vinfo);
3039 if (!ok)
3041 if (dump_enabled_p ())
3042 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3043 "bad operation or unsupported loop bound.\n");
3044 return ok;
3047 /* For now, we don't expect to mix both masking and length approaches for one
3048 loop, disable it if both are recorded. */
3049 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3050 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3051 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3053 if (dump_enabled_p ())
3054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3055 "can't vectorize a loop with partial vectors"
3056 " because we don't expect to mix different"
3057 " approaches with partial vectors for the"
3058 " same loop.\n");
3059 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3062 /* If we still have the option of using partial vectors,
3063 check whether we can generate the necessary loop controls. */
3064 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3066 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3068 if (!vect_verify_full_masking (loop_vinfo)
3069 && !vect_verify_full_masking_avx512 (loop_vinfo))
3070 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3072 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3073 if (!vect_verify_loop_lens (loop_vinfo))
3074 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3077 /* If we're vectorizing a loop that uses length "controls" and
3078 can iterate more than once, we apply decrementing IV approach
3079 in loop control. */
3080 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3081 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3082 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3083 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3084 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3085 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3086 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3088 /* If a loop uses length controls and has a decrementing loop control IV,
3089 we will normally pass that IV through a MIN_EXPR to calcaluate the
3090 basis for the length controls. E.g. in a loop that processes one
3091 element per scalar iteration, the number of elements would be
3092 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3094 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3095 step, since only the final iteration of the vector loop can have
3096 inactive lanes.
3098 However, some targets have a dedicated instruction for calculating the
3099 preferred length, given the total number of elements that still need to
3100 be processed. This is encapsulated in the SELECT_VL internal function.
3102 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3103 to determine the basis for the length controls. However, unlike the
3104 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3105 lanes inactive in any iteration of the vector loop, not just the last
3106 iteration. This SELECT_VL approach therefore requires us to use pointer
3107 IVs with variable steps.
3109 Once we've decided how many elements should be processed by one
3110 iteration of the vector loop, we need to populate the rgroup controls.
3111 If a loop has multiple rgroups, we need to make sure that those rgroups
3112 "line up" (that is, they must be consistent about which elements are
3113 active and which aren't). This is done by vect_adjust_loop_lens_control.
3115 In principle, it would be possible to use vect_adjust_loop_lens_control
3116 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3117 However:
3119 (1) In practice, it only makes sense to use SELECT_VL when a vector
3120 operation will be controlled directly by the result. It is not
3121 worth using SELECT_VL if it would only be the input to other
3122 calculations.
3124 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3125 pointer IV will need N updates by a variable amount (N-1 updates
3126 within the iteration and 1 update to move to the next iteration).
3128 Because of this, we prefer to use the MIN_EXPR approach whenever there
3129 is more than one length control.
3131 In addition, SELECT_VL always operates to a granularity of 1 unit.
3132 If we wanted to use it to control an SLP operation on N consecutive
3133 elements, we would need to make the SELECT_VL inputs measure scalar
3134 iterations (rather than elements) and then multiply the SELECT_VL
3135 result by N. But using SELECT_VL this way is inefficient because
3136 of (1) above.
3138 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3139 satisfied:
3141 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3142 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3144 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3145 we will fail to gain benefits of following unroll optimizations. We prefer
3146 using the MIN_EXPR approach in this situation. */
3147 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3149 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3150 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3151 OPTIMIZE_FOR_SPEED)
3152 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3153 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3154 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3155 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3156 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3159 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3160 assuming that the loop will be used as a main loop. We will redo
3161 this analysis later if we instead decide to use the loop as an
3162 epilogue loop. */
3163 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3164 if (!ok)
3165 return ok;
3167 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3168 to be able to handle fewer than VF scalars, or needs to have a lower VF
3169 than the main loop. */
3170 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3171 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3173 poly_uint64 unscaled_vf
3174 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3175 orig_loop_vinfo->suggested_unroll_factor);
3176 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3177 return opt_result::failure_at (vect_location,
3178 "Vectorization factor too high for"
3179 " epilogue loop.\n");
3182 /* Check the costings of the loop make vectorizing worthwhile. */
3183 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3184 if (res < 0)
3186 ok = opt_result::failure_at (vect_location,
3187 "Loop costings may not be worthwhile.\n");
3188 goto again;
3190 if (!res)
3191 return opt_result::failure_at (vect_location,
3192 "Loop costings not worthwhile.\n");
3194 /* If an epilogue loop is required make sure we can create one. */
3195 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3196 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3197 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3199 if (dump_enabled_p ())
3200 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3201 if (!vect_can_advance_ivs_p (loop_vinfo)
3202 || !slpeel_can_duplicate_loop_p (loop,
3203 LOOP_VINFO_IV_EXIT (loop_vinfo),
3204 LOOP_VINFO_IV_EXIT (loop_vinfo)))
3206 ok = opt_result::failure_at (vect_location,
3207 "not vectorized: can't create required "
3208 "epilog loop\n");
3209 goto again;
3213 /* During peeling, we need to check if number of loop iterations is
3214 enough for both peeled prolog loop and vector loop. This check
3215 can be merged along with threshold check of loop versioning, so
3216 increase threshold for this case if necessary.
3218 If we are analyzing an epilogue we still want to check what its
3219 versioning threshold would be. If we decide to vectorize the epilogues we
3220 will want to use the lowest versioning threshold of all epilogues and main
3221 loop. This will enable us to enter a vectorized epilogue even when
3222 versioning the loop. We can't simply check whether the epilogue requires
3223 versioning though since we may have skipped some versioning checks when
3224 analyzing the epilogue. For instance, checks for alias versioning will be
3225 skipped when dealing with epilogues as we assume we already checked them
3226 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3227 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3229 poly_uint64 niters_th = 0;
3230 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3232 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3234 /* Niters for peeled prolog loop. */
3235 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3237 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3238 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3239 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3241 else
3242 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3245 /* Niters for at least one iteration of vectorized loop. */
3246 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3247 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3248 /* One additional iteration because of peeling for gap. */
3249 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3250 niters_th += 1;
3252 /* Use the same condition as vect_transform_loop to decide when to use
3253 the cost to determine a versioning threshold. */
3254 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3255 && ordered_p (th, niters_th))
3256 niters_th = ordered_max (poly_uint64 (th), niters_th);
3258 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3261 gcc_assert (known_eq (vectorization_factor,
3262 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3264 slp_done_for_suggested_uf = slp;
3266 /* Ok to vectorize! */
3267 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3268 return opt_result::success ();
3270 again:
3271 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3272 gcc_assert (!ok);
3274 /* Try again with SLP forced off but if we didn't do any SLP there is
3275 no point in re-trying. */
3276 if (!slp)
3277 return ok;
3279 /* If the slp decision is true when suggested unroll factor is worked
3280 out, and we are applying suggested unroll factor, we don't need to
3281 re-try any more. */
3282 if (applying_suggested_uf && slp_done_for_suggested_uf)
3283 return ok;
3285 /* If there are reduction chains re-trying will fail anyway. */
3286 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3287 return ok;
3289 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3290 via interleaving or lane instructions. */
3291 slp_instance instance;
3292 slp_tree node;
3293 unsigned i, j;
3294 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3296 stmt_vec_info vinfo;
3297 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3298 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3299 continue;
3300 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3301 unsigned int size = DR_GROUP_SIZE (vinfo);
3302 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3303 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3304 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3305 && ! vect_grouped_store_supported (vectype, size))
3306 return opt_result::failure_at (vinfo->stmt,
3307 "unsupported grouped store\n");
3308 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3310 vinfo = SLP_TREE_REPRESENTATIVE (node);
3311 if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3313 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3314 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3315 size = DR_GROUP_SIZE (vinfo);
3316 vectype = STMT_VINFO_VECTYPE (vinfo);
3317 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3318 && ! vect_grouped_load_supported (vectype, single_element_p,
3319 size))
3320 return opt_result::failure_at (vinfo->stmt,
3321 "unsupported grouped load\n");
3326 if (dump_enabled_p ())
3327 dump_printf_loc (MSG_NOTE, vect_location,
3328 "re-trying with SLP disabled\n");
3330 /* Roll back state appropriately. No SLP this time. */
3331 slp = false;
3332 /* Restore vectorization factor as it were without SLP. */
3333 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3334 /* Free the SLP instances. */
3335 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3336 vect_free_slp_instance (instance);
3337 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3338 /* Reset SLP type to loop_vect on all stmts. */
3339 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3341 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3342 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3343 !gsi_end_p (si); gsi_next (&si))
3345 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3346 STMT_SLP_TYPE (stmt_info) = loop_vect;
3347 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3348 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3350 /* vectorizable_reduction adjusts reduction stmt def-types,
3351 restore them to that of the PHI. */
3352 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3353 = STMT_VINFO_DEF_TYPE (stmt_info);
3354 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3355 (STMT_VINFO_REDUC_DEF (stmt_info)))
3356 = STMT_VINFO_DEF_TYPE (stmt_info);
3359 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3360 !gsi_end_p (si); gsi_next (&si))
3362 if (is_gimple_debug (gsi_stmt (si)))
3363 continue;
3364 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3365 STMT_SLP_TYPE (stmt_info) = loop_vect;
3366 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3368 stmt_vec_info pattern_stmt_info
3369 = STMT_VINFO_RELATED_STMT (stmt_info);
3370 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3371 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3373 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3374 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3375 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3376 !gsi_end_p (pi); gsi_next (&pi))
3377 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3378 = loop_vect;
3382 /* Free optimized alias test DDRS. */
3383 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3384 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3385 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3386 /* Reset target cost data. */
3387 delete loop_vinfo->vector_costs;
3388 loop_vinfo->vector_costs = nullptr;
3389 /* Reset accumulated rgroup information. */
3390 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3391 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3392 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3393 /* Reset assorted flags. */
3394 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3395 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3396 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3397 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3398 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3399 = saved_can_use_partial_vectors_p;
3401 goto start_over;
3404 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3405 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3406 OLD_LOOP_VINFO is better unless something specifically indicates
3407 otherwise.
3409 Note that this deliberately isn't a partial order. */
3411 static bool
3412 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3413 loop_vec_info old_loop_vinfo)
3415 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3416 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3418 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3419 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3421 /* Always prefer a VF of loop->simdlen over any other VF. */
3422 if (loop->simdlen)
3424 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3425 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3426 if (new_simdlen_p != old_simdlen_p)
3427 return new_simdlen_p;
3430 const auto *old_costs = old_loop_vinfo->vector_costs;
3431 const auto *new_costs = new_loop_vinfo->vector_costs;
3432 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3433 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3435 return new_costs->better_main_loop_than_p (old_costs);
3438 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3439 true if we should. */
3441 static bool
3442 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3443 loop_vec_info old_loop_vinfo)
3445 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3446 return false;
3448 if (dump_enabled_p ())
3449 dump_printf_loc (MSG_NOTE, vect_location,
3450 "***** Preferring vector mode %s to vector mode %s\n",
3451 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3452 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3453 return true;
3456 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3457 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3458 MODE_I to the next mode useful to analyze.
3459 Return the loop_vinfo on success and wrapped null on failure. */
3461 static opt_loop_vec_info
3462 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3463 const vect_loop_form_info *loop_form_info,
3464 loop_vec_info main_loop_vinfo,
3465 const vector_modes &vector_modes, unsigned &mode_i,
3466 machine_mode &autodetected_vector_mode,
3467 bool &fatal)
3469 loop_vec_info loop_vinfo
3470 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3472 machine_mode vector_mode = vector_modes[mode_i];
3473 loop_vinfo->vector_mode = vector_mode;
3474 unsigned int suggested_unroll_factor = 1;
3475 bool slp_done_for_suggested_uf = false;
3477 /* Run the main analysis. */
3478 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3479 &suggested_unroll_factor,
3480 slp_done_for_suggested_uf);
3481 if (dump_enabled_p ())
3482 dump_printf_loc (MSG_NOTE, vect_location,
3483 "***** Analysis %s with vector mode %s\n",
3484 res ? "succeeded" : " failed",
3485 GET_MODE_NAME (loop_vinfo->vector_mode));
3487 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3489 if (dump_enabled_p ())
3490 dump_printf_loc (MSG_NOTE, vect_location,
3491 "***** Re-trying analysis for unrolling"
3492 " with unroll factor %d and slp %s.\n",
3493 suggested_unroll_factor,
3494 slp_done_for_suggested_uf ? "on" : "off");
3495 loop_vec_info unroll_vinfo
3496 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3497 unroll_vinfo->vector_mode = vector_mode;
3498 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3499 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3500 slp_done_for_suggested_uf);
3501 if (new_res)
3503 delete loop_vinfo;
3504 loop_vinfo = unroll_vinfo;
3506 else
3507 delete unroll_vinfo;
3510 /* Remember the autodetected vector mode. */
3511 if (vector_mode == VOIDmode)
3512 autodetected_vector_mode = loop_vinfo->vector_mode;
3514 /* Advance mode_i, first skipping modes that would result in the
3515 same analysis result. */
3516 while (mode_i + 1 < vector_modes.length ()
3517 && vect_chooses_same_modes_p (loop_vinfo,
3518 vector_modes[mode_i + 1]))
3520 if (dump_enabled_p ())
3521 dump_printf_loc (MSG_NOTE, vect_location,
3522 "***** The result for vector mode %s would"
3523 " be the same\n",
3524 GET_MODE_NAME (vector_modes[mode_i + 1]));
3525 mode_i += 1;
3527 if (mode_i + 1 < vector_modes.length ()
3528 && VECTOR_MODE_P (autodetected_vector_mode)
3529 && (related_vector_mode (vector_modes[mode_i + 1],
3530 GET_MODE_INNER (autodetected_vector_mode))
3531 == autodetected_vector_mode)
3532 && (related_vector_mode (autodetected_vector_mode,
3533 GET_MODE_INNER (vector_modes[mode_i + 1]))
3534 == vector_modes[mode_i + 1]))
3536 if (dump_enabled_p ())
3537 dump_printf_loc (MSG_NOTE, vect_location,
3538 "***** Skipping vector mode %s, which would"
3539 " repeat the analysis for %s\n",
3540 GET_MODE_NAME (vector_modes[mode_i + 1]),
3541 GET_MODE_NAME (autodetected_vector_mode));
3542 mode_i += 1;
3544 mode_i++;
3546 if (!res)
3548 delete loop_vinfo;
3549 if (fatal)
3550 gcc_checking_assert (main_loop_vinfo == NULL);
3551 return opt_loop_vec_info::propagate_failure (res);
3554 return opt_loop_vec_info::success (loop_vinfo);
3557 /* Function vect_analyze_loop.
3559 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3560 for it. The different analyses will record information in the
3561 loop_vec_info struct. */
3562 opt_loop_vec_info
3563 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3565 DUMP_VECT_SCOPE ("analyze_loop_nest");
3567 if (loop_outer (loop)
3568 && loop_vec_info_for_loop (loop_outer (loop))
3569 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3570 return opt_loop_vec_info::failure_at (vect_location,
3571 "outer-loop already vectorized.\n");
3573 if (!find_loop_nest (loop, &shared->loop_nest))
3574 return opt_loop_vec_info::failure_at
3575 (vect_location,
3576 "not vectorized: loop nest containing two or more consecutive inner"
3577 " loops cannot be vectorized\n");
3579 /* Analyze the loop form. */
3580 vect_loop_form_info loop_form_info;
3581 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3582 if (!res)
3584 if (dump_enabled_p ())
3585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3586 "bad loop form.\n");
3587 return opt_loop_vec_info::propagate_failure (res);
3589 if (!integer_onep (loop_form_info.assumptions))
3591 /* We consider to vectorize this loop by versioning it under
3592 some assumptions. In order to do this, we need to clear
3593 existing information computed by scev and niter analyzer. */
3594 scev_reset_htab ();
3595 free_numbers_of_iterations_estimates (loop);
3596 /* Also set flag for this loop so that following scev and niter
3597 analysis are done under the assumptions. */
3598 loop_constraint_set (loop, LOOP_C_FINITE);
3600 else
3601 /* Clear the existing niter information to make sure the nonwrapping flag
3602 will be calculated and set propriately. */
3603 free_numbers_of_iterations_estimates (loop);
3605 auto_vector_modes vector_modes;
3606 /* Autodetect first vector size we try. */
3607 vector_modes.safe_push (VOIDmode);
3608 unsigned int autovec_flags
3609 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3610 loop->simdlen != 0);
3611 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3612 && !unlimited_cost_model (loop));
3613 machine_mode autodetected_vector_mode = VOIDmode;
3614 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3615 unsigned int mode_i = 0;
3616 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3618 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3619 a mode has not been analyzed. */
3620 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3621 for (unsigned i = 0; i < vector_modes.length (); ++i)
3622 cached_vf_per_mode.safe_push (0);
3624 /* First determine the main loop vectorization mode, either the first
3625 one that works, starting with auto-detecting the vector mode and then
3626 following the targets order of preference, or the one with the
3627 lowest cost if pick_lowest_cost_p. */
3628 while (1)
3630 bool fatal;
3631 unsigned int last_mode_i = mode_i;
3632 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3633 failed. */
3634 cached_vf_per_mode[last_mode_i] = -1;
3635 opt_loop_vec_info loop_vinfo
3636 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3637 NULL, vector_modes, mode_i,
3638 autodetected_vector_mode, fatal);
3639 if (fatal)
3640 break;
3642 if (loop_vinfo)
3644 /* Analyzis has been successful so update the VF value. The
3645 VF should always be a multiple of unroll_factor and we want to
3646 capture the original VF here. */
3647 cached_vf_per_mode[last_mode_i]
3648 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3649 loop_vinfo->suggested_unroll_factor);
3650 /* Once we hit the desired simdlen for the first time,
3651 discard any previous attempts. */
3652 if (simdlen
3653 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3655 delete first_loop_vinfo;
3656 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3657 simdlen = 0;
3659 else if (pick_lowest_cost_p
3660 && first_loop_vinfo
3661 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3663 /* Pick loop_vinfo over first_loop_vinfo. */
3664 delete first_loop_vinfo;
3665 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3667 if (first_loop_vinfo == NULL)
3668 first_loop_vinfo = loop_vinfo;
3669 else
3671 delete loop_vinfo;
3672 loop_vinfo = opt_loop_vec_info::success (NULL);
3675 /* Commit to first_loop_vinfo if we have no reason to try
3676 alternatives. */
3677 if (!simdlen && !pick_lowest_cost_p)
3678 break;
3680 if (mode_i == vector_modes.length ()
3681 || autodetected_vector_mode == VOIDmode)
3682 break;
3684 /* Try the next biggest vector size. */
3685 if (dump_enabled_p ())
3686 dump_printf_loc (MSG_NOTE, vect_location,
3687 "***** Re-trying analysis with vector mode %s\n",
3688 GET_MODE_NAME (vector_modes[mode_i]));
3690 if (!first_loop_vinfo)
3691 return opt_loop_vec_info::propagate_failure (res);
3693 if (dump_enabled_p ())
3694 dump_printf_loc (MSG_NOTE, vect_location,
3695 "***** Choosing vector mode %s\n",
3696 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3698 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3699 enabled, SIMDUID is not set, it is the innermost loop and we have
3700 either already found the loop's SIMDLEN or there was no SIMDLEN to
3701 begin with.
3702 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3703 bool vect_epilogues = (!simdlen
3704 && loop->inner == NULL
3705 && param_vect_epilogues_nomask
3706 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3707 /* No code motion support for multiple epilogues so for now
3708 not supported when multiple exits. */
3709 && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3710 && !loop->simduid);
3711 if (!vect_epilogues)
3712 return first_loop_vinfo;
3714 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3715 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3717 /* For epilogues start the analysis from the first mode. The motivation
3718 behind starting from the beginning comes from cases where the VECTOR_MODES
3719 array may contain length-agnostic and length-specific modes. Their
3720 ordering is not guaranteed, so we could end up picking a mode for the main
3721 loop that is after the epilogue's optimal mode. */
3722 vector_modes[0] = autodetected_vector_mode;
3723 mode_i = 0;
3725 bool supports_partial_vectors =
3726 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3727 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3729 while (1)
3731 /* If the target does not support partial vectors we can shorten the
3732 number of modes to analyze for the epilogue as we know we can't pick a
3733 mode that would lead to a VF at least as big as the
3734 FIRST_VINFO_VF. */
3735 if (!supports_partial_vectors
3736 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3738 mode_i++;
3739 if (mode_i == vector_modes.length ())
3740 break;
3741 continue;
3744 if (dump_enabled_p ())
3745 dump_printf_loc (MSG_NOTE, vect_location,
3746 "***** Re-trying epilogue analysis with vector "
3747 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3749 bool fatal;
3750 opt_loop_vec_info loop_vinfo
3751 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3752 first_loop_vinfo,
3753 vector_modes, mode_i,
3754 autodetected_vector_mode, fatal);
3755 if (fatal)
3756 break;
3758 if (loop_vinfo)
3760 if (pick_lowest_cost_p)
3762 /* Keep trying to roll back vectorization attempts while the
3763 loop_vec_infos they produced were worse than this one. */
3764 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3765 while (!vinfos.is_empty ()
3766 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3768 gcc_assert (vect_epilogues);
3769 delete vinfos.pop ();
3772 /* For now only allow one epilogue loop. */
3773 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3775 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3776 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3777 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3778 || maybe_ne (lowest_th, 0U));
3779 /* Keep track of the known smallest versioning
3780 threshold. */
3781 if (ordered_p (lowest_th, th))
3782 lowest_th = ordered_min (lowest_th, th);
3784 else
3786 delete loop_vinfo;
3787 loop_vinfo = opt_loop_vec_info::success (NULL);
3790 /* For now only allow one epilogue loop, but allow
3791 pick_lowest_cost_p to replace it, so commit to the
3792 first epilogue if we have no reason to try alternatives. */
3793 if (!pick_lowest_cost_p)
3794 break;
3797 if (mode_i == vector_modes.length ())
3798 break;
3802 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3804 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3805 if (dump_enabled_p ())
3806 dump_printf_loc (MSG_NOTE, vect_location,
3807 "***** Choosing epilogue vector mode %s\n",
3808 GET_MODE_NAME
3809 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3812 return first_loop_vinfo;
3815 /* Return true if there is an in-order reduction function for CODE, storing
3816 it in *REDUC_FN if so. */
3818 static bool
3819 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3821 /* We support MINUS_EXPR by negating the operand. This also preserves an
3822 initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3823 (-0.0) = -0.0. */
3824 if (code == PLUS_EXPR || code == MINUS_EXPR)
3826 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3827 return true;
3829 return false;
3832 /* Function reduction_fn_for_scalar_code
3834 Input:
3835 CODE - tree_code of a reduction operations.
3837 Output:
3838 REDUC_FN - the corresponding internal function to be used to reduce the
3839 vector of partial results into a single scalar result, or IFN_LAST
3840 if the operation is a supported reduction operation, but does not have
3841 such an internal function.
3843 Return FALSE if CODE currently cannot be vectorized as reduction. */
3845 bool
3846 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3848 if (code.is_tree_code ())
3849 switch (tree_code (code))
3851 case MAX_EXPR:
3852 *reduc_fn = IFN_REDUC_MAX;
3853 return true;
3855 case MIN_EXPR:
3856 *reduc_fn = IFN_REDUC_MIN;
3857 return true;
3859 case PLUS_EXPR:
3860 *reduc_fn = IFN_REDUC_PLUS;
3861 return true;
3863 case BIT_AND_EXPR:
3864 *reduc_fn = IFN_REDUC_AND;
3865 return true;
3867 case BIT_IOR_EXPR:
3868 *reduc_fn = IFN_REDUC_IOR;
3869 return true;
3871 case BIT_XOR_EXPR:
3872 *reduc_fn = IFN_REDUC_XOR;
3873 return true;
3875 case MULT_EXPR:
3876 case MINUS_EXPR:
3877 *reduc_fn = IFN_LAST;
3878 return true;
3880 default:
3881 return false;
3883 else
3884 switch (combined_fn (code))
3886 CASE_CFN_FMAX:
3887 *reduc_fn = IFN_REDUC_FMAX;
3888 return true;
3890 CASE_CFN_FMIN:
3891 *reduc_fn = IFN_REDUC_FMIN;
3892 return true;
3894 default:
3895 return false;
3899 /* If there is a neutral value X such that a reduction would not be affected
3900 by the introduction of additional X elements, return that X, otherwise
3901 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3902 of the scalar elements. If the reduction has just a single initial value
3903 then INITIAL_VALUE is that value, otherwise it is null.
3904 If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3905 In that case no signed zero is returned. */
3907 tree
3908 neutral_op_for_reduction (tree scalar_type, code_helper code,
3909 tree initial_value, bool as_initial)
3911 if (code.is_tree_code ())
3912 switch (tree_code (code))
3914 case DOT_PROD_EXPR:
3915 case SAD_EXPR:
3916 case MINUS_EXPR:
3917 case BIT_IOR_EXPR:
3918 case BIT_XOR_EXPR:
3919 return build_zero_cst (scalar_type);
3920 case WIDEN_SUM_EXPR:
3921 case PLUS_EXPR:
3922 if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3923 return build_real (scalar_type, dconstm0);
3924 else
3925 return build_zero_cst (scalar_type);
3927 case MULT_EXPR:
3928 return build_one_cst (scalar_type);
3930 case BIT_AND_EXPR:
3931 return build_all_ones_cst (scalar_type);
3933 case MAX_EXPR:
3934 case MIN_EXPR:
3935 return initial_value;
3937 default:
3938 return NULL_TREE;
3940 else
3941 switch (combined_fn (code))
3943 CASE_CFN_FMIN:
3944 CASE_CFN_FMAX:
3945 return initial_value;
3947 default:
3948 return NULL_TREE;
3952 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3953 STMT is printed with a message MSG. */
3955 static void
3956 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3958 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3961 /* Return true if we need an in-order reduction for operation CODE
3962 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3963 overflow must wrap. */
3965 bool
3966 needs_fold_left_reduction_p (tree type, code_helper code)
3968 /* CHECKME: check for !flag_finite_math_only too? */
3969 if (SCALAR_FLOAT_TYPE_P (type))
3971 if (code.is_tree_code ())
3972 switch (tree_code (code))
3974 case MIN_EXPR:
3975 case MAX_EXPR:
3976 return false;
3978 default:
3979 return !flag_associative_math;
3981 else
3982 switch (combined_fn (code))
3984 CASE_CFN_FMIN:
3985 CASE_CFN_FMAX:
3986 return false;
3988 default:
3989 return !flag_associative_math;
3993 if (INTEGRAL_TYPE_P (type))
3994 return (!code.is_tree_code ()
3995 || !operation_no_trapping_overflow (type, tree_code (code)));
3997 if (SAT_FIXED_POINT_TYPE_P (type))
3998 return true;
4000 return false;
4003 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4004 has a handled computation expression. Store the main reduction
4005 operation in *CODE. */
4007 static bool
4008 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4009 tree loop_arg, code_helper *code,
4010 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
4012 auto_bitmap visited;
4013 tree lookfor = PHI_RESULT (phi);
4014 ssa_op_iter curri;
4015 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4016 while (USE_FROM_PTR (curr) != loop_arg)
4017 curr = op_iter_next_use (&curri);
4018 curri.i = curri.numops;
4021 path.safe_push (std::make_pair (curri, curr));
4022 tree use = USE_FROM_PTR (curr);
4023 if (use == lookfor)
4024 break;
4025 gimple *def = SSA_NAME_DEF_STMT (use);
4026 if (gimple_nop_p (def)
4027 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4029 pop:
4032 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4033 curri = x.first;
4034 curr = x.second;
4036 curr = op_iter_next_use (&curri);
4037 /* Skip already visited or non-SSA operands (from iterating
4038 over PHI args). */
4039 while (curr != NULL_USE_OPERAND_P
4040 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4041 || ! bitmap_set_bit (visited,
4042 SSA_NAME_VERSION
4043 (USE_FROM_PTR (curr)))));
4045 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4046 if (curr == NULL_USE_OPERAND_P)
4047 break;
4049 else
4051 if (gimple_code (def) == GIMPLE_PHI)
4052 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4053 else
4054 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4055 while (curr != NULL_USE_OPERAND_P
4056 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4057 || ! bitmap_set_bit (visited,
4058 SSA_NAME_VERSION
4059 (USE_FROM_PTR (curr)))))
4060 curr = op_iter_next_use (&curri);
4061 if (curr == NULL_USE_OPERAND_P)
4062 goto pop;
4065 while (1);
4066 if (dump_file && (dump_flags & TDF_DETAILS))
4068 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4069 unsigned i;
4070 std::pair<ssa_op_iter, use_operand_p> *x;
4071 FOR_EACH_VEC_ELT (path, i, x)
4072 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4073 dump_printf (MSG_NOTE, "\n");
4076 /* Check whether the reduction path detected is valid. */
4077 bool fail = path.length () == 0;
4078 bool neg = false;
4079 int sign = -1;
4080 *code = ERROR_MARK;
4081 for (unsigned i = 1; i < path.length (); ++i)
4083 gimple *use_stmt = USE_STMT (path[i].second);
4084 gimple_match_op op;
4085 if (!gimple_extract_op (use_stmt, &op))
4087 fail = true;
4088 break;
4090 unsigned int opi = op.num_ops;
4091 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4093 /* The following make sure we can compute the operand index
4094 easily plus it mostly disallows chaining via COND_EXPR condition
4095 operands. */
4096 for (opi = 0; opi < op.num_ops; ++opi)
4097 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4098 break;
4100 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4102 for (opi = 0; opi < op.num_ops; ++opi)
4103 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4104 break;
4106 if (opi == op.num_ops)
4108 fail = true;
4109 break;
4111 op.code = canonicalize_code (op.code, op.type);
4112 if (op.code == MINUS_EXPR)
4114 op.code = PLUS_EXPR;
4115 /* Track whether we negate the reduction value each iteration. */
4116 if (op.ops[1] == op.ops[opi])
4117 neg = ! neg;
4119 else if (op.code == IFN_COND_SUB)
4121 op.code = IFN_COND_ADD;
4122 /* Track whether we negate the reduction value each iteration. */
4123 if (op.ops[2] == op.ops[opi])
4124 neg = ! neg;
4126 if (CONVERT_EXPR_CODE_P (op.code)
4127 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4129 else if (*code == ERROR_MARK)
4131 *code = op.code;
4132 sign = TYPE_SIGN (op.type);
4134 else if (op.code != *code)
4136 fail = true;
4137 break;
4139 else if ((op.code == MIN_EXPR
4140 || op.code == MAX_EXPR)
4141 && sign != TYPE_SIGN (op.type))
4143 fail = true;
4144 break;
4146 /* Check there's only a single stmt the op is used on. For the
4147 not value-changing tail and the last stmt allow out-of-loop uses.
4148 ??? We could relax this and handle arbitrary live stmts by
4149 forcing a scalar epilogue for example. */
4150 imm_use_iterator imm_iter;
4151 use_operand_p use_p;
4152 gimple *op_use_stmt;
4153 unsigned cnt = 0;
4154 bool cond_fn_p = op.code.is_internal_fn ()
4155 && (conditional_internal_fn_code (internal_fn (op.code))
4156 != ERROR_MARK);
4158 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4160 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4161 op1 twice (once as definition, once as else) in the same operation.
4162 Allow this. */
4163 if (cond_fn_p && op_use_stmt == use_stmt)
4165 gcall *call = as_a<gcall *> (use_stmt);
4166 unsigned else_pos
4167 = internal_fn_else_index (internal_fn (op.code));
4169 for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4171 if (j == else_pos)
4172 continue;
4173 if (gimple_call_arg (call, j) == op.ops[opi])
4174 cnt++;
4177 else if (!is_gimple_debug (op_use_stmt)
4178 && (*code != ERROR_MARK
4179 || flow_bb_inside_loop_p (loop,
4180 gimple_bb (op_use_stmt))))
4181 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4182 cnt++;
4185 if (cnt != 1)
4187 fail = true;
4188 break;
4191 return ! fail && ! neg && *code != ERROR_MARK;
4194 bool
4195 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4196 tree loop_arg, enum tree_code code)
4198 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4199 code_helper code_;
4200 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4201 && code_ == code);
4206 /* Function vect_is_simple_reduction
4208 (1) Detect a cross-iteration def-use cycle that represents a simple
4209 reduction computation. We look for the following pattern:
4211 loop_header:
4212 a1 = phi < a0, a2 >
4213 a3 = ...
4214 a2 = operation (a3, a1)
4218 a3 = ...
4219 loop_header:
4220 a1 = phi < a0, a2 >
4221 a2 = operation (a3, a1)
4223 such that:
4224 1. operation is commutative and associative and it is safe to
4225 change the order of the computation
4226 2. no uses for a2 in the loop (a2 is used out of the loop)
4227 3. no uses of a1 in the loop besides the reduction operation
4228 4. no uses of a1 outside the loop.
4230 Conditions 1,4 are tested here.
4231 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4233 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4234 nested cycles.
4236 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4237 reductions:
4239 a1 = phi < a0, a2 >
4240 inner loop (def of a3)
4241 a2 = phi < a3 >
4243 (4) Detect condition expressions, ie:
4244 for (int i = 0; i < N; i++)
4245 if (a[i] < val)
4246 ret_val = a[i];
4250 static stmt_vec_info
4251 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4252 bool *double_reduc, bool *reduc_chain_p, bool slp)
4254 gphi *phi = as_a <gphi *> (phi_info->stmt);
4255 gimple *phi_use_stmt = NULL;
4256 imm_use_iterator imm_iter;
4257 use_operand_p use_p;
4259 *double_reduc = false;
4260 *reduc_chain_p = false;
4261 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4263 tree phi_name = PHI_RESULT (phi);
4264 /* ??? If there are no uses of the PHI result the inner loop reduction
4265 won't be detected as possibly double-reduction by vectorizable_reduction
4266 because that tries to walk the PHI arg from the preheader edge which
4267 can be constant. See PR60382. */
4268 if (has_zero_uses (phi_name))
4269 return NULL;
4270 class loop *loop = (gimple_bb (phi))->loop_father;
4271 unsigned nphi_def_loop_uses = 0;
4272 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4274 gimple *use_stmt = USE_STMT (use_p);
4275 if (is_gimple_debug (use_stmt))
4276 continue;
4278 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4280 if (dump_enabled_p ())
4281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4282 "intermediate value used outside loop.\n");
4284 return NULL;
4287 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4288 op1 twice (once as definition, once as else) in the same operation.
4289 Only count it as one. */
4290 if (use_stmt != phi_use_stmt)
4292 nphi_def_loop_uses++;
4293 phi_use_stmt = use_stmt;
4297 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4298 if (TREE_CODE (latch_def) != SSA_NAME)
4300 if (dump_enabled_p ())
4301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4302 "reduction: not ssa_name: %T\n", latch_def);
4303 return NULL;
4306 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4307 if (!def_stmt_info
4308 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4309 return NULL;
4311 bool nested_in_vect_loop
4312 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4313 unsigned nlatch_def_loop_uses = 0;
4314 auto_vec<gphi *, 3> lcphis;
4315 bool inner_loop_of_double_reduc = false;
4316 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4318 gimple *use_stmt = USE_STMT (use_p);
4319 if (is_gimple_debug (use_stmt))
4320 continue;
4321 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4322 nlatch_def_loop_uses++;
4323 else
4325 /* We can have more than one loop-closed PHI. */
4326 lcphis.safe_push (as_a <gphi *> (use_stmt));
4327 if (nested_in_vect_loop
4328 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4329 == vect_double_reduction_def))
4330 inner_loop_of_double_reduc = true;
4334 /* If we are vectorizing an inner reduction we are executing that
4335 in the original order only in case we are not dealing with a
4336 double reduction. */
4337 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4339 if (dump_enabled_p ())
4340 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4341 "detected nested cycle: ");
4342 return def_stmt_info;
4345 /* When the inner loop of a double reduction ends up with more than
4346 one loop-closed PHI we have failed to classify alternate such
4347 PHIs as double reduction, leading to wrong code. See PR103237. */
4348 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4350 if (dump_enabled_p ())
4351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4352 "unhandle double reduction\n");
4353 return NULL;
4356 /* If this isn't a nested cycle or if the nested cycle reduction value
4357 is used ouside of the inner loop we cannot handle uses of the reduction
4358 value. */
4359 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4361 if (dump_enabled_p ())
4362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4363 "reduction used in loop.\n");
4364 return NULL;
4367 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4368 defined in the inner loop. */
4369 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4371 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4372 if (gimple_phi_num_args (def_stmt) != 1
4373 || TREE_CODE (op1) != SSA_NAME)
4375 if (dump_enabled_p ())
4376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4377 "unsupported phi node definition.\n");
4379 return NULL;
4382 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4383 and the latch definition op1. */
4384 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4385 if (gimple_bb (def1)
4386 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4387 && loop->inner
4388 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4389 && (is_gimple_assign (def1) || is_gimple_call (def1))
4390 && is_a <gphi *> (phi_use_stmt)
4391 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4392 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4393 loop_latch_edge (loop->inner))))
4395 if (dump_enabled_p ())
4396 report_vect_op (MSG_NOTE, def_stmt,
4397 "detected double reduction: ");
4399 *double_reduc = true;
4400 return def_stmt_info;
4403 return NULL;
4406 /* Look for the expression computing latch_def from then loop PHI result. */
4407 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4408 code_helper code;
4409 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4410 path))
4412 STMT_VINFO_REDUC_CODE (phi_info) = code;
4413 if (code == COND_EXPR && !nested_in_vect_loop)
4414 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4416 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4417 reduction chain for which the additional restriction is that
4418 all operations in the chain are the same. */
4419 auto_vec<stmt_vec_info, 8> reduc_chain;
4420 unsigned i;
4421 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4422 for (i = path.length () - 1; i >= 1; --i)
4424 gimple *stmt = USE_STMT (path[i].second);
4425 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4426 gimple_match_op op;
4427 if (!gimple_extract_op (stmt, &op))
4428 gcc_unreachable ();
4429 if (gassign *assign = dyn_cast<gassign *> (stmt))
4430 STMT_VINFO_REDUC_IDX (stmt_info)
4431 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4432 else
4434 gcall *call = as_a<gcall *> (stmt);
4435 STMT_VINFO_REDUC_IDX (stmt_info)
4436 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4438 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4439 && (i == 1 || i == path.length () - 1));
4440 if ((op.code != code && !leading_conversion)
4441 /* We can only handle the final value in epilogue
4442 generation for reduction chains. */
4443 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4444 is_slp_reduc = false;
4445 /* For reduction chains we support a trailing/leading
4446 conversions. We do not store those in the actual chain. */
4447 if (leading_conversion)
4448 continue;
4449 reduc_chain.safe_push (stmt_info);
4451 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4453 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4455 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4456 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4458 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4459 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4461 /* Save the chain for further analysis in SLP detection. */
4462 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4463 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4465 *reduc_chain_p = true;
4466 if (dump_enabled_p ())
4467 dump_printf_loc (MSG_NOTE, vect_location,
4468 "reduction: detected reduction chain\n");
4470 else if (dump_enabled_p ())
4471 dump_printf_loc (MSG_NOTE, vect_location,
4472 "reduction: detected reduction\n");
4474 return def_stmt_info;
4477 if (dump_enabled_p ())
4478 dump_printf_loc (MSG_NOTE, vect_location,
4479 "reduction: unknown pattern\n");
4481 return NULL;
4484 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4485 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4486 or -1 if not known. */
4488 static int
4489 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4491 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4492 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4494 if (dump_enabled_p ())
4495 dump_printf_loc (MSG_NOTE, vect_location,
4496 "cost model: epilogue peel iters set to vf/2 "
4497 "because loop iterations are unknown .\n");
4498 return assumed_vf / 2;
4500 else
4502 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4503 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4504 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4505 /* If we need to peel for gaps, but no peeling is required, we have to
4506 peel VF iterations. */
4507 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4508 peel_iters_epilogue = assumed_vf;
4509 return peel_iters_epilogue;
4513 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4515 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4516 int *peel_iters_epilogue,
4517 stmt_vector_for_cost *scalar_cost_vec,
4518 stmt_vector_for_cost *prologue_cost_vec,
4519 stmt_vector_for_cost *epilogue_cost_vec)
4521 int retval = 0;
4523 *peel_iters_epilogue
4524 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4526 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4528 /* If peeled iterations are known but number of scalar loop
4529 iterations are unknown, count a taken branch per peeled loop. */
4530 if (peel_iters_prologue > 0)
4531 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4532 vect_prologue);
4533 if (*peel_iters_epilogue > 0)
4534 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4535 vect_epilogue);
4538 stmt_info_for_cost *si;
4539 int j;
4540 if (peel_iters_prologue)
4541 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4542 retval += record_stmt_cost (prologue_cost_vec,
4543 si->count * peel_iters_prologue,
4544 si->kind, si->stmt_info, si->misalign,
4545 vect_prologue);
4546 if (*peel_iters_epilogue)
4547 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4548 retval += record_stmt_cost (epilogue_cost_vec,
4549 si->count * *peel_iters_epilogue,
4550 si->kind, si->stmt_info, si->misalign,
4551 vect_epilogue);
4553 return retval;
4556 /* Function vect_estimate_min_profitable_iters
4558 Return the number of iterations required for the vector version of the
4559 loop to be profitable relative to the cost of the scalar version of the
4560 loop.
4562 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4563 of iterations for vectorization. -1 value means loop vectorization
4564 is not profitable. This returned value may be used for dynamic
4565 profitability check.
4567 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4568 for static check against estimated number of iterations. */
4570 static void
4571 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4572 int *ret_min_profitable_niters,
4573 int *ret_min_profitable_estimate,
4574 unsigned *suggested_unroll_factor)
4576 int min_profitable_iters;
4577 int min_profitable_estimate;
4578 int peel_iters_prologue;
4579 int peel_iters_epilogue;
4580 unsigned vec_inside_cost = 0;
4581 int vec_outside_cost = 0;
4582 unsigned vec_prologue_cost = 0;
4583 unsigned vec_epilogue_cost = 0;
4584 int scalar_single_iter_cost = 0;
4585 int scalar_outside_cost = 0;
4586 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4587 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4588 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4590 /* Cost model disabled. */
4591 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4593 if (dump_enabled_p ())
4594 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4595 *ret_min_profitable_niters = 0;
4596 *ret_min_profitable_estimate = 0;
4597 return;
4600 /* Requires loop versioning tests to handle misalignment. */
4601 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4603 /* FIXME: Make cost depend on complexity of individual check. */
4604 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4605 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4606 if (dump_enabled_p ())
4607 dump_printf (MSG_NOTE,
4608 "cost model: Adding cost of checks for loop "
4609 "versioning to treat misalignment.\n");
4612 /* Requires loop versioning with alias checks. */
4613 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4615 /* FIXME: Make cost depend on complexity of individual check. */
4616 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4617 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4618 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4619 if (len)
4620 /* Count LEN - 1 ANDs and LEN comparisons. */
4621 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4622 scalar_stmt, vect_prologue);
4623 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4624 if (len)
4626 /* Count LEN - 1 ANDs and LEN comparisons. */
4627 unsigned int nstmts = len * 2 - 1;
4628 /* +1 for each bias that needs adding. */
4629 for (unsigned int i = 0; i < len; ++i)
4630 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4631 nstmts += 1;
4632 (void) add_stmt_cost (target_cost_data, nstmts,
4633 scalar_stmt, vect_prologue);
4635 if (dump_enabled_p ())
4636 dump_printf (MSG_NOTE,
4637 "cost model: Adding cost of checks for loop "
4638 "versioning aliasing.\n");
4641 /* Requires loop versioning with niter checks. */
4642 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4644 /* FIXME: Make cost depend on complexity of individual check. */
4645 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4646 NULL, NULL, NULL_TREE, 0, vect_prologue);
4647 if (dump_enabled_p ())
4648 dump_printf (MSG_NOTE,
4649 "cost model: Adding cost of checks for loop "
4650 "versioning niters.\n");
4653 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4654 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4655 vect_prologue);
4657 /* Count statements in scalar loop. Using this as scalar cost for a single
4658 iteration for now.
4660 TODO: Add outer loop support.
4662 TODO: Consider assigning different costs to different scalar
4663 statements. */
4665 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4667 /* Add additional cost for the peeled instructions in prologue and epilogue
4668 loop. (For fully-masked loops there will be no peeling.)
4670 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4671 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4673 TODO: Build an expression that represents peel_iters for prologue and
4674 epilogue to be used in a run-time test. */
4676 bool prologue_need_br_taken_cost = false;
4677 bool prologue_need_br_not_taken_cost = false;
4679 /* Calculate peel_iters_prologue. */
4680 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4681 peel_iters_prologue = 0;
4682 else if (npeel < 0)
4684 peel_iters_prologue = assumed_vf / 2;
4685 if (dump_enabled_p ())
4686 dump_printf (MSG_NOTE, "cost model: "
4687 "prologue peel iters set to vf/2.\n");
4689 /* If peeled iterations are unknown, count a taken branch and a not taken
4690 branch per peeled loop. Even if scalar loop iterations are known,
4691 vector iterations are not known since peeled prologue iterations are
4692 not known. Hence guards remain the same. */
4693 prologue_need_br_taken_cost = true;
4694 prologue_need_br_not_taken_cost = true;
4696 else
4698 peel_iters_prologue = npeel;
4699 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4700 /* If peeled iterations are known but number of scalar loop
4701 iterations are unknown, count a taken branch per peeled loop. */
4702 prologue_need_br_taken_cost = true;
4705 bool epilogue_need_br_taken_cost = false;
4706 bool epilogue_need_br_not_taken_cost = false;
4708 /* Calculate peel_iters_epilogue. */
4709 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4710 /* We need to peel exactly one iteration for gaps. */
4711 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4712 else if (npeel < 0)
4714 /* If peeling for alignment is unknown, loop bound of main loop
4715 becomes unknown. */
4716 peel_iters_epilogue = assumed_vf / 2;
4717 if (dump_enabled_p ())
4718 dump_printf (MSG_NOTE, "cost model: "
4719 "epilogue peel iters set to vf/2 because "
4720 "peeling for alignment is unknown.\n");
4722 /* See the same reason above in peel_iters_prologue calculation. */
4723 epilogue_need_br_taken_cost = true;
4724 epilogue_need_br_not_taken_cost = true;
4726 else
4728 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4729 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4730 /* If peeled iterations are known but number of scalar loop
4731 iterations are unknown, count a taken branch per peeled loop. */
4732 epilogue_need_br_taken_cost = true;
4735 stmt_info_for_cost *si;
4736 int j;
4737 /* Add costs associated with peel_iters_prologue. */
4738 if (peel_iters_prologue)
4739 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4741 (void) add_stmt_cost (target_cost_data,
4742 si->count * peel_iters_prologue, si->kind,
4743 si->stmt_info, si->node, si->vectype,
4744 si->misalign, vect_prologue);
4747 /* Add costs associated with peel_iters_epilogue. */
4748 if (peel_iters_epilogue)
4749 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4751 (void) add_stmt_cost (target_cost_data,
4752 si->count * peel_iters_epilogue, si->kind,
4753 si->stmt_info, si->node, si->vectype,
4754 si->misalign, vect_epilogue);
4757 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4759 if (prologue_need_br_taken_cost)
4760 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4761 vect_prologue);
4763 if (prologue_need_br_not_taken_cost)
4764 (void) add_stmt_cost (target_cost_data, 1,
4765 cond_branch_not_taken, vect_prologue);
4767 if (epilogue_need_br_taken_cost)
4768 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4769 vect_epilogue);
4771 if (epilogue_need_br_not_taken_cost)
4772 (void) add_stmt_cost (target_cost_data, 1,
4773 cond_branch_not_taken, vect_epilogue);
4775 /* Take care of special costs for rgroup controls of partial vectors. */
4776 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4777 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4778 == vect_partial_vectors_avx512))
4780 /* Calculate how many masks we need to generate. */
4781 unsigned int num_masks = 0;
4782 bool need_saturation = false;
4783 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4784 if (rgm.type)
4786 unsigned nvectors = rgm.factor;
4787 num_masks += nvectors;
4788 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4789 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4790 need_saturation = true;
4793 /* ??? The target isn't able to identify the costs below as
4794 producing masks so it cannot penaltize cases where we'd run
4795 out of mask registers for example. */
4797 /* ??? We are also failing to account for smaller vector masks
4798 we generate by splitting larger masks in vect_get_loop_mask. */
4800 /* In the worst case, we need to generate each mask in the prologue
4801 and in the loop body. We need one splat per group and one
4802 compare per mask.
4804 Sometimes the prologue mask will fold to a constant,
4805 so the actual prologue cost might be smaller. However, it's
4806 simpler and safer to use the worst-case cost; if this ends up
4807 being the tie-breaker between vectorizing or not, then it's
4808 probably better not to vectorize. */
4809 (void) add_stmt_cost (target_cost_data,
4810 num_masks
4811 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4812 vector_stmt, NULL, NULL, NULL_TREE, 0,
4813 vect_prologue);
4814 (void) add_stmt_cost (target_cost_data,
4815 num_masks
4816 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4817 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4819 /* When we need saturation we need it both in the prologue and
4820 the epilogue. */
4821 if (need_saturation)
4823 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4824 NULL, NULL, NULL_TREE, 0, vect_prologue);
4825 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4826 NULL, NULL, NULL_TREE, 0, vect_body);
4829 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4830 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4831 == vect_partial_vectors_while_ult))
4833 /* Calculate how many masks we need to generate. */
4834 unsigned int num_masks = 0;
4835 rgroup_controls *rgm;
4836 unsigned int num_vectors_m1;
4837 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4838 num_vectors_m1, rgm)
4839 if (rgm->type)
4840 num_masks += num_vectors_m1 + 1;
4841 gcc_assert (num_masks > 0);
4843 /* In the worst case, we need to generate each mask in the prologue
4844 and in the loop body. One of the loop body mask instructions
4845 replaces the comparison in the scalar loop, and since we don't
4846 count the scalar comparison against the scalar body, we shouldn't
4847 count that vector instruction against the vector body either.
4849 Sometimes we can use unpacks instead of generating prologue
4850 masks and sometimes the prologue mask will fold to a constant,
4851 so the actual prologue cost might be smaller. However, it's
4852 simpler and safer to use the worst-case cost; if this ends up
4853 being the tie-breaker between vectorizing or not, then it's
4854 probably better not to vectorize. */
4855 (void) add_stmt_cost (target_cost_data, num_masks,
4856 vector_stmt, NULL, NULL, NULL_TREE, 0,
4857 vect_prologue);
4858 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4859 vector_stmt, NULL, NULL, NULL_TREE, 0,
4860 vect_body);
4862 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4864 /* Referring to the functions vect_set_loop_condition_partial_vectors
4865 and vect_set_loop_controls_directly, we need to generate each
4866 length in the prologue and in the loop body if required. Although
4867 there are some possible optimizations, we consider the worst case
4868 here. */
4870 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4871 signed char partial_load_store_bias
4872 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4873 bool need_iterate_p
4874 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4875 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4877 /* Calculate how many statements to be added. */
4878 unsigned int prologue_stmts = 0;
4879 unsigned int body_stmts = 0;
4881 rgroup_controls *rgc;
4882 unsigned int num_vectors_m1;
4883 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4884 if (rgc->type)
4886 /* May need one SHIFT for nitems_total computation. */
4887 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4888 if (nitems != 1 && !niters_known_p)
4889 prologue_stmts += 1;
4891 /* May need one MAX and one MINUS for wrap around. */
4892 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4893 prologue_stmts += 2;
4895 /* Need one MAX and one MINUS for each batch limit excepting for
4896 the 1st one. */
4897 prologue_stmts += num_vectors_m1 * 2;
4899 unsigned int num_vectors = num_vectors_m1 + 1;
4901 /* Need to set up lengths in prologue, only one MIN required
4902 for each since start index is zero. */
4903 prologue_stmts += num_vectors;
4905 /* If we have a non-zero partial load bias, we need one PLUS
4906 to adjust the load length. */
4907 if (partial_load_store_bias != 0)
4908 body_stmts += 1;
4910 unsigned int length_update_cost = 0;
4911 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4912 /* For decrement IV style, Each only need a single SELECT_VL
4913 or MIN since beginning to calculate the number of elements
4914 need to be processed in current iteration. */
4915 length_update_cost = 1;
4916 else
4917 /* For increment IV stype, Each may need two MINs and one MINUS to
4918 update lengths in body for next iteration. */
4919 length_update_cost = 3;
4921 if (need_iterate_p)
4922 body_stmts += length_update_cost * num_vectors;
4925 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4926 scalar_stmt, vect_prologue);
4927 (void) add_stmt_cost (target_cost_data, body_stmts,
4928 scalar_stmt, vect_body);
4931 /* FORNOW: The scalar outside cost is incremented in one of the
4932 following ways:
4934 1. The vectorizer checks for alignment and aliasing and generates
4935 a condition that allows dynamic vectorization. A cost model
4936 check is ANDED with the versioning condition. Hence scalar code
4937 path now has the added cost of the versioning check.
4939 if (cost > th & versioning_check)
4940 jmp to vector code
4942 Hence run-time scalar is incremented by not-taken branch cost.
4944 2. The vectorizer then checks if a prologue is required. If the
4945 cost model check was not done before during versioning, it has to
4946 be done before the prologue check.
4948 if (cost <= th)
4949 prologue = scalar_iters
4950 if (prologue == 0)
4951 jmp to vector code
4952 else
4953 execute prologue
4954 if (prologue == num_iters)
4955 go to exit
4957 Hence the run-time scalar cost is incremented by a taken branch,
4958 plus a not-taken branch, plus a taken branch cost.
4960 3. The vectorizer then checks if an epilogue is required. If the
4961 cost model check was not done before during prologue check, it
4962 has to be done with the epilogue check.
4964 if (prologue == 0)
4965 jmp to vector code
4966 else
4967 execute prologue
4968 if (prologue == num_iters)
4969 go to exit
4970 vector code:
4971 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4972 jmp to epilogue
4974 Hence the run-time scalar cost should be incremented by 2 taken
4975 branches.
4977 TODO: The back end may reorder the BBS's differently and reverse
4978 conditions/branch directions. Change the estimates below to
4979 something more reasonable. */
4981 /* If the number of iterations is known and we do not do versioning, we can
4982 decide whether to vectorize at compile time. Hence the scalar version
4983 do not carry cost model guard costs. */
4984 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4985 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4987 /* Cost model check occurs at versioning. */
4988 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4989 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4990 else
4992 /* Cost model check occurs at prologue generation. */
4993 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4994 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4995 + vect_get_stmt_cost (cond_branch_not_taken);
4996 /* Cost model check occurs at epilogue generation. */
4997 else
4998 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
5002 /* Complete the target-specific cost calculations. */
5003 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
5004 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
5005 suggested_unroll_factor);
5007 if (suggested_unroll_factor && *suggested_unroll_factor > 1
5008 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
5009 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
5010 *suggested_unroll_factor,
5011 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5013 if (dump_enabled_p ())
5014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5015 "can't unroll as unrolled vectorization factor larger"
5016 " than maximum vectorization factor: "
5017 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5018 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5019 *suggested_unroll_factor = 1;
5022 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5024 if (dump_enabled_p ())
5026 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5027 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
5028 vec_inside_cost);
5029 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
5030 vec_prologue_cost);
5031 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
5032 vec_epilogue_cost);
5033 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
5034 scalar_single_iter_cost);
5035 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
5036 scalar_outside_cost);
5037 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
5038 vec_outside_cost);
5039 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
5040 peel_iters_prologue);
5041 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
5042 peel_iters_epilogue);
5045 /* Calculate number of iterations required to make the vector version
5046 profitable, relative to the loop bodies only. The following condition
5047 must hold true:
5048 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5049 where
5050 SIC = scalar iteration cost, VIC = vector iteration cost,
5051 VOC = vector outside cost, VF = vectorization factor,
5052 NPEEL = prologue iterations + epilogue iterations,
5053 SOC = scalar outside cost for run time cost model check. */
5055 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5056 - vec_inside_cost);
5057 if (saving_per_viter <= 0)
5059 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5060 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5061 "vectorization did not happen for a simd loop");
5063 if (dump_enabled_p ())
5064 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5065 "cost model: the vector iteration cost = %d "
5066 "divided by the scalar iteration cost = %d "
5067 "is greater or equal to the vectorization factor = %d"
5068 ".\n",
5069 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5070 *ret_min_profitable_niters = -1;
5071 *ret_min_profitable_estimate = -1;
5072 return;
5075 /* ??? The "if" arm is written to handle all cases; see below for what
5076 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5077 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5079 /* Rewriting the condition above in terms of the number of
5080 vector iterations (vniters) rather than the number of
5081 scalar iterations (niters) gives:
5083 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5085 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5087 For integer N, X and Y when X > 0:
5089 N * X > Y <==> N >= (Y /[floor] X) + 1. */
5090 int outside_overhead = (vec_outside_cost
5091 - scalar_single_iter_cost * peel_iters_prologue
5092 - scalar_single_iter_cost * peel_iters_epilogue
5093 - scalar_outside_cost);
5094 /* We're only interested in cases that require at least one
5095 vector iteration. */
5096 int min_vec_niters = 1;
5097 if (outside_overhead > 0)
5098 min_vec_niters = outside_overhead / saving_per_viter + 1;
5100 if (dump_enabled_p ())
5101 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5102 min_vec_niters);
5104 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5106 /* Now that we know the minimum number of vector iterations,
5107 find the minimum niters for which the scalar cost is larger:
5109 SIC * niters > VIC * vniters + VOC - SOC
5111 We know that the minimum niters is no more than
5112 vniters * VF + NPEEL, but it might be (and often is) less
5113 than that if a partial vector iteration is cheaper than the
5114 equivalent scalar code. */
5115 int threshold = (vec_inside_cost * min_vec_niters
5116 + vec_outside_cost
5117 - scalar_outside_cost);
5118 if (threshold <= 0)
5119 min_profitable_iters = 1;
5120 else
5121 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5123 else
5124 /* Convert the number of vector iterations into a number of
5125 scalar iterations. */
5126 min_profitable_iters = (min_vec_niters * assumed_vf
5127 + peel_iters_prologue
5128 + peel_iters_epilogue);
5130 else
5132 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5133 * assumed_vf
5134 - vec_inside_cost * peel_iters_prologue
5135 - vec_inside_cost * peel_iters_epilogue);
5136 if (min_profitable_iters <= 0)
5137 min_profitable_iters = 0;
5138 else
5140 min_profitable_iters /= saving_per_viter;
5142 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5143 <= (((int) vec_inside_cost * min_profitable_iters)
5144 + (((int) vec_outside_cost - scalar_outside_cost)
5145 * assumed_vf)))
5146 min_profitable_iters++;
5150 if (dump_enabled_p ())
5151 dump_printf (MSG_NOTE,
5152 " Calculated minimum iters for profitability: %d\n",
5153 min_profitable_iters);
5155 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5156 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5157 /* We want the vectorized loop to execute at least once. */
5158 min_profitable_iters = assumed_vf + peel_iters_prologue;
5159 else if (min_profitable_iters < peel_iters_prologue)
5160 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5161 vectorized loop executes at least once. */
5162 min_profitable_iters = peel_iters_prologue;
5164 if (dump_enabled_p ())
5165 dump_printf_loc (MSG_NOTE, vect_location,
5166 " Runtime profitability threshold = %d\n",
5167 min_profitable_iters);
5169 *ret_min_profitable_niters = min_profitable_iters;
5171 /* Calculate number of iterations required to make the vector version
5172 profitable, relative to the loop bodies only.
5174 Non-vectorized variant is SIC * niters and it must win over vector
5175 variant on the expected loop trip count. The following condition must hold true:
5176 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5178 if (vec_outside_cost <= 0)
5179 min_profitable_estimate = 0;
5180 /* ??? This "else if" arm is written to handle all cases; see below for
5181 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5182 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5184 /* This is a repeat of the code above, but with + SOC rather
5185 than - SOC. */
5186 int outside_overhead = (vec_outside_cost
5187 - scalar_single_iter_cost * peel_iters_prologue
5188 - scalar_single_iter_cost * peel_iters_epilogue
5189 + scalar_outside_cost);
5190 int min_vec_niters = 1;
5191 if (outside_overhead > 0)
5192 min_vec_niters = outside_overhead / saving_per_viter + 1;
5194 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5196 int threshold = (vec_inside_cost * min_vec_niters
5197 + vec_outside_cost
5198 + scalar_outside_cost);
5199 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5201 else
5202 min_profitable_estimate = (min_vec_niters * assumed_vf
5203 + peel_iters_prologue
5204 + peel_iters_epilogue);
5206 else
5208 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5209 * assumed_vf
5210 - vec_inside_cost * peel_iters_prologue
5211 - vec_inside_cost * peel_iters_epilogue)
5212 / ((scalar_single_iter_cost * assumed_vf)
5213 - vec_inside_cost);
5215 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5216 if (dump_enabled_p ())
5217 dump_printf_loc (MSG_NOTE, vect_location,
5218 " Static estimate profitability threshold = %d\n",
5219 min_profitable_estimate);
5221 *ret_min_profitable_estimate = min_profitable_estimate;
5224 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5225 vector elements (not bits) for a vector with NELT elements. */
5226 static void
5227 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5228 vec_perm_builder *sel)
5230 /* The encoding is a single stepped pattern. Any wrap-around is handled
5231 by vec_perm_indices. */
5232 sel->new_vector (nelt, 1, 3);
5233 for (unsigned int i = 0; i < 3; i++)
5234 sel->quick_push (i + offset);
5237 /* Checks whether the target supports whole-vector shifts for vectors of mode
5238 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5239 it supports vec_perm_const with masks for all necessary shift amounts. */
5240 static bool
5241 have_whole_vector_shift (machine_mode mode)
5243 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5244 return true;
5246 /* Variable-length vectors should be handled via the optab. */
5247 unsigned int nelt;
5248 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5249 return false;
5251 vec_perm_builder sel;
5252 vec_perm_indices indices;
5253 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5255 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5256 indices.new_vector (sel, 2, nelt);
5257 if (!can_vec_perm_const_p (mode, mode, indices, false))
5258 return false;
5260 return true;
5263 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5264 multiplication operands have differing signs and (b) we intend
5265 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5266 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5268 static bool
5269 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5270 stmt_vec_info stmt_info)
5272 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5273 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5274 return false;
5276 tree rhs1 = gimple_assign_rhs1 (assign);
5277 tree rhs2 = gimple_assign_rhs2 (assign);
5278 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5279 return false;
5281 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5282 gcc_assert (reduc_info->is_reduc_info);
5283 return !directly_supported_p (DOT_PROD_EXPR,
5284 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5285 optab_vector_mixed_sign);
5288 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5289 functions. Design better to avoid maintenance issues. */
5291 /* Function vect_model_reduction_cost.
5293 Models cost for a reduction operation, including the vector ops
5294 generated within the strip-mine loop in some cases, the initial
5295 definition before the loop, and the epilogue code that must be generated. */
5297 static void
5298 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5299 stmt_vec_info stmt_info, internal_fn reduc_fn,
5300 vect_reduction_type reduction_type,
5301 int ncopies, stmt_vector_for_cost *cost_vec)
5303 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5304 tree vectype;
5305 machine_mode mode;
5306 class loop *loop = NULL;
5308 if (loop_vinfo)
5309 loop = LOOP_VINFO_LOOP (loop_vinfo);
5311 /* Condition reductions generate two reductions in the loop. */
5312 if (reduction_type == COND_REDUCTION)
5313 ncopies *= 2;
5315 vectype = STMT_VINFO_VECTYPE (stmt_info);
5316 mode = TYPE_MODE (vectype);
5317 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5319 gimple_match_op op;
5320 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5321 gcc_unreachable ();
5323 bool emulated_mixed_dot_prod
5324 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5325 if (reduction_type == EXTRACT_LAST_REDUCTION)
5326 /* No extra instructions are needed in the prologue. The loop body
5327 operations are costed in vectorizable_condition. */
5328 inside_cost = 0;
5329 else if (reduction_type == FOLD_LEFT_REDUCTION)
5331 /* No extra instructions needed in the prologue. */
5332 prologue_cost = 0;
5334 if (reduc_fn != IFN_LAST)
5335 /* Count one reduction-like operation per vector. */
5336 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5337 stmt_info, 0, vect_body);
5338 else
5340 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5341 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5342 inside_cost = record_stmt_cost (cost_vec, nelements,
5343 vec_to_scalar, stmt_info, 0,
5344 vect_body);
5345 inside_cost += record_stmt_cost (cost_vec, nelements,
5346 scalar_stmt, stmt_info, 0,
5347 vect_body);
5350 else
5352 /* Add in the cost of the initial definitions. */
5353 int prologue_stmts;
5354 if (reduction_type == COND_REDUCTION)
5355 /* For cond reductions we have four vectors: initial index, step,
5356 initial result of the data reduction, initial value of the index
5357 reduction. */
5358 prologue_stmts = 4;
5359 else if (emulated_mixed_dot_prod)
5360 /* We need the initial reduction value and two invariants:
5361 one that contains the minimum signed value and one that
5362 contains half of its negative. */
5363 prologue_stmts = 3;
5364 else
5365 prologue_stmts = 1;
5366 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5367 scalar_to_vec, stmt_info, 0,
5368 vect_prologue);
5371 /* Determine cost of epilogue code.
5373 We have a reduction operator that will reduce the vector in one statement.
5374 Also requires scalar extract. */
5376 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5378 if (reduc_fn != IFN_LAST)
5380 if (reduction_type == COND_REDUCTION)
5382 /* An EQ stmt and an COND_EXPR stmt. */
5383 epilogue_cost += record_stmt_cost (cost_vec, 2,
5384 vector_stmt, stmt_info, 0,
5385 vect_epilogue);
5386 /* Reduction of the max index and a reduction of the found
5387 values. */
5388 epilogue_cost += record_stmt_cost (cost_vec, 2,
5389 vec_to_scalar, stmt_info, 0,
5390 vect_epilogue);
5391 /* A broadcast of the max value. */
5392 epilogue_cost += record_stmt_cost (cost_vec, 1,
5393 scalar_to_vec, stmt_info, 0,
5394 vect_epilogue);
5396 else
5398 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5399 stmt_info, 0, vect_epilogue);
5400 epilogue_cost += record_stmt_cost (cost_vec, 1,
5401 vec_to_scalar, stmt_info, 0,
5402 vect_epilogue);
5405 else if (reduction_type == COND_REDUCTION)
5407 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5408 /* Extraction of scalar elements. */
5409 epilogue_cost += record_stmt_cost (cost_vec,
5410 2 * estimated_nunits,
5411 vec_to_scalar, stmt_info, 0,
5412 vect_epilogue);
5413 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5414 epilogue_cost += record_stmt_cost (cost_vec,
5415 2 * estimated_nunits - 3,
5416 scalar_stmt, stmt_info, 0,
5417 vect_epilogue);
5419 else if (reduction_type == EXTRACT_LAST_REDUCTION
5420 || reduction_type == FOLD_LEFT_REDUCTION)
5421 /* No extra instructions need in the epilogue. */
5423 else
5425 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5426 tree bitsize = TYPE_SIZE (op.type);
5427 int element_bitsize = tree_to_uhwi (bitsize);
5428 int nelements = vec_size_in_bits / element_bitsize;
5430 if (op.code == COND_EXPR)
5431 op.code = MAX_EXPR;
5433 /* We have a whole vector shift available. */
5434 if (VECTOR_MODE_P (mode)
5435 && directly_supported_p (op.code, vectype)
5436 && have_whole_vector_shift (mode))
5438 /* Final reduction via vector shifts and the reduction operator.
5439 Also requires scalar extract. */
5440 epilogue_cost += record_stmt_cost (cost_vec,
5441 exact_log2 (nelements) * 2,
5442 vector_stmt, stmt_info, 0,
5443 vect_epilogue);
5444 epilogue_cost += record_stmt_cost (cost_vec, 1,
5445 vec_to_scalar, stmt_info, 0,
5446 vect_epilogue);
5448 else
5449 /* Use extracts and reduction op for final reduction. For N
5450 elements, we have N extracts and N-1 reduction ops. */
5451 epilogue_cost += record_stmt_cost (cost_vec,
5452 nelements + nelements - 1,
5453 vector_stmt, stmt_info, 0,
5454 vect_epilogue);
5458 if (dump_enabled_p ())
5459 dump_printf (MSG_NOTE,
5460 "vect_model_reduction_cost: inside_cost = %d, "
5461 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5462 prologue_cost, epilogue_cost);
5465 /* SEQ is a sequence of instructions that initialize the reduction
5466 described by REDUC_INFO. Emit them in the appropriate place. */
5468 static void
5469 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5470 stmt_vec_info reduc_info, gimple *seq)
5472 if (reduc_info->reused_accumulator)
5474 /* When reusing an accumulator from the main loop, we only need
5475 initialization instructions if the main loop can be skipped.
5476 In that case, emit the initialization instructions at the end
5477 of the guard block that does the skip. */
5478 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5479 gcc_assert (skip_edge);
5480 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5481 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5483 else
5485 /* The normal case: emit the initialization instructions on the
5486 preheader edge. */
5487 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5488 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5492 /* Function get_initial_def_for_reduction
5494 Input:
5495 REDUC_INFO - the info_for_reduction
5496 INIT_VAL - the initial value of the reduction variable
5497 NEUTRAL_OP - a value that has no effect on the reduction, as per
5498 neutral_op_for_reduction
5500 Output:
5501 Return a vector variable, initialized according to the operation that
5502 STMT_VINFO performs. This vector will be used as the initial value
5503 of the vector of partial results.
5505 The value we need is a vector in which element 0 has value INIT_VAL
5506 and every other element has value NEUTRAL_OP. */
5508 static tree
5509 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5510 stmt_vec_info reduc_info,
5511 tree init_val, tree neutral_op)
5513 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5514 tree scalar_type = TREE_TYPE (init_val);
5515 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5516 tree init_def;
5517 gimple_seq stmts = NULL;
5519 gcc_assert (vectype);
5521 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5522 || SCALAR_FLOAT_TYPE_P (scalar_type));
5524 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5525 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5527 if (operand_equal_p (init_val, neutral_op))
5529 /* If both elements are equal then the vector described above is
5530 just a splat. */
5531 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5532 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5534 else
5536 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5537 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5538 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5540 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5541 element 0. */
5542 init_def = gimple_build_vector_from_val (&stmts, vectype,
5543 neutral_op);
5544 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5545 vectype, init_def, init_val);
5547 else
5549 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5550 tree_vector_builder elts (vectype, 1, 2);
5551 elts.quick_push (init_val);
5552 elts.quick_push (neutral_op);
5553 init_def = gimple_build_vector (&stmts, &elts);
5557 if (stmts)
5558 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5559 return init_def;
5562 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5563 which performs a reduction involving GROUP_SIZE scalar statements.
5564 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5565 is nonnull, introducing extra elements of that value will not change the
5566 result. */
5568 static void
5569 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5570 stmt_vec_info reduc_info,
5571 vec<tree> *vec_oprnds,
5572 unsigned int number_of_vectors,
5573 unsigned int group_size, tree neutral_op)
5575 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5576 unsigned HOST_WIDE_INT nunits;
5577 unsigned j, number_of_places_left_in_vector;
5578 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5579 unsigned int i;
5581 gcc_assert (group_size == initial_values.length () || neutral_op);
5583 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5584 created vectors. It is greater than 1 if unrolling is performed.
5586 For example, we have two scalar operands, s1 and s2 (e.g., group of
5587 strided accesses of size two), while NUNITS is four (i.e., four scalars
5588 of this type can be packed in a vector). The output vector will contain
5589 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5590 will be 2).
5592 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5593 vectors containing the operands.
5595 For example, NUNITS is four as before, and the group size is 8
5596 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5597 {s5, s6, s7, s8}. */
5599 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5600 nunits = group_size;
5602 number_of_places_left_in_vector = nunits;
5603 bool constant_p = true;
5604 tree_vector_builder elts (vector_type, nunits, 1);
5605 elts.quick_grow (nunits);
5606 gimple_seq ctor_seq = NULL;
5607 for (j = 0; j < nunits * number_of_vectors; ++j)
5609 tree op;
5610 i = j % group_size;
5612 /* Get the def before the loop. In reduction chain we have only
5613 one initial value. Else we have as many as PHIs in the group. */
5614 if (i >= initial_values.length () || (j > i && neutral_op))
5615 op = neutral_op;
5616 else
5617 op = initial_values[i];
5619 /* Create 'vect_ = {op0,op1,...,opn}'. */
5620 number_of_places_left_in_vector--;
5621 elts[nunits - number_of_places_left_in_vector - 1] = op;
5622 if (!CONSTANT_CLASS_P (op))
5623 constant_p = false;
5625 if (number_of_places_left_in_vector == 0)
5627 tree init;
5628 if (constant_p && !neutral_op
5629 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5630 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5631 /* Build the vector directly from ELTS. */
5632 init = gimple_build_vector (&ctor_seq, &elts);
5633 else if (neutral_op)
5635 /* Build a vector of the neutral value and shift the
5636 other elements into place. */
5637 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5638 neutral_op);
5639 int k = nunits;
5640 while (k > 0 && elts[k - 1] == neutral_op)
5641 k -= 1;
5642 while (k > 0)
5644 k -= 1;
5645 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5646 vector_type, init, elts[k]);
5649 else
5651 /* First time round, duplicate ELTS to fill the
5652 required number of vectors. */
5653 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5654 elts, number_of_vectors, *vec_oprnds);
5655 break;
5657 vec_oprnds->quick_push (init);
5659 number_of_places_left_in_vector = nunits;
5660 elts.new_vector (vector_type, nunits, 1);
5661 elts.quick_grow (nunits);
5662 constant_p = true;
5665 if (ctor_seq != NULL)
5666 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5669 /* For a statement STMT_INFO taking part in a reduction operation return
5670 the stmt_vec_info the meta information is stored on. */
5672 stmt_vec_info
5673 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5675 stmt_info = vect_orig_stmt (stmt_info);
5676 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5677 if (!is_a <gphi *> (stmt_info->stmt)
5678 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5679 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5680 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5681 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5683 if (gimple_phi_num_args (phi) == 1)
5684 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5686 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5688 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5689 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5690 stmt_info = info;
5692 return stmt_info;
5695 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5696 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5697 return false. */
5699 static bool
5700 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5701 stmt_vec_info reduc_info)
5703 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5704 if (!main_loop_vinfo)
5705 return false;
5707 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5708 return false;
5710 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5711 auto_vec<tree, 16> main_loop_results (num_phis);
5712 auto_vec<tree, 16> initial_values (num_phis);
5713 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5715 /* The epilogue loop can be entered either from the main loop or
5716 from an earlier guard block. */
5717 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5718 for (tree incoming_value : reduc_info->reduc_initial_values)
5720 /* Look for:
5722 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5723 INITIAL_VALUE(guard block)>. */
5724 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5726 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5727 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5729 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5730 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5732 main_loop_results.quick_push (from_main_loop);
5733 initial_values.quick_push (from_skip);
5736 else
5737 /* The main loop dominates the epilogue loop. */
5738 main_loop_results.splice (reduc_info->reduc_initial_values);
5740 /* See if the main loop has the kind of accumulator we need. */
5741 vect_reusable_accumulator *accumulator
5742 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5743 if (!accumulator
5744 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5745 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5746 accumulator->reduc_info->reduc_scalar_results.begin ()))
5747 return false;
5749 /* Handle the case where we can reduce wider vectors to narrower ones. */
5750 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5751 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5752 unsigned HOST_WIDE_INT m;
5753 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5754 TYPE_VECTOR_SUBPARTS (vectype), &m))
5755 return false;
5756 /* Check the intermediate vector types and operations are available. */
5757 tree prev_vectype = old_vectype;
5758 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5759 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5761 intermediate_nunits = exact_div (intermediate_nunits, 2);
5762 tree intermediate_vectype = get_related_vectype_for_scalar_type
5763 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5764 if (!intermediate_vectype
5765 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5766 intermediate_vectype)
5767 || !can_vec_extract (TYPE_MODE (prev_vectype),
5768 TYPE_MODE (intermediate_vectype)))
5769 return false;
5770 prev_vectype = intermediate_vectype;
5773 /* Non-SLP reductions might apply an adjustment after the reduction
5774 operation, in order to simplify the initialization of the accumulator.
5775 If the epilogue loop carries on from where the main loop left off,
5776 it should apply the same adjustment to the final reduction result.
5778 If the epilogue loop can also be entered directly (rather than via
5779 the main loop), we need to be able to handle that case in the same way,
5780 with the same adjustment. (In principle we could add a PHI node
5781 to select the correct adjustment, but in practice that shouldn't be
5782 necessary.) */
5783 tree main_adjustment
5784 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5785 if (loop_vinfo->main_loop_edge && main_adjustment)
5787 gcc_assert (num_phis == 1);
5788 tree initial_value = initial_values[0];
5789 /* Check that we can use INITIAL_VALUE as the adjustment and
5790 initialize the accumulator with a neutral value instead. */
5791 if (!operand_equal_p (initial_value, main_adjustment))
5792 return false;
5793 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5794 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5795 code, initial_value);
5797 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5798 reduc_info->reduc_initial_values.truncate (0);
5799 reduc_info->reduc_initial_values.splice (initial_values);
5800 reduc_info->reused_accumulator = accumulator;
5801 return true;
5804 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5805 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5807 static tree
5808 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5809 gimple_seq *seq)
5811 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5812 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5813 tree stype = TREE_TYPE (vectype);
5814 tree new_temp = vec_def;
5815 while (nunits > nunits1)
5817 nunits /= 2;
5818 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5819 stype, nunits);
5820 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5822 /* The target has to make sure we support lowpart/highpart
5823 extraction, either via direct vector extract or through
5824 an integer mode punning. */
5825 tree dst1, dst2;
5826 gimple *epilog_stmt;
5827 if (convert_optab_handler (vec_extract_optab,
5828 TYPE_MODE (TREE_TYPE (new_temp)),
5829 TYPE_MODE (vectype1))
5830 != CODE_FOR_nothing)
5832 /* Extract sub-vectors directly once vec_extract becomes
5833 a conversion optab. */
5834 dst1 = make_ssa_name (vectype1);
5835 epilog_stmt
5836 = gimple_build_assign (dst1, BIT_FIELD_REF,
5837 build3 (BIT_FIELD_REF, vectype1,
5838 new_temp, TYPE_SIZE (vectype1),
5839 bitsize_int (0)));
5840 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5841 dst2 = make_ssa_name (vectype1);
5842 epilog_stmt
5843 = gimple_build_assign (dst2, BIT_FIELD_REF,
5844 build3 (BIT_FIELD_REF, vectype1,
5845 new_temp, TYPE_SIZE (vectype1),
5846 bitsize_int (bitsize)));
5847 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5849 else
5851 /* Extract via punning to appropriately sized integer mode
5852 vector. */
5853 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5854 tree etype = build_vector_type (eltype, 2);
5855 gcc_assert (convert_optab_handler (vec_extract_optab,
5856 TYPE_MODE (etype),
5857 TYPE_MODE (eltype))
5858 != CODE_FOR_nothing);
5859 tree tem = make_ssa_name (etype);
5860 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5861 build1 (VIEW_CONVERT_EXPR,
5862 etype, new_temp));
5863 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5864 new_temp = tem;
5865 tem = make_ssa_name (eltype);
5866 epilog_stmt
5867 = gimple_build_assign (tem, BIT_FIELD_REF,
5868 build3 (BIT_FIELD_REF, eltype,
5869 new_temp, TYPE_SIZE (eltype),
5870 bitsize_int (0)));
5871 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5872 dst1 = make_ssa_name (vectype1);
5873 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5874 build1 (VIEW_CONVERT_EXPR,
5875 vectype1, tem));
5876 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5877 tem = make_ssa_name (eltype);
5878 epilog_stmt
5879 = gimple_build_assign (tem, BIT_FIELD_REF,
5880 build3 (BIT_FIELD_REF, eltype,
5881 new_temp, TYPE_SIZE (eltype),
5882 bitsize_int (bitsize)));
5883 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5884 dst2 = make_ssa_name (vectype1);
5885 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5886 build1 (VIEW_CONVERT_EXPR,
5887 vectype1, tem));
5888 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5891 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5894 return new_temp;
5897 /* Retrieves the definining statement to be used for a reduction.
5898 For MAIN_EXIT_P we use the current VEC_STMTs and otherwise we look at
5899 the reduction definitions. */
5901 tree
5902 vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
5903 slp_instance slp_node_instance, bool main_exit_p, unsigned i,
5904 vec <gimple *> &vec_stmts)
5906 tree def;
5908 if (slp_node)
5910 if (!main_exit_p)
5911 slp_node = slp_node_instance->reduc_phis;
5912 def = vect_get_slp_vect_def (slp_node, i);
5914 else
5916 if (!main_exit_p)
5917 reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
5918 vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
5919 def = gimple_get_lhs (vec_stmts[0]);
5922 return def;
5925 /* Function vect_create_epilog_for_reduction
5927 Create code at the loop-epilog to finalize the result of a reduction
5928 computation.
5930 STMT_INFO is the scalar reduction stmt that is being vectorized.
5931 SLP_NODE is an SLP node containing a group of reduction statements. The
5932 first one in this group is STMT_INFO.
5933 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5934 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5935 (counting from 0)
5936 LOOP_EXIT is the edge to update in the merge block. In the case of a single
5937 exit this edge is always the main loop exit.
5939 This function:
5940 1. Completes the reduction def-use cycles.
5941 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5942 by calling the function specified by REDUC_FN if available, or by
5943 other means (whole-vector shifts or a scalar loop).
5944 The function also creates a new phi node at the loop exit to preserve
5945 loop-closed form, as illustrated below.
5947 The flow at the entry to this function:
5949 loop:
5950 vec_def = phi <vec_init, null> # REDUCTION_PHI
5951 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5952 s_loop = scalar_stmt # (scalar) STMT_INFO
5953 loop_exit:
5954 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5955 use <s_out0>
5956 use <s_out0>
5958 The above is transformed by this function into:
5960 loop:
5961 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5962 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5963 s_loop = scalar_stmt # (scalar) STMT_INFO
5964 loop_exit:
5965 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5966 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5967 v_out2 = reduce <v_out1>
5968 s_out3 = extract_field <v_out2, 0>
5969 s_out4 = adjust_result <s_out3>
5970 use <s_out4>
5971 use <s_out4>
5974 static void
5975 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5976 stmt_vec_info stmt_info,
5977 slp_tree slp_node,
5978 slp_instance slp_node_instance,
5979 edge loop_exit)
5981 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5982 gcc_assert (reduc_info->is_reduc_info);
5983 /* For double reductions we need to get at the inner loop reduction
5984 stmt which has the meta info attached. Our stmt_info is that of the
5985 loop-closed PHI of the inner loop which we remember as
5986 def for the reduction PHI generation. */
5987 bool double_reduc = false;
5988 bool main_exit_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit;
5989 stmt_vec_info rdef_info = stmt_info;
5990 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5992 gcc_assert (!slp_node);
5993 double_reduc = true;
5994 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5995 (stmt_info->stmt, 0));
5996 stmt_info = vect_stmt_to_vectorize (stmt_info);
5998 gphi *reduc_def_stmt
5999 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
6000 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
6001 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6002 tree vectype;
6003 machine_mode mode;
6004 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
6005 basic_block exit_bb;
6006 tree scalar_dest;
6007 tree scalar_type;
6008 gimple *new_phi = NULL, *phi = NULL;
6009 gimple_stmt_iterator exit_gsi;
6010 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
6011 gimple *epilog_stmt = NULL;
6012 gimple *exit_phi;
6013 tree bitsize;
6014 tree def;
6015 tree orig_name, scalar_result;
6016 imm_use_iterator imm_iter, phi_imm_iter;
6017 use_operand_p use_p, phi_use_p;
6018 gimple *use_stmt;
6019 auto_vec<tree> reduc_inputs;
6020 int j, i;
6021 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
6022 unsigned int group_size = 1, k;
6023 auto_vec<gimple *> phis;
6024 /* SLP reduction without reduction chain, e.g.,
6025 # a1 = phi <a2, a0>
6026 # b1 = phi <b2, b0>
6027 a2 = operation (a1)
6028 b2 = operation (b1) */
6029 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6030 bool direct_slp_reduc;
6031 tree induction_index = NULL_TREE;
6033 if (slp_node)
6034 group_size = SLP_TREE_LANES (slp_node);
6036 if (nested_in_vect_loop_p (loop, stmt_info))
6038 outer_loop = loop;
6039 loop = loop->inner;
6040 gcc_assert (!slp_node && double_reduc);
6043 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6044 gcc_assert (vectype);
6045 mode = TYPE_MODE (vectype);
6047 tree induc_val = NULL_TREE;
6048 tree adjustment_def = NULL;
6049 if (slp_node)
6051 else
6053 /* Optimize: for induction condition reduction, if we can't use zero
6054 for induc_val, use initial_def. */
6055 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6056 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6057 else if (double_reduc)
6059 else
6060 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6063 stmt_vec_info single_live_out_stmt[] = { stmt_info };
6064 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6065 if (slp_reduc)
6066 /* All statements produce live-out values. */
6067 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6068 else if (slp_node)
6070 /* The last statement in the reduction chain produces the live-out
6071 value. Note SLP optimization can shuffle scalar stmts to
6072 optimize permutations so we have to search for the last stmt. */
6073 for (k = 0; k < group_size; ++k)
6074 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
6076 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6077 break;
6081 unsigned vec_num;
6082 int ncopies;
6083 if (slp_node)
6085 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6086 ncopies = 1;
6088 else
6090 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6091 vec_num = 1;
6092 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6095 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6096 which is updated with the current index of the loop for every match of
6097 the original loop's cond_expr (VEC_STMT). This results in a vector
6098 containing the last time the condition passed for that vector lane.
6099 The first match will be a 1 to allow 0 to be used for non-matching
6100 indexes. If there are no matches at all then the vector will be all
6101 zeroes.
6103 PR92772: This algorithm is broken for architectures that support
6104 masked vectors, but do not provide fold_extract_last. */
6105 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6107 auto_vec<std::pair<tree, bool>, 2> ccompares;
6108 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6109 cond_info = vect_stmt_to_vectorize (cond_info);
6110 while (cond_info != reduc_info)
6112 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6114 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6115 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6116 ccompares.safe_push
6117 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6118 STMT_VINFO_REDUC_IDX (cond_info) == 2));
6120 cond_info
6121 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6122 1 + STMT_VINFO_REDUC_IDX
6123 (cond_info)));
6124 cond_info = vect_stmt_to_vectorize (cond_info);
6126 gcc_assert (ccompares.length () != 0);
6128 tree indx_before_incr, indx_after_incr;
6129 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6130 int scalar_precision
6131 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6132 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6133 tree cr_index_vector_type = get_related_vectype_for_scalar_type
6134 (TYPE_MODE (vectype), cr_index_scalar_type,
6135 TYPE_VECTOR_SUBPARTS (vectype));
6137 /* First we create a simple vector induction variable which starts
6138 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6139 vector size (STEP). */
6141 /* Create a {1,2,3,...} vector. */
6142 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6144 /* Create a vector of the step value. */
6145 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6146 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6148 /* Create an induction variable. */
6149 gimple_stmt_iterator incr_gsi;
6150 bool insert_after;
6151 vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6152 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6153 insert_after, &indx_before_incr, &indx_after_incr);
6155 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6156 filled with zeros (VEC_ZERO). */
6158 /* Create a vector of 0s. */
6159 tree zero = build_zero_cst (cr_index_scalar_type);
6160 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6162 /* Create a vector phi node. */
6163 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6164 new_phi = create_phi_node (new_phi_tree, loop->header);
6165 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6166 loop_preheader_edge (loop), UNKNOWN_LOCATION);
6168 /* Now take the condition from the loops original cond_exprs
6169 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6170 every match uses values from the induction variable
6171 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6172 (NEW_PHI_TREE).
6173 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6174 the new cond_expr (INDEX_COND_EXPR). */
6175 gimple_seq stmts = NULL;
6176 for (int i = ccompares.length () - 1; i != -1; --i)
6178 tree ccompare = ccompares[i].first;
6179 if (ccompares[i].second)
6180 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6181 cr_index_vector_type,
6182 ccompare,
6183 indx_before_incr, new_phi_tree);
6184 else
6185 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6186 cr_index_vector_type,
6187 ccompare,
6188 new_phi_tree, indx_before_incr);
6190 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6192 /* Update the phi with the vec cond. */
6193 induction_index = new_phi_tree;
6194 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6195 loop_latch_edge (loop), UNKNOWN_LOCATION);
6198 /* 2. Create epilog code.
6199 The reduction epilog code operates across the elements of the vector
6200 of partial results computed by the vectorized loop.
6201 The reduction epilog code consists of:
6203 step 1: compute the scalar result in a vector (v_out2)
6204 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6205 step 3: adjust the scalar result (s_out3) if needed.
6207 Step 1 can be accomplished using one the following three schemes:
6208 (scheme 1) using reduc_fn, if available.
6209 (scheme 2) using whole-vector shifts, if available.
6210 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6211 combined.
6213 The overall epilog code looks like this:
6215 s_out0 = phi <s_loop> # original EXIT_PHI
6216 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6217 v_out2 = reduce <v_out1> # step 1
6218 s_out3 = extract_field <v_out2, 0> # step 2
6219 s_out4 = adjust_result <s_out3> # step 3
6221 (step 3 is optional, and steps 1 and 2 may be combined).
6222 Lastly, the uses of s_out0 are replaced by s_out4. */
6225 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6226 v_out1 = phi <VECT_DEF>
6227 Store them in NEW_PHIS. */
6228 if (double_reduc)
6229 loop = outer_loop;
6230 /* We need to reduce values in all exits. */
6231 exit_bb = loop_exit->dest;
6232 exit_gsi = gsi_after_labels (exit_bb);
6233 reduc_inputs.create (slp_node ? vec_num : ncopies);
6234 vec <gimple *> vec_stmts = vNULL;
6235 for (unsigned i = 0; i < vec_num; i++)
6237 gimple_seq stmts = NULL;
6238 def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
6239 main_exit_p, i, vec_stmts);
6240 for (j = 0; j < ncopies; j++)
6242 tree new_def = copy_ssa_name (def);
6243 phi = create_phi_node (new_def, exit_bb);
6244 if (j)
6245 def = gimple_get_lhs (vec_stmts[j]);
6246 SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6247 new_def = gimple_convert (&stmts, vectype, new_def);
6248 reduc_inputs.quick_push (new_def);
6250 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6253 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6254 (i.e. when reduc_fn is not available) and in the final adjustment
6255 code (if needed). Also get the original scalar reduction variable as
6256 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6257 represents a reduction pattern), the tree-code and scalar-def are
6258 taken from the original stmt that the pattern-stmt (STMT) replaces.
6259 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6260 are taken from STMT. */
6262 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6263 if (orig_stmt_info != stmt_info)
6265 /* Reduction pattern */
6266 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6267 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6270 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6271 scalar_type = TREE_TYPE (scalar_dest);
6272 scalar_results.truncate (0);
6273 scalar_results.reserve_exact (group_size);
6274 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6275 bitsize = TYPE_SIZE (scalar_type);
6277 /* True if we should implement SLP_REDUC using native reduction operations
6278 instead of scalar operations. */
6279 direct_slp_reduc = (reduc_fn != IFN_LAST
6280 && slp_reduc
6281 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6283 /* In case of reduction chain, e.g.,
6284 # a1 = phi <a3, a0>
6285 a2 = operation (a1)
6286 a3 = operation (a2),
6288 we may end up with more than one vector result. Here we reduce them
6289 to one vector.
6291 The same is true for a SLP reduction, e.g.,
6292 # a1 = phi <a2, a0>
6293 # b1 = phi <b2, b0>
6294 a2 = operation (a1)
6295 b2 = operation (a2),
6297 where we can end up with more than one vector as well. We can
6298 easily accumulate vectors when the number of vector elements is
6299 a multiple of the SLP group size.
6301 The same is true if we couldn't use a single defuse cycle. */
6302 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6303 || direct_slp_reduc
6304 || (slp_reduc
6305 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6306 || ncopies > 1)
6308 gimple_seq stmts = NULL;
6309 tree single_input = reduc_inputs[0];
6310 for (k = 1; k < reduc_inputs.length (); k++)
6311 single_input = gimple_build (&stmts, code, vectype,
6312 single_input, reduc_inputs[k]);
6313 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6315 reduc_inputs.truncate (0);
6316 reduc_inputs.safe_push (single_input);
6319 tree orig_reduc_input = reduc_inputs[0];
6321 /* If this loop is an epilogue loop that can be skipped after the
6322 main loop, we can only share a reduction operation between the
6323 main loop and the epilogue if we put it at the target of the
6324 skip edge.
6326 We can still reuse accumulators if this check fails. Doing so has
6327 the minor(?) benefit of making the epilogue loop's scalar result
6328 independent of the main loop's scalar result. */
6329 bool unify_with_main_loop_p = false;
6330 if (reduc_info->reused_accumulator
6331 && loop_vinfo->skip_this_loop_edge
6332 && single_succ_p (exit_bb)
6333 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6335 unify_with_main_loop_p = true;
6337 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6338 reduc_inputs[0] = make_ssa_name (vectype);
6339 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6340 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6341 UNKNOWN_LOCATION);
6342 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6343 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6344 exit_gsi = gsi_after_labels (reduc_block);
6347 /* Shouldn't be used beyond this point. */
6348 exit_bb = nullptr;
6350 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6351 && reduc_fn != IFN_LAST)
6353 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6354 various data values where the condition matched and another vector
6355 (INDUCTION_INDEX) containing all the indexes of those matches. We
6356 need to extract the last matching index (which will be the index with
6357 highest value) and use this to index into the data vector.
6358 For the case where there were no matches, the data vector will contain
6359 all default values and the index vector will be all zeros. */
6361 /* Get various versions of the type of the vector of indexes. */
6362 tree index_vec_type = TREE_TYPE (induction_index);
6363 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6364 tree index_scalar_type = TREE_TYPE (index_vec_type);
6365 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6367 /* Get an unsigned integer version of the type of the data vector. */
6368 int scalar_precision
6369 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6370 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6371 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6372 vectype);
6374 /* First we need to create a vector (ZERO_VEC) of zeros and another
6375 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6376 can create using a MAX reduction and then expanding.
6377 In the case where the loop never made any matches, the max index will
6378 be zero. */
6380 /* Vector of {0, 0, 0,...}. */
6381 tree zero_vec = build_zero_cst (vectype);
6383 /* Find maximum value from the vector of found indexes. */
6384 tree max_index = make_ssa_name (index_scalar_type);
6385 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6386 1, induction_index);
6387 gimple_call_set_lhs (max_index_stmt, max_index);
6388 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6390 /* Vector of {max_index, max_index, max_index,...}. */
6391 tree max_index_vec = make_ssa_name (index_vec_type);
6392 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6393 max_index);
6394 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6395 max_index_vec_rhs);
6396 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6398 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6399 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6400 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6401 otherwise. Only one value should match, resulting in a vector
6402 (VEC_COND) with one data value and the rest zeros.
6403 In the case where the loop never made any matches, every index will
6404 match, resulting in a vector with all data values (which will all be
6405 the default value). */
6407 /* Compare the max index vector to the vector of found indexes to find
6408 the position of the max value. */
6409 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6410 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6411 induction_index,
6412 max_index_vec);
6413 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6415 /* Use the compare to choose either values from the data vector or
6416 zero. */
6417 tree vec_cond = make_ssa_name (vectype);
6418 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6419 vec_compare,
6420 reduc_inputs[0],
6421 zero_vec);
6422 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6424 /* Finally we need to extract the data value from the vector (VEC_COND)
6425 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6426 reduction, but because this doesn't exist, we can use a MAX reduction
6427 instead. The data value might be signed or a float so we need to cast
6428 it first.
6429 In the case where the loop never made any matches, the data values are
6430 all identical, and so will reduce down correctly. */
6432 /* Make the matched data values unsigned. */
6433 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6434 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6435 vec_cond);
6436 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6437 VIEW_CONVERT_EXPR,
6438 vec_cond_cast_rhs);
6439 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6441 /* Reduce down to a scalar value. */
6442 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6443 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6444 1, vec_cond_cast);
6445 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6446 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6448 /* Convert the reduced value back to the result type and set as the
6449 result. */
6450 gimple_seq stmts = NULL;
6451 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6452 data_reduc);
6453 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6454 scalar_results.safe_push (new_temp);
6456 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6457 && reduc_fn == IFN_LAST)
6459 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6460 idx = 0;
6461 idx_val = induction_index[0];
6462 val = data_reduc[0];
6463 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6464 if (induction_index[i] > idx_val)
6465 val = data_reduc[i], idx_val = induction_index[i];
6466 return val; */
6468 tree data_eltype = TREE_TYPE (vectype);
6469 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6470 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6471 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6472 /* Enforced by vectorizable_reduction, which ensures we have target
6473 support before allowing a conditional reduction on variable-length
6474 vectors. */
6475 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6476 tree idx_val = NULL_TREE, val = NULL_TREE;
6477 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6479 tree old_idx_val = idx_val;
6480 tree old_val = val;
6481 idx_val = make_ssa_name (idx_eltype);
6482 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6483 build3 (BIT_FIELD_REF, idx_eltype,
6484 induction_index,
6485 bitsize_int (el_size),
6486 bitsize_int (off)));
6487 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6488 val = make_ssa_name (data_eltype);
6489 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6490 build3 (BIT_FIELD_REF,
6491 data_eltype,
6492 reduc_inputs[0],
6493 bitsize_int (el_size),
6494 bitsize_int (off)));
6495 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6496 if (off != 0)
6498 tree new_idx_val = idx_val;
6499 if (off != v_size - el_size)
6501 new_idx_val = make_ssa_name (idx_eltype);
6502 epilog_stmt = gimple_build_assign (new_idx_val,
6503 MAX_EXPR, idx_val,
6504 old_idx_val);
6505 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6507 tree cond = make_ssa_name (boolean_type_node);
6508 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6509 idx_val, old_idx_val);
6510 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6511 tree new_val = make_ssa_name (data_eltype);
6512 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6513 cond, val, old_val);
6514 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6515 idx_val = new_idx_val;
6516 val = new_val;
6519 /* Convert the reduced value back to the result type and set as the
6520 result. */
6521 gimple_seq stmts = NULL;
6522 val = gimple_convert (&stmts, scalar_type, val);
6523 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6524 scalar_results.safe_push (val);
6527 /* 2.3 Create the reduction code, using one of the three schemes described
6528 above. In SLP we simply need to extract all the elements from the
6529 vector (without reducing them), so we use scalar shifts. */
6530 else if (reduc_fn != IFN_LAST && !slp_reduc)
6532 tree tmp;
6533 tree vec_elem_type;
6535 /* Case 1: Create:
6536 v_out2 = reduc_expr <v_out1> */
6538 if (dump_enabled_p ())
6539 dump_printf_loc (MSG_NOTE, vect_location,
6540 "Reduce using direct vector reduction.\n");
6542 gimple_seq stmts = NULL;
6543 vec_elem_type = TREE_TYPE (vectype);
6544 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6545 vec_elem_type, reduc_inputs[0]);
6546 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6547 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6549 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6550 && induc_val)
6552 /* Earlier we set the initial value to be a vector if induc_val
6553 values. Check the result and if it is induc_val then replace
6554 with the original initial value, unless induc_val is
6555 the same as initial_def already. */
6556 tree zcompare = make_ssa_name (boolean_type_node);
6557 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6558 new_temp, induc_val);
6559 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6560 tree initial_def = reduc_info->reduc_initial_values[0];
6561 tmp = make_ssa_name (new_scalar_dest);
6562 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6563 initial_def, new_temp);
6564 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6565 new_temp = tmp;
6568 scalar_results.safe_push (new_temp);
6570 else if (direct_slp_reduc)
6572 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6573 with the elements for other SLP statements replaced with the
6574 neutral value. We can then do a normal reduction on each vector. */
6576 /* Enforced by vectorizable_reduction. */
6577 gcc_assert (reduc_inputs.length () == 1);
6578 gcc_assert (pow2p_hwi (group_size));
6580 gimple_seq seq = NULL;
6582 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6583 and the same element size as VECTYPE. */
6584 tree index = build_index_vector (vectype, 0, 1);
6585 tree index_type = TREE_TYPE (index);
6586 tree index_elt_type = TREE_TYPE (index_type);
6587 tree mask_type = truth_type_for (index_type);
6589 /* Create a vector that, for each element, identifies which of
6590 the REDUC_GROUP_SIZE results should use it. */
6591 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6592 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6593 build_vector_from_val (index_type, index_mask));
6595 /* Get a neutral vector value. This is simply a splat of the neutral
6596 scalar value if we have one, otherwise the initial scalar value
6597 is itself a neutral value. */
6598 tree vector_identity = NULL_TREE;
6599 tree neutral_op = NULL_TREE;
6600 if (slp_node)
6602 tree initial_value = NULL_TREE;
6603 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6604 initial_value = reduc_info->reduc_initial_values[0];
6605 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6606 initial_value, false);
6608 if (neutral_op)
6609 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6610 neutral_op);
6611 for (unsigned int i = 0; i < group_size; ++i)
6613 /* If there's no univeral neutral value, we can use the
6614 initial scalar value from the original PHI. This is used
6615 for MIN and MAX reduction, for example. */
6616 if (!neutral_op)
6618 tree scalar_value = reduc_info->reduc_initial_values[i];
6619 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6620 scalar_value);
6621 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6622 scalar_value);
6625 /* Calculate the equivalent of:
6627 sel[j] = (index[j] == i);
6629 which selects the elements of REDUC_INPUTS[0] that should
6630 be included in the result. */
6631 tree compare_val = build_int_cst (index_elt_type, i);
6632 compare_val = build_vector_from_val (index_type, compare_val);
6633 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6634 index, compare_val);
6636 /* Calculate the equivalent of:
6638 vec = seq ? reduc_inputs[0] : vector_identity;
6640 VEC is now suitable for a full vector reduction. */
6641 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6642 sel, reduc_inputs[0], vector_identity);
6644 /* Do the reduction and convert it to the appropriate type. */
6645 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6646 TREE_TYPE (vectype), vec);
6647 scalar = gimple_convert (&seq, scalar_type, scalar);
6648 scalar_results.safe_push (scalar);
6650 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6652 else
6654 bool reduce_with_shift;
6655 tree vec_temp;
6657 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6659 /* See if the target wants to do the final (shift) reduction
6660 in a vector mode of smaller size and first reduce upper/lower
6661 halves against each other. */
6662 enum machine_mode mode1 = mode;
6663 tree stype = TREE_TYPE (vectype);
6664 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6665 unsigned nunits1 = nunits;
6666 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6667 && reduc_inputs.length () == 1)
6669 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6670 /* For SLP reductions we have to make sure lanes match up, but
6671 since we're doing individual element final reduction reducing
6672 vector width here is even more important.
6673 ??? We can also separate lanes with permutes, for the common
6674 case of power-of-two group-size odd/even extracts would work. */
6675 if (slp_reduc && nunits != nunits1)
6677 nunits1 = least_common_multiple (nunits1, group_size);
6678 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6681 if (!slp_reduc
6682 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6683 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6685 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6686 stype, nunits1);
6687 reduce_with_shift = have_whole_vector_shift (mode1);
6688 if (!VECTOR_MODE_P (mode1)
6689 || !directly_supported_p (code, vectype1))
6690 reduce_with_shift = false;
6692 /* First reduce the vector to the desired vector size we should
6693 do shift reduction on by combining upper and lower halves. */
6694 gimple_seq stmts = NULL;
6695 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6696 code, &stmts);
6697 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6698 reduc_inputs[0] = new_temp;
6700 if (reduce_with_shift && !slp_reduc)
6702 int element_bitsize = tree_to_uhwi (bitsize);
6703 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6704 for variable-length vectors and also requires direct target support
6705 for loop reductions. */
6706 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6707 int nelements = vec_size_in_bits / element_bitsize;
6708 vec_perm_builder sel;
6709 vec_perm_indices indices;
6711 int elt_offset;
6713 tree zero_vec = build_zero_cst (vectype1);
6714 /* Case 2: Create:
6715 for (offset = nelements/2; offset >= 1; offset/=2)
6717 Create: va' = vec_shift <va, offset>
6718 Create: va = vop <va, va'>
6719 } */
6721 tree rhs;
6723 if (dump_enabled_p ())
6724 dump_printf_loc (MSG_NOTE, vect_location,
6725 "Reduce using vector shifts\n");
6727 gimple_seq stmts = NULL;
6728 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6729 for (elt_offset = nelements / 2;
6730 elt_offset >= 1;
6731 elt_offset /= 2)
6733 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6734 indices.new_vector (sel, 2, nelements);
6735 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6736 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6737 new_temp, zero_vec, mask);
6738 new_temp = gimple_build (&stmts, code,
6739 vectype1, new_name, new_temp);
6741 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6743 /* 2.4 Extract the final scalar result. Create:
6744 s_out3 = extract_field <v_out2, bitpos> */
6746 if (dump_enabled_p ())
6747 dump_printf_loc (MSG_NOTE, vect_location,
6748 "extract scalar result\n");
6750 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6751 bitsize, bitsize_zero_node);
6752 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6753 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6754 gimple_assign_set_lhs (epilog_stmt, new_temp);
6755 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6756 scalar_results.safe_push (new_temp);
6758 else
6760 /* Case 3: Create:
6761 s = extract_field <v_out2, 0>
6762 for (offset = element_size;
6763 offset < vector_size;
6764 offset += element_size;)
6766 Create: s' = extract_field <v_out2, offset>
6767 Create: s = op <s, s'> // For non SLP cases
6768 } */
6770 if (dump_enabled_p ())
6771 dump_printf_loc (MSG_NOTE, vect_location,
6772 "Reduce using scalar code.\n");
6774 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6775 int element_bitsize = tree_to_uhwi (bitsize);
6776 tree compute_type = TREE_TYPE (vectype);
6777 gimple_seq stmts = NULL;
6778 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6780 int bit_offset;
6781 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6782 vec_temp, bitsize, bitsize_zero_node);
6784 /* In SLP we don't need to apply reduction operation, so we just
6785 collect s' values in SCALAR_RESULTS. */
6786 if (slp_reduc)
6787 scalar_results.safe_push (new_temp);
6789 for (bit_offset = element_bitsize;
6790 bit_offset < vec_size_in_bits;
6791 bit_offset += element_bitsize)
6793 tree bitpos = bitsize_int (bit_offset);
6794 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6795 compute_type, vec_temp,
6796 bitsize, bitpos);
6797 if (slp_reduc)
6799 /* In SLP we don't need to apply reduction operation, so
6800 we just collect s' values in SCALAR_RESULTS. */
6801 new_temp = new_name;
6802 scalar_results.safe_push (new_name);
6804 else
6805 new_temp = gimple_build (&stmts, code, compute_type,
6806 new_name, new_temp);
6810 /* The only case where we need to reduce scalar results in SLP, is
6811 unrolling. If the size of SCALAR_RESULTS is greater than
6812 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6813 REDUC_GROUP_SIZE. */
6814 if (slp_reduc)
6816 tree res, first_res, new_res;
6818 /* Reduce multiple scalar results in case of SLP unrolling. */
6819 for (j = group_size; scalar_results.iterate (j, &res);
6820 j++)
6822 first_res = scalar_results[j % group_size];
6823 new_res = gimple_build (&stmts, code, compute_type,
6824 first_res, res);
6825 scalar_results[j % group_size] = new_res;
6827 scalar_results.truncate (group_size);
6828 for (k = 0; k < group_size; k++)
6829 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6830 scalar_results[k]);
6832 else
6834 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6835 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6836 scalar_results.safe_push (new_temp);
6839 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6842 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6843 && induc_val)
6845 /* Earlier we set the initial value to be a vector if induc_val
6846 values. Check the result and if it is induc_val then replace
6847 with the original initial value, unless induc_val is
6848 the same as initial_def already. */
6849 tree zcompare = make_ssa_name (boolean_type_node);
6850 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6851 induc_val);
6852 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6853 tree initial_def = reduc_info->reduc_initial_values[0];
6854 tree tmp = make_ssa_name (new_scalar_dest);
6855 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6856 initial_def, new_temp);
6857 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6858 scalar_results[0] = tmp;
6862 /* 2.5 Adjust the final result by the initial value of the reduction
6863 variable. (When such adjustment is not needed, then
6864 'adjustment_def' is zero). For example, if code is PLUS we create:
6865 new_temp = loop_exit_def + adjustment_def */
6867 if (adjustment_def)
6869 gcc_assert (!slp_reduc);
6870 gimple_seq stmts = NULL;
6871 if (double_reduc)
6873 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6874 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6875 new_temp = gimple_build (&stmts, code, vectype,
6876 reduc_inputs[0], adjustment_def);
6878 else
6880 new_temp = scalar_results[0];
6881 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6882 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6883 adjustment_def);
6884 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6885 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6886 new_temp, adjustment_def);
6887 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6890 epilog_stmt = gimple_seq_last_stmt (stmts);
6891 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6892 scalar_results[0] = new_temp;
6895 /* Record this operation if it could be reused by the epilogue loop. */
6896 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6897 && reduc_inputs.length () == 1)
6898 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6899 { orig_reduc_input, reduc_info });
6901 if (double_reduc)
6902 loop = outer_loop;
6904 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6905 phis with new adjusted scalar results, i.e., replace use <s_out0>
6906 with use <s_out4>.
6908 Transform:
6909 loop_exit:
6910 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6911 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6912 v_out2 = reduce <v_out1>
6913 s_out3 = extract_field <v_out2, 0>
6914 s_out4 = adjust_result <s_out3>
6915 use <s_out0>
6916 use <s_out0>
6918 into:
6920 loop_exit:
6921 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6922 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6923 v_out2 = reduce <v_out1>
6924 s_out3 = extract_field <v_out2, 0>
6925 s_out4 = adjust_result <s_out3>
6926 use <s_out4>
6927 use <s_out4> */
6929 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6930 for (k = 0; k < live_out_stmts.size (); k++)
6932 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6933 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6935 phis.create (3);
6936 /* Find the loop-closed-use at the loop exit of the original scalar
6937 result. (The reduction result is expected to have two immediate uses,
6938 one at the latch block, and one at the loop exit). For double
6939 reductions we are looking for exit phis of the outer loop. */
6940 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6942 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6944 if (!is_gimple_debug (USE_STMT (use_p)))
6945 phis.safe_push (USE_STMT (use_p));
6947 else
6949 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6951 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6953 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6955 if (!flow_bb_inside_loop_p (loop,
6956 gimple_bb (USE_STMT (phi_use_p)))
6957 && !is_gimple_debug (USE_STMT (phi_use_p)))
6958 phis.safe_push (USE_STMT (phi_use_p));
6964 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6966 /* Replace the uses: */
6967 orig_name = PHI_RESULT (exit_phi);
6969 /* Look for a single use at the target of the skip edge. */
6970 if (unify_with_main_loop_p)
6972 use_operand_p use_p;
6973 gimple *user;
6974 if (!single_imm_use (orig_name, &use_p, &user))
6975 gcc_unreachable ();
6976 orig_name = gimple_get_lhs (user);
6979 scalar_result = scalar_results[k];
6980 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6982 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6983 SET_USE (use_p, scalar_result);
6984 update_stmt (use_stmt);
6988 phis.release ();
6992 /* Return a vector of type VECTYPE that is equal to the vector select
6993 operation "MASK ? VEC : IDENTITY". Insert the select statements
6994 before GSI. */
6996 static tree
6997 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6998 tree vec, tree identity)
7000 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
7001 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
7002 mask, vec, identity);
7003 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7004 return cond;
7007 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
7008 order, starting with LHS. Insert the extraction statements before GSI and
7009 associate the new scalar SSA names with variable SCALAR_DEST.
7010 If MASK is nonzero mask the input and then operate on it unconditionally.
7011 Return the SSA name for the result. */
7013 static tree
7014 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
7015 tree_code code, tree lhs, tree vector_rhs,
7016 tree mask)
7018 tree vectype = TREE_TYPE (vector_rhs);
7019 tree scalar_type = TREE_TYPE (vectype);
7020 tree bitsize = TYPE_SIZE (scalar_type);
7021 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
7022 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
7024 /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7025 to perform an unconditional element-wise reduction of it. */
7026 if (mask)
7028 tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7029 "masked_vector_rhs");
7030 tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7031 false);
7032 tree vector_identity = build_vector_from_val (vectype, neutral_op);
7033 gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7034 mask, vector_rhs, vector_identity);
7035 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7036 vector_rhs = masked_vector_rhs;
7039 for (unsigned HOST_WIDE_INT bit_offset = 0;
7040 bit_offset < vec_size_in_bits;
7041 bit_offset += element_bitsize)
7043 tree bitpos = bitsize_int (bit_offset);
7044 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7045 bitsize, bitpos);
7047 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7048 rhs = make_ssa_name (scalar_dest, stmt);
7049 gimple_assign_set_lhs (stmt, rhs);
7050 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7052 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7053 tree new_name = make_ssa_name (scalar_dest, stmt);
7054 gimple_assign_set_lhs (stmt, new_name);
7055 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7056 lhs = new_name;
7058 return lhs;
7061 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
7062 type of the vector input. */
7064 static internal_fn
7065 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7067 internal_fn mask_reduc_fn;
7068 internal_fn mask_len_reduc_fn;
7070 switch (reduc_fn)
7072 case IFN_FOLD_LEFT_PLUS:
7073 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7074 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7075 break;
7077 default:
7078 return IFN_LAST;
7081 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7082 OPTIMIZE_FOR_SPEED))
7083 return mask_reduc_fn;
7084 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7085 OPTIMIZE_FOR_SPEED))
7086 return mask_len_reduc_fn;
7087 return IFN_LAST;
7090 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
7091 statement that sets the live-out value. REDUC_DEF_STMT is the phi
7092 statement. CODE is the operation performed by STMT_INFO and OPS are
7093 its scalar operands. REDUC_INDEX is the index of the operand in
7094 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
7095 implements in-order reduction, or IFN_LAST if we should open-code it.
7096 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7097 that should be used to control the operation in a fully-masked loop. */
7099 static bool
7100 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7101 stmt_vec_info stmt_info,
7102 gimple_stmt_iterator *gsi,
7103 gimple **vec_stmt, slp_tree slp_node,
7104 gimple *reduc_def_stmt,
7105 code_helper code, internal_fn reduc_fn,
7106 tree *ops, int num_ops, tree vectype_in,
7107 int reduc_index, vec_loop_masks *masks,
7108 vec_loop_lens *lens)
7110 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7111 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7112 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7114 int ncopies;
7115 if (slp_node)
7116 ncopies = 1;
7117 else
7118 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7120 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7121 gcc_assert (ncopies == 1);
7123 bool is_cond_op = false;
7124 if (!code.is_tree_code ())
7126 code = conditional_internal_fn_code (internal_fn (code));
7127 gcc_assert (code != ERROR_MARK);
7128 is_cond_op = true;
7131 gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7133 if (slp_node)
7135 if (is_cond_op)
7137 if (dump_enabled_p ())
7138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7139 "fold-left reduction on SLP not supported.\n");
7140 return false;
7143 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7144 TYPE_VECTOR_SUBPARTS (vectype_in)));
7147 /* The operands either come from a binary operation or an IFN_COND operation.
7148 The former is a gimple assign with binary rhs and the latter is a
7149 gimple call with four arguments. */
7150 gcc_assert (num_ops == 2 || num_ops == 4);
7151 tree op0, opmask;
7152 if (!is_cond_op)
7153 op0 = ops[1 - reduc_index];
7154 else
7156 op0 = ops[2 + (1 - reduc_index)];
7157 opmask = ops[0];
7158 gcc_assert (!slp_node);
7161 int group_size = 1;
7162 stmt_vec_info scalar_dest_def_info;
7163 auto_vec<tree> vec_oprnds0, vec_opmask;
7164 if (slp_node)
7166 auto_vec<vec<tree> > vec_defs (2);
7167 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7168 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7169 vec_defs[0].release ();
7170 vec_defs[1].release ();
7171 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7172 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7174 else
7176 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7177 op0, &vec_oprnds0);
7178 scalar_dest_def_info = stmt_info;
7180 /* For an IFN_COND_OP we also need the vector mask operand. */
7181 if (is_cond_op)
7182 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7183 opmask, &vec_opmask);
7186 gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7187 tree scalar_dest = gimple_get_lhs (sdef);
7188 tree scalar_type = TREE_TYPE (scalar_dest);
7189 tree reduc_var = gimple_phi_result (reduc_def_stmt);
7191 int vec_num = vec_oprnds0.length ();
7192 gcc_assert (vec_num == 1 || slp_node);
7193 tree vec_elem_type = TREE_TYPE (vectype_out);
7194 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7196 tree vector_identity = NULL_TREE;
7197 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7199 vector_identity = build_zero_cst (vectype_out);
7200 if (!HONOR_SIGNED_ZEROS (vectype_out))
7202 else
7204 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7205 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7206 vector_identity);
7210 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7211 int i;
7212 tree def0;
7213 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7215 gimple *new_stmt;
7216 tree mask = NULL_TREE;
7217 tree len = NULL_TREE;
7218 tree bias = NULL_TREE;
7219 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7220 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7221 else if (is_cond_op)
7222 mask = vec_opmask[0];
7223 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7225 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7226 i, 1);
7227 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7228 bias = build_int_cst (intQI_type_node, biasval);
7229 if (!is_cond_op)
7230 mask = build_minus_one_cst (truth_type_for (vectype_in));
7233 /* Handle MINUS by adding the negative. */
7234 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7236 tree negated = make_ssa_name (vectype_out);
7237 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7238 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7239 def0 = negated;
7242 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7243 && mask && mask_reduc_fn == IFN_LAST)
7244 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7245 vector_identity);
7247 /* On the first iteration the input is simply the scalar phi
7248 result, and for subsequent iterations it is the output of
7249 the preceding operation. */
7250 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7252 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7253 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7254 def0, mask, len, bias);
7255 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7256 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7257 def0, mask);
7258 else
7259 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7260 def0);
7261 /* For chained SLP reductions the output of the previous reduction
7262 operation serves as the input of the next. For the final statement
7263 the output cannot be a temporary - we reuse the original
7264 scalar destination of the last statement. */
7265 if (i != vec_num - 1)
7267 gimple_set_lhs (new_stmt, scalar_dest_var);
7268 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7269 gimple_set_lhs (new_stmt, reduc_var);
7272 else
7274 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7275 tree_code (code), reduc_var, def0,
7276 mask);
7277 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7278 /* Remove the statement, so that we can use the same code paths
7279 as for statements that we've just created. */
7280 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7281 gsi_remove (&tmp_gsi, true);
7284 if (i == vec_num - 1)
7286 gimple_set_lhs (new_stmt, scalar_dest);
7287 vect_finish_replace_stmt (loop_vinfo,
7288 scalar_dest_def_info,
7289 new_stmt);
7291 else
7292 vect_finish_stmt_generation (loop_vinfo,
7293 scalar_dest_def_info,
7294 new_stmt, gsi);
7296 if (slp_node)
7297 slp_node->push_vec_def (new_stmt);
7298 else
7300 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7301 *vec_stmt = new_stmt;
7305 return true;
7308 /* Function is_nonwrapping_integer_induction.
7310 Check if STMT_VINO (which is part of loop LOOP) both increments and
7311 does not cause overflow. */
7313 static bool
7314 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7316 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7317 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7318 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7319 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7320 widest_int ni, max_loop_value, lhs_max;
7321 wi::overflow_type overflow = wi::OVF_NONE;
7323 /* Make sure the loop is integer based. */
7324 if (TREE_CODE (base) != INTEGER_CST
7325 || TREE_CODE (step) != INTEGER_CST)
7326 return false;
7328 /* Check that the max size of the loop will not wrap. */
7330 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7331 return true;
7333 if (! max_stmt_executions (loop, &ni))
7334 return false;
7336 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7337 &overflow);
7338 if (overflow)
7339 return false;
7341 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7342 TYPE_SIGN (lhs_type), &overflow);
7343 if (overflow)
7344 return false;
7346 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7347 <= TYPE_PRECISION (lhs_type));
7350 /* Check if masking can be supported by inserting a conditional expression.
7351 CODE is the code for the operation. COND_FN is the conditional internal
7352 function, if it exists. VECTYPE_IN is the type of the vector input. */
7353 static bool
7354 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7355 tree vectype_in)
7357 if (cond_fn != IFN_LAST
7358 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7359 OPTIMIZE_FOR_SPEED))
7360 return false;
7362 if (code.is_tree_code ())
7363 switch (tree_code (code))
7365 case DOT_PROD_EXPR:
7366 case SAD_EXPR:
7367 return true;
7369 default:
7370 break;
7372 return false;
7375 /* Insert a conditional expression to enable masked vectorization. CODE is the
7376 code for the operation. VOP is the array of operands. MASK is the loop
7377 mask. GSI is a statement iterator used to place the new conditional
7378 expression. */
7379 static void
7380 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7381 gimple_stmt_iterator *gsi)
7383 switch (tree_code (code))
7385 case DOT_PROD_EXPR:
7387 tree vectype = TREE_TYPE (vop[1]);
7388 tree zero = build_zero_cst (vectype);
7389 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7390 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7391 mask, vop[1], zero);
7392 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7393 vop[1] = masked_op1;
7394 break;
7397 case SAD_EXPR:
7399 tree vectype = TREE_TYPE (vop[1]);
7400 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7401 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7402 mask, vop[1], vop[0]);
7403 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7404 vop[1] = masked_op1;
7405 break;
7408 default:
7409 gcc_unreachable ();
7413 /* Function vectorizable_reduction.
7415 Check if STMT_INFO performs a reduction operation that can be vectorized.
7416 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7417 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7418 Return true if STMT_INFO is vectorizable in this way.
7420 This function also handles reduction idioms (patterns) that have been
7421 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7422 may be of this form:
7423 X = pattern_expr (arg0, arg1, ..., X)
7424 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7425 sequence that had been detected and replaced by the pattern-stmt
7426 (STMT_INFO).
7428 This function also handles reduction of condition expressions, for example:
7429 for (int i = 0; i < N; i++)
7430 if (a[i] < value)
7431 last = a[i];
7432 This is handled by vectorising the loop and creating an additional vector
7433 containing the loop indexes for which "a[i] < value" was true. In the
7434 function epilogue this is reduced to a single max value and then used to
7435 index into the vector of results.
7437 In some cases of reduction patterns, the type of the reduction variable X is
7438 different than the type of the other arguments of STMT_INFO.
7439 In such cases, the vectype that is used when transforming STMT_INFO into
7440 a vector stmt is different than the vectype that is used to determine the
7441 vectorization factor, because it consists of a different number of elements
7442 than the actual number of elements that are being operated upon in parallel.
7444 For example, consider an accumulation of shorts into an int accumulator.
7445 On some targets it's possible to vectorize this pattern operating on 8
7446 shorts at a time (hence, the vectype for purposes of determining the
7447 vectorization factor should be V8HI); on the other hand, the vectype that
7448 is used to create the vector form is actually V4SI (the type of the result).
7450 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7451 indicates what is the actual level of parallelism (V8HI in the example), so
7452 that the right vectorization factor would be derived. This vectype
7453 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7454 be used to create the vectorized stmt. The right vectype for the vectorized
7455 stmt is obtained from the type of the result X:
7456 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7458 This means that, contrary to "regular" reductions (or "regular" stmts in
7459 general), the following equation:
7460 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7461 does *NOT* necessarily hold for reduction patterns. */
7463 bool
7464 vectorizable_reduction (loop_vec_info loop_vinfo,
7465 stmt_vec_info stmt_info, slp_tree slp_node,
7466 slp_instance slp_node_instance,
7467 stmt_vector_for_cost *cost_vec)
7469 tree vectype_in = NULL_TREE;
7470 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7471 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7472 stmt_vec_info cond_stmt_vinfo = NULL;
7473 int i;
7474 int ncopies;
7475 bool single_defuse_cycle = false;
7476 bool nested_cycle = false;
7477 bool double_reduc = false;
7478 int vec_num;
7479 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7480 tree cond_reduc_val = NULL_TREE;
7482 /* Make sure it was already recognized as a reduction computation. */
7483 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7484 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7485 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7486 return false;
7488 /* The stmt we store reduction analysis meta on. */
7489 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7490 reduc_info->is_reduc_info = true;
7492 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7494 if (is_a <gphi *> (stmt_info->stmt))
7496 if (slp_node)
7498 /* We eventually need to set a vector type on invariant
7499 arguments. */
7500 unsigned j;
7501 slp_tree child;
7502 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7503 if (!vect_maybe_update_slp_op_vectype
7504 (child, SLP_TREE_VECTYPE (slp_node)))
7506 if (dump_enabled_p ())
7507 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7508 "incompatible vector types for "
7509 "invariants\n");
7510 return false;
7513 /* Analysis for double-reduction is done on the outer
7514 loop PHI, nested cycles have no further restrictions. */
7515 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7517 else
7518 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7519 return true;
7522 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7523 stmt_vec_info phi_info = stmt_info;
7524 if (!is_a <gphi *> (stmt_info->stmt))
7526 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7527 return true;
7529 if (slp_node)
7531 slp_node_instance->reduc_phis = slp_node;
7532 /* ??? We're leaving slp_node to point to the PHIs, we only
7533 need it to get at the number of vector stmts which wasn't
7534 yet initialized for the instance root. */
7536 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7538 use_operand_p use_p;
7539 gimple *use_stmt;
7540 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7541 &use_p, &use_stmt);
7542 gcc_assert (res);
7543 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7546 /* PHIs should not participate in patterns. */
7547 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7548 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7550 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7551 and compute the reduction chain length. Discover the real
7552 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7553 tree reduc_def
7554 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7555 loop_latch_edge
7556 (gimple_bb (reduc_def_phi)->loop_father));
7557 unsigned reduc_chain_length = 0;
7558 bool only_slp_reduc_chain = true;
7559 stmt_info = NULL;
7560 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7561 while (reduc_def != PHI_RESULT (reduc_def_phi))
7563 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7564 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7565 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7567 if (dump_enabled_p ())
7568 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7569 "reduction chain broken by patterns.\n");
7570 return false;
7572 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7573 only_slp_reduc_chain = false;
7574 /* For epilogue generation live members of the chain need
7575 to point back to the PHI via their original stmt for
7576 info_for_reduction to work. For SLP we need to look at
7577 all lanes here - even though we only will vectorize from
7578 the SLP node with live lane zero the other live lanes also
7579 need to be identified as part of a reduction to be able
7580 to skip code generation for them. */
7581 if (slp_for_stmt_info)
7583 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7584 if (STMT_VINFO_LIVE_P (s))
7585 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7587 else if (STMT_VINFO_LIVE_P (vdef))
7588 STMT_VINFO_REDUC_DEF (def) = phi_info;
7589 gimple_match_op op;
7590 if (!gimple_extract_op (vdef->stmt, &op))
7592 if (dump_enabled_p ())
7593 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7594 "reduction chain includes unsupported"
7595 " statement type.\n");
7596 return false;
7598 if (CONVERT_EXPR_CODE_P (op.code))
7600 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7602 if (dump_enabled_p ())
7603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7604 "conversion in the reduction chain.\n");
7605 return false;
7608 else if (!stmt_info)
7609 /* First non-conversion stmt. */
7610 stmt_info = vdef;
7611 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7612 reduc_chain_length++;
7613 if (!stmt_info && slp_node)
7614 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7616 /* PHIs should not participate in patterns. */
7617 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7619 if (nested_in_vect_loop_p (loop, stmt_info))
7621 loop = loop->inner;
7622 nested_cycle = true;
7625 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7626 element. */
7627 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7629 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7630 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7632 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7633 gcc_assert (slp_node
7634 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7636 /* 1. Is vectorizable reduction? */
7637 /* Not supportable if the reduction variable is used in the loop, unless
7638 it's a reduction chain. */
7639 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7640 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7641 return false;
7643 /* Reductions that are not used even in an enclosing outer-loop,
7644 are expected to be "live" (used out of the loop). */
7645 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7646 && !STMT_VINFO_LIVE_P (stmt_info))
7647 return false;
7649 /* 2. Has this been recognized as a reduction pattern?
7651 Check if STMT represents a pattern that has been recognized
7652 in earlier analysis stages. For stmts that represent a pattern,
7653 the STMT_VINFO_RELATED_STMT field records the last stmt in
7654 the original sequence that constitutes the pattern. */
7656 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7657 if (orig_stmt_info)
7659 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7660 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7663 /* 3. Check the operands of the operation. The first operands are defined
7664 inside the loop body. The last operand is the reduction variable,
7665 which is defined by the loop-header-phi. */
7667 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7668 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7669 gimple_match_op op;
7670 if (!gimple_extract_op (stmt_info->stmt, &op))
7671 gcc_unreachable ();
7672 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7673 || op.code == WIDEN_SUM_EXPR
7674 || op.code == SAD_EXPR);
7676 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7677 && !SCALAR_FLOAT_TYPE_P (op.type))
7678 return false;
7680 /* Do not try to vectorize bit-precision reductions. */
7681 if (!type_has_mode_precision_p (op.type))
7682 return false;
7684 /* For lane-reducing ops we're reducing the number of reduction PHIs
7685 which means the only use of that may be in the lane-reducing operation. */
7686 if (lane_reduc_code_p
7687 && reduc_chain_length != 1
7688 && !only_slp_reduc_chain)
7690 if (dump_enabled_p ())
7691 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7692 "lane-reducing reduction with extra stmts.\n");
7693 return false;
7696 /* All uses but the last are expected to be defined in the loop.
7697 The last use is the reduction variable. In case of nested cycle this
7698 assumption is not true: we use reduc_index to record the index of the
7699 reduction variable. */
7700 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7701 tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7702 /* We need to skip an extra operand for COND_EXPRs with embedded
7703 comparison. */
7704 unsigned opno_adjust = 0;
7705 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7706 opno_adjust = 1;
7707 for (i = 0; i < (int) op.num_ops; i++)
7709 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7710 if (i == 0 && op.code == COND_EXPR)
7711 continue;
7713 stmt_vec_info def_stmt_info;
7714 enum vect_def_type dt;
7715 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7716 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7717 &vectype_op[i], &def_stmt_info))
7719 if (dump_enabled_p ())
7720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7721 "use not simple.\n");
7722 return false;
7724 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7725 continue;
7727 /* For an IFN_COND_OP we might hit the reduction definition operand
7728 twice (once as definition, once as else). */
7729 if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7730 continue;
7732 /* There should be only one cycle def in the stmt, the one
7733 leading to reduc_def. */
7734 if (VECTORIZABLE_CYCLE_DEF (dt))
7735 return false;
7737 if (!vectype_op[i])
7738 vectype_op[i]
7739 = get_vectype_for_scalar_type (loop_vinfo,
7740 TREE_TYPE (op.ops[i]), slp_op[i]);
7742 /* To properly compute ncopies we are interested in the widest
7743 non-reduction input type in case we're looking at a widening
7744 accumulation that we later handle in vect_transform_reduction. */
7745 if (lane_reduc_code_p
7746 && vectype_op[i]
7747 && (!vectype_in
7748 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7749 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7750 vectype_in = vectype_op[i];
7752 if (op.code == COND_EXPR)
7754 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7755 if (dt == vect_constant_def)
7757 cond_reduc_dt = dt;
7758 cond_reduc_val = op.ops[i];
7760 if (dt == vect_induction_def
7761 && def_stmt_info
7762 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7764 cond_reduc_dt = dt;
7765 cond_stmt_vinfo = def_stmt_info;
7769 if (!vectype_in)
7770 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7771 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7773 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7774 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7775 /* If we have a condition reduction, see if we can simplify it further. */
7776 if (v_reduc_type == COND_REDUCTION)
7778 if (slp_node)
7779 return false;
7781 /* When the condition uses the reduction value in the condition, fail. */
7782 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7784 if (dump_enabled_p ())
7785 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7786 "condition depends on previous iteration\n");
7787 return false;
7790 if (reduc_chain_length == 1
7791 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7792 OPTIMIZE_FOR_SPEED)
7793 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7794 vectype_in,
7795 OPTIMIZE_FOR_SPEED)))
7797 if (dump_enabled_p ())
7798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7799 "optimizing condition reduction with"
7800 " FOLD_EXTRACT_LAST.\n");
7801 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7803 else if (cond_reduc_dt == vect_induction_def)
7805 tree base
7806 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7807 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7809 gcc_assert (TREE_CODE (base) == INTEGER_CST
7810 && TREE_CODE (step) == INTEGER_CST);
7811 cond_reduc_val = NULL_TREE;
7812 enum tree_code cond_reduc_op_code = ERROR_MARK;
7813 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7814 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7816 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7817 above base; punt if base is the minimum value of the type for
7818 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7819 else if (tree_int_cst_sgn (step) == -1)
7821 cond_reduc_op_code = MIN_EXPR;
7822 if (tree_int_cst_sgn (base) == -1)
7823 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7824 else if (tree_int_cst_lt (base,
7825 TYPE_MAX_VALUE (TREE_TYPE (base))))
7826 cond_reduc_val
7827 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7829 else
7831 cond_reduc_op_code = MAX_EXPR;
7832 if (tree_int_cst_sgn (base) == 1)
7833 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7834 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7835 base))
7836 cond_reduc_val
7837 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7839 if (cond_reduc_val)
7841 if (dump_enabled_p ())
7842 dump_printf_loc (MSG_NOTE, vect_location,
7843 "condition expression based on "
7844 "integer induction.\n");
7845 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7846 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7847 = cond_reduc_val;
7848 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7851 else if (cond_reduc_dt == vect_constant_def)
7853 enum vect_def_type cond_initial_dt;
7854 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7855 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7856 if (cond_initial_dt == vect_constant_def
7857 && types_compatible_p (TREE_TYPE (cond_initial_val),
7858 TREE_TYPE (cond_reduc_val)))
7860 tree e = fold_binary (LE_EXPR, boolean_type_node,
7861 cond_initial_val, cond_reduc_val);
7862 if (e && (integer_onep (e) || integer_zerop (e)))
7864 if (dump_enabled_p ())
7865 dump_printf_loc (MSG_NOTE, vect_location,
7866 "condition expression based on "
7867 "compile time constant.\n");
7868 /* Record reduction code at analysis stage. */
7869 STMT_VINFO_REDUC_CODE (reduc_info)
7870 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7871 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7877 if (STMT_VINFO_LIVE_P (phi_info))
7878 return false;
7880 if (slp_node)
7881 ncopies = 1;
7882 else
7883 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7885 gcc_assert (ncopies >= 1);
7887 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7889 if (nested_cycle)
7891 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7892 == vect_double_reduction_def);
7893 double_reduc = true;
7896 /* 4.2. Check support for the epilog operation.
7898 If STMT represents a reduction pattern, then the type of the
7899 reduction variable may be different than the type of the rest
7900 of the arguments. For example, consider the case of accumulation
7901 of shorts into an int accumulator; The original code:
7902 S1: int_a = (int) short_a;
7903 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7905 was replaced with:
7906 STMT: int_acc = widen_sum <short_a, int_acc>
7908 This means that:
7909 1. The tree-code that is used to create the vector operation in the
7910 epilog code (that reduces the partial results) is not the
7911 tree-code of STMT, but is rather the tree-code of the original
7912 stmt from the pattern that STMT is replacing. I.e, in the example
7913 above we want to use 'widen_sum' in the loop, but 'plus' in the
7914 epilog.
7915 2. The type (mode) we use to check available target support
7916 for the vector operation to be created in the *epilog*, is
7917 determined by the type of the reduction variable (in the example
7918 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7919 However the type (mode) we use to check available target support
7920 for the vector operation to be created *inside the loop*, is
7921 determined by the type of the other arguments to STMT (in the
7922 example we'd check this: optab_handler (widen_sum_optab,
7923 vect_short_mode)).
7925 This is contrary to "regular" reductions, in which the types of all
7926 the arguments are the same as the type of the reduction variable.
7927 For "regular" reductions we can therefore use the same vector type
7928 (and also the same tree-code) when generating the epilog code and
7929 when generating the code inside the loop. */
7931 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7933 /* If conversion might have created a conditional operation like
7934 IFN_COND_ADD already. Use the internal code for the following checks. */
7935 if (orig_code.is_internal_fn ())
7937 tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7938 orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7941 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7943 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7944 if (reduction_type == TREE_CODE_REDUCTION)
7946 /* Check whether it's ok to change the order of the computation.
7947 Generally, when vectorizing a reduction we change the order of the
7948 computation. This may change the behavior of the program in some
7949 cases, so we need to check that this is ok. One exception is when
7950 vectorizing an outer-loop: the inner-loop is executed sequentially,
7951 and therefore vectorizing reductions in the inner-loop during
7952 outer-loop vectorization is safe. Likewise when we are vectorizing
7953 a series of reductions using SLP and the VF is one the reductions
7954 are performed in scalar order. */
7955 if (slp_node
7956 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7957 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7959 else if (needs_fold_left_reduction_p (op.type, orig_code))
7961 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7962 is not directy used in stmt. */
7963 if (!only_slp_reduc_chain
7964 && reduc_chain_length != 1)
7966 if (dump_enabled_p ())
7967 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7968 "in-order reduction chain without SLP.\n");
7969 return false;
7971 STMT_VINFO_REDUC_TYPE (reduc_info)
7972 = reduction_type = FOLD_LEFT_REDUCTION;
7974 else if (!commutative_binary_op_p (orig_code, op.type)
7975 || !associative_binary_op_p (orig_code, op.type))
7977 if (dump_enabled_p ())
7978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7979 "reduction: not commutative/associative\n");
7980 return false;
7984 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7985 && ncopies > 1)
7987 if (dump_enabled_p ())
7988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7989 "multiple types in double reduction or condition "
7990 "reduction or fold-left reduction.\n");
7991 return false;
7994 internal_fn reduc_fn = IFN_LAST;
7995 if (reduction_type == TREE_CODE_REDUCTION
7996 || reduction_type == FOLD_LEFT_REDUCTION
7997 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7998 || reduction_type == CONST_COND_REDUCTION)
8000 if (reduction_type == FOLD_LEFT_REDUCTION
8001 ? fold_left_reduction_fn (orig_code, &reduc_fn)
8002 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
8004 if (reduc_fn != IFN_LAST
8005 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
8006 OPTIMIZE_FOR_SPEED))
8008 if (dump_enabled_p ())
8009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8010 "reduc op not supported by target.\n");
8012 reduc_fn = IFN_LAST;
8015 else
8017 if (!nested_cycle || double_reduc)
8019 if (dump_enabled_p ())
8020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8021 "no reduc code for scalar code.\n");
8023 return false;
8027 else if (reduction_type == COND_REDUCTION)
8029 int scalar_precision
8030 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8031 cr_index_scalar_type = make_unsigned_type (scalar_precision);
8032 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8033 vectype_out);
8035 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8036 OPTIMIZE_FOR_SPEED))
8037 reduc_fn = IFN_REDUC_MAX;
8039 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8041 if (reduction_type != EXTRACT_LAST_REDUCTION
8042 && (!nested_cycle || double_reduc)
8043 && reduc_fn == IFN_LAST
8044 && !nunits_out.is_constant ())
8046 if (dump_enabled_p ())
8047 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8048 "missing target support for reduction on"
8049 " variable-length vectors.\n");
8050 return false;
8053 /* For SLP reductions, see if there is a neutral value we can use. */
8054 tree neutral_op = NULL_TREE;
8055 if (slp_node)
8057 tree initial_value = NULL_TREE;
8058 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8059 initial_value = vect_phi_initial_value (reduc_def_phi);
8060 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8061 orig_code, initial_value);
8064 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8066 /* We can't support in-order reductions of code such as this:
8068 for (int i = 0; i < n1; ++i)
8069 for (int j = 0; j < n2; ++j)
8070 l += a[j];
8072 since GCC effectively transforms the loop when vectorizing:
8074 for (int i = 0; i < n1 / VF; ++i)
8075 for (int j = 0; j < n2; ++j)
8076 for (int k = 0; k < VF; ++k)
8077 l += a[j];
8079 which is a reassociation of the original operation. */
8080 if (dump_enabled_p ())
8081 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8082 "in-order double reduction not supported.\n");
8084 return false;
8087 if (reduction_type == FOLD_LEFT_REDUCTION
8088 && slp_node
8089 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8091 /* We cannot use in-order reductions in this case because there is
8092 an implicit reassociation of the operations involved. */
8093 if (dump_enabled_p ())
8094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8095 "in-order unchained SLP reductions not supported.\n");
8096 return false;
8099 /* For double reductions, and for SLP reductions with a neutral value,
8100 we construct a variable-length initial vector by loading a vector
8101 full of the neutral value and then shift-and-inserting the start
8102 values into the low-numbered elements. */
8103 if ((double_reduc || neutral_op)
8104 && !nunits_out.is_constant ()
8105 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8106 vectype_out, OPTIMIZE_FOR_SPEED))
8108 if (dump_enabled_p ())
8109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8110 "reduction on variable-length vectors requires"
8111 " target support for a vector-shift-and-insert"
8112 " operation.\n");
8113 return false;
8116 /* Check extra constraints for variable-length unchained SLP reductions. */
8117 if (slp_node
8118 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8119 && !nunits_out.is_constant ())
8121 /* We checked above that we could build the initial vector when
8122 there's a neutral element value. Check here for the case in
8123 which each SLP statement has its own initial value and in which
8124 that value needs to be repeated for every instance of the
8125 statement within the initial vector. */
8126 unsigned int group_size = SLP_TREE_LANES (slp_node);
8127 if (!neutral_op
8128 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8129 TREE_TYPE (vectype_out)))
8131 if (dump_enabled_p ())
8132 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8133 "unsupported form of SLP reduction for"
8134 " variable-length vectors: cannot build"
8135 " initial vector.\n");
8136 return false;
8138 /* The epilogue code relies on the number of elements being a multiple
8139 of the group size. The duplicate-and-interleave approach to setting
8140 up the initial vector does too. */
8141 if (!multiple_p (nunits_out, group_size))
8143 if (dump_enabled_p ())
8144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8145 "unsupported form of SLP reduction for"
8146 " variable-length vectors: the vector size"
8147 " is not a multiple of the number of results.\n");
8148 return false;
8152 if (reduction_type == COND_REDUCTION)
8154 widest_int ni;
8156 if (! max_loop_iterations (loop, &ni))
8158 if (dump_enabled_p ())
8159 dump_printf_loc (MSG_NOTE, vect_location,
8160 "loop count not known, cannot create cond "
8161 "reduction.\n");
8162 return false;
8164 /* Convert backedges to iterations. */
8165 ni += 1;
8167 /* The additional index will be the same type as the condition. Check
8168 that the loop can fit into this less one (because we'll use up the
8169 zero slot for when there are no matches). */
8170 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8171 if (wi::geu_p (ni, wi::to_widest (max_index)))
8173 if (dump_enabled_p ())
8174 dump_printf_loc (MSG_NOTE, vect_location,
8175 "loop size is greater than data size.\n");
8176 return false;
8180 /* In case the vectorization factor (VF) is bigger than the number
8181 of elements that we can fit in a vectype (nunits), we have to generate
8182 more than one vector stmt - i.e - we need to "unroll" the
8183 vector stmt by a factor VF/nunits. For more details see documentation
8184 in vectorizable_operation. */
8186 /* If the reduction is used in an outer loop we need to generate
8187 VF intermediate results, like so (e.g. for ncopies=2):
8188 r0 = phi (init, r0)
8189 r1 = phi (init, r1)
8190 r0 = x0 + r0;
8191 r1 = x1 + r1;
8192 (i.e. we generate VF results in 2 registers).
8193 In this case we have a separate def-use cycle for each copy, and therefore
8194 for each copy we get the vector def for the reduction variable from the
8195 respective phi node created for this copy.
8197 Otherwise (the reduction is unused in the loop nest), we can combine
8198 together intermediate results, like so (e.g. for ncopies=2):
8199 r = phi (init, r)
8200 r = x0 + r;
8201 r = x1 + r;
8202 (i.e. we generate VF/2 results in a single register).
8203 In this case for each copy we get the vector def for the reduction variable
8204 from the vectorized reduction operation generated in the previous iteration.
8206 This only works when we see both the reduction PHI and its only consumer
8207 in vectorizable_reduction and there are no intermediate stmts
8208 participating. When unrolling we want each unrolled iteration to have its
8209 own reduction accumulator since one of the main goals of unrolling a
8210 reduction is to reduce the aggregate loop-carried latency. */
8211 if (ncopies > 1
8212 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8213 && reduc_chain_length == 1
8214 && loop_vinfo->suggested_unroll_factor == 1)
8215 single_defuse_cycle = true;
8217 if (single_defuse_cycle || lane_reduc_code_p)
8219 gcc_assert (op.code != COND_EXPR);
8221 /* 4. Supportable by target? */
8222 bool ok = true;
8224 /* 4.1. check support for the operation in the loop
8226 This isn't necessary for the lane reduction codes, since they
8227 can only be produced by pattern matching, and it's up to the
8228 pattern matcher to test for support. The main reason for
8229 specifically skipping this step is to avoid rechecking whether
8230 mixed-sign dot-products can be implemented using signed
8231 dot-products. */
8232 machine_mode vec_mode = TYPE_MODE (vectype_in);
8233 if (!lane_reduc_code_p
8234 && !directly_supported_p (op.code, vectype_in, optab_vector))
8236 if (dump_enabled_p ())
8237 dump_printf (MSG_NOTE, "op not supported by target.\n");
8238 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8239 || !vect_can_vectorize_without_simd_p (op.code))
8240 ok = false;
8241 else
8242 if (dump_enabled_p ())
8243 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8246 if (vect_emulated_vector_p (vectype_in)
8247 && !vect_can_vectorize_without_simd_p (op.code))
8249 if (dump_enabled_p ())
8250 dump_printf (MSG_NOTE, "using word mode not possible.\n");
8251 return false;
8254 /* lane-reducing operations have to go through vect_transform_reduction.
8255 For the other cases try without the single cycle optimization. */
8256 if (!ok)
8258 if (lane_reduc_code_p)
8259 return false;
8260 else
8261 single_defuse_cycle = false;
8264 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8266 /* If the reduction stmt is one of the patterns that have lane
8267 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
8268 if ((ncopies > 1 && ! single_defuse_cycle)
8269 && lane_reduc_code_p)
8271 if (dump_enabled_p ())
8272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8273 "multi def-use cycle not possible for lane-reducing "
8274 "reduction operation\n");
8275 return false;
8278 if (slp_node
8279 && !(!single_defuse_cycle
8280 && !lane_reduc_code_p
8281 && reduction_type != FOLD_LEFT_REDUCTION))
8282 for (i = 0; i < (int) op.num_ops; i++)
8283 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8285 if (dump_enabled_p ())
8286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8287 "incompatible vector types for invariants\n");
8288 return false;
8291 if (slp_node)
8292 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8293 else
8294 vec_num = 1;
8296 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8297 reduction_type, ncopies, cost_vec);
8298 /* Cost the reduction op inside the loop if transformed via
8299 vect_transform_reduction. Otherwise this is costed by the
8300 separate vectorizable_* routines. */
8301 if (single_defuse_cycle || lane_reduc_code_p)
8303 int factor = 1;
8304 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8305 /* Three dot-products and a subtraction. */
8306 factor = 4;
8307 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8308 stmt_info, 0, vect_body);
8311 if (dump_enabled_p ()
8312 && reduction_type == FOLD_LEFT_REDUCTION)
8313 dump_printf_loc (MSG_NOTE, vect_location,
8314 "using an in-order (fold-left) reduction.\n");
8315 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8316 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8317 reductions go through their own vectorizable_* routines. */
8318 if (!single_defuse_cycle
8319 && !lane_reduc_code_p
8320 && reduction_type != FOLD_LEFT_REDUCTION)
8322 stmt_vec_info tem
8323 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8324 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8326 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8327 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8329 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8330 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8332 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8334 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8335 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8336 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8338 if (reduction_type != FOLD_LEFT_REDUCTION
8339 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8340 && (cond_fn == IFN_LAST
8341 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8342 OPTIMIZE_FOR_SPEED)))
8344 if (dump_enabled_p ())
8345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8346 "can't operate on partial vectors because"
8347 " no conditional operation is available.\n");
8348 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8350 else if (reduction_type == FOLD_LEFT_REDUCTION
8351 && reduc_fn == IFN_LAST
8352 && !expand_vec_cond_expr_p (vectype_in,
8353 truth_type_for (vectype_in),
8354 SSA_NAME))
8356 if (dump_enabled_p ())
8357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8358 "can't operate on partial vectors because"
8359 " no conditional operation is available.\n");
8360 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8362 else if (reduction_type == FOLD_LEFT_REDUCTION
8363 && internal_fn_mask_index (reduc_fn) == -1
8364 && FLOAT_TYPE_P (vectype_in)
8365 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8367 if (dump_enabled_p ())
8368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8369 "can't operate on partial vectors because"
8370 " signed zeros cannot be preserved.\n");
8371 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8373 else
8375 internal_fn mask_reduc_fn
8376 = get_masked_reduction_fn (reduc_fn, vectype_in);
8378 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8379 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8380 vectype_in, 1);
8381 else
8382 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8383 vectype_in, NULL);
8386 return true;
8389 /* STMT_INFO is a dot-product reduction whose multiplication operands
8390 have different signs. Emit a sequence to emulate the operation
8391 using a series of signed DOT_PROD_EXPRs and return the last
8392 statement generated. VEC_DEST is the result of the vector operation
8393 and VOP lists its inputs. */
8395 static gassign *
8396 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8397 gimple_stmt_iterator *gsi, tree vec_dest,
8398 tree vop[3])
8400 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8401 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8402 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8403 gimple *new_stmt;
8405 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8406 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8407 std::swap (vop[0], vop[1]);
8409 /* Convert all inputs to signed types. */
8410 for (int i = 0; i < 3; ++i)
8411 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8413 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8414 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8415 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8416 vop[i] = tmp;
8419 /* In the comments below we assume 8-bit inputs for simplicity,
8420 but the approach works for any full integer type. */
8422 /* Create a vector of -128. */
8423 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8424 tree min_narrow = build_vector_from_val (narrow_vectype,
8425 min_narrow_elttype);
8427 /* Create a vector of 64. */
8428 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8429 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8430 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8432 /* Emit: SUB_RES = VOP[0] - 128. */
8433 tree sub_res = make_ssa_name (narrow_vectype);
8434 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8435 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8437 /* Emit:
8439 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8440 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8441 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8443 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8444 Doing the two 64 * y steps first allows more time to compute x. */
8445 tree stage1 = make_ssa_name (wide_vectype);
8446 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8447 vop[1], half_narrow, vop[2]);
8448 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8450 tree stage2 = make_ssa_name (wide_vectype);
8451 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8452 vop[1], half_narrow, stage1);
8453 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8455 tree stage3 = make_ssa_name (wide_vectype);
8456 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8457 sub_res, vop[1], stage2);
8458 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8460 /* Convert STAGE3 to the reduction type. */
8461 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8464 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8465 value. */
8467 bool
8468 vect_transform_reduction (loop_vec_info loop_vinfo,
8469 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8470 gimple **vec_stmt, slp_tree slp_node)
8472 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8473 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8474 int i;
8475 int ncopies;
8476 int vec_num;
8478 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8479 gcc_assert (reduc_info->is_reduc_info);
8481 if (nested_in_vect_loop_p (loop, stmt_info))
8483 loop = loop->inner;
8484 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8487 gimple_match_op op;
8488 if (!gimple_extract_op (stmt_info->stmt, &op))
8489 gcc_unreachable ();
8491 /* All uses but the last are expected to be defined in the loop.
8492 The last use is the reduction variable. In case of nested cycle this
8493 assumption is not true: we use reduc_index to record the index of the
8494 reduction variable. */
8495 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8496 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8497 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8498 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8500 if (slp_node)
8502 ncopies = 1;
8503 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8505 else
8507 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8508 vec_num = 1;
8511 code_helper code = canonicalize_code (op.code, op.type);
8512 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8514 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8515 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8516 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8518 /* Transform. */
8519 tree new_temp = NULL_TREE;
8520 auto_vec<tree> vec_oprnds0;
8521 auto_vec<tree> vec_oprnds1;
8522 auto_vec<tree> vec_oprnds2;
8523 tree def0;
8525 if (dump_enabled_p ())
8526 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8528 /* FORNOW: Multiple types are not supported for condition. */
8529 if (code == COND_EXPR)
8530 gcc_assert (ncopies == 1);
8532 /* A binary COND_OP reduction must have the same definition and else
8533 value. */
8534 bool cond_fn_p = code.is_internal_fn ()
8535 && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8536 if (cond_fn_p)
8538 gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8539 || code == IFN_COND_MUL || code == IFN_COND_AND
8540 || code == IFN_COND_IOR || code == IFN_COND_XOR);
8541 gcc_assert (op.num_ops == 4
8542 && (op.ops[reduc_index]
8543 == op.ops[internal_fn_else_index ((internal_fn) code)]));
8546 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8548 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8549 if (reduction_type == FOLD_LEFT_REDUCTION)
8551 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8552 gcc_assert (code.is_tree_code () || cond_fn_p);
8553 return vectorize_fold_left_reduction
8554 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8555 code, reduc_fn, op.ops, op.num_ops, vectype_in,
8556 reduc_index, masks, lens);
8559 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8560 gcc_assert (single_defuse_cycle
8561 || code == DOT_PROD_EXPR
8562 || code == WIDEN_SUM_EXPR
8563 || code == SAD_EXPR);
8565 /* Create the destination vector */
8566 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8567 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8569 /* Get NCOPIES vector definitions for all operands except the reduction
8570 definition. */
8571 if (!cond_fn_p)
8573 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8574 single_defuse_cycle && reduc_index == 0
8575 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8576 single_defuse_cycle && reduc_index == 1
8577 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8578 op.num_ops == 3
8579 && !(single_defuse_cycle && reduc_index == 2)
8580 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8582 else
8584 /* For a conditional operation pass the truth type as mask
8585 vectype. */
8586 gcc_assert (single_defuse_cycle
8587 && (reduc_index == 1 || reduc_index == 2));
8588 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8589 op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8590 reduc_index == 1 ? NULL_TREE : op.ops[1],
8591 NULL_TREE, &vec_oprnds1,
8592 reduc_index == 2 ? NULL_TREE : op.ops[2],
8593 NULL_TREE, &vec_oprnds2);
8596 /* For single def-use cycles get one copy of the vectorized reduction
8597 definition. */
8598 if (single_defuse_cycle)
8600 gcc_assert (!slp_node);
8601 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8602 op.ops[reduc_index],
8603 reduc_index == 0 ? &vec_oprnds0
8604 : (reduc_index == 1 ? &vec_oprnds1
8605 : &vec_oprnds2));
8608 bool emulated_mixed_dot_prod
8609 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8610 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8612 gimple *new_stmt;
8613 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8614 if (masked_loop_p && !mask_by_cond_expr)
8616 /* No conditional ifns have been defined for dot-product yet. */
8617 gcc_assert (code != DOT_PROD_EXPR);
8619 /* Make sure that the reduction accumulator is vop[0]. */
8620 if (reduc_index == 1)
8622 gcc_assert (commutative_binary_op_p (code, op.type));
8623 std::swap (vop[0], vop[1]);
8625 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8626 vec_num * ncopies, vectype_in, i);
8627 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8628 vop[0], vop[1], vop[0]);
8629 new_temp = make_ssa_name (vec_dest, call);
8630 gimple_call_set_lhs (call, new_temp);
8631 gimple_call_set_nothrow (call, true);
8632 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8633 new_stmt = call;
8635 else
8637 if (op.num_ops >= 3)
8638 vop[2] = vec_oprnds2[i];
8640 if (masked_loop_p && mask_by_cond_expr)
8642 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8643 vec_num * ncopies, vectype_in, i);
8644 build_vect_cond_expr (code, vop, mask, gsi);
8647 if (emulated_mixed_dot_prod)
8648 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8649 vec_dest, vop);
8651 else if (code.is_internal_fn () && !cond_fn_p)
8652 new_stmt = gimple_build_call_internal (internal_fn (code),
8653 op.num_ops,
8654 vop[0], vop[1], vop[2]);
8655 else if (code.is_internal_fn () && cond_fn_p)
8656 new_stmt = gimple_build_call_internal (internal_fn (code),
8657 op.num_ops,
8658 vop[0], vop[1], vop[2],
8659 vop[1]);
8660 else
8661 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8662 vop[0], vop[1], vop[2]);
8663 new_temp = make_ssa_name (vec_dest, new_stmt);
8664 gimple_set_lhs (new_stmt, new_temp);
8665 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8668 if (slp_node)
8669 slp_node->push_vec_def (new_stmt);
8670 else if (single_defuse_cycle
8671 && i < ncopies - 1)
8673 if (reduc_index == 0)
8674 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8675 else if (reduc_index == 1)
8676 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8677 else if (reduc_index == 2)
8678 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8680 else
8681 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8684 if (!slp_node)
8685 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8687 return true;
8690 /* Transform phase of a cycle PHI. */
8692 bool
8693 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8694 stmt_vec_info stmt_info, gimple **vec_stmt,
8695 slp_tree slp_node, slp_instance slp_node_instance)
8697 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8698 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8699 int i;
8700 int ncopies;
8701 int j;
8702 bool nested_cycle = false;
8703 int vec_num;
8705 if (nested_in_vect_loop_p (loop, stmt_info))
8707 loop = loop->inner;
8708 nested_cycle = true;
8711 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8712 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8713 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8714 gcc_assert (reduc_info->is_reduc_info);
8716 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8717 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8718 /* Leave the scalar phi in place. */
8719 return true;
8721 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8722 /* For a nested cycle we do not fill the above. */
8723 if (!vectype_in)
8724 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8725 gcc_assert (vectype_in);
8727 if (slp_node)
8729 /* The size vect_schedule_slp_instance computes is off for us. */
8730 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8731 * SLP_TREE_LANES (slp_node), vectype_in);
8732 ncopies = 1;
8734 else
8736 vec_num = 1;
8737 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8740 /* Check whether we should use a single PHI node and accumulate
8741 vectors to one before the backedge. */
8742 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8743 ncopies = 1;
8745 /* Create the destination vector */
8746 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8747 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8748 vectype_out);
8750 /* Get the loop-entry arguments. */
8751 tree vec_initial_def = NULL_TREE;
8752 auto_vec<tree> vec_initial_defs;
8753 if (slp_node)
8755 vec_initial_defs.reserve (vec_num);
8756 if (nested_cycle)
8758 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8759 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8760 &vec_initial_defs);
8762 else
8764 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8765 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8766 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8768 unsigned int num_phis = stmts.length ();
8769 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8770 num_phis = 1;
8771 initial_values.reserve (num_phis);
8772 for (unsigned int i = 0; i < num_phis; ++i)
8774 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8775 initial_values.quick_push (vect_phi_initial_value (this_phi));
8777 if (vec_num == 1)
8778 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8779 if (!initial_values.is_empty ())
8781 tree initial_value
8782 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8783 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8784 tree neutral_op
8785 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8786 code, initial_value);
8787 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8788 &vec_initial_defs, vec_num,
8789 stmts.length (), neutral_op);
8793 else
8795 /* Get at the scalar def before the loop, that defines the initial
8796 value of the reduction variable. */
8797 tree initial_def = vect_phi_initial_value (phi);
8798 reduc_info->reduc_initial_values.safe_push (initial_def);
8799 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8800 and we can't use zero for induc_val, use initial_def. Similarly
8801 for REDUC_MIN and initial_def larger than the base. */
8802 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8804 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8805 if (TREE_CODE (initial_def) == INTEGER_CST
8806 && !integer_zerop (induc_val)
8807 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8808 && tree_int_cst_lt (initial_def, induc_val))
8809 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8810 && tree_int_cst_lt (induc_val, initial_def))))
8812 induc_val = initial_def;
8813 /* Communicate we used the initial_def to epilouge
8814 generation. */
8815 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8817 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8819 else if (nested_cycle)
8821 /* Do not use an adjustment def as that case is not supported
8822 correctly if ncopies is not one. */
8823 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8824 ncopies, initial_def,
8825 &vec_initial_defs);
8827 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8828 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8829 /* Fill the initial vector with the initial scalar value. */
8830 vec_initial_def
8831 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8832 initial_def, initial_def);
8833 else
8835 if (ncopies == 1)
8836 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8837 if (!reduc_info->reduc_initial_values.is_empty ())
8839 initial_def = reduc_info->reduc_initial_values[0];
8840 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8841 tree neutral_op
8842 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8843 code, initial_def);
8844 gcc_assert (neutral_op);
8845 /* Try to simplify the vector initialization by applying an
8846 adjustment after the reduction has been performed. */
8847 if (!reduc_info->reused_accumulator
8848 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8849 && !operand_equal_p (neutral_op, initial_def))
8851 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8852 = initial_def;
8853 initial_def = neutral_op;
8855 vec_initial_def
8856 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8857 initial_def, neutral_op);
8862 if (vec_initial_def)
8864 vec_initial_defs.create (ncopies);
8865 for (i = 0; i < ncopies; ++i)
8866 vec_initial_defs.quick_push (vec_initial_def);
8869 if (auto *accumulator = reduc_info->reused_accumulator)
8871 tree def = accumulator->reduc_input;
8872 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8874 unsigned int nreduc;
8875 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8876 (TREE_TYPE (def)),
8877 TYPE_VECTOR_SUBPARTS (vectype_out),
8878 &nreduc);
8879 gcc_assert (res);
8880 gimple_seq stmts = NULL;
8881 /* Reduce the single vector to a smaller one. */
8882 if (nreduc != 1)
8884 /* Perform the reduction in the appropriate type. */
8885 tree rvectype = vectype_out;
8886 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8887 TREE_TYPE (TREE_TYPE (def))))
8888 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8889 TYPE_VECTOR_SUBPARTS
8890 (vectype_out));
8891 def = vect_create_partial_epilog (def, rvectype,
8892 STMT_VINFO_REDUC_CODE
8893 (reduc_info),
8894 &stmts);
8896 /* The epilogue loop might use a different vector mode, like
8897 VNx2DI vs. V2DI. */
8898 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8900 tree reduc_type = build_vector_type_for_mode
8901 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8902 def = gimple_convert (&stmts, reduc_type, def);
8904 /* Adjust the input so we pick up the partially reduced value
8905 for the skip edge in vect_create_epilog_for_reduction. */
8906 accumulator->reduc_input = def;
8907 /* And the reduction could be carried out using a different sign. */
8908 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8909 def = gimple_convert (&stmts, vectype_out, def);
8910 if (loop_vinfo->main_loop_edge)
8912 /* While we'd like to insert on the edge this will split
8913 blocks and disturb bookkeeping, we also will eventually
8914 need this on the skip edge. Rely on sinking to
8915 fixup optimal placement and insert in the pred. */
8916 gimple_stmt_iterator gsi
8917 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8918 /* Insert before a cond that eventually skips the
8919 epilogue. */
8920 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8921 gsi_prev (&gsi);
8922 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8924 else
8925 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8926 stmts);
8928 if (loop_vinfo->main_loop_edge)
8929 vec_initial_defs[0]
8930 = vect_get_main_loop_result (loop_vinfo, def,
8931 vec_initial_defs[0]);
8932 else
8933 vec_initial_defs.safe_push (def);
8936 /* Generate the reduction PHIs upfront. */
8937 for (i = 0; i < vec_num; i++)
8939 tree vec_init_def = vec_initial_defs[i];
8940 for (j = 0; j < ncopies; j++)
8942 /* Create the reduction-phi that defines the reduction
8943 operand. */
8944 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8946 /* Set the loop-entry arg of the reduction-phi. */
8947 if (j != 0 && nested_cycle)
8948 vec_init_def = vec_initial_defs[j];
8949 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8950 UNKNOWN_LOCATION);
8952 /* The loop-latch arg is set in epilogue processing. */
8954 if (slp_node)
8955 slp_node->push_vec_def (new_phi);
8956 else
8958 if (j == 0)
8959 *vec_stmt = new_phi;
8960 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8965 return true;
8968 /* Vectorizes LC PHIs. */
8970 bool
8971 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8972 stmt_vec_info stmt_info, gimple **vec_stmt,
8973 slp_tree slp_node)
8975 if (!loop_vinfo
8976 || !is_a <gphi *> (stmt_info->stmt)
8977 || gimple_phi_num_args (stmt_info->stmt) != 1)
8978 return false;
8980 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8981 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8982 return false;
8984 if (!vec_stmt) /* transformation not required. */
8986 /* Deal with copies from externs or constants that disguise as
8987 loop-closed PHI nodes (PR97886). */
8988 if (slp_node
8989 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8990 SLP_TREE_VECTYPE (slp_node)))
8992 if (dump_enabled_p ())
8993 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8994 "incompatible vector types for invariants\n");
8995 return false;
8997 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8998 return true;
9001 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9002 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9003 basic_block bb = gimple_bb (stmt_info->stmt);
9004 edge e = single_pred_edge (bb);
9005 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9006 auto_vec<tree> vec_oprnds;
9007 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
9008 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
9009 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
9010 for (unsigned i = 0; i < vec_oprnds.length (); i++)
9012 /* Create the vectorized LC PHI node. */
9013 gphi *new_phi = create_phi_node (vec_dest, bb);
9014 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
9015 if (slp_node)
9016 slp_node->push_vec_def (new_phi);
9017 else
9018 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
9020 if (!slp_node)
9021 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9023 return true;
9026 /* Vectorizes PHIs. */
9028 bool
9029 vectorizable_phi (vec_info *,
9030 stmt_vec_info stmt_info, gimple **vec_stmt,
9031 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9033 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9034 return false;
9036 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9037 return false;
9039 tree vectype = SLP_TREE_VECTYPE (slp_node);
9041 if (!vec_stmt) /* transformation not required. */
9043 slp_tree child;
9044 unsigned i;
9045 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9046 if (!child)
9048 if (dump_enabled_p ())
9049 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9050 "PHI node with unvectorized backedge def\n");
9051 return false;
9053 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9055 if (dump_enabled_p ())
9056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9057 "incompatible vector types for invariants\n");
9058 return false;
9060 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9061 && !useless_type_conversion_p (vectype,
9062 SLP_TREE_VECTYPE (child)))
9064 /* With bools we can have mask and non-mask precision vectors
9065 or different non-mask precisions. while pattern recog is
9066 supposed to guarantee consistency here bugs in it can cause
9067 mismatches (PR103489 and PR103800 for example).
9068 Deal with them here instead of ICEing later. */
9069 if (dump_enabled_p ())
9070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9071 "incompatible vector type setup from "
9072 "bool pattern detection\n");
9073 return false;
9076 /* For single-argument PHIs assume coalescing which means zero cost
9077 for the scalar and the vector PHIs. This avoids artificially
9078 favoring the vector path (but may pessimize it in some cases). */
9079 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9080 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9081 vector_stmt, stmt_info, vectype, 0, vect_body);
9082 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9083 return true;
9086 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9087 basic_block bb = gimple_bb (stmt_info->stmt);
9088 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9089 auto_vec<gphi *> new_phis;
9090 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9092 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9094 /* Skip not yet vectorized defs. */
9095 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9096 && SLP_TREE_VEC_DEFS (child).is_empty ())
9097 continue;
9099 auto_vec<tree> vec_oprnds;
9100 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9101 if (!new_phis.exists ())
9103 new_phis.create (vec_oprnds.length ());
9104 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9106 /* Create the vectorized LC PHI node. */
9107 new_phis.quick_push (create_phi_node (vec_dest, bb));
9108 slp_node->push_vec_def (new_phis[j]);
9111 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9112 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9113 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9115 /* We should have at least one already vectorized child. */
9116 gcc_assert (new_phis.exists ());
9118 return true;
9121 /* Vectorizes first order recurrences. An overview of the transformation
9122 is described below. Suppose we have the following loop.
9124 int t = 0;
9125 for (int i = 0; i < n; ++i)
9127 b[i] = a[i] - t;
9128 t = a[i];
9131 There is a first-order recurrence on 'a'. For this loop, the scalar IR
9132 looks (simplified) like:
9134 scalar.preheader:
9135 init = 0;
9137 scalar.body:
9138 i = PHI <0(scalar.preheader), i+1(scalar.body)>
9139 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9140 _1 = a[i]
9141 b[i] = _1 - _2
9142 if (i < n) goto scalar.body
9144 In this example, _2 is a recurrence because it's value depends on the
9145 previous iteration. We vectorize this as (VF = 4)
9147 vector.preheader:
9148 vect_init = vect_cst(..., ..., ..., 0)
9150 vector.body
9151 i = PHI <0(vector.preheader), i+4(vector.body)>
9152 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9153 vect_2 = a[i, i+1, i+2, i+3];
9154 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9155 b[i, i+1, i+2, i+3] = vect_2 - vect_3
9156 if (..) goto vector.body
9158 In this function, vectorizable_recurr, we code generate both the
9159 vector PHI node and the permute since those together compute the
9160 vectorized value of the scalar PHI. We do not yet have the
9161 backedge value to fill in there nor into the vec_perm. Those
9162 are filled in maybe_set_vectorized_backedge_value and
9163 vect_schedule_scc.
9165 TODO: Since the scalar loop does not have a use of the recurrence
9166 outside of the loop the natural way to implement peeling via
9167 vectorizing the live value doesn't work. For now peeling of loops
9168 with a recurrence is not implemented. For SLP the supported cases
9169 are restricted to those requiring a single vector recurrence PHI. */
9171 bool
9172 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9173 gimple **vec_stmt, slp_tree slp_node,
9174 stmt_vector_for_cost *cost_vec)
9176 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9177 return false;
9179 gphi *phi = as_a<gphi *> (stmt_info->stmt);
9181 /* So far we only support first-order recurrence auto-vectorization. */
9182 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9183 return false;
9185 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9186 unsigned ncopies;
9187 if (slp_node)
9188 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9189 else
9190 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9191 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9192 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9193 /* We need to be able to make progress with a single vector. */
9194 if (maybe_gt (dist * 2, nunits))
9196 if (dump_enabled_p ())
9197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9198 "first order recurrence exceeds half of "
9199 "a vector\n");
9200 return false;
9203 /* First-order recurrence autovectorization needs to handle permutation
9204 with indices = [nunits-1, nunits, nunits+1, ...]. */
9205 vec_perm_builder sel (nunits, 1, 3);
9206 for (int i = 0; i < 3; ++i)
9207 sel.quick_push (nunits - dist + i);
9208 vec_perm_indices indices (sel, 2, nunits);
9210 if (!vec_stmt) /* transformation not required. */
9212 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9213 indices))
9214 return false;
9216 if (slp_node)
9218 /* We eventually need to set a vector type on invariant
9219 arguments. */
9220 unsigned j;
9221 slp_tree child;
9222 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9223 if (!vect_maybe_update_slp_op_vectype
9224 (child, SLP_TREE_VECTYPE (slp_node)))
9226 if (dump_enabled_p ())
9227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9228 "incompatible vector types for "
9229 "invariants\n");
9230 return false;
9233 /* The recurrence costs the initialization vector and one permute
9234 for each copy. */
9235 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9236 stmt_info, 0, vect_prologue);
9237 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9238 stmt_info, 0, vect_body);
9239 if (dump_enabled_p ())
9240 dump_printf_loc (MSG_NOTE, vect_location,
9241 "vectorizable_recurr: inside_cost = %d, "
9242 "prologue_cost = %d .\n", inside_cost,
9243 prologue_cost);
9245 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9246 return true;
9249 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9250 basic_block bb = gimple_bb (phi);
9251 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9252 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9254 gimple_seq stmts = NULL;
9255 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9256 gsi_insert_seq_on_edge_immediate (pe, stmts);
9258 tree vec_init = build_vector_from_val (vectype, preheader);
9259 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9261 /* Create the vectorized first-order PHI node. */
9262 tree vec_dest = vect_get_new_vect_var (vectype,
9263 vect_simple_var, "vec_recur_");
9264 gphi *new_phi = create_phi_node (vec_dest, bb);
9265 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9267 /* Insert shuffles the first-order recurrence autovectorization.
9268 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9269 tree perm = vect_gen_perm_mask_checked (vectype, indices);
9271 /* Insert the required permute after the latch definition. The
9272 second and later operands are tentative and will be updated when we have
9273 vectorized the latch definition. */
9274 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9275 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9276 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9277 gsi_next (&gsi2);
9279 for (unsigned i = 0; i < ncopies; ++i)
9281 vec_dest = make_ssa_name (vectype);
9282 gassign *vperm
9283 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9284 i == 0 ? gimple_phi_result (new_phi) : NULL,
9285 NULL, perm);
9286 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9288 if (slp_node)
9289 slp_node->push_vec_def (vperm);
9290 else
9291 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9294 if (!slp_node)
9295 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9296 return true;
9299 /* Return true if VECTYPE represents a vector that requires lowering
9300 by the vector lowering pass. */
9302 bool
9303 vect_emulated_vector_p (tree vectype)
9305 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9306 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9307 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9310 /* Return true if we can emulate CODE on an integer mode representation
9311 of a vector. */
9313 bool
9314 vect_can_vectorize_without_simd_p (tree_code code)
9316 switch (code)
9318 case PLUS_EXPR:
9319 case MINUS_EXPR:
9320 case NEGATE_EXPR:
9321 case BIT_AND_EXPR:
9322 case BIT_IOR_EXPR:
9323 case BIT_XOR_EXPR:
9324 case BIT_NOT_EXPR:
9325 return true;
9327 default:
9328 return false;
9332 /* Likewise, but taking a code_helper. */
9334 bool
9335 vect_can_vectorize_without_simd_p (code_helper code)
9337 return (code.is_tree_code ()
9338 && vect_can_vectorize_without_simd_p (tree_code (code)));
9341 /* Create vector init for vectorized iv. */
9342 static tree
9343 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9344 tree step_expr, poly_uint64 nunits,
9345 tree vectype,
9346 enum vect_induction_op_type induction_type)
9348 unsigned HOST_WIDE_INT const_nunits;
9349 tree vec_shift, vec_init, new_name;
9350 unsigned i;
9351 tree itype = TREE_TYPE (vectype);
9353 /* iv_loop is the loop to be vectorized. Create:
9354 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9355 new_name = gimple_convert (stmts, itype, init_expr);
9356 switch (induction_type)
9358 case vect_step_op_shr:
9359 case vect_step_op_shl:
9360 /* Build the Initial value from shift_expr. */
9361 vec_init = gimple_build_vector_from_val (stmts,
9362 vectype,
9363 new_name);
9364 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9365 build_zero_cst (itype), step_expr);
9366 vec_init = gimple_build (stmts,
9367 (induction_type == vect_step_op_shr
9368 ? RSHIFT_EXPR : LSHIFT_EXPR),
9369 vectype, vec_init, vec_shift);
9370 break;
9372 case vect_step_op_neg:
9374 vec_init = gimple_build_vector_from_val (stmts,
9375 vectype,
9376 new_name);
9377 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9378 vectype, vec_init);
9379 /* The encoding has 2 interleaved stepped patterns. */
9380 vec_perm_builder sel (nunits, 2, 3);
9381 sel.quick_grow (6);
9382 for (i = 0; i < 3; i++)
9384 sel[2 * i] = i;
9385 sel[2 * i + 1] = i + nunits;
9387 vec_perm_indices indices (sel, 2, nunits);
9388 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9389 fail when vec_init is const vector. In that situation vec_perm is not
9390 really needed. */
9391 tree perm_mask_even
9392 = vect_gen_perm_mask_any (vectype, indices);
9393 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9394 vectype,
9395 vec_init, vec_neg,
9396 perm_mask_even);
9398 break;
9400 case vect_step_op_mul:
9402 /* Use unsigned mult to avoid UD integer overflow. */
9403 gcc_assert (nunits.is_constant (&const_nunits));
9404 tree utype = unsigned_type_for (itype);
9405 tree uvectype = build_vector_type (utype,
9406 TYPE_VECTOR_SUBPARTS (vectype));
9407 new_name = gimple_convert (stmts, utype, new_name);
9408 vec_init = gimple_build_vector_from_val (stmts,
9409 uvectype,
9410 new_name);
9411 tree_vector_builder elts (uvectype, const_nunits, 1);
9412 tree elt_step = build_one_cst (utype);
9414 elts.quick_push (elt_step);
9415 for (i = 1; i < const_nunits; i++)
9417 /* Create: new_name_i = new_name + step_expr. */
9418 elt_step = gimple_build (stmts, MULT_EXPR,
9419 utype, elt_step, step_expr);
9420 elts.quick_push (elt_step);
9422 /* Create a vector from [new_name_0, new_name_1, ...,
9423 new_name_nunits-1]. */
9424 tree vec_mul = gimple_build_vector (stmts, &elts);
9425 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9426 vec_init, vec_mul);
9427 vec_init = gimple_convert (stmts, vectype, vec_init);
9429 break;
9431 default:
9432 gcc_unreachable ();
9435 return vec_init;
9438 /* Peel init_expr by skip_niter for induction_type. */
9439 tree
9440 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9441 tree skip_niters, tree step_expr,
9442 enum vect_induction_op_type induction_type)
9444 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9445 tree type = TREE_TYPE (init_expr);
9446 unsigned prec = TYPE_PRECISION (type);
9447 switch (induction_type)
9449 case vect_step_op_neg:
9450 if (TREE_INT_CST_LOW (skip_niters) % 2)
9451 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9452 /* else no change. */
9453 break;
9455 case vect_step_op_shr:
9456 case vect_step_op_shl:
9457 skip_niters = gimple_convert (stmts, type, skip_niters);
9458 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9459 /* When shift mount >= precision, need to avoid UD.
9460 In the original loop, there's no UD, and according to semantic,
9461 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9462 if (!tree_fits_uhwi_p (step_expr)
9463 || tree_to_uhwi (step_expr) >= prec)
9465 if (induction_type == vect_step_op_shl
9466 || TYPE_UNSIGNED (type))
9467 init_expr = build_zero_cst (type);
9468 else
9469 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9470 init_expr,
9471 wide_int_to_tree (type, prec - 1));
9473 else
9474 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9475 ? RSHIFT_EXPR : LSHIFT_EXPR),
9476 type, init_expr, step_expr);
9477 break;
9479 case vect_step_op_mul:
9481 tree utype = unsigned_type_for (type);
9482 init_expr = gimple_convert (stmts, utype, init_expr);
9483 wide_int skipn = wi::to_wide (skip_niters);
9484 wide_int begin = wi::to_wide (step_expr);
9485 auto_mpz base, exp, mod, res;
9486 wi::to_mpz (begin, base, TYPE_SIGN (type));
9487 wi::to_mpz (skipn, exp, UNSIGNED);
9488 mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9489 mpz_powm (res, base, exp, mod);
9490 begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9491 tree mult_expr = wide_int_to_tree (utype, begin);
9492 init_expr = gimple_build (stmts, MULT_EXPR, utype,
9493 init_expr, mult_expr);
9494 init_expr = gimple_convert (stmts, type, init_expr);
9496 break;
9498 default:
9499 gcc_unreachable ();
9502 return init_expr;
9505 /* Create vector step for vectorized iv. */
9506 static tree
9507 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9508 poly_uint64 vf,
9509 enum vect_induction_op_type induction_type)
9511 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9512 tree new_name = NULL;
9513 /* Step should be pow (step, vf) for mult induction. */
9514 if (induction_type == vect_step_op_mul)
9516 gcc_assert (vf.is_constant ());
9517 wide_int begin = wi::to_wide (step_expr);
9519 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9520 begin = wi::mul (begin, wi::to_wide (step_expr));
9522 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9524 else if (induction_type == vect_step_op_neg)
9525 /* Do nothing. */
9527 else
9528 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9529 expr, step_expr);
9530 return new_name;
9533 static tree
9534 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9535 stmt_vec_info stmt_info,
9536 tree new_name, tree vectype,
9537 enum vect_induction_op_type induction_type)
9539 /* No step is needed for neg induction. */
9540 if (induction_type == vect_step_op_neg)
9541 return NULL;
9543 tree t = unshare_expr (new_name);
9544 gcc_assert (CONSTANT_CLASS_P (new_name)
9545 || TREE_CODE (new_name) == SSA_NAME);
9546 tree new_vec = build_vector_from_val (vectype, t);
9547 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9548 new_vec, vectype, NULL);
9549 return vec_step;
9552 /* Update vectorized iv with vect_step, induc_def is init. */
9553 static tree
9554 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9555 tree induc_def, tree vec_step,
9556 enum vect_induction_op_type induction_type)
9558 tree vec_def = induc_def;
9559 switch (induction_type)
9561 case vect_step_op_mul:
9563 /* Use unsigned mult to avoid UD integer overflow. */
9564 tree uvectype
9565 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9566 TYPE_VECTOR_SUBPARTS (vectype));
9567 vec_def = gimple_convert (stmts, uvectype, vec_def);
9568 vec_step = gimple_convert (stmts, uvectype, vec_step);
9569 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9570 vec_def, vec_step);
9571 vec_def = gimple_convert (stmts, vectype, vec_def);
9573 break;
9575 case vect_step_op_shr:
9576 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9577 vec_def, vec_step);
9578 break;
9580 case vect_step_op_shl:
9581 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9582 vec_def, vec_step);
9583 break;
9584 case vect_step_op_neg:
9585 vec_def = induc_def;
9586 /* Do nothing. */
9587 break;
9588 default:
9589 gcc_unreachable ();
9592 return vec_def;
9596 /* Function vectorizable_induction
9598 Check if STMT_INFO performs an nonlinear induction computation that can be
9599 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9600 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9601 basic block.
9602 Return true if STMT_INFO is vectorizable in this way. */
9604 static bool
9605 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9606 stmt_vec_info stmt_info,
9607 gimple **vec_stmt, slp_tree slp_node,
9608 stmt_vector_for_cost *cost_vec)
9610 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9611 unsigned ncopies;
9612 bool nested_in_vect_loop = false;
9613 class loop *iv_loop;
9614 tree vec_def;
9615 edge pe = loop_preheader_edge (loop);
9616 basic_block new_bb;
9617 tree vec_init, vec_step;
9618 tree new_name;
9619 gimple *new_stmt;
9620 gphi *induction_phi;
9621 tree induc_def, vec_dest;
9622 tree init_expr, step_expr;
9623 tree niters_skip;
9624 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9625 unsigned i;
9626 gimple_stmt_iterator si;
9628 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9630 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9631 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9632 enum vect_induction_op_type induction_type
9633 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9635 gcc_assert (induction_type > vect_step_op_add);
9637 if (slp_node)
9638 ncopies = 1;
9639 else
9640 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9641 gcc_assert (ncopies >= 1);
9643 /* FORNOW. Only handle nonlinear induction in the same loop. */
9644 if (nested_in_vect_loop_p (loop, stmt_info))
9646 if (dump_enabled_p ())
9647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9648 "nonlinear induction in nested loop.\n");
9649 return false;
9652 iv_loop = loop;
9653 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9655 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9656 update for each iv and a permutation to generate wanted vector iv. */
9657 if (slp_node)
9659 if (dump_enabled_p ())
9660 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9661 "SLP induction not supported for nonlinear"
9662 " induction.\n");
9663 return false;
9666 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9668 if (dump_enabled_p ())
9669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9670 "floating point nonlinear induction vectorization"
9671 " not supported.\n");
9672 return false;
9675 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9676 init_expr = vect_phi_initial_value (phi);
9677 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9678 && TREE_CODE (step_expr) == INTEGER_CST);
9679 /* step_expr should be aligned with init_expr,
9680 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9681 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9683 if (TREE_CODE (init_expr) == INTEGER_CST)
9684 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9685 else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9687 /* INIT_EXPR could be a bit_field, bail out for such case. */
9688 if (dump_enabled_p ())
9689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9690 "nonlinear induction vectorization failed:"
9691 " component type of vectype is not a nop conversion"
9692 " from type of init_expr.\n");
9693 return false;
9696 switch (induction_type)
9698 case vect_step_op_neg:
9699 if (TREE_CODE (init_expr) != INTEGER_CST
9700 && TREE_CODE (init_expr) != REAL_CST)
9702 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9703 if (!directly_supported_p (NEGATE_EXPR, vectype))
9704 return false;
9706 /* The encoding has 2 interleaved stepped patterns. */
9707 vec_perm_builder sel (nunits, 2, 3);
9708 machine_mode mode = TYPE_MODE (vectype);
9709 sel.quick_grow (6);
9710 for (i = 0; i < 3; i++)
9712 sel[i * 2] = i;
9713 sel[i * 2 + 1] = i + nunits;
9715 vec_perm_indices indices (sel, 2, nunits);
9716 if (!can_vec_perm_const_p (mode, mode, indices))
9717 return false;
9719 break;
9721 case vect_step_op_mul:
9723 /* Check for backend support of MULT_EXPR. */
9724 if (!directly_supported_p (MULT_EXPR, vectype))
9725 return false;
9727 /* ?? How to construct vector step for variable number vector.
9728 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9729 if (!vf.is_constant ())
9730 return false;
9732 break;
9734 case vect_step_op_shr:
9735 /* Check for backend support of RSHIFT_EXPR. */
9736 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9737 return false;
9739 /* Don't shift more than type precision to avoid UD. */
9740 if (!tree_fits_uhwi_p (step_expr)
9741 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9742 TYPE_PRECISION (TREE_TYPE (init_expr))))
9743 return false;
9744 break;
9746 case vect_step_op_shl:
9747 /* Check for backend support of RSHIFT_EXPR. */
9748 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9749 return false;
9751 /* Don't shift more than type precision to avoid UD. */
9752 if (!tree_fits_uhwi_p (step_expr)
9753 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9754 TYPE_PRECISION (TREE_TYPE (init_expr))))
9755 return false;
9757 break;
9759 default:
9760 gcc_unreachable ();
9763 if (!vec_stmt) /* transformation not required. */
9765 unsigned inside_cost = 0, prologue_cost = 0;
9766 /* loop cost for vec_loop. Neg induction doesn't have any
9767 inside_cost. */
9768 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9769 stmt_info, 0, vect_body);
9771 /* loop cost for vec_loop. Neg induction doesn't have any
9772 inside_cost. */
9773 if (induction_type == vect_step_op_neg)
9774 inside_cost = 0;
9776 /* prologue cost for vec_init and vec_step. */
9777 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9778 stmt_info, 0, vect_prologue);
9780 if (dump_enabled_p ())
9781 dump_printf_loc (MSG_NOTE, vect_location,
9782 "vect_model_induction_cost: inside_cost = %d, "
9783 "prologue_cost = %d. \n", inside_cost,
9784 prologue_cost);
9786 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9787 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9788 return true;
9791 /* Transform. */
9793 /* Compute a vector variable, initialized with the first VF values of
9794 the induction variable. E.g., for an iv with IV_PHI='X' and
9795 evolution S, for a vector of 4 units, we want to compute:
9796 [X, X + S, X + 2*S, X + 3*S]. */
9798 if (dump_enabled_p ())
9799 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9801 pe = loop_preheader_edge (iv_loop);
9802 /* Find the first insertion point in the BB. */
9803 basic_block bb = gimple_bb (phi);
9804 si = gsi_after_labels (bb);
9806 gimple_seq stmts = NULL;
9808 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9809 /* If we are using the loop mask to "peel" for alignment then we need
9810 to adjust the start value here. */
9811 if (niters_skip != NULL_TREE)
9812 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9813 step_expr, induction_type);
9815 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9816 step_expr, nunits, vectype,
9817 induction_type);
9818 if (stmts)
9820 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9821 gcc_assert (!new_bb);
9824 stmts = NULL;
9825 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9826 vf, induction_type);
9827 if (stmts)
9829 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9830 gcc_assert (!new_bb);
9833 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9834 new_name, vectype,
9835 induction_type);
9836 /* Create the following def-use cycle:
9837 loop prolog:
9838 vec_init = ...
9839 vec_step = ...
9840 loop:
9841 vec_iv = PHI <vec_init, vec_loop>
9843 STMT
9845 vec_loop = vec_iv + vec_step; */
9847 /* Create the induction-phi that defines the induction-operand. */
9848 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9849 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9850 induc_def = PHI_RESULT (induction_phi);
9852 /* Create the iv update inside the loop. */
9853 stmts = NULL;
9854 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9855 induc_def, vec_step,
9856 induction_type);
9858 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9859 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9861 /* Set the arguments of the phi node: */
9862 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9863 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9864 UNKNOWN_LOCATION);
9866 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9867 *vec_stmt = induction_phi;
9869 /* In case that vectorization factor (VF) is bigger than the number
9870 of elements that we can fit in a vectype (nunits), we have to generate
9871 more than one vector stmt - i.e - we need to "unroll" the
9872 vector stmt by a factor VF/nunits. For more details see documentation
9873 in vectorizable_operation. */
9875 if (ncopies > 1)
9877 stmts = NULL;
9878 /* FORNOW. This restriction should be relaxed. */
9879 gcc_assert (!nested_in_vect_loop);
9881 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9882 nunits, induction_type);
9884 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9885 new_name, vectype,
9886 induction_type);
9887 vec_def = induc_def;
9888 for (i = 1; i < ncopies; i++)
9890 /* vec_i = vec_prev + vec_step. */
9891 stmts = NULL;
9892 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9893 vec_def, vec_step,
9894 induction_type);
9895 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9896 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9897 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9901 if (dump_enabled_p ())
9902 dump_printf_loc (MSG_NOTE, vect_location,
9903 "transform induction: created def-use cycle: %G%G",
9904 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9906 return true;
9909 /* Function vectorizable_induction
9911 Check if STMT_INFO performs an induction computation that can be vectorized.
9912 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9913 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9914 Return true if STMT_INFO is vectorizable in this way. */
9916 bool
9917 vectorizable_induction (loop_vec_info loop_vinfo,
9918 stmt_vec_info stmt_info,
9919 gimple **vec_stmt, slp_tree slp_node,
9920 stmt_vector_for_cost *cost_vec)
9922 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9923 unsigned ncopies;
9924 bool nested_in_vect_loop = false;
9925 class loop *iv_loop;
9926 tree vec_def;
9927 edge pe = loop_preheader_edge (loop);
9928 basic_block new_bb;
9929 tree new_vec, vec_init, vec_step, t;
9930 tree new_name;
9931 gimple *new_stmt;
9932 gphi *induction_phi;
9933 tree induc_def, vec_dest;
9934 tree init_expr, step_expr;
9935 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9936 unsigned i;
9937 tree expr;
9938 gimple_stmt_iterator si;
9939 enum vect_induction_op_type induction_type
9940 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9942 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9943 if (!phi)
9944 return false;
9946 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9947 return false;
9949 /* Make sure it was recognized as induction computation. */
9950 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9951 return false;
9953 /* Handle nonlinear induction in a separate place. */
9954 if (induction_type != vect_step_op_add)
9955 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9956 vec_stmt, slp_node, cost_vec);
9958 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9959 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9961 if (slp_node)
9962 ncopies = 1;
9963 else
9964 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9965 gcc_assert (ncopies >= 1);
9967 /* FORNOW. These restrictions should be relaxed. */
9968 if (nested_in_vect_loop_p (loop, stmt_info))
9970 imm_use_iterator imm_iter;
9971 use_operand_p use_p;
9972 gimple *exit_phi;
9973 edge latch_e;
9974 tree loop_arg;
9976 if (ncopies > 1)
9978 if (dump_enabled_p ())
9979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9980 "multiple types in nested loop.\n");
9981 return false;
9984 exit_phi = NULL;
9985 latch_e = loop_latch_edge (loop->inner);
9986 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9987 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9989 gimple *use_stmt = USE_STMT (use_p);
9990 if (is_gimple_debug (use_stmt))
9991 continue;
9993 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9995 exit_phi = use_stmt;
9996 break;
9999 if (exit_phi)
10001 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
10002 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
10003 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
10005 if (dump_enabled_p ())
10006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10007 "inner-loop induction only used outside "
10008 "of the outer vectorized loop.\n");
10009 return false;
10013 nested_in_vect_loop = true;
10014 iv_loop = loop->inner;
10016 else
10017 iv_loop = loop;
10018 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10020 if (slp_node && !nunits.is_constant ())
10022 /* The current SLP code creates the step value element-by-element. */
10023 if (dump_enabled_p ())
10024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10025 "SLP induction not supported for variable-length"
10026 " vectors.\n");
10027 return false;
10030 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10032 if (dump_enabled_p ())
10033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10034 "floating point induction vectorization disabled\n");
10035 return false;
10038 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10039 gcc_assert (step_expr != NULL_TREE);
10040 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10042 /* Check for backend support of PLUS/MINUS_EXPR. */
10043 if (!directly_supported_p (PLUS_EXPR, step_vectype)
10044 || !directly_supported_p (MINUS_EXPR, step_vectype))
10045 return false;
10047 if (!vec_stmt) /* transformation not required. */
10049 unsigned inside_cost = 0, prologue_cost = 0;
10050 if (slp_node)
10052 /* We eventually need to set a vector type on invariant
10053 arguments. */
10054 unsigned j;
10055 slp_tree child;
10056 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10057 if (!vect_maybe_update_slp_op_vectype
10058 (child, SLP_TREE_VECTYPE (slp_node)))
10060 if (dump_enabled_p ())
10061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10062 "incompatible vector types for "
10063 "invariants\n");
10064 return false;
10066 /* loop cost for vec_loop. */
10067 inside_cost
10068 = record_stmt_cost (cost_vec,
10069 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10070 vector_stmt, stmt_info, 0, vect_body);
10071 /* prologue cost for vec_init (if not nested) and step. */
10072 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10073 scalar_to_vec,
10074 stmt_info, 0, vect_prologue);
10076 else /* if (!slp_node) */
10078 /* loop cost for vec_loop. */
10079 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10080 stmt_info, 0, vect_body);
10081 /* prologue cost for vec_init and vec_step. */
10082 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10083 stmt_info, 0, vect_prologue);
10085 if (dump_enabled_p ())
10086 dump_printf_loc (MSG_NOTE, vect_location,
10087 "vect_model_induction_cost: inside_cost = %d, "
10088 "prologue_cost = %d .\n", inside_cost,
10089 prologue_cost);
10091 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10092 DUMP_VECT_SCOPE ("vectorizable_induction");
10093 return true;
10096 /* Transform. */
10098 /* Compute a vector variable, initialized with the first VF values of
10099 the induction variable. E.g., for an iv with IV_PHI='X' and
10100 evolution S, for a vector of 4 units, we want to compute:
10101 [X, X + S, X + 2*S, X + 3*S]. */
10103 if (dump_enabled_p ())
10104 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10106 pe = loop_preheader_edge (iv_loop);
10107 /* Find the first insertion point in the BB. */
10108 basic_block bb = gimple_bb (phi);
10109 si = gsi_after_labels (bb);
10111 /* For SLP induction we have to generate several IVs as for example
10112 with group size 3 we need
10113 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10114 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
10115 if (slp_node)
10117 /* Enforced above. */
10118 unsigned int const_nunits = nunits.to_constant ();
10120 /* The initial values are vectorized, but any lanes > group_size
10121 need adjustment. */
10122 slp_tree init_node
10123 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10125 /* Gather steps. Since we do not vectorize inductions as
10126 cycles we have to reconstruct the step from SCEV data. */
10127 unsigned group_size = SLP_TREE_LANES (slp_node);
10128 tree *steps = XALLOCAVEC (tree, group_size);
10129 tree *inits = XALLOCAVEC (tree, group_size);
10130 stmt_vec_info phi_info;
10131 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10133 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10134 if (!init_node)
10135 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10136 pe->dest_idx);
10139 /* Now generate the IVs. */
10140 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10141 gcc_assert ((const_nunits * nvects) % group_size == 0);
10142 unsigned nivs;
10143 if (nested_in_vect_loop)
10144 nivs = nvects;
10145 else
10147 /* Compute the number of distinct IVs we need. First reduce
10148 group_size if it is a multiple of const_nunits so we get
10149 one IV for a group_size of 4 but const_nunits 2. */
10150 unsigned group_sizep = group_size;
10151 if (group_sizep % const_nunits == 0)
10152 group_sizep = group_sizep / const_nunits;
10153 nivs = least_common_multiple (group_sizep,
10154 const_nunits) / const_nunits;
10156 tree stept = TREE_TYPE (step_vectype);
10157 tree lupdate_mul = NULL_TREE;
10158 if (!nested_in_vect_loop)
10160 /* The number of iterations covered in one vector iteration. */
10161 unsigned lup_mul = (nvects * const_nunits) / group_size;
10162 lupdate_mul
10163 = build_vector_from_val (step_vectype,
10164 SCALAR_FLOAT_TYPE_P (stept)
10165 ? build_real_from_wide (stept, lup_mul,
10166 UNSIGNED)
10167 : build_int_cstu (stept, lup_mul));
10169 tree peel_mul = NULL_TREE;
10170 gimple_seq init_stmts = NULL;
10171 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10173 if (SCALAR_FLOAT_TYPE_P (stept))
10174 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10175 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10176 else
10177 peel_mul = gimple_convert (&init_stmts, stept,
10178 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10179 peel_mul = gimple_build_vector_from_val (&init_stmts,
10180 step_vectype, peel_mul);
10182 unsigned ivn;
10183 auto_vec<tree> vec_steps;
10184 for (ivn = 0; ivn < nivs; ++ivn)
10186 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10187 tree_vector_builder init_elts (vectype, const_nunits, 1);
10188 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10189 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10191 /* The scalar steps of the IVs. */
10192 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10193 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10194 step_elts.quick_push (elt);
10195 if (!init_node)
10197 /* The scalar inits of the IVs if not vectorized. */
10198 elt = inits[(ivn*const_nunits + eltn) % group_size];
10199 if (!useless_type_conversion_p (TREE_TYPE (vectype),
10200 TREE_TYPE (elt)))
10201 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10202 TREE_TYPE (vectype), elt);
10203 init_elts.quick_push (elt);
10205 /* The number of steps to add to the initial values. */
10206 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10207 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10208 ? build_real_from_wide (stept,
10209 mul_elt, UNSIGNED)
10210 : build_int_cstu (stept, mul_elt));
10212 vec_step = gimple_build_vector (&init_stmts, &step_elts);
10213 vec_steps.safe_push (vec_step);
10214 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10215 if (peel_mul)
10216 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10217 step_mul, peel_mul);
10218 if (!init_node)
10219 vec_init = gimple_build_vector (&init_stmts, &init_elts);
10221 /* Create the induction-phi that defines the induction-operand. */
10222 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10223 "vec_iv_");
10224 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10225 induc_def = PHI_RESULT (induction_phi);
10227 /* Create the iv update inside the loop */
10228 tree up = vec_step;
10229 if (lupdate_mul)
10230 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10231 vec_step, lupdate_mul);
10232 gimple_seq stmts = NULL;
10233 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10234 vec_def = gimple_build (&stmts,
10235 PLUS_EXPR, step_vectype, vec_def, up);
10236 vec_def = gimple_convert (&stmts, vectype, vec_def);
10237 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10238 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10239 UNKNOWN_LOCATION);
10241 if (init_node)
10242 vec_init = vect_get_slp_vect_def (init_node, ivn);
10243 if (!nested_in_vect_loop
10244 && !integer_zerop (step_mul))
10246 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10247 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10248 vec_step, step_mul);
10249 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10250 vec_def, up);
10251 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10254 /* Set the arguments of the phi node: */
10255 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10257 slp_node->push_vec_def (induction_phi);
10259 if (!nested_in_vect_loop)
10261 /* Fill up to the number of vectors we need for the whole group. */
10262 nivs = least_common_multiple (group_size,
10263 const_nunits) / const_nunits;
10264 vec_steps.reserve (nivs-ivn);
10265 for (; ivn < nivs; ++ivn)
10267 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10268 vec_steps.quick_push (vec_steps[0]);
10272 /* Re-use IVs when we can. We are generating further vector
10273 stmts by adding VF' * stride to the IVs generated above. */
10274 if (ivn < nvects)
10276 unsigned vfp
10277 = least_common_multiple (group_size, const_nunits) / group_size;
10278 tree lupdate_mul
10279 = build_vector_from_val (step_vectype,
10280 SCALAR_FLOAT_TYPE_P (stept)
10281 ? build_real_from_wide (stept,
10282 vfp, UNSIGNED)
10283 : build_int_cstu (stept, vfp));
10284 for (; ivn < nvects; ++ivn)
10286 gimple *iv
10287 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10288 tree def = gimple_get_lhs (iv);
10289 if (ivn < 2*nivs)
10290 vec_steps[ivn - nivs]
10291 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10292 vec_steps[ivn - nivs], lupdate_mul);
10293 gimple_seq stmts = NULL;
10294 def = gimple_convert (&stmts, step_vectype, def);
10295 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10296 def, vec_steps[ivn % nivs]);
10297 def = gimple_convert (&stmts, vectype, def);
10298 if (gimple_code (iv) == GIMPLE_PHI)
10299 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10300 else
10302 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10303 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10305 slp_node->push_vec_def (def);
10309 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10310 gcc_assert (!new_bb);
10312 return true;
10315 init_expr = vect_phi_initial_value (phi);
10317 gimple_seq stmts = NULL;
10318 if (!nested_in_vect_loop)
10320 /* Convert the initial value to the IV update type. */
10321 tree new_type = TREE_TYPE (step_expr);
10322 init_expr = gimple_convert (&stmts, new_type, init_expr);
10324 /* If we are using the loop mask to "peel" for alignment then we need
10325 to adjust the start value here. */
10326 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10327 if (skip_niters != NULL_TREE)
10329 if (FLOAT_TYPE_P (vectype))
10330 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10331 skip_niters);
10332 else
10333 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10334 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10335 skip_niters, step_expr);
10336 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10337 init_expr, skip_step);
10341 if (stmts)
10343 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10344 gcc_assert (!new_bb);
10347 /* Create the vector that holds the initial_value of the induction. */
10348 if (nested_in_vect_loop)
10350 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10351 been created during vectorization of previous stmts. We obtain it
10352 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10353 auto_vec<tree> vec_inits;
10354 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10355 init_expr, &vec_inits);
10356 vec_init = vec_inits[0];
10357 /* If the initial value is not of proper type, convert it. */
10358 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10360 new_stmt
10361 = gimple_build_assign (vect_get_new_ssa_name (vectype,
10362 vect_simple_var,
10363 "vec_iv_"),
10364 VIEW_CONVERT_EXPR,
10365 build1 (VIEW_CONVERT_EXPR, vectype,
10366 vec_init));
10367 vec_init = gimple_assign_lhs (new_stmt);
10368 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10369 new_stmt);
10370 gcc_assert (!new_bb);
10373 else
10375 /* iv_loop is the loop to be vectorized. Create:
10376 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10377 stmts = NULL;
10378 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10380 unsigned HOST_WIDE_INT const_nunits;
10381 if (nunits.is_constant (&const_nunits))
10383 tree_vector_builder elts (step_vectype, const_nunits, 1);
10384 elts.quick_push (new_name);
10385 for (i = 1; i < const_nunits; i++)
10387 /* Create: new_name_i = new_name + step_expr */
10388 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10389 new_name, step_expr);
10390 elts.quick_push (new_name);
10392 /* Create a vector from [new_name_0, new_name_1, ...,
10393 new_name_nunits-1] */
10394 vec_init = gimple_build_vector (&stmts, &elts);
10396 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10397 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10398 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10399 new_name, step_expr);
10400 else
10402 /* Build:
10403 [base, base, base, ...]
10404 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10405 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10406 gcc_assert (flag_associative_math);
10407 tree index = build_index_vector (step_vectype, 0, 1);
10408 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10409 new_name);
10410 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10411 step_expr);
10412 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10413 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10414 vec_init, step_vec);
10415 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10416 vec_init, base_vec);
10418 vec_init = gimple_convert (&stmts, vectype, vec_init);
10420 if (stmts)
10422 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10423 gcc_assert (!new_bb);
10428 /* Create the vector that holds the step of the induction. */
10429 gimple_stmt_iterator *step_iv_si = NULL;
10430 if (nested_in_vect_loop)
10431 /* iv_loop is nested in the loop to be vectorized. Generate:
10432 vec_step = [S, S, S, S] */
10433 new_name = step_expr;
10434 else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10436 /* When we're using loop_len produced by SELEC_VL, the non-final
10437 iterations are not always processing VF elements. So vectorize
10438 induction variable instead of
10440 _21 = vect_vec_iv_.6_22 + { VF, ... };
10442 We should generate:
10444 _35 = .SELECT_VL (ivtmp_33, VF);
10445 vect_cst__22 = [vec_duplicate_expr] _35;
10446 _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10447 gcc_assert (!slp_node);
10448 gimple_seq seq = NULL;
10449 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10450 tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10451 expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10452 unshare_expr (len)),
10453 &seq, true, NULL_TREE);
10454 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10455 step_expr);
10456 gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10457 step_iv_si = &si;
10459 else
10461 /* iv_loop is the loop to be vectorized. Generate:
10462 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10463 gimple_seq seq = NULL;
10464 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10466 expr = build_int_cst (integer_type_node, vf);
10467 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10469 else
10470 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10471 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10472 expr, step_expr);
10473 if (seq)
10475 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10476 gcc_assert (!new_bb);
10480 t = unshare_expr (new_name);
10481 gcc_assert (CONSTANT_CLASS_P (new_name)
10482 || TREE_CODE (new_name) == SSA_NAME);
10483 new_vec = build_vector_from_val (step_vectype, t);
10484 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10485 new_vec, step_vectype, step_iv_si);
10488 /* Create the following def-use cycle:
10489 loop prolog:
10490 vec_init = ...
10491 vec_step = ...
10492 loop:
10493 vec_iv = PHI <vec_init, vec_loop>
10495 STMT
10497 vec_loop = vec_iv + vec_step; */
10499 /* Create the induction-phi that defines the induction-operand. */
10500 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10501 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10502 induc_def = PHI_RESULT (induction_phi);
10504 /* Create the iv update inside the loop */
10505 stmts = NULL;
10506 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10507 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10508 vec_def = gimple_convert (&stmts, vectype, vec_def);
10509 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10510 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10512 /* Set the arguments of the phi node: */
10513 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10514 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10515 UNKNOWN_LOCATION);
10517 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10518 *vec_stmt = induction_phi;
10520 /* In case that vectorization factor (VF) is bigger than the number
10521 of elements that we can fit in a vectype (nunits), we have to generate
10522 more than one vector stmt - i.e - we need to "unroll" the
10523 vector stmt by a factor VF/nunits. For more details see documentation
10524 in vectorizable_operation. */
10526 if (ncopies > 1)
10528 gimple_seq seq = NULL;
10529 /* FORNOW. This restriction should be relaxed. */
10530 gcc_assert (!nested_in_vect_loop);
10531 /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1. */
10532 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10534 /* Create the vector that holds the step of the induction. */
10535 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10537 expr = build_int_cst (integer_type_node, nunits);
10538 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10540 else
10541 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10542 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10543 expr, step_expr);
10544 if (seq)
10546 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10547 gcc_assert (!new_bb);
10550 t = unshare_expr (new_name);
10551 gcc_assert (CONSTANT_CLASS_P (new_name)
10552 || TREE_CODE (new_name) == SSA_NAME);
10553 new_vec = build_vector_from_val (step_vectype, t);
10554 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10555 new_vec, step_vectype, NULL);
10557 vec_def = induc_def;
10558 for (i = 1; i < ncopies + 1; i++)
10560 /* vec_i = vec_prev + vec_step */
10561 gimple_seq stmts = NULL;
10562 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10563 vec_def = gimple_build (&stmts,
10564 PLUS_EXPR, step_vectype, vec_def, vec_step);
10565 vec_def = gimple_convert (&stmts, vectype, vec_def);
10567 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10568 if (i < ncopies)
10570 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10571 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10573 else
10575 /* vec_1 = vec_iv + (VF/n * S)
10576 vec_2 = vec_1 + (VF/n * S)
10578 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10580 vec_n is used as vec_loop to save the large step register and
10581 related operations. */
10582 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10583 UNKNOWN_LOCATION);
10588 if (dump_enabled_p ())
10589 dump_printf_loc (MSG_NOTE, vect_location,
10590 "transform induction: created def-use cycle: %G%G",
10591 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10593 return true;
10596 /* Function vectorizable_live_operation_1.
10598 helper function for vectorizable_live_operation. */
10600 static tree
10601 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10602 stmt_vec_info stmt_info, basic_block exit_bb,
10603 tree vectype, int ncopies, slp_tree slp_node,
10604 tree bitsize, tree bitstart, tree vec_lhs,
10605 tree lhs_type, gimple_stmt_iterator *exit_gsi)
10607 gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10609 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10610 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10611 for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10612 SET_PHI_ARG_DEF (phi, i, vec_lhs);
10614 gimple_seq stmts = NULL;
10615 tree new_tree;
10617 /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10618 if (integer_zerop (bitstart))
10620 tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10621 vec_lhs_phi, bitsize, bitstart);
10623 /* Convert the extracted vector element to the scalar type. */
10624 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10626 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10628 /* Emit:
10630 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10632 where VEC_LHS is the vectorized live-out result and MASK is
10633 the loop mask for the final iteration. */
10634 gcc_assert (ncopies == 1 && !slp_node);
10635 gimple_seq tem = NULL;
10636 gimple_stmt_iterator gsi = gsi_last (tem);
10637 tree len = vect_get_loop_len (loop_vinfo, &gsi,
10638 &LOOP_VINFO_LENS (loop_vinfo),
10639 1, vectype, 0, 0);
10641 /* BIAS - 1. */
10642 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10643 tree bias_minus_one
10644 = int_const_binop (MINUS_EXPR,
10645 build_int_cst (TREE_TYPE (len), biasval),
10646 build_one_cst (TREE_TYPE (len)));
10648 /* LAST_INDEX = LEN + (BIAS - 1). */
10649 tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10650 len, bias_minus_one);
10652 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10653 tree scalar_res
10654 = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10655 vec_lhs_phi, last_index);
10657 /* Convert the extracted vector element to the scalar type. */
10658 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10660 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10662 /* Emit:
10664 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10666 where VEC_LHS is the vectorized live-out result and MASK is
10667 the loop mask for the final iteration. */
10668 gcc_assert (!slp_node);
10669 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10670 gimple_seq tem = NULL;
10671 gimple_stmt_iterator gsi = gsi_last (tem);
10672 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10673 &LOOP_VINFO_MASKS (loop_vinfo),
10674 1, vectype, 0);
10675 tree scalar_res;
10676 gimple_seq_add_seq (&stmts, tem);
10678 scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10679 mask, vec_lhs_phi);
10681 /* Convert the extracted vector element to the scalar type. */
10682 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10684 else
10686 tree bftype = TREE_TYPE (vectype);
10687 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10688 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10689 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10690 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10691 &stmts, true, NULL_TREE);
10694 *exit_gsi = gsi_after_labels (exit_bb);
10695 if (stmts)
10696 gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10698 return new_tree;
10701 /* Find the edge that's the final one in the path from SRC to DEST and
10702 return it. This edge must exist in at most one forwarder edge between. */
10704 static edge
10705 find_connected_edge (edge src, basic_block dest)
10707 if (src->dest == dest)
10708 return src;
10710 return find_edge (src->dest, dest);
10713 /* Function vectorizable_live_operation.
10715 STMT_INFO computes a value that is used outside the loop. Check if
10716 it can be supported. */
10718 bool
10719 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10720 slp_tree slp_node, slp_instance slp_node_instance,
10721 int slp_index, bool vec_stmt_p,
10722 stmt_vector_for_cost *cost_vec)
10724 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10725 imm_use_iterator imm_iter;
10726 tree lhs, lhs_type, bitsize;
10727 tree vectype = (slp_node
10728 ? SLP_TREE_VECTYPE (slp_node)
10729 : STMT_VINFO_VECTYPE (stmt_info));
10730 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10731 int ncopies;
10732 gimple *use_stmt;
10733 use_operand_p use_p;
10734 auto_vec<tree> vec_oprnds;
10735 int vec_entry = 0;
10736 poly_uint64 vec_index = 0;
10738 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10739 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10741 /* If a stmt of a reduction is live, vectorize it via
10742 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10743 validity so just trigger the transform here. */
10744 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10746 if (!vec_stmt_p)
10747 return true;
10748 if (slp_node)
10750 /* For reduction chains the meta-info is attached to
10751 the group leader. */
10752 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10753 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10754 /* For SLP reductions we vectorize the epilogue for
10755 all involved stmts together. */
10756 else if (slp_index != 0)
10757 return true;
10759 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10760 gcc_assert (reduc_info->is_reduc_info);
10761 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10762 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10763 return true;
10765 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10766 slp_node_instance,
10767 LOOP_VINFO_IV_EXIT (loop_vinfo));
10769 /* If early break we only have to materialize the reduction on the merge
10770 block, but we have to find an alternate exit first. */
10771 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10773 for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10774 if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10776 vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10777 slp_node, slp_node_instance,
10778 exit);
10779 break;
10783 return true;
10786 /* If STMT is not relevant and it is a simple assignment and its inputs are
10787 invariant then it can remain in place, unvectorized. The original last
10788 scalar value that it computes will be used. */
10789 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10791 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10792 if (dump_enabled_p ())
10793 dump_printf_loc (MSG_NOTE, vect_location,
10794 "statement is simple and uses invariant. Leaving in "
10795 "place.\n");
10796 return true;
10799 if (slp_node)
10800 ncopies = 1;
10801 else
10802 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10804 if (slp_node)
10806 gcc_assert (slp_index >= 0);
10808 /* Get the last occurrence of the scalar index from the concatenation of
10809 all the slp vectors. Calculate which slp vector it is and the index
10810 within. */
10811 int num_scalar = SLP_TREE_LANES (slp_node);
10812 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10813 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10815 /* Calculate which vector contains the result, and which lane of
10816 that vector we need. */
10817 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10819 if (dump_enabled_p ())
10820 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10821 "Cannot determine which vector holds the"
10822 " final result.\n");
10823 return false;
10827 if (!vec_stmt_p)
10829 /* No transformation required. */
10830 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10832 if (slp_node)
10834 if (dump_enabled_p ())
10835 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10836 "can't operate on partial vectors "
10837 "because an SLP statement is live after "
10838 "the loop.\n");
10839 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10841 else if (ncopies > 1)
10843 if (dump_enabled_p ())
10844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10845 "can't operate on partial vectors "
10846 "because ncopies is greater than 1.\n");
10847 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10849 else
10851 gcc_assert (ncopies == 1 && !slp_node);
10852 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10853 OPTIMIZE_FOR_SPEED))
10854 vect_record_loop_mask (loop_vinfo,
10855 &LOOP_VINFO_MASKS (loop_vinfo),
10856 1, vectype, NULL);
10857 else if (can_vec_extract_var_idx_p (
10858 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10859 vect_record_loop_len (loop_vinfo,
10860 &LOOP_VINFO_LENS (loop_vinfo),
10861 1, vectype, 1);
10862 else
10864 if (dump_enabled_p ())
10865 dump_printf_loc (
10866 MSG_MISSED_OPTIMIZATION, vect_location,
10867 "can't operate on partial vectors "
10868 "because the target doesn't support extract "
10869 "last reduction.\n");
10870 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10874 /* ??? Enable for loop costing as well. */
10875 if (!loop_vinfo)
10876 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10877 0, vect_epilogue);
10878 return true;
10881 /* Use the lhs of the original scalar statement. */
10882 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10883 if (dump_enabled_p ())
10884 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10885 "stmt %G", stmt);
10887 lhs = gimple_get_lhs (stmt);
10888 lhs_type = TREE_TYPE (lhs);
10890 bitsize = vector_element_bits_tree (vectype);
10892 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10893 tree vec_lhs, vec_lhs0, bitstart;
10894 gimple *vec_stmt, *vec_stmt0;
10895 if (slp_node)
10897 gcc_assert (!loop_vinfo
10898 || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10899 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10901 /* Get the correct slp vectorized stmt. */
10902 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10903 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10905 /* In case we need to early break vectorize also get the first stmt. */
10906 vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10907 vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10909 /* Get entry to use. */
10910 bitstart = bitsize_int (vec_index);
10911 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10913 else
10915 /* For multiple copies, get the last copy. */
10916 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10917 vec_lhs = gimple_get_lhs (vec_stmt);
10919 /* In case we need to early break vectorize also get the first stmt. */
10920 vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10921 vec_lhs0 = gimple_get_lhs (vec_stmt0);
10923 /* Get the last lane in the vector. */
10924 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10927 if (loop_vinfo)
10929 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10930 requirement, insert one phi node for it. It looks like:
10931 loop;
10933 # lhs' = PHI <lhs>
10935 loop;
10937 # vec_lhs' = PHI <vec_lhs>
10938 new_tree = lane_extract <vec_lhs', ...>;
10939 lhs' = new_tree; */
10941 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10942 /* Check if we have a loop where the chosen exit is not the main exit,
10943 in these cases for an early break we restart the iteration the vector code
10944 did. For the live values we want the value at the start of the iteration
10945 rather than at the end. */
10946 edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10947 bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10948 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10949 if (!is_gimple_debug (use_stmt)
10950 && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10951 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10953 edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10954 phi_arg_index_from_use (use_p));
10955 bool main_exit_edge = e == main_e
10956 || find_connected_edge (main_e, e->src);
10958 /* Early exits have an merge block, we want the merge block itself
10959 so use ->src. For main exit the merge block is the
10960 destination. */
10961 basic_block dest = main_exit_edge ? main_e->dest : e->src;
10962 tree tmp_vec_lhs = vec_lhs;
10963 tree tmp_bitstart = bitstart;
10965 /* For early exit where the exit is not in the BB that leads
10966 to the latch then we're restarting the iteration in the
10967 scalar loop. So get the first live value. */
10968 restart_loop = restart_loop || !main_exit_edge;
10969 if (restart_loop
10970 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10972 tmp_vec_lhs = vec_lhs0;
10973 tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10976 gimple_stmt_iterator exit_gsi;
10977 tree new_tree
10978 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10979 dest, vectype, ncopies,
10980 slp_node, bitsize,
10981 tmp_bitstart, tmp_vec_lhs,
10982 lhs_type, &exit_gsi);
10984 if (gimple_phi_num_args (use_stmt) == 1)
10986 auto gsi = gsi_for_stmt (use_stmt);
10987 remove_phi_node (&gsi, false);
10988 tree lhs_phi = gimple_phi_result (use_stmt);
10989 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10990 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10992 else
10993 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, new_tree);
10996 /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10997 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10998 gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
11000 else
11002 /* For basic-block vectorization simply insert the lane-extraction. */
11003 tree bftype = TREE_TYPE (vectype);
11004 if (VECTOR_BOOLEAN_TYPE_P (vectype))
11005 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11006 tree new_tree = build3 (BIT_FIELD_REF, bftype,
11007 vec_lhs, bitsize, bitstart);
11008 gimple_seq stmts = NULL;
11009 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11010 &stmts, true, NULL_TREE);
11011 if (TREE_CODE (new_tree) == SSA_NAME
11012 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11013 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11014 if (is_a <gphi *> (vec_stmt))
11016 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11017 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11019 else
11021 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11022 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11025 /* Replace use of lhs with newly computed result. If the use stmt is a
11026 single arg PHI, just replace all uses of PHI result. It's necessary
11027 because lcssa PHI defining lhs may be before newly inserted stmt. */
11028 use_operand_p use_p;
11029 stmt_vec_info use_stmt_info;
11030 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11031 if (!is_gimple_debug (use_stmt)
11032 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11033 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11035 /* ??? This can happen when the live lane ends up being
11036 rooted in a vector construction code-generated by an
11037 external SLP node (and code-generation for that already
11038 happened). See gcc.dg/vect/bb-slp-47.c.
11039 Doing this is what would happen if that vector CTOR
11040 were not code-generated yet so it is not too bad.
11041 ??? In fact we'd likely want to avoid this situation
11042 in the first place. */
11043 if (TREE_CODE (new_tree) == SSA_NAME
11044 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11045 && gimple_code (use_stmt) != GIMPLE_PHI
11046 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11047 use_stmt))
11049 if (dump_enabled_p ())
11050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11051 "Using original scalar computation for "
11052 "live lane because use preceeds vector "
11053 "def\n");
11054 continue;
11056 /* ??? It can also happen that we end up pulling a def into
11057 a loop where replacing out-of-loop uses would require
11058 a new LC SSA PHI node. Retain the original scalar in
11059 those cases as well. PR98064. */
11060 if (TREE_CODE (new_tree) == SSA_NAME
11061 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11062 && (gimple_bb (use_stmt)->loop_father
11063 != gimple_bb (vec_stmt)->loop_father)
11064 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11065 gimple_bb (use_stmt)->loop_father))
11067 if (dump_enabled_p ())
11068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11069 "Using original scalar computation for "
11070 "live lane because there is an out-of-loop "
11071 "definition for it\n");
11072 continue;
11074 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11075 SET_USE (use_p, new_tree);
11076 update_stmt (use_stmt);
11080 return true;
11083 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
11085 static void
11086 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11088 ssa_op_iter op_iter;
11089 imm_use_iterator imm_iter;
11090 def_operand_p def_p;
11091 gimple *ustmt;
11093 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11095 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11097 basic_block bb;
11099 if (!is_gimple_debug (ustmt))
11100 continue;
11102 bb = gimple_bb (ustmt);
11104 if (!flow_bb_inside_loop_p (loop, bb))
11106 if (gimple_debug_bind_p (ustmt))
11108 if (dump_enabled_p ())
11109 dump_printf_loc (MSG_NOTE, vect_location,
11110 "killing debug use\n");
11112 gimple_debug_bind_reset_value (ustmt);
11113 update_stmt (ustmt);
11115 else
11116 gcc_unreachable ();
11122 /* Given loop represented by LOOP_VINFO, return true if computation of
11123 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11124 otherwise. */
11126 static bool
11127 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11129 /* Constant case. */
11130 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11132 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11133 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11135 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11136 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11137 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11138 return true;
11141 widest_int max;
11142 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11143 /* Check the upper bound of loop niters. */
11144 if (get_max_loop_iterations (loop, &max))
11146 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11147 signop sgn = TYPE_SIGN (type);
11148 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11149 if (max < type_max)
11150 return true;
11152 return false;
11155 /* Return a mask type with half the number of elements as OLD_TYPE,
11156 given that it should have mode NEW_MODE. */
11158 tree
11159 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11161 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11162 return build_truth_vector_type_for_mode (nunits, new_mode);
11165 /* Return a mask type with twice as many elements as OLD_TYPE,
11166 given that it should have mode NEW_MODE. */
11168 tree
11169 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11171 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11172 return build_truth_vector_type_for_mode (nunits, new_mode);
11175 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11176 contain a sequence of NVECTORS masks that each control a vector of type
11177 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
11178 these vector masks with the vector version of SCALAR_MASK. */
11180 void
11181 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11182 unsigned int nvectors, tree vectype, tree scalar_mask)
11184 gcc_assert (nvectors != 0);
11186 if (scalar_mask)
11188 scalar_cond_masked_key cond (scalar_mask, nvectors);
11189 loop_vinfo->scalar_cond_masked_set.add (cond);
11192 masks->mask_set.add (std::make_pair (vectype, nvectors));
11195 /* Given a complete set of masks MASKS, extract mask number INDEX
11196 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11197 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
11199 See the comment above vec_loop_masks for more details about the mask
11200 arrangement. */
11202 tree
11203 vect_get_loop_mask (loop_vec_info loop_vinfo,
11204 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11205 unsigned int nvectors, tree vectype, unsigned int index)
11207 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11208 == vect_partial_vectors_while_ult)
11210 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11211 tree mask_type = rgm->type;
11213 /* Populate the rgroup's mask array, if this is the first time we've
11214 used it. */
11215 if (rgm->controls.is_empty ())
11217 rgm->controls.safe_grow_cleared (nvectors, true);
11218 for (unsigned int i = 0; i < nvectors; ++i)
11220 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11221 /* Provide a dummy definition until the real one is available. */
11222 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11223 rgm->controls[i] = mask;
11227 tree mask = rgm->controls[index];
11228 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11229 TYPE_VECTOR_SUBPARTS (vectype)))
11231 /* A loop mask for data type X can be reused for data type Y
11232 if X has N times more elements than Y and if Y's elements
11233 are N times bigger than X's. In this case each sequence
11234 of N elements in the loop mask will be all-zero or all-one.
11235 We can then view-convert the mask so that each sequence of
11236 N elements is replaced by a single element. */
11237 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11238 TYPE_VECTOR_SUBPARTS (vectype)));
11239 gimple_seq seq = NULL;
11240 mask_type = truth_type_for (vectype);
11241 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11242 if (seq)
11243 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11245 return mask;
11247 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11248 == vect_partial_vectors_avx512)
11250 /* The number of scalars per iteration and the number of vectors are
11251 both compile-time constants. */
11252 unsigned int nscalars_per_iter
11253 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11254 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11256 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11258 /* The stored nV is dependent on the mask type produced. */
11259 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11260 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11261 == rgm->factor);
11262 nvectors = rgm->factor;
11264 /* Populate the rgroup's mask array, if this is the first time we've
11265 used it. */
11266 if (rgm->controls.is_empty ())
11268 rgm->controls.safe_grow_cleared (nvectors, true);
11269 for (unsigned int i = 0; i < nvectors; ++i)
11271 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11272 /* Provide a dummy definition until the real one is available. */
11273 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11274 rgm->controls[i] = mask;
11277 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11278 TYPE_VECTOR_SUBPARTS (vectype)))
11279 return rgm->controls[index];
11281 /* Split the vector if needed. Since we are dealing with integer mode
11282 masks with AVX512 we can operate on the integer representation
11283 performing the whole vector shifting. */
11284 unsigned HOST_WIDE_INT factor;
11285 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11286 TYPE_VECTOR_SUBPARTS (vectype), &factor);
11287 gcc_assert (ok);
11288 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11289 tree mask_type = truth_type_for (vectype);
11290 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11291 unsigned vi = index / factor;
11292 unsigned vpart = index % factor;
11293 tree vec = rgm->controls[vi];
11294 gimple_seq seq = NULL;
11295 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11296 lang_hooks.types.type_for_mode
11297 (TYPE_MODE (rgm->type), 1), vec);
11298 /* For integer mode masks simply shift the right bits into position. */
11299 if (vpart != 0)
11300 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11301 build_int_cst (integer_type_node,
11302 (TYPE_VECTOR_SUBPARTS (vectype)
11303 * vpart)));
11304 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11305 (TYPE_MODE (mask_type), 1), vec);
11306 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11307 if (seq)
11308 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11309 return vec;
11311 else
11312 gcc_unreachable ();
11315 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11316 lengths for controlling an operation on VECTYPE. The operation splits
11317 each element of VECTYPE into FACTOR separate subelements, measuring the
11318 length as a number of these subelements. */
11320 void
11321 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11322 unsigned int nvectors, tree vectype, unsigned int factor)
11324 gcc_assert (nvectors != 0);
11325 if (lens->length () < nvectors)
11326 lens->safe_grow_cleared (nvectors, true);
11327 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11329 /* The number of scalars per iteration, scalar occupied bytes and
11330 the number of vectors are both compile-time constants. */
11331 unsigned int nscalars_per_iter
11332 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11333 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11335 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11337 /* For now, we only support cases in which all loads and stores fall back
11338 to VnQI or none do. */
11339 gcc_assert (!rgl->max_nscalars_per_iter
11340 || (rgl->factor == 1 && factor == 1)
11341 || (rgl->max_nscalars_per_iter * rgl->factor
11342 == nscalars_per_iter * factor));
11343 rgl->max_nscalars_per_iter = nscalars_per_iter;
11344 rgl->type = vectype;
11345 rgl->factor = factor;
11349 /* Given a complete set of lengths LENS, extract length number INDEX
11350 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11351 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11352 multipled by the number of elements that should be processed.
11353 Insert any set-up statements before GSI. */
11355 tree
11356 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11357 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11358 unsigned int index, unsigned int factor)
11360 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11361 bool use_bias_adjusted_len =
11362 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11364 /* Populate the rgroup's len array, if this is the first time we've
11365 used it. */
11366 if (rgl->controls.is_empty ())
11368 rgl->controls.safe_grow_cleared (nvectors, true);
11369 for (unsigned int i = 0; i < nvectors; ++i)
11371 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11372 gcc_assert (len_type != NULL_TREE);
11374 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11376 /* Provide a dummy definition until the real one is available. */
11377 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11378 rgl->controls[i] = len;
11380 if (use_bias_adjusted_len)
11382 gcc_assert (i == 0);
11383 tree adjusted_len =
11384 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11385 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11386 rgl->bias_adjusted_ctrl = adjusted_len;
11391 if (use_bias_adjusted_len)
11392 return rgl->bias_adjusted_ctrl;
11394 tree loop_len = rgl->controls[index];
11395 if (rgl->factor == 1 && factor == 1)
11397 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11398 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11399 if (maybe_ne (nunits1, nunits2))
11401 /* A loop len for data type X can be reused for data type Y
11402 if X has N times more elements than Y and if Y's elements
11403 are N times bigger than X's. */
11404 gcc_assert (multiple_p (nunits1, nunits2));
11405 factor = exact_div (nunits1, nunits2).to_constant ();
11406 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11407 gimple_seq seq = NULL;
11408 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11409 build_int_cst (iv_type, factor));
11410 if (seq)
11411 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11414 return loop_len;
11417 /* Scale profiling counters by estimation for LOOP which is vectorized
11418 by factor VF.
11419 If FLAT is true, the loop we started with had unrealistically flat
11420 profile. */
11422 static void
11423 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11425 /* For flat profiles do not scale down proportionally by VF and only
11426 cap by known iteration count bounds. */
11427 if (flat)
11429 if (dump_file && (dump_flags & TDF_DETAILS))
11430 fprintf (dump_file,
11431 "Vectorized loop profile seems flat; not scaling iteration "
11432 "count down by the vectorization factor %i\n", vf);
11433 scale_loop_profile (loop, profile_probability::always (),
11434 get_likely_max_loop_iterations_int (loop));
11435 return;
11437 /* Loop body executes VF fewer times and exit increases VF times. */
11438 profile_count entry_count = loop_preheader_edge (loop)->count ();
11440 /* If we have unreliable loop profile avoid dropping entry
11441 count bellow header count. This can happen since loops
11442 has unrealistically low trip counts. */
11443 while (vf > 1
11444 && loop->header->count > entry_count
11445 && loop->header->count < entry_count * vf)
11447 if (dump_file && (dump_flags & TDF_DETAILS))
11448 fprintf (dump_file,
11449 "Vectorization factor %i seems too large for profile "
11450 "prevoiusly believed to be consistent; reducing.\n", vf);
11451 vf /= 2;
11454 if (entry_count.nonzero_p ())
11455 set_edge_probability_and_rescale_others
11456 (exit_e,
11457 entry_count.probability_in (loop->header->count / vf));
11458 /* Avoid producing very large exit probability when we do not have
11459 sensible profile. */
11460 else if (exit_e->probability < profile_probability::always () / (vf * 2))
11461 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11462 loop->latch->count = single_pred_edge (loop->latch)->count ();
11464 scale_loop_profile (loop, profile_probability::always () / vf,
11465 get_likely_max_loop_iterations_int (loop));
11468 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11469 latch edge values originally defined by it. */
11471 static void
11472 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11473 stmt_vec_info def_stmt_info)
11475 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11476 if (!def || TREE_CODE (def) != SSA_NAME)
11477 return;
11478 stmt_vec_info phi_info;
11479 imm_use_iterator iter;
11480 use_operand_p use_p;
11481 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11483 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11484 if (!phi)
11485 continue;
11486 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11487 && (phi_info = loop_vinfo->lookup_stmt (phi))
11488 && STMT_VINFO_RELEVANT_P (phi_info)))
11489 continue;
11490 loop_p loop = gimple_bb (phi)->loop_father;
11491 edge e = loop_latch_edge (loop);
11492 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11493 continue;
11495 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11496 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11497 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11499 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11500 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11501 gcc_assert (phi_defs.length () == latch_defs.length ());
11502 for (unsigned i = 0; i < phi_defs.length (); ++i)
11503 add_phi_arg (as_a <gphi *> (phi_defs[i]),
11504 gimple_get_lhs (latch_defs[i]), e,
11505 gimple_phi_arg_location (phi, e->dest_idx));
11507 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11509 /* For first order recurrences we have to update both uses of
11510 the latch definition, the one in the PHI node and the one
11511 in the generated VEC_PERM_EXPR. */
11512 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11513 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11514 gcc_assert (phi_defs.length () == latch_defs.length ());
11515 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11516 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11517 for (unsigned i = 0; i < phi_defs.length (); ++i)
11519 gassign *perm = as_a <gassign *> (phi_defs[i]);
11520 if (i > 0)
11521 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11522 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11523 update_stmt (perm);
11525 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11526 gimple_phi_arg_location (phi, e->dest_idx));
11531 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11532 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11533 stmt_vec_info. */
11535 static bool
11536 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11537 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11539 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11540 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11542 if (dump_enabled_p ())
11543 dump_printf_loc (MSG_NOTE, vect_location,
11544 "------>vectorizing statement: %G", stmt_info->stmt);
11546 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11547 vect_loop_kill_debug_uses (loop, stmt_info);
11549 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11550 && !STMT_VINFO_LIVE_P (stmt_info))
11552 if (is_gimple_call (stmt_info->stmt)
11553 && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11555 gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11556 *seen_store = stmt_info;
11557 return false;
11559 return false;
11562 if (STMT_VINFO_VECTYPE (stmt_info))
11564 poly_uint64 nunits
11565 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11566 if (!STMT_SLP_TYPE (stmt_info)
11567 && maybe_ne (nunits, vf)
11568 && dump_enabled_p ())
11569 /* For SLP VF is set according to unrolling factor, and not
11570 to vector size, hence for SLP this print is not valid. */
11571 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11574 /* Pure SLP statements have already been vectorized. We still need
11575 to apply loop vectorization to hybrid SLP statements. */
11576 if (PURE_SLP_STMT (stmt_info))
11577 return false;
11579 if (dump_enabled_p ())
11580 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11582 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11583 *seen_store = stmt_info;
11585 return true;
11588 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11589 in the hash_map with its corresponding values. */
11591 static tree
11592 find_in_mapping (tree t, void *context)
11594 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11596 tree *value = mapping->get (t);
11597 return value ? *value : t;
11600 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11601 original loop that has now been vectorized.
11603 The inits of the data_references need to be advanced with the number of
11604 iterations of the main loop. This has been computed in vect_do_peeling and
11605 is stored in parameter ADVANCE. We first restore the data_references
11606 initial offset with the values recored in ORIG_DRS_INIT.
11608 Since the loop_vec_info of this EPILOGUE was constructed for the original
11609 loop, its stmt_vec_infos all point to the original statements. These need
11610 to be updated to point to their corresponding copies as well as the SSA_NAMES
11611 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11613 The data_reference's connections also need to be updated. Their
11614 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11615 stmt_vec_infos, their statements need to point to their corresponding copy,
11616 if they are gather loads or scatter stores then their reference needs to be
11617 updated to point to its corresponding copy and finally we set
11618 'base_misaligned' to false as we have already peeled for alignment in the
11619 prologue of the main loop. */
11621 static void
11622 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11624 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11625 auto_vec<gimple *> stmt_worklist;
11626 hash_map<tree,tree> mapping;
11627 gimple *orig_stmt, *new_stmt;
11628 gimple_stmt_iterator epilogue_gsi;
11629 gphi_iterator epilogue_phi_gsi;
11630 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11631 basic_block *epilogue_bbs = get_loop_body (epilogue);
11632 unsigned i;
11634 free (LOOP_VINFO_BBS (epilogue_vinfo));
11635 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11637 /* Advance data_reference's with the number of iterations of the previous
11638 loop and its prologue. */
11639 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11642 /* The EPILOGUE loop is a copy of the original loop so they share the same
11643 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11644 point to the copied statements. We also create a mapping of all LHS' in
11645 the original loop and all the LHS' in the EPILOGUE and create worklists to
11646 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11647 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11649 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11650 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11652 new_stmt = epilogue_phi_gsi.phi ();
11654 gcc_assert (gimple_uid (new_stmt) > 0);
11655 stmt_vinfo
11656 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11658 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11659 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11661 mapping.put (gimple_phi_result (orig_stmt),
11662 gimple_phi_result (new_stmt));
11663 /* PHI nodes can not have patterns or related statements. */
11664 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11665 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11668 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11669 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11671 new_stmt = gsi_stmt (epilogue_gsi);
11672 if (is_gimple_debug (new_stmt))
11673 continue;
11675 gcc_assert (gimple_uid (new_stmt) > 0);
11676 stmt_vinfo
11677 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11679 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11680 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11682 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11683 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11685 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11687 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11688 for (gimple_stmt_iterator gsi = gsi_start (seq);
11689 !gsi_end_p (gsi); gsi_next (&gsi))
11690 stmt_worklist.safe_push (gsi_stmt (gsi));
11693 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11694 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11696 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11697 stmt_worklist.safe_push (stmt);
11698 /* Set BB such that the assert in
11699 'get_initial_def_for_reduction' is able to determine that
11700 the BB of the related stmt is inside this loop. */
11701 gimple_set_bb (stmt,
11702 gimple_bb (new_stmt));
11703 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11704 gcc_assert (related_vinfo == NULL
11705 || related_vinfo == stmt_vinfo);
11710 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11711 using the original main loop and thus need to be updated to refer to the
11712 cloned variables used in the epilogue. */
11713 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11715 gimple *stmt = stmt_worklist[i];
11716 tree *new_op;
11718 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11720 tree op = gimple_op (stmt, j);
11721 if ((new_op = mapping.get(op)))
11722 gimple_set_op (stmt, j, *new_op);
11723 else
11725 /* PR92429: The last argument of simplify_replace_tree disables
11726 folding when replacing arguments. This is required as
11727 otherwise you might end up with different statements than the
11728 ones analyzed in vect_loop_analyze, leading to different
11729 vectorization. */
11730 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11731 &find_in_mapping, &mapping, false);
11732 gimple_set_op (stmt, j, op);
11737 struct data_reference *dr;
11738 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11739 FOR_EACH_VEC_ELT (datarefs, i, dr)
11741 orig_stmt = DR_STMT (dr);
11742 gcc_assert (gimple_uid (orig_stmt) > 0);
11743 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11744 /* Data references for gather loads and scatter stores do not use the
11745 updated offset we set using ADVANCE. Instead we have to make sure the
11746 reference in the data references point to the corresponding copy of
11747 the original in the epilogue. Make sure to update both
11748 gather/scatters recognized by dataref analysis and also other
11749 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11750 auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11751 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11752 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11754 DR_REF (dr)
11755 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11756 &find_in_mapping, &mapping);
11757 DR_BASE_ADDRESS (dr)
11758 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11759 &find_in_mapping, &mapping);
11761 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11762 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11763 /* The vector size of the epilogue is smaller than that of the main loop
11764 so the alignment is either the same or lower. This means the dr will
11765 thus by definition be aligned. */
11766 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11769 epilogue_vinfo->shared->datarefs_copy.release ();
11770 epilogue_vinfo->shared->save_datarefs ();
11773 /* When vectorizing early break statements instructions that happen before
11774 the early break in the current BB need to be moved to after the early
11775 break. This function deals with that and assumes that any validity
11776 checks has already been performed.
11778 While moving the instructions if it encounters a VUSE or VDEF it then
11779 corrects the VUSES as it moves the statements along. GDEST is the location
11780 in which to insert the new statements. */
11782 static void
11783 move_early_exit_stmts (loop_vec_info loop_vinfo)
11785 DUMP_VECT_SCOPE ("move_early_exit_stmts");
11787 if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11788 return;
11790 /* Move all stmts that need moving. */
11791 basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11792 gimple_stmt_iterator dest_gsi = gsi_start_bb (dest_bb);
11794 for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11796 /* Check to see if statement is still required for vect or has been
11797 elided. */
11798 auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11799 if (!stmt_info)
11800 continue;
11802 if (dump_enabled_p ())
11803 dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11805 gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11806 gsi_move_before (&stmt_gsi, &dest_gsi);
11807 gsi_prev (&dest_gsi);
11810 /* Update all the stmts with their new reaching VUSES. */
11811 tree vuse
11812 = gimple_vuse (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).last ());
11813 for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11815 if (dump_enabled_p ())
11816 dump_printf_loc (MSG_NOTE, vect_location,
11817 "updating vuse to %T for load %G", vuse, p);
11818 gimple_set_vuse (p, vuse);
11819 update_stmt (p);
11823 /* Function vect_transform_loop.
11825 The analysis phase has determined that the loop is vectorizable.
11826 Vectorize the loop - created vectorized stmts to replace the scalar
11827 stmts in the loop, and update the loop exit condition.
11828 Returns scalar epilogue loop if any. */
11830 class loop *
11831 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11833 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11834 class loop *epilogue = NULL;
11835 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11836 int nbbs = loop->num_nodes;
11837 int i;
11838 tree niters_vector = NULL_TREE;
11839 tree step_vector = NULL_TREE;
11840 tree niters_vector_mult_vf = NULL_TREE;
11841 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11842 unsigned int lowest_vf = constant_lower_bound (vf);
11843 gimple *stmt;
11844 bool check_profitability = false;
11845 unsigned int th;
11846 bool flat = maybe_flat_loop_profile (loop);
11848 DUMP_VECT_SCOPE ("vec_transform_loop");
11850 loop_vinfo->shared->check_datarefs ();
11852 /* Use the more conservative vectorization threshold. If the number
11853 of iterations is constant assume the cost check has been performed
11854 by our caller. If the threshold makes all loops profitable that
11855 run at least the (estimated) vectorization factor number of times
11856 checking is pointless, too. */
11857 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11858 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11860 if (dump_enabled_p ())
11861 dump_printf_loc (MSG_NOTE, vect_location,
11862 "Profitability threshold is %d loop iterations.\n",
11863 th);
11864 check_profitability = true;
11867 /* Make sure there exists a single-predecessor exit bb. Do this before
11868 versioning. */
11869 edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11870 if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11872 split_loop_exit_edge (e, true);
11873 if (dump_enabled_p ())
11874 dump_printf (MSG_NOTE, "split exit edge\n");
11877 /* Version the loop first, if required, so the profitability check
11878 comes first. */
11880 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11882 class loop *sloop
11883 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11884 sloop->force_vectorize = false;
11885 check_profitability = false;
11888 /* Make sure there exists a single-predecessor exit bb also on the
11889 scalar loop copy. Do this after versioning but before peeling
11890 so CFG structure is fine for both scalar and if-converted loop
11891 to make slpeel_duplicate_current_defs_from_edges face matched
11892 loop closed PHI nodes on the exit. */
11893 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11895 e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11896 if (! single_pred_p (e->dest))
11898 split_loop_exit_edge (e, true);
11899 if (dump_enabled_p ())
11900 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11904 tree niters = vect_build_loop_niters (loop_vinfo);
11905 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11906 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11907 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11908 tree advance;
11909 drs_init_vec orig_drs_init;
11911 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11912 &step_vector, &niters_vector_mult_vf, th,
11913 check_profitability, niters_no_overflow,
11914 &advance);
11915 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11916 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11918 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11919 block after loop exit. We need to scale all that. */
11920 basic_block preheader
11921 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11922 preheader->count
11923 = preheader->count.apply_probability
11924 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11925 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11926 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11927 single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11928 = preheader->count;
11931 if (niters_vector == NULL_TREE)
11933 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11934 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11935 && known_eq (lowest_vf, vf))
11937 niters_vector
11938 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11939 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11940 step_vector = build_one_cst (TREE_TYPE (niters));
11942 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11943 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11944 &step_vector, niters_no_overflow);
11945 else
11946 /* vect_do_peeling subtracted the number of peeled prologue
11947 iterations from LOOP_VINFO_NITERS. */
11948 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11949 &niters_vector, &step_vector,
11950 niters_no_overflow);
11953 /* 1) Make sure the loop header has exactly two entries
11954 2) Make sure we have a preheader basic block. */
11956 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11958 split_edge (loop_preheader_edge (loop));
11960 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11961 /* This will deal with any possible peeling. */
11962 vect_prepare_for_masked_peels (loop_vinfo);
11964 /* Handle any code motion that we need to for early-break vectorization after
11965 we've done peeling but just before we start vectorizing. */
11966 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11967 move_early_exit_stmts (loop_vinfo);
11969 /* Schedule the SLP instances first, then handle loop vectorization
11970 below. */
11971 if (!loop_vinfo->slp_instances.is_empty ())
11973 DUMP_VECT_SCOPE ("scheduling SLP instances");
11974 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11977 /* FORNOW: the vectorizer supports only loops which body consist
11978 of one basic block (header + empty latch). When the vectorizer will
11979 support more involved loop forms, the order by which the BBs are
11980 traversed need to be reconsidered. */
11982 for (i = 0; i < nbbs; i++)
11984 basic_block bb = bbs[i];
11985 stmt_vec_info stmt_info;
11987 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11988 gsi_next (&si))
11990 gphi *phi = si.phi ();
11991 if (dump_enabled_p ())
11992 dump_printf_loc (MSG_NOTE, vect_location,
11993 "------>vectorizing phi: %G", (gimple *) phi);
11994 stmt_info = loop_vinfo->lookup_stmt (phi);
11995 if (!stmt_info)
11996 continue;
11998 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11999 vect_loop_kill_debug_uses (loop, stmt_info);
12001 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12002 && !STMT_VINFO_LIVE_P (stmt_info))
12003 continue;
12005 if (STMT_VINFO_VECTYPE (stmt_info)
12006 && (maybe_ne
12007 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12008 && dump_enabled_p ())
12009 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12011 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12012 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12013 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12014 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12015 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12016 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12017 && ! PURE_SLP_STMT (stmt_info))
12019 if (dump_enabled_p ())
12020 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12021 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12025 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12026 gsi_next (&si))
12028 gphi *phi = si.phi ();
12029 stmt_info = loop_vinfo->lookup_stmt (phi);
12030 if (!stmt_info)
12031 continue;
12033 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12034 && !STMT_VINFO_LIVE_P (stmt_info))
12035 continue;
12037 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12038 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12039 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12040 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12041 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12042 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12043 && ! PURE_SLP_STMT (stmt_info))
12044 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12047 for (gimple_stmt_iterator si = gsi_start_bb (bb);
12048 !gsi_end_p (si);)
12050 stmt = gsi_stmt (si);
12051 /* During vectorization remove existing clobber stmts. */
12052 if (gimple_clobber_p (stmt))
12054 unlink_stmt_vdef (stmt);
12055 gsi_remove (&si, true);
12056 release_defs (stmt);
12058 else
12060 /* Ignore vector stmts created in the outer loop. */
12061 stmt_info = loop_vinfo->lookup_stmt (stmt);
12063 /* vector stmts created in the outer-loop during vectorization of
12064 stmts in an inner-loop may not have a stmt_info, and do not
12065 need to be vectorized. */
12066 stmt_vec_info seen_store = NULL;
12067 if (stmt_info)
12069 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12071 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12072 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12073 !gsi_end_p (subsi); gsi_next (&subsi))
12075 stmt_vec_info pat_stmt_info
12076 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12077 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12078 &si, &seen_store);
12080 stmt_vec_info pat_stmt_info
12081 = STMT_VINFO_RELATED_STMT (stmt_info);
12082 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12083 &si, &seen_store))
12084 maybe_set_vectorized_backedge_value (loop_vinfo,
12085 pat_stmt_info);
12087 else
12089 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12090 &seen_store))
12091 maybe_set_vectorized_backedge_value (loop_vinfo,
12092 stmt_info);
12095 gsi_next (&si);
12096 if (seen_store)
12098 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12099 /* Interleaving. If IS_STORE is TRUE, the
12100 vectorization of the interleaving chain was
12101 completed - free all the stores in the chain. */
12102 vect_remove_stores (loop_vinfo,
12103 DR_GROUP_FIRST_ELEMENT (seen_store));
12104 else
12105 /* Free the attached stmt_vec_info and remove the stmt. */
12106 loop_vinfo->remove_stmt (stmt_info);
12111 /* Stub out scalar statements that must not survive vectorization.
12112 Doing this here helps with grouped statements, or statements that
12113 are involved in patterns. */
12114 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12115 !gsi_end_p (gsi); gsi_next (&gsi))
12117 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12118 if (!call || !gimple_call_internal_p (call))
12119 continue;
12120 internal_fn ifn = gimple_call_internal_fn (call);
12121 if (ifn == IFN_MASK_LOAD)
12123 tree lhs = gimple_get_lhs (call);
12124 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12126 tree zero = build_zero_cst (TREE_TYPE (lhs));
12127 gimple *new_stmt = gimple_build_assign (lhs, zero);
12128 gsi_replace (&gsi, new_stmt, true);
12131 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12133 tree lhs = gimple_get_lhs (call);
12134 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12136 tree else_arg
12137 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12138 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12139 gsi_replace (&gsi, new_stmt, true);
12143 } /* BBs in loop */
12145 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12146 a zero NITERS becomes a nonzero NITERS_VECTOR. */
12147 if (integer_onep (step_vector))
12148 niters_no_overflow = true;
12149 vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12150 niters_vector, step_vector, niters_vector_mult_vf,
12151 !niters_no_overflow);
12153 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12155 /* True if the final iteration might not handle a full vector's
12156 worth of scalar iterations. */
12157 bool final_iter_may_be_partial
12158 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
12159 /* The minimum number of iterations performed by the epilogue. This
12160 is 1 when peeling for gaps because we always need a final scalar
12161 iteration. */
12162 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12163 /* +1 to convert latch counts to loop iteration counts,
12164 -min_epilogue_iters to remove iterations that cannot be performed
12165 by the vector code. */
12166 int bias_for_lowest = 1 - min_epilogue_iters;
12167 int bias_for_assumed = bias_for_lowest;
12168 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12169 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12171 /* When the amount of peeling is known at compile time, the first
12172 iteration will have exactly alignment_npeels active elements.
12173 In the worst case it will have at least one. */
12174 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12175 bias_for_lowest += lowest_vf - min_first_active;
12176 bias_for_assumed += assumed_vf - min_first_active;
12178 /* In these calculations the "- 1" converts loop iteration counts
12179 back to latch counts. */
12180 if (loop->any_upper_bound)
12182 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12183 loop->nb_iterations_upper_bound
12184 = (final_iter_may_be_partial
12185 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12186 lowest_vf) - 1
12187 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12188 lowest_vf) - 1);
12189 if (main_vinfo
12190 /* Both peeling for alignment and peeling for gaps can end up
12191 with the scalar epilogue running for more than VF-1 iterations. */
12192 && !main_vinfo->peeling_for_alignment
12193 && !main_vinfo->peeling_for_gaps)
12195 unsigned int bound;
12196 poly_uint64 main_iters
12197 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12198 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12199 main_iters
12200 = upper_bound (main_iters,
12201 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12202 if (can_div_away_from_zero_p (main_iters,
12203 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12204 &bound))
12205 loop->nb_iterations_upper_bound
12206 = wi::umin ((bound_wide_int) (bound - 1),
12207 loop->nb_iterations_upper_bound);
12210 if (loop->any_likely_upper_bound)
12211 loop->nb_iterations_likely_upper_bound
12212 = (final_iter_may_be_partial
12213 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12214 + bias_for_lowest, lowest_vf) - 1
12215 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12216 + bias_for_lowest, lowest_vf) - 1);
12217 if (loop->any_estimate)
12218 loop->nb_iterations_estimate
12219 = (final_iter_may_be_partial
12220 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12221 assumed_vf) - 1
12222 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12223 assumed_vf) - 1);
12224 scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12225 assumed_vf, flat);
12227 if (dump_enabled_p ())
12229 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12231 dump_printf_loc (MSG_NOTE, vect_location,
12232 "LOOP VECTORIZED\n");
12233 if (loop->inner)
12234 dump_printf_loc (MSG_NOTE, vect_location,
12235 "OUTER LOOP VECTORIZED\n");
12236 dump_printf (MSG_NOTE, "\n");
12238 else
12239 dump_printf_loc (MSG_NOTE, vect_location,
12240 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12241 GET_MODE_NAME (loop_vinfo->vector_mode));
12244 /* Loops vectorized with a variable factor won't benefit from
12245 unrolling/peeling. */
12246 if (!vf.is_constant ())
12248 loop->unroll = 1;
12249 if (dump_enabled_p ())
12250 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12251 " variable-length vectorization factor\n");
12253 /* Free SLP instances here because otherwise stmt reference counting
12254 won't work. */
12255 slp_instance instance;
12256 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12257 vect_free_slp_instance (instance);
12258 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12259 /* Clear-up safelen field since its value is invalid after vectorization
12260 since vectorized loop can have loop-carried dependencies. */
12261 loop->safelen = 0;
12263 if (epilogue)
12265 update_epilogue_loop_vinfo (epilogue, advance);
12267 epilogue->simduid = loop->simduid;
12268 epilogue->force_vectorize = loop->force_vectorize;
12269 epilogue->dont_vectorize = false;
12272 return epilogue;
12275 /* The code below is trying to perform simple optimization - revert
12276 if-conversion for masked stores, i.e. if the mask of a store is zero
12277 do not perform it and all stored value producers also if possible.
12278 For example,
12279 for (i=0; i<n; i++)
12280 if (c[i])
12282 p1[i] += 1;
12283 p2[i] = p3[i] +2;
12285 this transformation will produce the following semi-hammock:
12287 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12289 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12290 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12291 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12292 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12293 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12294 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12298 void
12299 optimize_mask_stores (class loop *loop)
12301 basic_block *bbs = get_loop_body (loop);
12302 unsigned nbbs = loop->num_nodes;
12303 unsigned i;
12304 basic_block bb;
12305 class loop *bb_loop;
12306 gimple_stmt_iterator gsi;
12307 gimple *stmt;
12308 auto_vec<gimple *> worklist;
12309 auto_purge_vect_location sentinel;
12311 vect_location = find_loop_location (loop);
12312 /* Pick up all masked stores in loop if any. */
12313 for (i = 0; i < nbbs; i++)
12315 bb = bbs[i];
12316 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12317 gsi_next (&gsi))
12319 stmt = gsi_stmt (gsi);
12320 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12321 worklist.safe_push (stmt);
12325 free (bbs);
12326 if (worklist.is_empty ())
12327 return;
12329 /* Loop has masked stores. */
12330 while (!worklist.is_empty ())
12332 gimple *last, *last_store;
12333 edge e, efalse;
12334 tree mask;
12335 basic_block store_bb, join_bb;
12336 gimple_stmt_iterator gsi_to;
12337 tree vdef, new_vdef;
12338 gphi *phi;
12339 tree vectype;
12340 tree zero;
12342 last = worklist.pop ();
12343 mask = gimple_call_arg (last, 2);
12344 bb = gimple_bb (last);
12345 /* Create then_bb and if-then structure in CFG, then_bb belongs to
12346 the same loop as if_bb. It could be different to LOOP when two
12347 level loop-nest is vectorized and mask_store belongs to the inner
12348 one. */
12349 e = split_block (bb, last);
12350 bb_loop = bb->loop_father;
12351 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12352 join_bb = e->dest;
12353 store_bb = create_empty_bb (bb);
12354 add_bb_to_loop (store_bb, bb_loop);
12355 e->flags = EDGE_TRUE_VALUE;
12356 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12357 /* Put STORE_BB to likely part. */
12358 efalse->probability = profile_probability::likely ();
12359 e->probability = efalse->probability.invert ();
12360 store_bb->count = efalse->count ();
12361 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12362 if (dom_info_available_p (CDI_DOMINATORS))
12363 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12364 if (dump_enabled_p ())
12365 dump_printf_loc (MSG_NOTE, vect_location,
12366 "Create new block %d to sink mask stores.",
12367 store_bb->index);
12368 /* Create vector comparison with boolean result. */
12369 vectype = TREE_TYPE (mask);
12370 zero = build_zero_cst (vectype);
12371 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12372 gsi = gsi_last_bb (bb);
12373 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12374 /* Create new PHI node for vdef of the last masked store:
12375 .MEM_2 = VDEF <.MEM_1>
12376 will be converted to
12377 .MEM.3 = VDEF <.MEM_1>
12378 and new PHI node will be created in join bb
12379 .MEM_2 = PHI <.MEM_1, .MEM_3>
12381 vdef = gimple_vdef (last);
12382 new_vdef = make_ssa_name (gimple_vop (cfun), last);
12383 gimple_set_vdef (last, new_vdef);
12384 phi = create_phi_node (vdef, join_bb);
12385 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12387 /* Put all masked stores with the same mask to STORE_BB if possible. */
12388 while (true)
12390 gimple_stmt_iterator gsi_from;
12391 gimple *stmt1 = NULL;
12393 /* Move masked store to STORE_BB. */
12394 last_store = last;
12395 gsi = gsi_for_stmt (last);
12396 gsi_from = gsi;
12397 /* Shift GSI to the previous stmt for further traversal. */
12398 gsi_prev (&gsi);
12399 gsi_to = gsi_start_bb (store_bb);
12400 gsi_move_before (&gsi_from, &gsi_to);
12401 /* Setup GSI_TO to the non-empty block start. */
12402 gsi_to = gsi_start_bb (store_bb);
12403 if (dump_enabled_p ())
12404 dump_printf_loc (MSG_NOTE, vect_location,
12405 "Move stmt to created bb\n%G", last);
12406 /* Move all stored value producers if possible. */
12407 while (!gsi_end_p (gsi))
12409 tree lhs;
12410 imm_use_iterator imm_iter;
12411 use_operand_p use_p;
12412 bool res;
12414 /* Skip debug statements. */
12415 if (is_gimple_debug (gsi_stmt (gsi)))
12417 gsi_prev (&gsi);
12418 continue;
12420 stmt1 = gsi_stmt (gsi);
12421 /* Do not consider statements writing to memory or having
12422 volatile operand. */
12423 if (gimple_vdef (stmt1)
12424 || gimple_has_volatile_ops (stmt1))
12425 break;
12426 gsi_from = gsi;
12427 gsi_prev (&gsi);
12428 lhs = gimple_get_lhs (stmt1);
12429 if (!lhs)
12430 break;
12432 /* LHS of vectorized stmt must be SSA_NAME. */
12433 if (TREE_CODE (lhs) != SSA_NAME)
12434 break;
12436 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12438 /* Remove dead scalar statement. */
12439 if (has_zero_uses (lhs))
12441 gsi_remove (&gsi_from, true);
12442 continue;
12446 /* Check that LHS does not have uses outside of STORE_BB. */
12447 res = true;
12448 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12450 gimple *use_stmt;
12451 use_stmt = USE_STMT (use_p);
12452 if (is_gimple_debug (use_stmt))
12453 continue;
12454 if (gimple_bb (use_stmt) != store_bb)
12456 res = false;
12457 break;
12460 if (!res)
12461 break;
12463 if (gimple_vuse (stmt1)
12464 && gimple_vuse (stmt1) != gimple_vuse (last_store))
12465 break;
12467 /* Can move STMT1 to STORE_BB. */
12468 if (dump_enabled_p ())
12469 dump_printf_loc (MSG_NOTE, vect_location,
12470 "Move stmt to created bb\n%G", stmt1);
12471 gsi_move_before (&gsi_from, &gsi_to);
12472 /* Shift GSI_TO for further insertion. */
12473 gsi_prev (&gsi_to);
12475 /* Put other masked stores with the same mask to STORE_BB. */
12476 if (worklist.is_empty ()
12477 || gimple_call_arg (worklist.last (), 2) != mask
12478 || worklist.last () != stmt1)
12479 break;
12480 last = worklist.pop ();
12482 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12486 /* Decide whether it is possible to use a zero-based induction variable
12487 when vectorizing LOOP_VINFO with partial vectors. If it is, return
12488 the value that the induction variable must be able to hold in order
12489 to ensure that the rgroups eventually have no active vector elements.
12490 Return -1 otherwise. */
12492 widest_int
12493 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12495 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12496 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12497 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12499 /* Calculate the value that the induction variable must be able
12500 to hit in order to ensure that we end the loop with an all-false mask.
12501 This involves adding the maximum number of inactive trailing scalar
12502 iterations. */
12503 widest_int iv_limit = -1;
12504 if (max_loop_iterations (loop, &iv_limit))
12506 if (niters_skip)
12508 /* Add the maximum number of skipped iterations to the
12509 maximum iteration count. */
12510 if (TREE_CODE (niters_skip) == INTEGER_CST)
12511 iv_limit += wi::to_widest (niters_skip);
12512 else
12513 iv_limit += max_vf - 1;
12515 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12516 /* Make a conservatively-correct assumption. */
12517 iv_limit += max_vf - 1;
12519 /* IV_LIMIT is the maximum number of latch iterations, which is also
12520 the maximum in-range IV value. Round this value down to the previous
12521 vector alignment boundary and then add an extra full iteration. */
12522 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12523 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12525 return iv_limit;
12528 /* For the given rgroup_controls RGC, check whether an induction variable
12529 would ever hit a value that produces a set of all-false masks or zero
12530 lengths before wrapping around. Return true if it's possible to wrap
12531 around before hitting the desirable value, otherwise return false. */
12533 bool
12534 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12536 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12538 if (iv_limit == -1)
12539 return true;
12541 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12542 unsigned int compare_precision = TYPE_PRECISION (compare_type);
12543 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12545 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12546 return true;
12548 return false;