RISC-V: Make dynamic LMUL cost model more accurate for conversion codes
[official-gcc.git] / gcc / tree-vect-loop.cc
blobf51ae3e719e753059389cf9495b6d65b3b1191cb
1 /* Loop Vectorization
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "memmodel.h"
36 #include "optabs.h"
37 #include "diagnostic-core.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "cfganal.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "gimple-fold.h"
52 #include "cgraph.h"
53 #include "tree-cfg.h"
54 #include "tree-if-conv.h"
55 #include "internal-fn.h"
56 #include "tree-vector-builder.h"
57 #include "vec-perm-indices.h"
58 #include "tree-eh.h"
59 #include "case-cfn-macros.h"
60 #include "langhooks.h"
62 /* Loop Vectorization Pass.
64 This pass tries to vectorize loops.
66 For example, the vectorizer transforms the following simple loop:
68 short a[N]; short b[N]; short c[N]; int i;
70 for (i=0; i<N; i++){
71 a[i] = b[i] + c[i];
74 as if it was manually vectorized by rewriting the source code into:
76 typedef int __attribute__((mode(V8HI))) v8hi;
77 short a[N]; short b[N]; short c[N]; int i;
78 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 v8hi va, vb, vc;
81 for (i=0; i<N/8; i++){
82 vb = pb[i];
83 vc = pc[i];
84 va = vb + vc;
85 pa[i] = va;
88 The main entry to this pass is vectorize_loops(), in which
89 the vectorizer applies a set of analyses on a given set of loops,
90 followed by the actual vectorization transformation for the loops that
91 had successfully passed the analysis phase.
92 Throughout this pass we make a distinction between two types of
93 data: scalars (which are represented by SSA_NAMES), and memory references
94 ("data-refs"). These two types of data require different handling both
95 during analysis and transformation. The types of data-refs that the
96 vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 accesses are required to have a simple (consecutive) access pattern.
100 Analysis phase:
101 ===============
102 The driver for the analysis phase is vect_analyze_loop().
103 It applies a set of analyses, some of which rely on the scalar evolution
104 analyzer (scev) developed by Sebastian Pop.
106 During the analysis phase the vectorizer records some information
107 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 loop, as well as general information about the loop as a whole, which is
109 recorded in a "loop_vec_info" struct attached to each loop.
111 Transformation phase:
112 =====================
113 The loop transformation phase scans all the stmts in the loop, and
114 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 the loop that needs to be vectorized. It inserts the vector code sequence
116 just before the scalar stmt S, and records a pointer to the vector code
117 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 attached to S). This pointer will be used for the vectorization of following
119 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 otherwise, we rely on dead code elimination for removing it.
122 For example, say stmt S1 was vectorized into stmt VS1:
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 S2: a = b;
128 To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 resulting sequence would be:
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 VS2: va = vb;
136 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 Operands that are not SSA_NAMEs, are data-refs that appear in
139 load/store operations (like 'x[i]' in S1), and are handled differently.
141 Target modeling:
142 =================
143 Currently the only target specific information that is used is the
144 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 Targets that can support different sizes of vectors, for now will need
146 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 flexibility will be added in the future.
149 Since we only vectorize operations which vector form can be
150 expressed using existing tree codes, to verify that an operation is
151 supported, the vectorizer checks the relevant optab at the relevant
152 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 the value found is CODE_FOR_nothing, then there's no target support, and
154 we can't vectorize the stmt.
156 For additional information on this project see:
157 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 unsigned *);
162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 bool *, bool *, bool);
165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 may already be set for general statements (not just data refs). */
169 static opt_result
170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 bool vectype_maybe_set_p,
172 poly_uint64 *vf)
174 gimple *stmt = stmt_info->stmt;
176 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 && !STMT_VINFO_LIVE_P (stmt_info))
178 || gimple_clobber_p (stmt))
180 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 return opt_result::success ();
185 tree stmt_vectype, nunits_vectype;
186 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 &stmt_vectype,
188 &nunits_vectype);
189 if (!res)
190 return res;
192 if (stmt_vectype)
194 if (STMT_VINFO_VECTYPE (stmt_info))
195 /* The only case when a vectype had been already set is for stmts
196 that contain a data ref, or for "pattern-stmts" (stmts generated
197 by the vectorizer to represent/replace a certain idiom). */
198 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 || vectype_maybe_set_p)
200 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 else
202 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
205 if (nunits_vectype)
206 vect_update_max_nunits (vf, nunits_vectype);
208 return opt_result::success ();
211 /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 types of STMT_INFO and all attached pattern statements and update
213 the vectorization factor VF accordingly. Return true on success
214 or false if something prevented vectorization. */
216 static opt_result
217 vect_determine_vf_for_stmt (vec_info *vinfo,
218 stmt_vec_info stmt_info, poly_uint64 *vf)
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 if (!res)
225 return res;
227 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 && STMT_VINFO_RELATED_STMT (stmt_info))
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
233 /* If a pattern statement has def stmts, analyze them too. */
234 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 !gsi_end_p (si); gsi_next (&si))
237 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "==> examining pattern def stmt: %G",
241 def_stmt_info->stmt);
242 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 if (!res)
244 return res;
247 if (dump_enabled_p ())
248 dump_printf_loc (MSG_NOTE, vect_location,
249 "==> examining pattern statement: %G",
250 stmt_info->stmt);
251 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 if (!res)
253 return res;
256 return opt_result::success ();
259 /* Function vect_determine_vectorization_factor
261 Determine the vectorization factor (VF). VF is the number of data elements
262 that are operated upon in parallel in a single iteration of the vectorized
263 loop. For example, when vectorizing a loop that operates on 4byte elements,
264 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 elements can fit in a single vector register.
267 We currently support vectorization of loops in which all types operated upon
268 are of the same size. Therefore this function currently sets VF according to
269 the size of the types operated upon, and fails if there are multiple sizes
270 in the loop.
272 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 original loop:
274 for (i=0; i<N; i++){
275 a[i] = b[i] + c[i];
278 vectorized loop:
279 for (i=0; i<N; i+=VF){
280 a[i:VF] = b[i:VF] + c[i:VF];
284 static opt_result
285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
287 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 unsigned nbbs = loop->num_nodes;
290 poly_uint64 vectorization_factor = 1;
291 tree scalar_type = NULL_TREE;
292 gphi *phi;
293 tree vectype;
294 stmt_vec_info stmt_info;
295 unsigned i;
297 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
299 for (i = 0; i < nbbs; i++)
301 basic_block bb = bbs[i];
303 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 gsi_next (&si))
306 phi = si.phi ();
307 stmt_info = loop_vinfo->lookup_stmt (phi);
308 if (dump_enabled_p ())
309 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 (gimple *) phi);
312 gcc_assert (stmt_info);
314 if (STMT_VINFO_RELEVANT_P (stmt_info)
315 || STMT_VINFO_LIVE_P (stmt_info))
317 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 scalar_type = TREE_TYPE (PHI_RESULT (phi));
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE, vect_location,
322 "get vectype for scalar type: %T\n",
323 scalar_type);
325 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 if (!vectype)
327 return opt_result::failure_at (phi,
328 "not vectorized: unsupported "
329 "data-type %T\n",
330 scalar_type);
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 dump_printf (MSG_NOTE, "\n");
344 vect_update_max_nunits (&vectorization_factor, vectype);
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 gsi_next (&si))
351 if (is_gimple_debug (gsi_stmt (si)))
352 continue;
353 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 opt_result res
355 = vect_determine_vf_for_stmt (loop_vinfo,
356 stmt_info, &vectorization_factor);
357 if (!res)
358 return res;
362 /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 dump_dec (MSG_NOTE, vectorization_factor);
367 dump_printf (MSG_NOTE, "\n");
370 if (known_le (vectorization_factor, 1U))
371 return opt_result::failure_at (vect_location,
372 "not vectorized: unsupported data-type\n");
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 return opt_result::success ();
378 /* Function vect_is_simple_iv_evolution.
380 FORNOW: A simple evolution of an induction variables in the loop is
381 considered a polynomial evolution. */
383 static bool
384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 tree * step)
387 tree init_expr;
388 tree step_expr;
389 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 basic_block bb;
392 /* When there is no evolution in this loop, the evolution function
393 is not "simple". */
394 if (evolution_part == NULL_TREE)
395 return false;
397 /* When the evolution is a polynomial of degree >= 2
398 the evolution function is not "simple". */
399 if (tree_is_chrec (evolution_part))
400 return false;
402 step_expr = evolution_part;
403 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 step_expr, init_expr);
409 *init = init_expr;
410 *step = step_expr;
412 if (TREE_CODE (step_expr) != INTEGER_CST
413 && (TREE_CODE (step_expr) != SSA_NAME
414 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 || !flag_associative_math)))
419 && (TREE_CODE (step_expr) != REAL_CST
420 || !flag_associative_math))
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "step unknown.\n");
425 return false;
428 return true;
431 /* Function vect_is_nonlinear_iv_evolution
433 Only support nonlinear induction for integer type
434 1. neg
435 2. mul by constant
436 3. lshift/rshift by constant.
438 For neg induction, return a fake step as integer -1. */
439 static bool
440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 gphi* loop_phi_node, tree *init, tree *step)
443 tree init_expr, ev_expr, result, op1, op2;
444 gimple* def;
446 if (gimple_phi_num_args (loop_phi_node) != 2)
447 return false;
449 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
452 /* Support nonlinear induction only for integer type. */
453 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 return false;
456 *init = init_expr;
457 result = PHI_RESULT (loop_phi_node);
459 if (TREE_CODE (ev_expr) != SSA_NAME
460 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 || !is_gimple_assign (def))
462 return false;
464 enum tree_code t_code = gimple_assign_rhs_code (def);
465 switch (t_code)
467 case NEGATE_EXPR:
468 if (gimple_assign_rhs1 (def) != result)
469 return false;
470 *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 break;
474 case RSHIFT_EXPR:
475 case LSHIFT_EXPR:
476 case MULT_EXPR:
477 op1 = gimple_assign_rhs1 (def);
478 op2 = gimple_assign_rhs2 (def);
479 if (TREE_CODE (op2) != INTEGER_CST
480 || op1 != result)
481 return false;
482 *step = op2;
483 if (t_code == LSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 else if (t_code == RSHIFT_EXPR)
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 else
489 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 break;
492 default:
493 return false;
496 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
499 return true;
502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 what we are assuming is a double reduction. For example, given
504 a structure like this:
506 outer1:
507 x_1 = PHI <x_4(outer2), ...>;
510 inner:
511 x_2 = PHI <x_1(outer1), ...>;
513 x_3 = ...;
516 outer2:
517 x_4 = PHI <x_3(inner)>;
520 outer loop analysis would treat x_1 as a double reduction phi and
521 this function would then return true for x_2. */
523 static bool
524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
526 use_operand_p use_p;
527 ssa_op_iter op_iter;
528 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 return true;
532 return false;
535 /* Returns true if Phi is a first-order recurrence. A first-order
536 recurrence is a non-reduction recurrence relation in which the value of
537 the recurrence in the current loop iteration equals a value defined in
538 the previous iteration. */
540 static bool
541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 gphi *phi)
544 /* A nested cycle isn't vectorizable as first order recurrence. */
545 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 return false;
548 /* Ensure the loop latch definition is from within the loop. */
549 edge latch = loop_latch_edge (loop);
550 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 if (TREE_CODE (ldef) != SSA_NAME
552 || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 return false;
557 tree def = gimple_phi_result (phi);
559 /* Ensure every use_stmt of the phi node is dominated by the latch
560 definition. */
561 imm_use_iterator imm_iter;
562 use_operand_p use_p;
563 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 if (!is_gimple_debug (USE_STMT (use_p))
565 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 USE_STMT (use_p))))
568 return false;
570 /* First-order recurrence autovectorization needs shuffle vector. */
571 tree scalar_type = TREE_TYPE (def);
572 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 if (!vectype)
574 return false;
576 return true;
579 /* Function vect_analyze_scalar_cycles_1.
581 Examine the cross iteration def-use cycles of scalar variables
582 in LOOP. LOOP_VINFO represents the loop that is now being
583 considered for vectorization (can be LOOP, or an outer-loop
584 enclosing LOOP). SLP indicates there will be some subsequent
585 slp analyses or not. */
587 static void
588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 bool slp)
591 basic_block bb = loop->header;
592 tree init, step;
593 auto_vec<stmt_vec_info, 64> worklist;
594 gphi_iterator gsi;
595 bool double_reduc, reduc_chain;
597 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
599 /* First - identify all inductions. Reduction detection assumes that all the
600 inductions have been identified, therefore, this order must not be
601 changed. */
602 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
604 gphi *phi = gsi.phi ();
605 tree access_fn = NULL;
606 tree def = PHI_RESULT (phi);
607 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 (gimple *) phi);
613 /* Skip virtual phi's. The data dependences that are associated with
614 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 if (virtual_operand_p (def))
616 continue;
618 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
620 /* Analyze the evolution function. */
621 access_fn = analyze_scalar_evolution (loop, def);
622 if (access_fn)
624 STRIP_NOPS (access_fn);
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "Access function of PHI: %T\n", access_fn);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 = initial_condition_in_loop_num (access_fn, loop->num);
630 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 = evolution_part_in_loop_num (access_fn, loop->num);
634 if ((!access_fn
635 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 &init, &step)
638 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 && TREE_CODE (step) != INTEGER_CST))
640 /* Only handle nonlinear iv for same loop. */
641 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 phi, &init, &step)))
645 worklist.safe_push (stmt_vinfo);
646 continue;
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 != NULL_TREE);
651 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
659 /* Second - identify all reductions and nested cycles. */
660 while (worklist.length () > 0)
662 stmt_vec_info stmt_vinfo = worklist.pop ();
663 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
664 tree def = PHI_RESULT (phi);
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 (gimple *) phi);
670 gcc_assert (!virtual_operand_p (def)
671 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
673 stmt_vec_info reduc_stmt_info
674 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 &reduc_chain, slp);
676 if (reduc_stmt_info)
678 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 if (double_reduc)
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location,
684 "Detected double reduction.\n");
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
689 else
691 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "Detected vectorizable nested cycle.\n");
697 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
699 else
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_NOTE, vect_location,
703 "Detected reduction.\n");
705 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
706 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
707 /* Store the reduction cycles for possible vectorization in
708 loop-aware SLP if it was not detected as reduction
709 chain. */
710 if (! reduc_chain)
711 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
712 (reduc_stmt_info);
716 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
718 else
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown def-use cycle pattern.\n");
726 /* Function vect_analyze_scalar_cycles.
728 Examine the cross iteration def-use cycles of scalar variables, by
729 analyzing the loop-header PHIs of scalar variables. Classify each
730 cycle as one of the following: invariant, induction, reduction, unknown.
731 We do that for the loop represented by LOOP_VINFO, and also to its
732 inner-loop, if exists.
733 Examples for scalar cycles:
735 Example1: reduction:
737 loop1:
738 for (i=0; i<N; i++)
739 sum += a[i];
741 Example2: induction:
743 loop2:
744 for (i=0; i<N; i++)
745 a[i] = i; */
747 static void
748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
752 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
754 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 Reductions in such inner-loop therefore have different properties than
756 the reductions in the nest that gets vectorized:
757 1. When vectorized, they are executed in the same order as in the original
758 scalar loop, so we can't change the order of computation when
759 vectorizing them.
760 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 current checks are too strict. */
763 if (loop->inner)
764 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
767 /* Transfer group and reduction information from STMT_INFO to its
768 pattern stmt. */
770 static void
771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
773 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
774 stmt_vec_info stmtp;
775 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
777 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
780 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
781 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
782 == STMT_VINFO_DEF_TYPE (stmt_info));
783 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
784 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
785 if (stmt_info)
786 REDUC_GROUP_NEXT_ELEMENT (stmtp)
787 = STMT_VINFO_RELATED_STMT (stmt_info);
789 while (stmt_info);
792 /* Fixup scalar cycles that now have their stmts detected as patterns. */
794 static void
795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
797 stmt_vec_info first;
798 unsigned i;
800 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
802 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
803 while (next)
805 if ((STMT_VINFO_IN_PATTERN_P (next)
806 != STMT_VINFO_IN_PATTERN_P (first))
807 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
808 break;
809 next = REDUC_GROUP_NEXT_ELEMENT (next);
811 /* If all reduction chain members are well-formed patterns adjust
812 the group to group the pattern stmts instead. */
813 if (! next
814 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
816 if (STMT_VINFO_IN_PATTERN_P (first))
818 vect_fixup_reduc_chain (first);
819 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
820 = STMT_VINFO_RELATED_STMT (first);
823 /* If not all stmt in the chain are patterns or if we failed
824 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 it as regular reduction instead. */
826 else
828 stmt_vec_info vinfo = first;
829 stmt_vec_info last = NULL;
830 while (vinfo)
832 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
833 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
834 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
835 last = vinfo;
836 vinfo = next;
838 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
839 = vect_internal_def;
840 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
841 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
842 --i;
847 /* Function vect_get_loop_niters.
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 niter information holds in ASSUMPTIONS.
854 Return the loop exit conditions. */
857 static vec<gcond *>
858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
859 tree *number_of_iterations, tree *number_of_iterationsm1)
861 auto_vec<edge> exits = get_loop_exit_edges (loop);
862 vec<gcond *> conds;
863 conds.create (exits.length ());
864 class tree_niter_desc niter_desc;
865 tree niter_assumptions, niter, may_be_zero;
867 *assumptions = boolean_true_node;
868 *number_of_iterationsm1 = chrec_dont_know;
869 *number_of_iterations = chrec_dont_know;
871 DUMP_VECT_SCOPE ("get_loop_niters");
873 if (exits.is_empty ())
874 return conds;
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
878 exits.length ());
880 edge exit;
881 unsigned int i;
882 FOR_EACH_VEC_ELT (exits, i, exit)
884 gcond *cond = get_loop_exit_condition (exit);
885 if (cond)
886 conds.safe_push (cond);
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
891 if (exit != main_exit)
892 continue;
894 may_be_zero = NULL_TREE;
895 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
896 || chrec_contains_undetermined (niter_desc.niter))
897 continue;
899 niter_assumptions = niter_desc.assumptions;
900 may_be_zero = niter_desc.may_be_zero;
901 niter = niter_desc.niter;
903 if (may_be_zero && integer_zerop (may_be_zero))
904 may_be_zero = NULL_TREE;
906 if (may_be_zero)
908 if (COMPARISON_CLASS_P (may_be_zero))
910 /* Try to combine may_be_zero with assumptions, this can simplify
911 computation of niter expression. */
912 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
913 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
914 niter_assumptions,
915 fold_build1 (TRUTH_NOT_EXPR,
916 boolean_type_node,
917 may_be_zero));
918 else
919 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
920 build_int_cst (TREE_TYPE (niter), 0),
921 rewrite_to_non_trapping_overflow (niter));
923 may_be_zero = NULL_TREE;
925 else if (integer_nonzerop (may_be_zero))
927 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
928 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
929 continue;
931 else
932 continue;
935 /* Loop assumptions are based off the normal exit. */
936 *assumptions = niter_assumptions;
937 *number_of_iterationsm1 = niter;
939 /* We want the number of loop header executions which is the number
940 of latch executions plus one.
941 ??? For UINT_MAX latch executions this number overflows to zero
942 for loops like do { n++; } while (n != 0); */
943 if (niter && !chrec_contains_undetermined (niter))
944 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
945 unshare_expr (niter),
946 build_int_cst (TREE_TYPE (niter), 1));
947 *number_of_iterations = niter;
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
953 return conds;
956 /* Determine the main loop exit for the vectorizer. */
958 edge
959 vec_init_loop_exit_info (class loop *loop)
961 /* Before we begin we must first determine which exit is the main one and
962 which are auxilary exits. */
963 auto_vec<edge> exits = get_loop_exit_edges (loop);
964 if (exits.length () == 1)
965 return exits[0];
967 /* If we have multiple exits we only support counting IV at the moment. Analyze
968 all exits and return one */
969 class tree_niter_desc niter_desc;
970 edge candidate = NULL;
971 for (edge exit : exits)
973 if (!get_loop_exit_condition (exit))
974 continue;
976 if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
977 && !chrec_contains_undetermined (niter_desc.niter))
979 if (!niter_desc.may_be_zero || !candidate)
980 candidate = exit;
984 return candidate;
987 /* Function bb_in_loop_p
989 Used as predicate for dfs order traversal of the loop bbs. */
991 static bool
992 bb_in_loop_p (const_basic_block bb, const void *data)
994 const class loop *const loop = (const class loop *)data;
995 if (flow_bb_inside_loop_p (loop, bb))
996 return true;
997 return false;
1001 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002 stmt_vec_info structs for all the stmts in LOOP_IN. */
1004 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005 : vec_info (vec_info::loop, shared),
1006 loop (loop_in),
1007 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008 num_itersm1 (NULL_TREE),
1009 num_iters (NULL_TREE),
1010 num_iters_unchanged (NULL_TREE),
1011 num_iters_assumptions (NULL_TREE),
1012 vector_costs (nullptr),
1013 scalar_costs (nullptr),
1014 th (0),
1015 versioning_threshold (0),
1016 vectorization_factor (0),
1017 main_loop_edge (nullptr),
1018 skip_main_loop_edge (nullptr),
1019 skip_this_loop_edge (nullptr),
1020 reusable_accumulators (),
1021 suggested_unroll_factor (1),
1022 max_vectorization_factor (0),
1023 mask_skip_niters (NULL_TREE),
1024 rgroup_compare_type (NULL_TREE),
1025 simd_if_cond (NULL_TREE),
1026 partial_vector_style (vect_partial_vectors_none),
1027 unaligned_dr (NULL),
1028 peeling_for_alignment (0),
1029 ptr_mask (0),
1030 ivexpr_map (NULL),
1031 scan_map (NULL),
1032 slp_unrolling_factor (1),
1033 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034 vectorizable (false),
1035 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036 using_partial_vectors_p (false),
1037 using_decrementing_iv_p (false),
1038 using_select_vl_p (false),
1039 epil_using_partial_vectors_p (false),
1040 partial_load_store_bias (0),
1041 peeling_for_gaps (false),
1042 peeling_for_niter (false),
1043 early_breaks (false),
1044 no_data_dependencies (false),
1045 has_mask_store (false),
1046 scalar_loop_scaling (profile_probability::uninitialized ()),
1047 scalar_loop (NULL),
1048 orig_loop_info (NULL),
1049 vec_loop_iv_exit (NULL),
1050 vec_epilogue_loop_iv_exit (NULL),
1051 scalar_loop_iv_exit (NULL)
1053 /* CHECKME: We want to visit all BBs before their successors (except for
1054 latch blocks, for which this assertion wouldn't hold). In the simple
1055 case of the loop forms we allow, a dfs order of the BBs would the same
1056 as reversed postorder traversal, so we are safe. */
1058 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1059 bbs, loop->num_nodes, loop);
1060 gcc_assert (nbbs == loop->num_nodes);
1062 for (unsigned int i = 0; i < nbbs; i++)
1064 basic_block bb = bbs[i];
1065 gimple_stmt_iterator si;
1067 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1069 gimple *phi = gsi_stmt (si);
1070 gimple_set_uid (phi, 0);
1071 add_stmt (phi);
1074 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1076 gimple *stmt = gsi_stmt (si);
1077 gimple_set_uid (stmt, 0);
1078 if (is_gimple_debug (stmt))
1079 continue;
1080 add_stmt (stmt);
1081 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1082 third argument is the #pragma omp simd if (x) condition, when 0,
1083 loop shouldn't be vectorized, when non-zero constant, it should
1084 be vectorized normally, otherwise versioned with vectorized loop
1085 done if the condition is non-zero at runtime. */
1086 if (loop_in->simduid
1087 && is_gimple_call (stmt)
1088 && gimple_call_internal_p (stmt)
1089 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1090 && gimple_call_num_args (stmt) >= 3
1091 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1092 && (loop_in->simduid
1093 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1095 tree arg = gimple_call_arg (stmt, 2);
1096 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1097 simd_if_cond = arg;
1098 else
1099 gcc_assert (integer_nonzerop (arg));
1104 epilogue_vinfos.create (6);
1107 /* Free all levels of rgroup CONTROLS. */
1109 void
1110 release_vec_loop_controls (vec<rgroup_controls> *controls)
1112 rgroup_controls *rgc;
1113 unsigned int i;
1114 FOR_EACH_VEC_ELT (*controls, i, rgc)
1115 rgc->controls.release ();
1116 controls->release ();
1119 /* Free all memory used by the _loop_vec_info, as well as all the
1120 stmt_vec_info structs of all the stmts in the loop. */
1122 _loop_vec_info::~_loop_vec_info ()
1124 free (bbs);
1126 release_vec_loop_controls (&masks.rgc_vec);
1127 release_vec_loop_controls (&lens);
1128 delete ivexpr_map;
1129 delete scan_map;
1130 epilogue_vinfos.release ();
1131 delete scalar_costs;
1132 delete vector_costs;
1134 /* When we release an epiloge vinfo that we do not intend to use
1135 avoid clearing AUX of the main loop which should continue to
1136 point to the main loop vinfo since otherwise we'll leak that. */
1137 if (loop->aux == this)
1138 loop->aux = NULL;
1141 /* Return an invariant or register for EXPR and emit necessary
1142 computations in the LOOP_VINFO loop preheader. */
1144 tree
1145 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1147 if (is_gimple_reg (expr)
1148 || is_gimple_min_invariant (expr))
1149 return expr;
1151 if (! loop_vinfo->ivexpr_map)
1152 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1153 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1154 if (! cached)
1156 gimple_seq stmts = NULL;
1157 cached = force_gimple_operand (unshare_expr (expr),
1158 &stmts, true, NULL_TREE);
1159 if (stmts)
1161 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1162 gsi_insert_seq_on_edge_immediate (e, stmts);
1165 return cached;
1168 /* Return true if we can use CMP_TYPE as the comparison type to produce
1169 all masks required to mask LOOP_VINFO. */
1171 static bool
1172 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1174 rgroup_controls *rgm;
1175 unsigned int i;
1176 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1177 if (rgm->type != NULL_TREE
1178 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1179 cmp_type, rgm->type,
1180 OPTIMIZE_FOR_SPEED))
1181 return false;
1182 return true;
1185 /* Calculate the maximum number of scalars per iteration for every
1186 rgroup in LOOP_VINFO. */
1188 static unsigned int
1189 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1191 unsigned int res = 1;
1192 unsigned int i;
1193 rgroup_controls *rgm;
1194 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1195 res = MAX (res, rgm->max_nscalars_per_iter);
1196 return res;
1199 /* Calculate the minimum precision necessary to represent:
1201 MAX_NITERS * FACTOR
1203 as an unsigned integer, where MAX_NITERS is the maximum number of
1204 loop header iterations for the original scalar form of LOOP_VINFO. */
1206 static unsigned
1207 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1209 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1211 /* Get the maximum number of iterations that is representable
1212 in the counter type. */
1213 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1214 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1216 /* Get a more refined estimate for the number of iterations. */
1217 widest_int max_back_edges;
1218 if (max_loop_iterations (loop, &max_back_edges))
1219 max_ni = wi::smin (max_ni, max_back_edges + 1);
1221 /* Work out how many bits we need to represent the limit. */
1222 return wi::min_precision (max_ni * factor, UNSIGNED);
1225 /* True if the loop needs peeling or partial vectors when vectorized. */
1227 static bool
1228 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1230 unsigned HOST_WIDE_INT const_vf;
1231 HOST_WIDE_INT max_niter
1232 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1234 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1235 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1236 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1237 (loop_vinfo));
1239 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1240 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1242 /* Work out the (constant) number of iterations that need to be
1243 peeled for reasons other than niters. */
1244 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1245 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1246 peel_niter += 1;
1247 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1248 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1249 return true;
1251 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1252 /* ??? When peeling for gaps but not alignment, we could
1253 try to check whether the (variable) niters is known to be
1254 VF * N + 1. That's something of a niche case though. */
1255 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1256 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1257 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1258 < (unsigned) exact_log2 (const_vf))
1259 /* In case of versioning, check if the maximum number of
1260 iterations is greater than th. If they are identical,
1261 the epilogue is unnecessary. */
1262 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1263 || ((unsigned HOST_WIDE_INT) max_niter
1264 > (th / const_vf) * const_vf))))
1265 return true;
1267 return false;
1270 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1271 whether we can actually generate the masks required. Return true if so,
1272 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1274 static bool
1275 vect_verify_full_masking (loop_vec_info loop_vinfo)
1277 unsigned int min_ni_width;
1279 /* Use a normal loop if there are no statements that need masking.
1280 This only happens in rare degenerate cases: it means that the loop
1281 has no loads, no stores, and no live-out values. */
1282 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1283 return false;
1285 /* Produce the rgroup controls. */
1286 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1288 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1289 tree vectype = mask.first;
1290 unsigned nvectors = mask.second;
1292 if (masks->rgc_vec.length () < nvectors)
1293 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1294 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1295 /* The number of scalars per iteration and the number of vectors are
1296 both compile-time constants. */
1297 unsigned int nscalars_per_iter
1298 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1299 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1301 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1303 rgm->max_nscalars_per_iter = nscalars_per_iter;
1304 rgm->type = truth_type_for (vectype);
1305 rgm->factor = 1;
1309 unsigned int max_nscalars_per_iter
1310 = vect_get_max_nscalars_per_iter (loop_vinfo);
1312 /* Work out how many bits we need to represent the limit. */
1313 min_ni_width
1314 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1316 /* Find a scalar mode for which WHILE_ULT is supported. */
1317 opt_scalar_int_mode cmp_mode_iter;
1318 tree cmp_type = NULL_TREE;
1319 tree iv_type = NULL_TREE;
1320 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1321 unsigned int iv_precision = UINT_MAX;
1323 if (iv_limit != -1)
1324 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1325 UNSIGNED);
1327 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1329 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1330 if (cmp_bits >= min_ni_width
1331 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1333 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1334 if (this_type
1335 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1337 /* Although we could stop as soon as we find a valid mode,
1338 there are at least two reasons why that's not always the
1339 best choice:
1341 - An IV that's Pmode or wider is more likely to be reusable
1342 in address calculations than an IV that's narrower than
1343 Pmode.
1345 - Doing the comparison in IV_PRECISION or wider allows
1346 a natural 0-based IV, whereas using a narrower comparison
1347 type requires mitigations against wrap-around.
1349 Conversely, if the IV limit is variable, doing the comparison
1350 in a wider type than the original type can introduce
1351 unnecessary extensions, so picking the widest valid mode
1352 is not always a good choice either.
1354 Here we prefer the first IV type that's Pmode or wider,
1355 and the first comparison type that's IV_PRECISION or wider.
1356 (The comparison type must be no wider than the IV type,
1357 to avoid extensions in the vector loop.)
1359 ??? We might want to try continuing beyond Pmode for ILP32
1360 targets if CMP_BITS < IV_PRECISION. */
1361 iv_type = this_type;
1362 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1363 cmp_type = this_type;
1364 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1365 break;
1370 if (!cmp_type)
1372 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1373 return false;
1376 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1377 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1378 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1379 return true;
1382 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1383 whether we can actually generate AVX512 style masks. Return true if so,
1384 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1386 static bool
1387 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1389 /* Produce differently organized rgc_vec and differently check
1390 we can produce masks. */
1392 /* Use a normal loop if there are no statements that need masking.
1393 This only happens in rare degenerate cases: it means that the loop
1394 has no loads, no stores, and no live-out values. */
1395 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1396 return false;
1398 /* For the decrementing IV we need to represent all values in
1399 [0, niter + niter_skip] where niter_skip is the elements we
1400 skip in the first iteration for prologue peeling. */
1401 tree iv_type = NULL_TREE;
1402 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1403 unsigned int iv_precision = UINT_MAX;
1404 if (iv_limit != -1)
1405 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1407 /* First compute the type for the IV we use to track the remaining
1408 scalar iterations. */
1409 opt_scalar_int_mode cmp_mode_iter;
1410 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1412 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1413 if (cmp_bits >= iv_precision
1414 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1416 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1417 if (iv_type)
1418 break;
1421 if (!iv_type)
1422 return false;
1424 /* Produce the rgroup controls. */
1425 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1427 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1428 tree vectype = mask.first;
1429 unsigned nvectors = mask.second;
1431 /* The number of scalars per iteration and the number of vectors are
1432 both compile-time constants. */
1433 unsigned int nscalars_per_iter
1434 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1435 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1437 /* We index the rgroup_controls vector with nscalars_per_iter
1438 which we keep constant and instead have a varying nvectors,
1439 remembering the vector mask with the fewest nV. */
1440 if (masks->rgc_vec.length () < nscalars_per_iter)
1441 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1442 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1444 if (!rgm->type || rgm->factor > nvectors)
1446 rgm->type = truth_type_for (vectype);
1447 rgm->compare_type = NULL_TREE;
1448 rgm->max_nscalars_per_iter = nscalars_per_iter;
1449 rgm->factor = nvectors;
1450 rgm->bias_adjusted_ctrl = NULL_TREE;
1454 /* There is no fixed compare type we are going to use but we have to
1455 be able to get at one for each mask group. */
1456 unsigned int min_ni_width
1457 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1459 bool ok = true;
1460 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1462 tree mask_type = rgc.type;
1463 if (!mask_type)
1464 continue;
1466 /* For now vect_get_loop_mask only supports integer mode masks
1467 when we need to split it. */
1468 if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1469 || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1471 ok = false;
1472 break;
1475 /* If iv_type is usable as compare type use that - we can elide the
1476 saturation in that case. */
1477 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1479 tree cmp_vectype
1480 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1481 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1482 rgc.compare_type = cmp_vectype;
1484 if (!rgc.compare_type)
1485 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1487 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1488 if (cmp_bits >= min_ni_width
1489 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1491 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1492 if (!cmp_type)
1493 continue;
1495 /* Check whether we can produce the mask with cmp_type. */
1496 tree cmp_vectype
1497 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1498 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1500 rgc.compare_type = cmp_vectype;
1501 break;
1505 if (!rgc.compare_type)
1507 ok = false;
1508 break;
1511 if (!ok)
1513 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1514 return false;
1517 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1518 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1519 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1520 return true;
1523 /* Check whether we can use vector access with length based on precison
1524 comparison. So far, to keep it simple, we only allow the case that the
1525 precision of the target supported length is larger than the precision
1526 required by loop niters. */
1528 static bool
1529 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1531 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1532 return false;
1534 machine_mode len_load_mode, len_store_mode;
1535 if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1536 .exists (&len_load_mode))
1537 return false;
1538 if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1539 .exists (&len_store_mode))
1540 return false;
1542 signed char partial_load_bias = internal_len_load_store_bias
1543 (IFN_LEN_LOAD, len_load_mode);
1545 signed char partial_store_bias = internal_len_load_store_bias
1546 (IFN_LEN_STORE, len_store_mode);
1548 gcc_assert (partial_load_bias == partial_store_bias);
1550 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1551 return false;
1553 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1554 len_loads with a length of zero. In order to avoid that we prohibit
1555 more than one loop length here. */
1556 if (partial_load_bias == -1
1557 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1558 return false;
1560 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1562 unsigned int max_nitems_per_iter = 1;
1563 unsigned int i;
1564 rgroup_controls *rgl;
1565 /* Find the maximum number of items per iteration for every rgroup. */
1566 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1568 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1569 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1572 /* Work out how many bits we need to represent the length limit. */
1573 unsigned int min_ni_prec
1574 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1576 /* Now use the maximum of below precisions for one suitable IV type:
1577 - the IV's natural precision
1578 - the precision needed to hold: the maximum number of scalar
1579 iterations multiplied by the scale factor (min_ni_prec above)
1580 - the Pmode precision
1582 If min_ni_prec is less than the precision of the current niters,
1583 we perfer to still use the niters type. Prefer to use Pmode and
1584 wider IV to avoid narrow conversions. */
1586 unsigned int ni_prec
1587 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1588 min_ni_prec = MAX (min_ni_prec, ni_prec);
1589 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1591 tree iv_type = NULL_TREE;
1592 opt_scalar_int_mode tmode_iter;
1593 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1595 scalar_mode tmode = tmode_iter.require ();
1596 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1598 /* ??? Do we really want to construct one IV whose precision exceeds
1599 BITS_PER_WORD? */
1600 if (tbits > BITS_PER_WORD)
1601 break;
1603 /* Find the first available standard integral type. */
1604 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1606 iv_type = build_nonstandard_integer_type (tbits, true);
1607 break;
1611 if (!iv_type)
1613 if (dump_enabled_p ())
1614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1615 "can't vectorize with length-based partial vectors"
1616 " because there is no suitable iv type.\n");
1617 return false;
1620 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1621 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1622 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1624 return true;
1627 /* Calculate the cost of one scalar iteration of the loop. */
1628 static void
1629 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1631 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1632 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1633 int nbbs = loop->num_nodes, factor;
1634 int innerloop_iters, i;
1636 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1638 /* Gather costs for statements in the scalar loop. */
1640 /* FORNOW. */
1641 innerloop_iters = 1;
1642 if (loop->inner)
1643 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1645 for (i = 0; i < nbbs; i++)
1647 gimple_stmt_iterator si;
1648 basic_block bb = bbs[i];
1650 if (bb->loop_father == loop->inner)
1651 factor = innerloop_iters;
1652 else
1653 factor = 1;
1655 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1657 gimple *stmt = gsi_stmt (si);
1658 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1660 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1661 continue;
1663 /* Skip stmts that are not vectorized inside the loop. */
1664 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1665 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1666 && (!STMT_VINFO_LIVE_P (vstmt_info)
1667 || !VECTORIZABLE_CYCLE_DEF
1668 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1669 continue;
1671 vect_cost_for_stmt kind;
1672 if (STMT_VINFO_DATA_REF (stmt_info))
1674 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1675 kind = scalar_load;
1676 else
1677 kind = scalar_store;
1679 else if (vect_nop_conversion_p (stmt_info))
1680 continue;
1681 else
1682 kind = scalar_stmt;
1684 /* We are using vect_prologue here to avoid scaling twice
1685 by the inner loop factor. */
1686 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1687 factor, kind, stmt_info, 0, vect_prologue);
1691 /* Now accumulate cost. */
1692 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1693 add_stmt_costs (loop_vinfo->scalar_costs,
1694 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1695 loop_vinfo->scalar_costs->finish_cost (nullptr);
1698 /* Function vect_analyze_loop_form.
1700 Verify that certain CFG restrictions hold, including:
1701 - the loop has a pre-header
1702 - the loop has a single entry
1703 - nested loops can have only a single exit.
1704 - the loop exit condition is simple enough
1705 - the number of iterations can be analyzed, i.e, a countable loop. The
1706 niter could be analyzed under some assumptions. */
1708 opt_result
1709 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1711 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1713 edge exit_e = vec_init_loop_exit_info (loop);
1714 if (!exit_e)
1715 return opt_result::failure_at (vect_location,
1716 "not vectorized:"
1717 " could not determine main exit from"
1718 " loop with multiple exits.\n");
1719 info->loop_exit = exit_e;
1720 if (dump_enabled_p ())
1721 dump_printf_loc (MSG_NOTE, vect_location,
1722 "using as main loop exit: %d -> %d [AUX: %p]\n",
1723 exit_e->src->index, exit_e->dest->index, exit_e->aux);
1725 /* Check if we have any control flow that doesn't leave the loop. */
1726 class loop *v_loop = loop->inner ? loop->inner : loop;
1727 basic_block *bbs= get_loop_body (v_loop);
1728 for (unsigned i = 0; i < v_loop->num_nodes; i++)
1729 if (EDGE_COUNT (bbs[i]->succs) != 1
1730 && (EDGE_COUNT (bbs[i]->succs) != 2
1731 || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1732 return opt_result::failure_at (vect_location,
1733 "not vectorized:"
1734 " unsupported control flow in loop.\n");
1736 /* Different restrictions apply when we are considering an inner-most loop,
1737 vs. an outer (nested) loop.
1738 (FORNOW. May want to relax some of these restrictions in the future). */
1740 info->inner_loop_cond = NULL;
1741 if (!loop->inner)
1743 /* Inner-most loop. We currently require that the number of BBs is
1744 exactly 2 (the header and latch). Vectorizable inner-most loops
1745 look like this:
1747 (pre-header)
1749 header <--------+
1750 | | |
1751 | +--> latch --+
1753 (exit-bb) */
1755 if (empty_block_p (loop->header))
1756 return opt_result::failure_at (vect_location,
1757 "not vectorized: empty loop.\n");
1759 else
1761 class loop *innerloop = loop->inner;
1762 edge entryedge;
1764 /* Nested loop. We currently require that the loop is doubly-nested,
1765 contains a single inner loop, and the number of BBs is exactly 5.
1766 Vectorizable outer-loops look like this:
1768 (pre-header)
1770 header <---+
1772 inner-loop |
1774 tail ------+
1776 (exit-bb)
1778 The inner-loop has the properties expected of inner-most loops
1779 as described above. */
1781 if ((loop->inner)->inner || (loop->inner)->next)
1782 return opt_result::failure_at (vect_location,
1783 "not vectorized:"
1784 " multiple nested loops.\n");
1786 entryedge = loop_preheader_edge (innerloop);
1787 if (entryedge->src != loop->header
1788 || !single_exit (innerloop)
1789 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1790 return opt_result::failure_at (vect_location,
1791 "not vectorized:"
1792 " unsupported outerloop form.\n");
1794 /* Analyze the inner-loop. */
1795 vect_loop_form_info inner;
1796 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1797 if (!res)
1799 if (dump_enabled_p ())
1800 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1801 "not vectorized: Bad inner loop.\n");
1802 return res;
1805 /* Don't support analyzing niter under assumptions for inner
1806 loop. */
1807 if (!integer_onep (inner.assumptions))
1808 return opt_result::failure_at (vect_location,
1809 "not vectorized: Bad inner loop.\n");
1811 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1812 return opt_result::failure_at (vect_location,
1813 "not vectorized: inner-loop count not"
1814 " invariant.\n");
1816 if (dump_enabled_p ())
1817 dump_printf_loc (MSG_NOTE, vect_location,
1818 "Considering outer-loop vectorization.\n");
1819 info->inner_loop_cond = inner.conds[0];
1822 if (EDGE_COUNT (loop->header->preds) != 2)
1823 return opt_result::failure_at (vect_location,
1824 "not vectorized:"
1825 " too many incoming edges.\n");
1827 /* We assume that the loop exit condition is at the end of the loop. i.e,
1828 that the loop is represented as a do-while (with a proper if-guard
1829 before the loop if needed), where the loop header contains all the
1830 executable statements, and the latch is empty. */
1831 if (!empty_block_p (loop->latch)
1832 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1833 return opt_result::failure_at (vect_location,
1834 "not vectorized: latch block not empty.\n");
1836 /* Make sure the exit is not abnormal. */
1837 auto_vec<edge> exits = get_loop_exit_edges (loop);
1838 for (edge e : exits)
1840 if (e->flags & EDGE_ABNORMAL)
1841 return opt_result::failure_at (vect_location,
1842 "not vectorized:"
1843 " abnormal loop exit edge.\n");
1846 info->conds
1847 = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1848 &info->number_of_iterations,
1849 &info->number_of_iterationsm1);
1851 if (info->conds.is_empty ())
1852 return opt_result::failure_at
1853 (vect_location,
1854 "not vectorized: complicated exit condition.\n");
1856 /* Determine what the primary and alternate exit conds are. */
1857 for (unsigned i = 0; i < info->conds.length (); i++)
1859 gcond *cond = info->conds[i];
1860 if (exit_e->src == gimple_bb (cond))
1861 std::swap (info->conds[0], info->conds[i]);
1864 if (integer_zerop (info->assumptions)
1865 || !info->number_of_iterations
1866 || chrec_contains_undetermined (info->number_of_iterations))
1867 return opt_result::failure_at
1868 (info->conds[0],
1869 "not vectorized: number of iterations cannot be computed.\n");
1871 if (integer_zerop (info->number_of_iterations))
1872 return opt_result::failure_at
1873 (info->conds[0],
1874 "not vectorized: number of iterations = 0.\n");
1876 if (!(tree_fits_shwi_p (info->number_of_iterations)
1877 && tree_to_shwi (info->number_of_iterations) > 0))
1879 if (dump_enabled_p ())
1881 dump_printf_loc (MSG_NOTE, vect_location,
1882 "Symbolic number of iterations is ");
1883 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1884 dump_printf (MSG_NOTE, "\n");
1888 return opt_result::success ();
1891 /* Create a loop_vec_info for LOOP with SHARED and the
1892 vect_analyze_loop_form result. */
1894 loop_vec_info
1895 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1896 const vect_loop_form_info *info,
1897 loop_vec_info main_loop_info)
1899 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1900 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1901 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1902 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1903 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1904 /* Also record the assumptions for versioning. */
1905 if (!integer_onep (info->assumptions) && !main_loop_info)
1906 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1908 for (gcond *cond : info->conds)
1910 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1911 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1912 /* Mark the statement as a condition. */
1913 STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1916 for (unsigned i = 1; i < info->conds.length (); i ++)
1917 LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1918 LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1920 LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1922 /* Check to see if we're vectorizing multiple exits. */
1923 LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1924 = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1926 if (info->inner_loop_cond)
1928 stmt_vec_info inner_loop_cond_info
1929 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1930 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1931 /* If we have an estimate on the number of iterations of the inner
1932 loop use that to limit the scale for costing, otherwise use
1933 --param vect-inner-loop-cost-factor literally. */
1934 widest_int nit;
1935 if (estimated_stmt_executions (loop->inner, &nit))
1936 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1937 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1940 return loop_vinfo;
1945 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1946 statements update the vectorization factor. */
1948 static void
1949 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1951 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1952 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1953 int nbbs = loop->num_nodes;
1954 poly_uint64 vectorization_factor;
1955 int i;
1957 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1959 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1960 gcc_assert (known_ne (vectorization_factor, 0U));
1962 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1963 vectorization factor of the loop is the unrolling factor required by
1964 the SLP instances. If that unrolling factor is 1, we say, that we
1965 perform pure SLP on loop - cross iteration parallelism is not
1966 exploited. */
1967 bool only_slp_in_loop = true;
1968 for (i = 0; i < nbbs; i++)
1970 basic_block bb = bbs[i];
1971 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1972 gsi_next (&si))
1974 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1975 if (!stmt_info)
1976 continue;
1977 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1978 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1979 && !PURE_SLP_STMT (stmt_info))
1980 /* STMT needs both SLP and loop-based vectorization. */
1981 only_slp_in_loop = false;
1983 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1984 gsi_next (&si))
1986 if (is_gimple_debug (gsi_stmt (si)))
1987 continue;
1988 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1989 stmt_info = vect_stmt_to_vectorize (stmt_info);
1990 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1991 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1992 && !PURE_SLP_STMT (stmt_info))
1993 /* STMT needs both SLP and loop-based vectorization. */
1994 only_slp_in_loop = false;
1998 if (only_slp_in_loop)
2000 if (dump_enabled_p ())
2001 dump_printf_loc (MSG_NOTE, vect_location,
2002 "Loop contains only SLP stmts\n");
2003 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2005 else
2007 if (dump_enabled_p ())
2008 dump_printf_loc (MSG_NOTE, vect_location,
2009 "Loop contains SLP and non-SLP stmts\n");
2010 /* Both the vectorization factor and unroll factor have the form
2011 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2012 so they must have a common multiple. */
2013 vectorization_factor
2014 = force_common_multiple (vectorization_factor,
2015 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2018 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2019 if (dump_enabled_p ())
2021 dump_printf_loc (MSG_NOTE, vect_location,
2022 "Updating vectorization factor to ");
2023 dump_dec (MSG_NOTE, vectorization_factor);
2024 dump_printf (MSG_NOTE, ".\n");
2028 /* Return true if STMT_INFO describes a double reduction phi and if
2029 the other phi in the reduction is also relevant for vectorization.
2030 This rejects cases such as:
2032 outer1:
2033 x_1 = PHI <x_3(outer2), ...>;
2036 inner:
2037 x_2 = ...;
2040 outer2:
2041 x_3 = PHI <x_2(inner)>;
2043 if nothing in x_2 or elsewhere makes x_1 relevant. */
2045 static bool
2046 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2048 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2049 return false;
2051 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2054 /* Function vect_analyze_loop_operations.
2056 Scan the loop stmts and make sure they are all vectorizable. */
2058 static opt_result
2059 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2061 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2062 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2063 int nbbs = loop->num_nodes;
2064 int i;
2065 stmt_vec_info stmt_info;
2066 bool need_to_vectorize = false;
2067 bool ok;
2069 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2071 auto_vec<stmt_info_for_cost> cost_vec;
2073 for (i = 0; i < nbbs; i++)
2075 basic_block bb = bbs[i];
2077 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2078 gsi_next (&si))
2080 gphi *phi = si.phi ();
2081 ok = true;
2083 stmt_info = loop_vinfo->lookup_stmt (phi);
2084 if (dump_enabled_p ())
2085 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2086 (gimple *) phi);
2087 if (virtual_operand_p (gimple_phi_result (phi)))
2088 continue;
2090 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2091 (i.e., a phi in the tail of the outer-loop). */
2092 if (! is_loop_header_bb_p (bb))
2094 /* FORNOW: we currently don't support the case that these phis
2095 are not used in the outerloop (unless it is double reduction,
2096 i.e., this phi is vect_reduction_def), cause this case
2097 requires to actually do something here. */
2098 if (STMT_VINFO_LIVE_P (stmt_info)
2099 && !vect_active_double_reduction_p (stmt_info))
2100 return opt_result::failure_at (phi,
2101 "Unsupported loop-closed phi"
2102 " in outer-loop.\n");
2104 /* If PHI is used in the outer loop, we check that its operand
2105 is defined in the inner loop. */
2106 if (STMT_VINFO_RELEVANT_P (stmt_info))
2108 tree phi_op;
2110 if (gimple_phi_num_args (phi) != 1)
2111 return opt_result::failure_at (phi, "unsupported phi");
2113 phi_op = PHI_ARG_DEF (phi, 0);
2114 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2115 if (!op_def_info)
2116 return opt_result::failure_at (phi, "unsupported phi\n");
2118 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2119 && (STMT_VINFO_RELEVANT (op_def_info)
2120 != vect_used_in_outer_by_reduction))
2121 return opt_result::failure_at (phi, "unsupported phi\n");
2123 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2124 || (STMT_VINFO_DEF_TYPE (stmt_info)
2125 == vect_double_reduction_def))
2126 && !vectorizable_lc_phi (loop_vinfo,
2127 stmt_info, NULL, NULL))
2128 return opt_result::failure_at (phi, "unsupported phi\n");
2131 continue;
2134 gcc_assert (stmt_info);
2136 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2137 || STMT_VINFO_LIVE_P (stmt_info))
2138 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2139 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2140 /* A scalar-dependence cycle that we don't support. */
2141 return opt_result::failure_at (phi,
2142 "not vectorized:"
2143 " scalar dependence cycle.\n");
2145 if (STMT_VINFO_RELEVANT_P (stmt_info))
2147 need_to_vectorize = true;
2148 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2149 && ! PURE_SLP_STMT (stmt_info))
2150 ok = vectorizable_induction (loop_vinfo,
2151 stmt_info, NULL, NULL,
2152 &cost_vec);
2153 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2154 || (STMT_VINFO_DEF_TYPE (stmt_info)
2155 == vect_double_reduction_def)
2156 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2157 && ! PURE_SLP_STMT (stmt_info))
2158 ok = vectorizable_reduction (loop_vinfo,
2159 stmt_info, NULL, NULL, &cost_vec);
2160 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2161 == vect_first_order_recurrence)
2162 && ! PURE_SLP_STMT (stmt_info))
2163 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2164 &cost_vec);
2167 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2168 if (ok
2169 && STMT_VINFO_LIVE_P (stmt_info)
2170 && !PURE_SLP_STMT (stmt_info))
2171 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2172 -1, false, &cost_vec);
2174 if (!ok)
2175 return opt_result::failure_at (phi,
2176 "not vectorized: relevant phi not "
2177 "supported: %G",
2178 static_cast <gimple *> (phi));
2181 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2182 gsi_next (&si))
2184 gimple *stmt = gsi_stmt (si);
2185 if (!gimple_clobber_p (stmt)
2186 && !is_gimple_debug (stmt))
2188 opt_result res
2189 = vect_analyze_stmt (loop_vinfo,
2190 loop_vinfo->lookup_stmt (stmt),
2191 &need_to_vectorize,
2192 NULL, NULL, &cost_vec);
2193 if (!res)
2194 return res;
2197 } /* bbs */
2199 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2201 /* All operations in the loop are either irrelevant (deal with loop
2202 control, or dead), or only used outside the loop and can be moved
2203 out of the loop (e.g. invariants, inductions). The loop can be
2204 optimized away by scalar optimizations. We're better off not
2205 touching this loop. */
2206 if (!need_to_vectorize)
2208 if (dump_enabled_p ())
2209 dump_printf_loc (MSG_NOTE, vect_location,
2210 "All the computation can be taken out of the loop.\n");
2211 return opt_result::failure_at
2212 (vect_location,
2213 "not vectorized: redundant loop. no profit to vectorize.\n");
2216 return opt_result::success ();
2219 /* Return true if we know that the iteration count is smaller than the
2220 vectorization factor. Return false if it isn't, or if we can't be sure
2221 either way. */
2223 static bool
2224 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2226 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2228 HOST_WIDE_INT max_niter;
2229 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2230 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2231 else
2232 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2234 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2235 return true;
2237 return false;
2240 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2241 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2242 definitely no, or -1 if it's worth retrying. */
2244 static int
2245 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2246 unsigned *suggested_unroll_factor)
2248 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2249 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2251 /* Only loops that can handle partially-populated vectors can have iteration
2252 counts less than the vectorization factor. */
2253 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2254 && vect_known_niters_smaller_than_vf (loop_vinfo))
2256 if (dump_enabled_p ())
2257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2258 "not vectorized: iteration count smaller than "
2259 "vectorization factor.\n");
2260 return 0;
2263 /* If we know the number of iterations we can do better, for the
2264 epilogue we can also decide whether the main loop leaves us
2265 with enough iterations, prefering a smaller vector epilog then
2266 also possibly used for the case we skip the vector loop. */
2267 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2269 widest_int scalar_niters
2270 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2271 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2273 loop_vec_info orig_loop_vinfo
2274 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2275 unsigned lowest_vf
2276 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2277 int prolog_peeling = 0;
2278 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2279 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2280 if (prolog_peeling >= 0
2281 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2282 lowest_vf))
2284 unsigned gap
2285 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2286 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2287 % lowest_vf + gap);
2290 /* Reject vectorizing for a single scalar iteration, even if
2291 we could in principle implement that using partial vectors. */
2292 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2293 if (scalar_niters <= peeling_gap + 1)
2295 if (dump_enabled_p ())
2296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2297 "not vectorized: loop only has a single "
2298 "scalar iteration.\n");
2299 return 0;
2302 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2304 /* Check that the loop processes at least one full vector. */
2305 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2306 if (known_lt (scalar_niters, vf))
2308 if (dump_enabled_p ())
2309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2310 "loop does not have enough iterations "
2311 "to support vectorization.\n");
2312 return 0;
2315 /* If we need to peel an extra epilogue iteration to handle data
2316 accesses with gaps, check that there are enough scalar iterations
2317 available.
2319 The check above is redundant with this one when peeling for gaps,
2320 but the distinction is useful for diagnostics. */
2321 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2322 && known_le (scalar_niters, vf))
2324 if (dump_enabled_p ())
2325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2326 "loop does not have enough iterations "
2327 "to support peeling for gaps.\n");
2328 return 0;
2333 /* If using the "very cheap" model. reject cases in which we'd keep
2334 a copy of the scalar code (even if we might be able to vectorize it). */
2335 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2336 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2337 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2338 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2340 if (dump_enabled_p ())
2341 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2342 "some scalar iterations would need to be peeled\n");
2343 return 0;
2346 int min_profitable_iters, min_profitable_estimate;
2347 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2348 &min_profitable_estimate,
2349 suggested_unroll_factor);
2351 if (min_profitable_iters < 0)
2353 if (dump_enabled_p ())
2354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2355 "not vectorized: vectorization not profitable.\n");
2356 if (dump_enabled_p ())
2357 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2358 "not vectorized: vector version will never be "
2359 "profitable.\n");
2360 return -1;
2363 int min_scalar_loop_bound = (param_min_vect_loop_bound
2364 * assumed_vf);
2366 /* Use the cost model only if it is more conservative than user specified
2367 threshold. */
2368 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2369 min_profitable_iters);
2371 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2373 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2374 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2376 if (dump_enabled_p ())
2377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2378 "not vectorized: vectorization not profitable.\n");
2379 if (dump_enabled_p ())
2380 dump_printf_loc (MSG_NOTE, vect_location,
2381 "not vectorized: iteration count smaller than user "
2382 "specified loop bound parameter or minimum profitable "
2383 "iterations (whichever is more conservative).\n");
2384 return 0;
2387 /* The static profitablity threshold min_profitable_estimate includes
2388 the cost of having to check at runtime whether the scalar loop
2389 should be used instead. If it turns out that we don't need or want
2390 such a check, the threshold we should use for the static estimate
2391 is simply the point at which the vector loop becomes more profitable
2392 than the scalar loop. */
2393 if (min_profitable_estimate > min_profitable_iters
2394 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2395 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2396 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2397 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2399 if (dump_enabled_p ())
2400 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2401 " choice between the scalar and vector loops\n");
2402 min_profitable_estimate = min_profitable_iters;
2405 /* If the vector loop needs multiple iterations to be beneficial then
2406 things are probably too close to call, and the conservative thing
2407 would be to stick with the scalar code. */
2408 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2409 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2411 if (dump_enabled_p ())
2412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2413 "one iteration of the vector loop would be"
2414 " more expensive than the equivalent number of"
2415 " iterations of the scalar loop\n");
2416 return 0;
2419 HOST_WIDE_INT estimated_niter;
2421 /* If we are vectorizing an epilogue then we know the maximum number of
2422 scalar iterations it will cover is at least one lower than the
2423 vectorization factor of the main loop. */
2424 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2425 estimated_niter
2426 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2427 else
2429 estimated_niter = estimated_stmt_executions_int (loop);
2430 if (estimated_niter == -1)
2431 estimated_niter = likely_max_stmt_executions_int (loop);
2433 if (estimated_niter != -1
2434 && ((unsigned HOST_WIDE_INT) estimated_niter
2435 < MAX (th, (unsigned) min_profitable_estimate)))
2437 if (dump_enabled_p ())
2438 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2439 "not vectorized: estimated iteration count too "
2440 "small.\n");
2441 if (dump_enabled_p ())
2442 dump_printf_loc (MSG_NOTE, vect_location,
2443 "not vectorized: estimated iteration count smaller "
2444 "than specified loop bound parameter or minimum "
2445 "profitable iterations (whichever is more "
2446 "conservative).\n");
2447 return -1;
2450 return 1;
2453 static opt_result
2454 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2455 vec<data_reference_p> *datarefs,
2456 unsigned int *n_stmts)
2458 *n_stmts = 0;
2459 for (unsigned i = 0; i < loop->num_nodes; i++)
2460 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2461 !gsi_end_p (gsi); gsi_next (&gsi))
2463 gimple *stmt = gsi_stmt (gsi);
2464 if (is_gimple_debug (stmt))
2465 continue;
2466 ++(*n_stmts);
2467 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2468 NULL, 0);
2469 if (!res)
2471 if (is_gimple_call (stmt) && loop->safelen)
2473 tree fndecl = gimple_call_fndecl (stmt), op;
2474 if (fndecl == NULL_TREE
2475 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2477 fndecl = gimple_call_arg (stmt, 0);
2478 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2479 fndecl = TREE_OPERAND (fndecl, 0);
2480 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2482 if (fndecl != NULL_TREE)
2484 cgraph_node *node = cgraph_node::get (fndecl);
2485 if (node != NULL && node->simd_clones != NULL)
2487 unsigned int j, n = gimple_call_num_args (stmt);
2488 for (j = 0; j < n; j++)
2490 op = gimple_call_arg (stmt, j);
2491 if (DECL_P (op)
2492 || (REFERENCE_CLASS_P (op)
2493 && get_base_address (op)))
2494 break;
2496 op = gimple_call_lhs (stmt);
2497 /* Ignore #pragma omp declare simd functions
2498 if they don't have data references in the
2499 call stmt itself. */
2500 if (j == n
2501 && !(op
2502 && (DECL_P (op)
2503 || (REFERENCE_CLASS_P (op)
2504 && get_base_address (op)))))
2505 continue;
2509 return res;
2511 /* If dependence analysis will give up due to the limit on the
2512 number of datarefs stop here and fail fatally. */
2513 if (datarefs->length ()
2514 > (unsigned)param_loop_max_datarefs_for_datadeps)
2515 return opt_result::failure_at (stmt, "exceeded param "
2516 "loop-max-datarefs-for-datadeps\n");
2518 return opt_result::success ();
2521 /* Look for SLP-only access groups and turn each individual access into its own
2522 group. */
2523 static void
2524 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2526 unsigned int i;
2527 struct data_reference *dr;
2529 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2531 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2532 FOR_EACH_VEC_ELT (datarefs, i, dr)
2534 gcc_assert (DR_REF (dr));
2535 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2537 /* Check if the load is a part of an interleaving chain. */
2538 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2540 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2541 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2542 unsigned int group_size = DR_GROUP_SIZE (first_element);
2544 /* Check if SLP-only groups. */
2545 if (!STMT_SLP_TYPE (stmt_info)
2546 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2548 /* Dissolve the group. */
2549 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2551 stmt_vec_info vinfo = first_element;
2552 while (vinfo)
2554 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2555 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2556 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2557 DR_GROUP_SIZE (vinfo) = 1;
2558 if (STMT_VINFO_STRIDED_P (first_element)
2559 /* We cannot handle stores with gaps. */
2560 || DR_IS_WRITE (dr_info->dr))
2562 STMT_VINFO_STRIDED_P (vinfo) = true;
2563 DR_GROUP_GAP (vinfo) = 0;
2565 else
2566 DR_GROUP_GAP (vinfo) = group_size - 1;
2567 /* Duplicate and adjust alignment info, it needs to
2568 be present on each group leader, see dr_misalignment. */
2569 if (vinfo != first_element)
2571 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2572 dr_info2->target_alignment = dr_info->target_alignment;
2573 int misalignment = dr_info->misalignment;
2574 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2576 HOST_WIDE_INT diff
2577 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2578 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2579 unsigned HOST_WIDE_INT align_c
2580 = dr_info->target_alignment.to_constant ();
2581 misalignment = (misalignment + diff) % align_c;
2583 dr_info2->misalignment = misalignment;
2585 vinfo = next;
2592 /* Determine if operating on full vectors for LOOP_VINFO might leave
2593 some scalar iterations still to do. If so, decide how we should
2594 handle those scalar iterations. The possibilities are:
2596 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2597 In this case:
2599 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2600 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2601 LOOP_VINFO_PEELING_FOR_NITER == false
2603 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2604 to handle the remaining scalar iterations. In this case:
2606 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2607 LOOP_VINFO_PEELING_FOR_NITER == true
2609 There are two choices:
2611 (2a) Consider vectorizing the epilogue loop at the same VF as the
2612 main loop, but using partial vectors instead of full vectors.
2613 In this case:
2615 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2617 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2618 In this case:
2620 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2623 opt_result
2624 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2626 /* Determine whether there would be any scalar iterations left over. */
2627 bool need_peeling_or_partial_vectors_p
2628 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2630 /* Decide whether to vectorize the loop with partial vectors. */
2631 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2632 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2633 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2634 && need_peeling_or_partial_vectors_p)
2636 /* For partial-vector-usage=1, try to push the handling of partial
2637 vectors to the epilogue, with the main loop continuing to operate
2638 on full vectors.
2640 If we are unrolling we also do not want to use partial vectors. This
2641 is to avoid the overhead of generating multiple masks and also to
2642 avoid having to execute entire iterations of FALSE masked instructions
2643 when dealing with one or less full iterations.
2645 ??? We could then end up failing to use partial vectors if we
2646 decide to peel iterations into a prologue, and if the main loop
2647 then ends up processing fewer than VF iterations. */
2648 if ((param_vect_partial_vector_usage == 1
2649 || loop_vinfo->suggested_unroll_factor > 1)
2650 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2651 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2652 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2653 else
2654 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2657 if (dump_enabled_p ())
2658 dump_printf_loc (MSG_NOTE, vect_location,
2659 "operating on %s vectors%s.\n",
2660 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2661 ? "partial" : "full",
2662 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2663 ? " for epilogue loop" : "");
2665 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2666 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2667 && need_peeling_or_partial_vectors_p);
2669 /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2670 analysis that we don't know whether the loop is vectorized by partial
2671 vectors (More details see tree-vect-loop-manip.cc).
2673 However, SELECT_VL vectorizaton style should only applied on partial
2674 vectorization since SELECT_VL is the GIMPLE IR that calculates the
2675 number of elements to be process for each iteration.
2677 After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2678 if it is not partial vectorized loop. */
2679 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2680 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2682 return opt_result::success ();
2685 /* Function vect_analyze_loop_2.
2687 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2688 analyses will record information in some members of LOOP_VINFO. FATAL
2689 indicates if some analysis meets fatal error. If one non-NULL pointer
2690 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2691 worked out suggested unroll factor, while one NULL pointer shows it's
2692 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2693 is to hold the slp decision when the suggested unroll factor is worked
2694 out. */
2695 static opt_result
2696 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2697 unsigned *suggested_unroll_factor,
2698 bool& slp_done_for_suggested_uf)
2700 opt_result ok = opt_result::success ();
2701 int res;
2702 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2703 poly_uint64 min_vf = 2;
2704 loop_vec_info orig_loop_vinfo = NULL;
2706 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2707 loop_vec_info of the first vectorized loop. */
2708 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2709 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2710 else
2711 orig_loop_vinfo = loop_vinfo;
2712 gcc_assert (orig_loop_vinfo);
2714 /* The first group of checks is independent of the vector size. */
2715 fatal = true;
2717 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2718 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2719 return opt_result::failure_at (vect_location,
2720 "not vectorized: simd if(0)\n");
2722 /* Find all data references in the loop (which correspond to vdefs/vuses)
2723 and analyze their evolution in the loop. */
2725 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2727 /* Gather the data references and count stmts in the loop. */
2728 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2730 opt_result res
2731 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2732 &LOOP_VINFO_DATAREFS (loop_vinfo),
2733 &LOOP_VINFO_N_STMTS (loop_vinfo));
2734 if (!res)
2736 if (dump_enabled_p ())
2737 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2738 "not vectorized: loop contains function "
2739 "calls or data references that cannot "
2740 "be analyzed\n");
2741 return res;
2743 loop_vinfo->shared->save_datarefs ();
2745 else
2746 loop_vinfo->shared->check_datarefs ();
2748 /* Analyze the data references and also adjust the minimal
2749 vectorization factor according to the loads and stores. */
2751 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2752 if (!ok)
2754 if (dump_enabled_p ())
2755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2756 "bad data references.\n");
2757 return ok;
2760 /* Check if we are applying unroll factor now. */
2761 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2762 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2764 /* If the slp decision is false when suggested unroll factor is worked
2765 out, and we are applying suggested unroll factor, we can simply skip
2766 all slp related analyses this time. */
2767 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2769 /* Classify all cross-iteration scalar data-flow cycles.
2770 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2771 vect_analyze_scalar_cycles (loop_vinfo, slp);
2773 vect_pattern_recog (loop_vinfo);
2775 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2777 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2778 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2780 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2781 if (!ok)
2783 if (dump_enabled_p ())
2784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2785 "bad data access.\n");
2786 return ok;
2789 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2791 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2792 if (!ok)
2794 if (dump_enabled_p ())
2795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2796 "unexpected pattern.\n");
2797 return ok;
2800 /* While the rest of the analysis below depends on it in some way. */
2801 fatal = false;
2803 /* Analyze data dependences between the data-refs in the loop
2804 and adjust the maximum vectorization factor according to
2805 the dependences.
2806 FORNOW: fail at the first data dependence that we encounter. */
2808 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2809 if (!ok)
2811 if (dump_enabled_p ())
2812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2813 "bad data dependence.\n");
2814 return ok;
2816 if (max_vf != MAX_VECTORIZATION_FACTOR
2817 && maybe_lt (max_vf, min_vf))
2818 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2819 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2821 ok = vect_determine_vectorization_factor (loop_vinfo);
2822 if (!ok)
2824 if (dump_enabled_p ())
2825 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2826 "can't determine vectorization factor.\n");
2827 return ok;
2830 /* Compute the scalar iteration cost. */
2831 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2833 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2835 if (slp)
2837 /* Check the SLP opportunities in the loop, analyze and build
2838 SLP trees. */
2839 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2840 if (!ok)
2841 return ok;
2843 /* If there are any SLP instances mark them as pure_slp. */
2844 slp = vect_make_slp_decision (loop_vinfo);
2845 if (slp)
2847 /* Find stmts that need to be both vectorized and SLPed. */
2848 vect_detect_hybrid_slp (loop_vinfo);
2850 /* Update the vectorization factor based on the SLP decision. */
2851 vect_update_vf_for_slp (loop_vinfo);
2853 /* Optimize the SLP graph with the vectorization factor fixed. */
2854 vect_optimize_slp (loop_vinfo);
2856 /* Gather the loads reachable from the SLP graph entries. */
2857 vect_gather_slp_loads (loop_vinfo);
2861 bool saved_can_use_partial_vectors_p
2862 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2864 /* We don't expect to have to roll back to anything other than an empty
2865 set of rgroups. */
2866 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2868 /* This is the point where we can re-start analysis with SLP forced off. */
2869 start_over:
2871 /* Apply the suggested unrolling factor, this was determined by the backend
2872 during finish_cost the first time we ran the analyzis for this
2873 vector mode. */
2874 if (applying_suggested_uf)
2875 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2877 /* Now the vectorization factor is final. */
2878 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2879 gcc_assert (known_ne (vectorization_factor, 0U));
2881 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2883 dump_printf_loc (MSG_NOTE, vect_location,
2884 "vectorization_factor = ");
2885 dump_dec (MSG_NOTE, vectorization_factor);
2886 dump_printf (MSG_NOTE, ", niters = %wd\n",
2887 LOOP_VINFO_INT_NITERS (loop_vinfo));
2890 if (max_vf != MAX_VECTORIZATION_FACTOR
2891 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2892 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2894 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2896 /* Analyze the alignment of the data-refs in the loop.
2897 Fail if a data reference is found that cannot be vectorized. */
2899 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2900 if (!ok)
2902 if (dump_enabled_p ())
2903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2904 "bad data alignment.\n");
2905 return ok;
2908 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2909 It is important to call pruning after vect_analyze_data_ref_accesses,
2910 since we use grouping information gathered by interleaving analysis. */
2911 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2912 if (!ok)
2913 return ok;
2915 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2916 vectorization, since we do not want to add extra peeling or
2917 add versioning for alignment. */
2918 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2919 /* This pass will decide on using loop versioning and/or loop peeling in
2920 order to enhance the alignment of data references in the loop. */
2921 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2922 if (!ok)
2923 return ok;
2925 if (slp)
2927 /* Analyze operations in the SLP instances. Note this may
2928 remove unsupported SLP instances which makes the above
2929 SLP kind detection invalid. */
2930 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2931 vect_slp_analyze_operations (loop_vinfo);
2932 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2934 ok = opt_result::failure_at (vect_location,
2935 "unsupported SLP instances\n");
2936 goto again;
2939 /* Check whether any load in ALL SLP instances is possibly permuted. */
2940 slp_tree load_node, slp_root;
2941 unsigned i, x;
2942 slp_instance instance;
2943 bool can_use_lanes = true;
2944 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2946 slp_root = SLP_INSTANCE_TREE (instance);
2947 int group_size = SLP_TREE_LANES (slp_root);
2948 tree vectype = SLP_TREE_VECTYPE (slp_root);
2949 bool loads_permuted = false;
2950 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2952 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2953 continue;
2954 unsigned j;
2955 stmt_vec_info load_info;
2956 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2957 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2959 loads_permuted = true;
2960 break;
2964 /* If the loads and stores can be handled with load/store-lane
2965 instructions record it and move on to the next instance. */
2966 if (loads_permuted
2967 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2968 && vect_store_lanes_supported (vectype, group_size, false)
2969 != IFN_LAST)
2971 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2972 if (STMT_VINFO_GROUPED_ACCESS
2973 (SLP_TREE_REPRESENTATIVE (load_node)))
2975 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2976 (SLP_TREE_REPRESENTATIVE (load_node));
2977 /* Use SLP for strided accesses (or if we can't
2978 load-lanes). */
2979 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2980 || vect_load_lanes_supported
2981 (STMT_VINFO_VECTYPE (stmt_vinfo),
2982 DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2983 break;
2986 can_use_lanes
2987 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2989 if (can_use_lanes && dump_enabled_p ())
2990 dump_printf_loc (MSG_NOTE, vect_location,
2991 "SLP instance %p can use load/store-lanes\n",
2992 (void *) instance);
2994 else
2996 can_use_lanes = false;
2997 break;
3001 /* If all SLP instances can use load/store-lanes abort SLP and try again
3002 with SLP disabled. */
3003 if (can_use_lanes)
3005 ok = opt_result::failure_at (vect_location,
3006 "Built SLP cancelled: can use "
3007 "load/store-lanes\n");
3008 if (dump_enabled_p ())
3009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3010 "Built SLP cancelled: all SLP instances support "
3011 "load/store-lanes\n");
3012 goto again;
3016 /* Dissolve SLP-only groups. */
3017 vect_dissolve_slp_only_groups (loop_vinfo);
3019 /* Scan all the remaining operations in the loop that are not subject
3020 to SLP and make sure they are vectorizable. */
3021 ok = vect_analyze_loop_operations (loop_vinfo);
3022 if (!ok)
3024 if (dump_enabled_p ())
3025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3026 "bad operation or unsupported loop bound.\n");
3027 return ok;
3030 /* For now, we don't expect to mix both masking and length approaches for one
3031 loop, disable it if both are recorded. */
3032 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3033 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3034 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3036 if (dump_enabled_p ())
3037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3038 "can't vectorize a loop with partial vectors"
3039 " because we don't expect to mix different"
3040 " approaches with partial vectors for the"
3041 " same loop.\n");
3042 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3045 /* If we still have the option of using partial vectors,
3046 check whether we can generate the necessary loop controls. */
3047 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3049 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3051 if (!vect_verify_full_masking (loop_vinfo)
3052 && !vect_verify_full_masking_avx512 (loop_vinfo))
3053 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3055 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3056 if (!vect_verify_loop_lens (loop_vinfo))
3057 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3060 /* If we're vectorizing a loop that uses length "controls" and
3061 can iterate more than once, we apply decrementing IV approach
3062 in loop control. */
3063 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3064 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3065 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3066 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3067 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3068 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3069 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3071 /* If a loop uses length controls and has a decrementing loop control IV,
3072 we will normally pass that IV through a MIN_EXPR to calcaluate the
3073 basis for the length controls. E.g. in a loop that processes one
3074 element per scalar iteration, the number of elements would be
3075 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3077 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3078 step, since only the final iteration of the vector loop can have
3079 inactive lanes.
3081 However, some targets have a dedicated instruction for calculating the
3082 preferred length, given the total number of elements that still need to
3083 be processed. This is encapsulated in the SELECT_VL internal function.
3085 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3086 to determine the basis for the length controls. However, unlike the
3087 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3088 lanes inactive in any iteration of the vector loop, not just the last
3089 iteration. This SELECT_VL approach therefore requires us to use pointer
3090 IVs with variable steps.
3092 Once we've decided how many elements should be processed by one
3093 iteration of the vector loop, we need to populate the rgroup controls.
3094 If a loop has multiple rgroups, we need to make sure that those rgroups
3095 "line up" (that is, they must be consistent about which elements are
3096 active and which aren't). This is done by vect_adjust_loop_lens_control.
3098 In principle, it would be possible to use vect_adjust_loop_lens_control
3099 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3100 However:
3102 (1) In practice, it only makes sense to use SELECT_VL when a vector
3103 operation will be controlled directly by the result. It is not
3104 worth using SELECT_VL if it would only be the input to other
3105 calculations.
3107 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3108 pointer IV will need N updates by a variable amount (N-1 updates
3109 within the iteration and 1 update to move to the next iteration).
3111 Because of this, we prefer to use the MIN_EXPR approach whenever there
3112 is more than one length control.
3114 In addition, SELECT_VL always operates to a granularity of 1 unit.
3115 If we wanted to use it to control an SLP operation on N consecutive
3116 elements, we would need to make the SELECT_VL inputs measure scalar
3117 iterations (rather than elements) and then multiply the SELECT_VL
3118 result by N. But using SELECT_VL this way is inefficient because
3119 of (1) above.
3121 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3122 satisfied:
3124 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3125 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3127 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3128 we will fail to gain benefits of following unroll optimizations. We prefer
3129 using the MIN_EXPR approach in this situation. */
3130 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3132 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3133 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3134 OPTIMIZE_FOR_SPEED)
3135 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3136 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3137 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3138 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3139 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3142 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3143 assuming that the loop will be used as a main loop. We will redo
3144 this analysis later if we instead decide to use the loop as an
3145 epilogue loop. */
3146 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3147 if (!ok)
3148 return ok;
3150 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3151 to be able to handle fewer than VF scalars, or needs to have a lower VF
3152 than the main loop. */
3153 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3154 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3156 poly_uint64 unscaled_vf
3157 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3158 orig_loop_vinfo->suggested_unroll_factor);
3159 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3160 return opt_result::failure_at (vect_location,
3161 "Vectorization factor too high for"
3162 " epilogue loop.\n");
3165 /* Check the costings of the loop make vectorizing worthwhile. */
3166 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3167 if (res < 0)
3169 ok = opt_result::failure_at (vect_location,
3170 "Loop costings may not be worthwhile.\n");
3171 goto again;
3173 if (!res)
3174 return opt_result::failure_at (vect_location,
3175 "Loop costings not worthwhile.\n");
3177 /* If an epilogue loop is required make sure we can create one. */
3178 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3179 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3180 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3182 if (dump_enabled_p ())
3183 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3184 if (!vect_can_advance_ivs_p (loop_vinfo)
3185 || !slpeel_can_duplicate_loop_p (loop,
3186 LOOP_VINFO_IV_EXIT (loop_vinfo),
3187 LOOP_VINFO_IV_EXIT (loop_vinfo)))
3189 ok = opt_result::failure_at (vect_location,
3190 "not vectorized: can't create required "
3191 "epilog loop\n");
3192 goto again;
3196 /* During peeling, we need to check if number of loop iterations is
3197 enough for both peeled prolog loop and vector loop. This check
3198 can be merged along with threshold check of loop versioning, so
3199 increase threshold for this case if necessary.
3201 If we are analyzing an epilogue we still want to check what its
3202 versioning threshold would be. If we decide to vectorize the epilogues we
3203 will want to use the lowest versioning threshold of all epilogues and main
3204 loop. This will enable us to enter a vectorized epilogue even when
3205 versioning the loop. We can't simply check whether the epilogue requires
3206 versioning though since we may have skipped some versioning checks when
3207 analyzing the epilogue. For instance, checks for alias versioning will be
3208 skipped when dealing with epilogues as we assume we already checked them
3209 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3210 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3212 poly_uint64 niters_th = 0;
3213 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3215 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3217 /* Niters for peeled prolog loop. */
3218 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3220 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3221 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3222 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3224 else
3225 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3228 /* Niters for at least one iteration of vectorized loop. */
3229 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3230 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3231 /* One additional iteration because of peeling for gap. */
3232 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3233 niters_th += 1;
3235 /* Use the same condition as vect_transform_loop to decide when to use
3236 the cost to determine a versioning threshold. */
3237 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3238 && ordered_p (th, niters_th))
3239 niters_th = ordered_max (poly_uint64 (th), niters_th);
3241 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3244 gcc_assert (known_eq (vectorization_factor,
3245 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3247 slp_done_for_suggested_uf = slp;
3249 /* Ok to vectorize! */
3250 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3251 return opt_result::success ();
3253 again:
3254 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3255 gcc_assert (!ok);
3257 /* Try again with SLP forced off but if we didn't do any SLP there is
3258 no point in re-trying. */
3259 if (!slp)
3260 return ok;
3262 /* If the slp decision is true when suggested unroll factor is worked
3263 out, and we are applying suggested unroll factor, we don't need to
3264 re-try any more. */
3265 if (applying_suggested_uf && slp_done_for_suggested_uf)
3266 return ok;
3268 /* If there are reduction chains re-trying will fail anyway. */
3269 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3270 return ok;
3272 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3273 via interleaving or lane instructions. */
3274 slp_instance instance;
3275 slp_tree node;
3276 unsigned i, j;
3277 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3279 stmt_vec_info vinfo;
3280 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3281 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3282 continue;
3283 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3284 unsigned int size = DR_GROUP_SIZE (vinfo);
3285 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3286 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3287 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3288 && ! vect_grouped_store_supported (vectype, size))
3289 return opt_result::failure_at (vinfo->stmt,
3290 "unsupported grouped store\n");
3291 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3293 vinfo = SLP_TREE_REPRESENTATIVE (node);
3294 if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3296 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3297 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3298 size = DR_GROUP_SIZE (vinfo);
3299 vectype = STMT_VINFO_VECTYPE (vinfo);
3300 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3301 && ! vect_grouped_load_supported (vectype, single_element_p,
3302 size))
3303 return opt_result::failure_at (vinfo->stmt,
3304 "unsupported grouped load\n");
3309 if (dump_enabled_p ())
3310 dump_printf_loc (MSG_NOTE, vect_location,
3311 "re-trying with SLP disabled\n");
3313 /* Roll back state appropriately. No SLP this time. */
3314 slp = false;
3315 /* Restore vectorization factor as it were without SLP. */
3316 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3317 /* Free the SLP instances. */
3318 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3319 vect_free_slp_instance (instance);
3320 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3321 /* Reset SLP type to loop_vect on all stmts. */
3322 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3324 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3325 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3326 !gsi_end_p (si); gsi_next (&si))
3328 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3329 STMT_SLP_TYPE (stmt_info) = loop_vect;
3330 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3331 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3333 /* vectorizable_reduction adjusts reduction stmt def-types,
3334 restore them to that of the PHI. */
3335 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3336 = STMT_VINFO_DEF_TYPE (stmt_info);
3337 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3338 (STMT_VINFO_REDUC_DEF (stmt_info)))
3339 = STMT_VINFO_DEF_TYPE (stmt_info);
3342 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3343 !gsi_end_p (si); gsi_next (&si))
3345 if (is_gimple_debug (gsi_stmt (si)))
3346 continue;
3347 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3348 STMT_SLP_TYPE (stmt_info) = loop_vect;
3349 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3351 stmt_vec_info pattern_stmt_info
3352 = STMT_VINFO_RELATED_STMT (stmt_info);
3353 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3354 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3356 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3357 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3358 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3359 !gsi_end_p (pi); gsi_next (&pi))
3360 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3361 = loop_vect;
3365 /* Free optimized alias test DDRS. */
3366 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3367 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3368 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3369 /* Reset target cost data. */
3370 delete loop_vinfo->vector_costs;
3371 loop_vinfo->vector_costs = nullptr;
3372 /* Reset accumulated rgroup information. */
3373 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3374 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3375 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3376 /* Reset assorted flags. */
3377 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3378 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3379 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3380 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3381 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3382 = saved_can_use_partial_vectors_p;
3384 goto start_over;
3387 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3388 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3389 OLD_LOOP_VINFO is better unless something specifically indicates
3390 otherwise.
3392 Note that this deliberately isn't a partial order. */
3394 static bool
3395 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3396 loop_vec_info old_loop_vinfo)
3398 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3399 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3401 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3402 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3404 /* Always prefer a VF of loop->simdlen over any other VF. */
3405 if (loop->simdlen)
3407 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3408 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3409 if (new_simdlen_p != old_simdlen_p)
3410 return new_simdlen_p;
3413 const auto *old_costs = old_loop_vinfo->vector_costs;
3414 const auto *new_costs = new_loop_vinfo->vector_costs;
3415 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3416 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3418 return new_costs->better_main_loop_than_p (old_costs);
3421 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3422 true if we should. */
3424 static bool
3425 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3426 loop_vec_info old_loop_vinfo)
3428 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3429 return false;
3431 if (dump_enabled_p ())
3432 dump_printf_loc (MSG_NOTE, vect_location,
3433 "***** Preferring vector mode %s to vector mode %s\n",
3434 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3435 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3436 return true;
3439 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3440 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3441 MODE_I to the next mode useful to analyze.
3442 Return the loop_vinfo on success and wrapped null on failure. */
3444 static opt_loop_vec_info
3445 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3446 const vect_loop_form_info *loop_form_info,
3447 loop_vec_info main_loop_vinfo,
3448 const vector_modes &vector_modes, unsigned &mode_i,
3449 machine_mode &autodetected_vector_mode,
3450 bool &fatal)
3452 loop_vec_info loop_vinfo
3453 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3455 machine_mode vector_mode = vector_modes[mode_i];
3456 loop_vinfo->vector_mode = vector_mode;
3457 unsigned int suggested_unroll_factor = 1;
3458 bool slp_done_for_suggested_uf = false;
3460 /* Run the main analysis. */
3461 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3462 &suggested_unroll_factor,
3463 slp_done_for_suggested_uf);
3464 if (dump_enabled_p ())
3465 dump_printf_loc (MSG_NOTE, vect_location,
3466 "***** Analysis %s with vector mode %s\n",
3467 res ? "succeeded" : " failed",
3468 GET_MODE_NAME (loop_vinfo->vector_mode));
3470 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3472 if (dump_enabled_p ())
3473 dump_printf_loc (MSG_NOTE, vect_location,
3474 "***** Re-trying analysis for unrolling"
3475 " with unroll factor %d and slp %s.\n",
3476 suggested_unroll_factor,
3477 slp_done_for_suggested_uf ? "on" : "off");
3478 loop_vec_info unroll_vinfo
3479 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3480 unroll_vinfo->vector_mode = vector_mode;
3481 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3482 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3483 slp_done_for_suggested_uf);
3484 if (new_res)
3486 delete loop_vinfo;
3487 loop_vinfo = unroll_vinfo;
3489 else
3490 delete unroll_vinfo;
3493 /* Remember the autodetected vector mode. */
3494 if (vector_mode == VOIDmode)
3495 autodetected_vector_mode = loop_vinfo->vector_mode;
3497 /* Advance mode_i, first skipping modes that would result in the
3498 same analysis result. */
3499 while (mode_i + 1 < vector_modes.length ()
3500 && vect_chooses_same_modes_p (loop_vinfo,
3501 vector_modes[mode_i + 1]))
3503 if (dump_enabled_p ())
3504 dump_printf_loc (MSG_NOTE, vect_location,
3505 "***** The result for vector mode %s would"
3506 " be the same\n",
3507 GET_MODE_NAME (vector_modes[mode_i + 1]));
3508 mode_i += 1;
3510 if (mode_i + 1 < vector_modes.length ()
3511 && VECTOR_MODE_P (autodetected_vector_mode)
3512 && (related_vector_mode (vector_modes[mode_i + 1],
3513 GET_MODE_INNER (autodetected_vector_mode))
3514 == autodetected_vector_mode)
3515 && (related_vector_mode (autodetected_vector_mode,
3516 GET_MODE_INNER (vector_modes[mode_i + 1]))
3517 == vector_modes[mode_i + 1]))
3519 if (dump_enabled_p ())
3520 dump_printf_loc (MSG_NOTE, vect_location,
3521 "***** Skipping vector mode %s, which would"
3522 " repeat the analysis for %s\n",
3523 GET_MODE_NAME (vector_modes[mode_i + 1]),
3524 GET_MODE_NAME (autodetected_vector_mode));
3525 mode_i += 1;
3527 mode_i++;
3529 if (!res)
3531 delete loop_vinfo;
3532 if (fatal)
3533 gcc_checking_assert (main_loop_vinfo == NULL);
3534 return opt_loop_vec_info::propagate_failure (res);
3537 return opt_loop_vec_info::success (loop_vinfo);
3540 /* Function vect_analyze_loop.
3542 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3543 for it. The different analyses will record information in the
3544 loop_vec_info struct. */
3545 opt_loop_vec_info
3546 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3548 DUMP_VECT_SCOPE ("analyze_loop_nest");
3550 if (loop_outer (loop)
3551 && loop_vec_info_for_loop (loop_outer (loop))
3552 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3553 return opt_loop_vec_info::failure_at (vect_location,
3554 "outer-loop already vectorized.\n");
3556 if (!find_loop_nest (loop, &shared->loop_nest))
3557 return opt_loop_vec_info::failure_at
3558 (vect_location,
3559 "not vectorized: loop nest containing two or more consecutive inner"
3560 " loops cannot be vectorized\n");
3562 /* Analyze the loop form. */
3563 vect_loop_form_info loop_form_info;
3564 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3565 if (!res)
3567 if (dump_enabled_p ())
3568 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3569 "bad loop form.\n");
3570 return opt_loop_vec_info::propagate_failure (res);
3572 if (!integer_onep (loop_form_info.assumptions))
3574 /* We consider to vectorize this loop by versioning it under
3575 some assumptions. In order to do this, we need to clear
3576 existing information computed by scev and niter analyzer. */
3577 scev_reset_htab ();
3578 free_numbers_of_iterations_estimates (loop);
3579 /* Also set flag for this loop so that following scev and niter
3580 analysis are done under the assumptions. */
3581 loop_constraint_set (loop, LOOP_C_FINITE);
3583 else
3584 /* Clear the existing niter information to make sure the nonwrapping flag
3585 will be calculated and set propriately. */
3586 free_numbers_of_iterations_estimates (loop);
3588 auto_vector_modes vector_modes;
3589 /* Autodetect first vector size we try. */
3590 vector_modes.safe_push (VOIDmode);
3591 unsigned int autovec_flags
3592 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3593 loop->simdlen != 0);
3594 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3595 && !unlimited_cost_model (loop));
3596 machine_mode autodetected_vector_mode = VOIDmode;
3597 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3598 unsigned int mode_i = 0;
3599 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3601 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3602 a mode has not been analyzed. */
3603 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3604 for (unsigned i = 0; i < vector_modes.length (); ++i)
3605 cached_vf_per_mode.safe_push (0);
3607 /* First determine the main loop vectorization mode, either the first
3608 one that works, starting with auto-detecting the vector mode and then
3609 following the targets order of preference, or the one with the
3610 lowest cost if pick_lowest_cost_p. */
3611 while (1)
3613 bool fatal;
3614 unsigned int last_mode_i = mode_i;
3615 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3616 failed. */
3617 cached_vf_per_mode[last_mode_i] = -1;
3618 opt_loop_vec_info loop_vinfo
3619 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3620 NULL, vector_modes, mode_i,
3621 autodetected_vector_mode, fatal);
3622 if (fatal)
3623 break;
3625 if (loop_vinfo)
3627 /* Analyzis has been successful so update the VF value. The
3628 VF should always be a multiple of unroll_factor and we want to
3629 capture the original VF here. */
3630 cached_vf_per_mode[last_mode_i]
3631 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3632 loop_vinfo->suggested_unroll_factor);
3633 /* Once we hit the desired simdlen for the first time,
3634 discard any previous attempts. */
3635 if (simdlen
3636 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3638 delete first_loop_vinfo;
3639 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3640 simdlen = 0;
3642 else if (pick_lowest_cost_p
3643 && first_loop_vinfo
3644 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3646 /* Pick loop_vinfo over first_loop_vinfo. */
3647 delete first_loop_vinfo;
3648 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3650 if (first_loop_vinfo == NULL)
3651 first_loop_vinfo = loop_vinfo;
3652 else
3654 delete loop_vinfo;
3655 loop_vinfo = opt_loop_vec_info::success (NULL);
3658 /* Commit to first_loop_vinfo if we have no reason to try
3659 alternatives. */
3660 if (!simdlen && !pick_lowest_cost_p)
3661 break;
3663 if (mode_i == vector_modes.length ()
3664 || autodetected_vector_mode == VOIDmode)
3665 break;
3667 /* Try the next biggest vector size. */
3668 if (dump_enabled_p ())
3669 dump_printf_loc (MSG_NOTE, vect_location,
3670 "***** Re-trying analysis with vector mode %s\n",
3671 GET_MODE_NAME (vector_modes[mode_i]));
3673 if (!first_loop_vinfo)
3674 return opt_loop_vec_info::propagate_failure (res);
3676 if (dump_enabled_p ())
3677 dump_printf_loc (MSG_NOTE, vect_location,
3678 "***** Choosing vector mode %s\n",
3679 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3681 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3682 enabled, SIMDUID is not set, it is the innermost loop and we have
3683 either already found the loop's SIMDLEN or there was no SIMDLEN to
3684 begin with.
3685 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3686 bool vect_epilogues = (!simdlen
3687 && loop->inner == NULL
3688 && param_vect_epilogues_nomask
3689 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3690 /* No code motion support for multiple epilogues so for now
3691 not supported when multiple exits. */
3692 && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3693 && !loop->simduid);
3694 if (!vect_epilogues)
3695 return first_loop_vinfo;
3697 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3698 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3700 /* For epilogues start the analysis from the first mode. The motivation
3701 behind starting from the beginning comes from cases where the VECTOR_MODES
3702 array may contain length-agnostic and length-specific modes. Their
3703 ordering is not guaranteed, so we could end up picking a mode for the main
3704 loop that is after the epilogue's optimal mode. */
3705 vector_modes[0] = autodetected_vector_mode;
3706 mode_i = 0;
3708 bool supports_partial_vectors =
3709 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3710 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3712 while (1)
3714 /* If the target does not support partial vectors we can shorten the
3715 number of modes to analyze for the epilogue as we know we can't pick a
3716 mode that would lead to a VF at least as big as the
3717 FIRST_VINFO_VF. */
3718 if (!supports_partial_vectors
3719 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3721 mode_i++;
3722 if (mode_i == vector_modes.length ())
3723 break;
3724 continue;
3727 if (dump_enabled_p ())
3728 dump_printf_loc (MSG_NOTE, vect_location,
3729 "***** Re-trying epilogue analysis with vector "
3730 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3732 bool fatal;
3733 opt_loop_vec_info loop_vinfo
3734 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3735 first_loop_vinfo,
3736 vector_modes, mode_i,
3737 autodetected_vector_mode, fatal);
3738 if (fatal)
3739 break;
3741 if (loop_vinfo)
3743 if (pick_lowest_cost_p)
3745 /* Keep trying to roll back vectorization attempts while the
3746 loop_vec_infos they produced were worse than this one. */
3747 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3748 while (!vinfos.is_empty ()
3749 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3751 gcc_assert (vect_epilogues);
3752 delete vinfos.pop ();
3755 /* For now only allow one epilogue loop. */
3756 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3758 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3759 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3760 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3761 || maybe_ne (lowest_th, 0U));
3762 /* Keep track of the known smallest versioning
3763 threshold. */
3764 if (ordered_p (lowest_th, th))
3765 lowest_th = ordered_min (lowest_th, th);
3767 else
3769 delete loop_vinfo;
3770 loop_vinfo = opt_loop_vec_info::success (NULL);
3773 /* For now only allow one epilogue loop, but allow
3774 pick_lowest_cost_p to replace it, so commit to the
3775 first epilogue if we have no reason to try alternatives. */
3776 if (!pick_lowest_cost_p)
3777 break;
3780 if (mode_i == vector_modes.length ())
3781 break;
3785 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3787 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3788 if (dump_enabled_p ())
3789 dump_printf_loc (MSG_NOTE, vect_location,
3790 "***** Choosing epilogue vector mode %s\n",
3791 GET_MODE_NAME
3792 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3795 return first_loop_vinfo;
3798 /* Return true if there is an in-order reduction function for CODE, storing
3799 it in *REDUC_FN if so. */
3801 static bool
3802 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3804 /* We support MINUS_EXPR by negating the operand. This also preserves an
3805 initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3806 (-0.0) = -0.0. */
3807 if (code == PLUS_EXPR || code == MINUS_EXPR)
3809 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3810 return true;
3812 return false;
3815 /* Function reduction_fn_for_scalar_code
3817 Input:
3818 CODE - tree_code of a reduction operations.
3820 Output:
3821 REDUC_FN - the corresponding internal function to be used to reduce the
3822 vector of partial results into a single scalar result, or IFN_LAST
3823 if the operation is a supported reduction operation, but does not have
3824 such an internal function.
3826 Return FALSE if CODE currently cannot be vectorized as reduction. */
3828 bool
3829 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3831 if (code.is_tree_code ())
3832 switch (tree_code (code))
3834 case MAX_EXPR:
3835 *reduc_fn = IFN_REDUC_MAX;
3836 return true;
3838 case MIN_EXPR:
3839 *reduc_fn = IFN_REDUC_MIN;
3840 return true;
3842 case PLUS_EXPR:
3843 *reduc_fn = IFN_REDUC_PLUS;
3844 return true;
3846 case BIT_AND_EXPR:
3847 *reduc_fn = IFN_REDUC_AND;
3848 return true;
3850 case BIT_IOR_EXPR:
3851 *reduc_fn = IFN_REDUC_IOR;
3852 return true;
3854 case BIT_XOR_EXPR:
3855 *reduc_fn = IFN_REDUC_XOR;
3856 return true;
3858 case MULT_EXPR:
3859 case MINUS_EXPR:
3860 *reduc_fn = IFN_LAST;
3861 return true;
3863 default:
3864 return false;
3866 else
3867 switch (combined_fn (code))
3869 CASE_CFN_FMAX:
3870 *reduc_fn = IFN_REDUC_FMAX;
3871 return true;
3873 CASE_CFN_FMIN:
3874 *reduc_fn = IFN_REDUC_FMIN;
3875 return true;
3877 default:
3878 return false;
3882 /* If there is a neutral value X such that a reduction would not be affected
3883 by the introduction of additional X elements, return that X, otherwise
3884 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3885 of the scalar elements. If the reduction has just a single initial value
3886 then INITIAL_VALUE is that value, otherwise it is null.
3887 If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3888 In that case no signed zero is returned. */
3890 tree
3891 neutral_op_for_reduction (tree scalar_type, code_helper code,
3892 tree initial_value, bool as_initial)
3894 if (code.is_tree_code ())
3895 switch (tree_code (code))
3897 case DOT_PROD_EXPR:
3898 case SAD_EXPR:
3899 case MINUS_EXPR:
3900 case BIT_IOR_EXPR:
3901 case BIT_XOR_EXPR:
3902 return build_zero_cst (scalar_type);
3903 case WIDEN_SUM_EXPR:
3904 case PLUS_EXPR:
3905 if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3906 return build_real (scalar_type, dconstm0);
3907 else
3908 return build_zero_cst (scalar_type);
3910 case MULT_EXPR:
3911 return build_one_cst (scalar_type);
3913 case BIT_AND_EXPR:
3914 return build_all_ones_cst (scalar_type);
3916 case MAX_EXPR:
3917 case MIN_EXPR:
3918 return initial_value;
3920 default:
3921 return NULL_TREE;
3923 else
3924 switch (combined_fn (code))
3926 CASE_CFN_FMIN:
3927 CASE_CFN_FMAX:
3928 return initial_value;
3930 default:
3931 return NULL_TREE;
3935 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3936 STMT is printed with a message MSG. */
3938 static void
3939 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3941 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3944 /* Return true if we need an in-order reduction for operation CODE
3945 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3946 overflow must wrap. */
3948 bool
3949 needs_fold_left_reduction_p (tree type, code_helper code)
3951 /* CHECKME: check for !flag_finite_math_only too? */
3952 if (SCALAR_FLOAT_TYPE_P (type))
3954 if (code.is_tree_code ())
3955 switch (tree_code (code))
3957 case MIN_EXPR:
3958 case MAX_EXPR:
3959 return false;
3961 default:
3962 return !flag_associative_math;
3964 else
3965 switch (combined_fn (code))
3967 CASE_CFN_FMIN:
3968 CASE_CFN_FMAX:
3969 return false;
3971 default:
3972 return !flag_associative_math;
3976 if (INTEGRAL_TYPE_P (type))
3977 return (!code.is_tree_code ()
3978 || !operation_no_trapping_overflow (type, tree_code (code)));
3980 if (SAT_FIXED_POINT_TYPE_P (type))
3981 return true;
3983 return false;
3986 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3987 has a handled computation expression. Store the main reduction
3988 operation in *CODE. */
3990 static bool
3991 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3992 tree loop_arg, code_helper *code,
3993 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3995 auto_bitmap visited;
3996 tree lookfor = PHI_RESULT (phi);
3997 ssa_op_iter curri;
3998 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3999 while (USE_FROM_PTR (curr) != loop_arg)
4000 curr = op_iter_next_use (&curri);
4001 curri.i = curri.numops;
4004 path.safe_push (std::make_pair (curri, curr));
4005 tree use = USE_FROM_PTR (curr);
4006 if (use == lookfor)
4007 break;
4008 gimple *def = SSA_NAME_DEF_STMT (use);
4009 if (gimple_nop_p (def)
4010 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4012 pop:
4015 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4016 curri = x.first;
4017 curr = x.second;
4019 curr = op_iter_next_use (&curri);
4020 /* Skip already visited or non-SSA operands (from iterating
4021 over PHI args). */
4022 while (curr != NULL_USE_OPERAND_P
4023 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4024 || ! bitmap_set_bit (visited,
4025 SSA_NAME_VERSION
4026 (USE_FROM_PTR (curr)))));
4028 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4029 if (curr == NULL_USE_OPERAND_P)
4030 break;
4032 else
4034 if (gimple_code (def) == GIMPLE_PHI)
4035 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4036 else
4037 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4038 while (curr != NULL_USE_OPERAND_P
4039 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4040 || ! bitmap_set_bit (visited,
4041 SSA_NAME_VERSION
4042 (USE_FROM_PTR (curr)))))
4043 curr = op_iter_next_use (&curri);
4044 if (curr == NULL_USE_OPERAND_P)
4045 goto pop;
4048 while (1);
4049 if (dump_file && (dump_flags & TDF_DETAILS))
4051 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4052 unsigned i;
4053 std::pair<ssa_op_iter, use_operand_p> *x;
4054 FOR_EACH_VEC_ELT (path, i, x)
4055 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4056 dump_printf (MSG_NOTE, "\n");
4059 /* Check whether the reduction path detected is valid. */
4060 bool fail = path.length () == 0;
4061 bool neg = false;
4062 int sign = -1;
4063 *code = ERROR_MARK;
4064 for (unsigned i = 1; i < path.length (); ++i)
4066 gimple *use_stmt = USE_STMT (path[i].second);
4067 gimple_match_op op;
4068 if (!gimple_extract_op (use_stmt, &op))
4070 fail = true;
4071 break;
4073 unsigned int opi = op.num_ops;
4074 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4076 /* The following make sure we can compute the operand index
4077 easily plus it mostly disallows chaining via COND_EXPR condition
4078 operands. */
4079 for (opi = 0; opi < op.num_ops; ++opi)
4080 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4081 break;
4083 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4085 for (opi = 0; opi < op.num_ops; ++opi)
4086 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4087 break;
4089 if (opi == op.num_ops)
4091 fail = true;
4092 break;
4094 op.code = canonicalize_code (op.code, op.type);
4095 if (op.code == MINUS_EXPR)
4097 op.code = PLUS_EXPR;
4098 /* Track whether we negate the reduction value each iteration. */
4099 if (op.ops[1] == op.ops[opi])
4100 neg = ! neg;
4102 if (CONVERT_EXPR_CODE_P (op.code)
4103 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4105 else if (*code == ERROR_MARK)
4107 *code = op.code;
4108 sign = TYPE_SIGN (op.type);
4110 else if (op.code != *code)
4112 fail = true;
4113 break;
4115 else if ((op.code == MIN_EXPR
4116 || op.code == MAX_EXPR)
4117 && sign != TYPE_SIGN (op.type))
4119 fail = true;
4120 break;
4122 /* Check there's only a single stmt the op is used on. For the
4123 not value-changing tail and the last stmt allow out-of-loop uses.
4124 ??? We could relax this and handle arbitrary live stmts by
4125 forcing a scalar epilogue for example. */
4126 imm_use_iterator imm_iter;
4127 use_operand_p use_p;
4128 gimple *op_use_stmt;
4129 unsigned cnt = 0;
4130 bool cond_fn_p = op.code.is_internal_fn ()
4131 && (conditional_internal_fn_code (internal_fn (op.code))
4132 != ERROR_MARK);
4134 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4136 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4137 op1 twice (once as definition, once as else) in the same operation.
4138 Allow this. */
4139 if (cond_fn_p && op_use_stmt == use_stmt)
4141 gcall *call = as_a<gcall *> (use_stmt);
4142 unsigned else_pos
4143 = internal_fn_else_index (internal_fn (op.code));
4145 for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4147 if (j == else_pos)
4148 continue;
4149 if (gimple_call_arg (call, j) == op.ops[opi])
4150 cnt++;
4153 else if (!is_gimple_debug (op_use_stmt)
4154 && (*code != ERROR_MARK
4155 || flow_bb_inside_loop_p (loop,
4156 gimple_bb (op_use_stmt))))
4157 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4158 cnt++;
4161 if (cnt != 1)
4163 fail = true;
4164 break;
4167 return ! fail && ! neg && *code != ERROR_MARK;
4170 bool
4171 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4172 tree loop_arg, enum tree_code code)
4174 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4175 code_helper code_;
4176 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4177 && code_ == code);
4182 /* Function vect_is_simple_reduction
4184 (1) Detect a cross-iteration def-use cycle that represents a simple
4185 reduction computation. We look for the following pattern:
4187 loop_header:
4188 a1 = phi < a0, a2 >
4189 a3 = ...
4190 a2 = operation (a3, a1)
4194 a3 = ...
4195 loop_header:
4196 a1 = phi < a0, a2 >
4197 a2 = operation (a3, a1)
4199 such that:
4200 1. operation is commutative and associative and it is safe to
4201 change the order of the computation
4202 2. no uses for a2 in the loop (a2 is used out of the loop)
4203 3. no uses of a1 in the loop besides the reduction operation
4204 4. no uses of a1 outside the loop.
4206 Conditions 1,4 are tested here.
4207 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4209 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4210 nested cycles.
4212 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4213 reductions:
4215 a1 = phi < a0, a2 >
4216 inner loop (def of a3)
4217 a2 = phi < a3 >
4219 (4) Detect condition expressions, ie:
4220 for (int i = 0; i < N; i++)
4221 if (a[i] < val)
4222 ret_val = a[i];
4226 static stmt_vec_info
4227 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4228 bool *double_reduc, bool *reduc_chain_p, bool slp)
4230 gphi *phi = as_a <gphi *> (phi_info->stmt);
4231 gimple *phi_use_stmt = NULL;
4232 imm_use_iterator imm_iter;
4233 use_operand_p use_p;
4235 *double_reduc = false;
4236 *reduc_chain_p = false;
4237 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4239 tree phi_name = PHI_RESULT (phi);
4240 /* ??? If there are no uses of the PHI result the inner loop reduction
4241 won't be detected as possibly double-reduction by vectorizable_reduction
4242 because that tries to walk the PHI arg from the preheader edge which
4243 can be constant. See PR60382. */
4244 if (has_zero_uses (phi_name))
4245 return NULL;
4246 class loop *loop = (gimple_bb (phi))->loop_father;
4247 unsigned nphi_def_loop_uses = 0;
4248 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4250 gimple *use_stmt = USE_STMT (use_p);
4251 if (is_gimple_debug (use_stmt))
4252 continue;
4254 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4256 if (dump_enabled_p ())
4257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4258 "intermediate value used outside loop.\n");
4260 return NULL;
4263 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4264 op1 twice (once as definition, once as else) in the same operation.
4265 Only count it as one. */
4266 if (use_stmt != phi_use_stmt)
4268 nphi_def_loop_uses++;
4269 phi_use_stmt = use_stmt;
4273 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4274 if (TREE_CODE (latch_def) != SSA_NAME)
4276 if (dump_enabled_p ())
4277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4278 "reduction: not ssa_name: %T\n", latch_def);
4279 return NULL;
4282 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4283 if (!def_stmt_info
4284 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4285 return NULL;
4287 bool nested_in_vect_loop
4288 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4289 unsigned nlatch_def_loop_uses = 0;
4290 auto_vec<gphi *, 3> lcphis;
4291 bool inner_loop_of_double_reduc = false;
4292 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4294 gimple *use_stmt = USE_STMT (use_p);
4295 if (is_gimple_debug (use_stmt))
4296 continue;
4297 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4298 nlatch_def_loop_uses++;
4299 else
4301 /* We can have more than one loop-closed PHI. */
4302 lcphis.safe_push (as_a <gphi *> (use_stmt));
4303 if (nested_in_vect_loop
4304 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4305 == vect_double_reduction_def))
4306 inner_loop_of_double_reduc = true;
4310 /* If we are vectorizing an inner reduction we are executing that
4311 in the original order only in case we are not dealing with a
4312 double reduction. */
4313 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4315 if (dump_enabled_p ())
4316 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4317 "detected nested cycle: ");
4318 return def_stmt_info;
4321 /* When the inner loop of a double reduction ends up with more than
4322 one loop-closed PHI we have failed to classify alternate such
4323 PHIs as double reduction, leading to wrong code. See PR103237. */
4324 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4326 if (dump_enabled_p ())
4327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4328 "unhandle double reduction\n");
4329 return NULL;
4332 /* If this isn't a nested cycle or if the nested cycle reduction value
4333 is used ouside of the inner loop we cannot handle uses of the reduction
4334 value. */
4335 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4337 if (dump_enabled_p ())
4338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4339 "reduction used in loop.\n");
4340 return NULL;
4343 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4344 defined in the inner loop. */
4345 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4347 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4348 if (gimple_phi_num_args (def_stmt) != 1
4349 || TREE_CODE (op1) != SSA_NAME)
4351 if (dump_enabled_p ())
4352 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4353 "unsupported phi node definition.\n");
4355 return NULL;
4358 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4359 and the latch definition op1. */
4360 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4361 if (gimple_bb (def1)
4362 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4363 && loop->inner
4364 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4365 && (is_gimple_assign (def1) || is_gimple_call (def1))
4366 && is_a <gphi *> (phi_use_stmt)
4367 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4368 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4369 loop_latch_edge (loop->inner))))
4371 if (dump_enabled_p ())
4372 report_vect_op (MSG_NOTE, def_stmt,
4373 "detected double reduction: ");
4375 *double_reduc = true;
4376 return def_stmt_info;
4379 return NULL;
4382 /* Look for the expression computing latch_def from then loop PHI result. */
4383 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4384 code_helper code;
4385 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4386 path))
4388 STMT_VINFO_REDUC_CODE (phi_info) = code;
4389 if (code == COND_EXPR && !nested_in_vect_loop)
4390 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4392 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4393 reduction chain for which the additional restriction is that
4394 all operations in the chain are the same. */
4395 auto_vec<stmt_vec_info, 8> reduc_chain;
4396 unsigned i;
4397 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4398 for (i = path.length () - 1; i >= 1; --i)
4400 gimple *stmt = USE_STMT (path[i].second);
4401 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4402 gimple_match_op op;
4403 if (!gimple_extract_op (stmt, &op))
4404 gcc_unreachable ();
4405 if (gassign *assign = dyn_cast<gassign *> (stmt))
4406 STMT_VINFO_REDUC_IDX (stmt_info)
4407 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4408 else
4410 gcall *call = as_a<gcall *> (stmt);
4411 STMT_VINFO_REDUC_IDX (stmt_info)
4412 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4414 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4415 && (i == 1 || i == path.length () - 1));
4416 if ((op.code != code && !leading_conversion)
4417 /* We can only handle the final value in epilogue
4418 generation for reduction chains. */
4419 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4420 is_slp_reduc = false;
4421 /* For reduction chains we support a trailing/leading
4422 conversions. We do not store those in the actual chain. */
4423 if (leading_conversion)
4424 continue;
4425 reduc_chain.safe_push (stmt_info);
4427 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4429 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4431 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4432 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4434 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4435 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4437 /* Save the chain for further analysis in SLP detection. */
4438 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4439 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4441 *reduc_chain_p = true;
4442 if (dump_enabled_p ())
4443 dump_printf_loc (MSG_NOTE, vect_location,
4444 "reduction: detected reduction chain\n");
4446 else if (dump_enabled_p ())
4447 dump_printf_loc (MSG_NOTE, vect_location,
4448 "reduction: detected reduction\n");
4450 return def_stmt_info;
4453 if (dump_enabled_p ())
4454 dump_printf_loc (MSG_NOTE, vect_location,
4455 "reduction: unknown pattern\n");
4457 return NULL;
4460 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4461 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4462 or -1 if not known. */
4464 static int
4465 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4467 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4468 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4470 if (dump_enabled_p ())
4471 dump_printf_loc (MSG_NOTE, vect_location,
4472 "cost model: epilogue peel iters set to vf/2 "
4473 "because loop iterations are unknown .\n");
4474 return assumed_vf / 2;
4476 else
4478 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4479 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4480 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4481 /* If we need to peel for gaps, but no peeling is required, we have to
4482 peel VF iterations. */
4483 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4484 peel_iters_epilogue = assumed_vf;
4485 return peel_iters_epilogue;
4489 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4491 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4492 int *peel_iters_epilogue,
4493 stmt_vector_for_cost *scalar_cost_vec,
4494 stmt_vector_for_cost *prologue_cost_vec,
4495 stmt_vector_for_cost *epilogue_cost_vec)
4497 int retval = 0;
4499 *peel_iters_epilogue
4500 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4502 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4504 /* If peeled iterations are known but number of scalar loop
4505 iterations are unknown, count a taken branch per peeled loop. */
4506 if (peel_iters_prologue > 0)
4507 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4508 vect_prologue);
4509 if (*peel_iters_epilogue > 0)
4510 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4511 vect_epilogue);
4514 stmt_info_for_cost *si;
4515 int j;
4516 if (peel_iters_prologue)
4517 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4518 retval += record_stmt_cost (prologue_cost_vec,
4519 si->count * peel_iters_prologue,
4520 si->kind, si->stmt_info, si->misalign,
4521 vect_prologue);
4522 if (*peel_iters_epilogue)
4523 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4524 retval += record_stmt_cost (epilogue_cost_vec,
4525 si->count * *peel_iters_epilogue,
4526 si->kind, si->stmt_info, si->misalign,
4527 vect_epilogue);
4529 return retval;
4532 /* Function vect_estimate_min_profitable_iters
4534 Return the number of iterations required for the vector version of the
4535 loop to be profitable relative to the cost of the scalar version of the
4536 loop.
4538 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4539 of iterations for vectorization. -1 value means loop vectorization
4540 is not profitable. This returned value may be used for dynamic
4541 profitability check.
4543 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4544 for static check against estimated number of iterations. */
4546 static void
4547 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4548 int *ret_min_profitable_niters,
4549 int *ret_min_profitable_estimate,
4550 unsigned *suggested_unroll_factor)
4552 int min_profitable_iters;
4553 int min_profitable_estimate;
4554 int peel_iters_prologue;
4555 int peel_iters_epilogue;
4556 unsigned vec_inside_cost = 0;
4557 int vec_outside_cost = 0;
4558 unsigned vec_prologue_cost = 0;
4559 unsigned vec_epilogue_cost = 0;
4560 int scalar_single_iter_cost = 0;
4561 int scalar_outside_cost = 0;
4562 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4563 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4564 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4566 /* Cost model disabled. */
4567 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4569 if (dump_enabled_p ())
4570 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4571 *ret_min_profitable_niters = 0;
4572 *ret_min_profitable_estimate = 0;
4573 return;
4576 /* Requires loop versioning tests to handle misalignment. */
4577 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4579 /* FIXME: Make cost depend on complexity of individual check. */
4580 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4581 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4582 if (dump_enabled_p ())
4583 dump_printf (MSG_NOTE,
4584 "cost model: Adding cost of checks for loop "
4585 "versioning to treat misalignment.\n");
4588 /* Requires loop versioning with alias checks. */
4589 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4591 /* FIXME: Make cost depend on complexity of individual check. */
4592 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4593 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4594 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4595 if (len)
4596 /* Count LEN - 1 ANDs and LEN comparisons. */
4597 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4598 scalar_stmt, vect_prologue);
4599 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4600 if (len)
4602 /* Count LEN - 1 ANDs and LEN comparisons. */
4603 unsigned int nstmts = len * 2 - 1;
4604 /* +1 for each bias that needs adding. */
4605 for (unsigned int i = 0; i < len; ++i)
4606 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4607 nstmts += 1;
4608 (void) add_stmt_cost (target_cost_data, nstmts,
4609 scalar_stmt, vect_prologue);
4611 if (dump_enabled_p ())
4612 dump_printf (MSG_NOTE,
4613 "cost model: Adding cost of checks for loop "
4614 "versioning aliasing.\n");
4617 /* Requires loop versioning with niter checks. */
4618 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4620 /* FIXME: Make cost depend on complexity of individual check. */
4621 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4622 NULL, NULL, NULL_TREE, 0, vect_prologue);
4623 if (dump_enabled_p ())
4624 dump_printf (MSG_NOTE,
4625 "cost model: Adding cost of checks for loop "
4626 "versioning niters.\n");
4629 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4630 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4631 vect_prologue);
4633 /* Count statements in scalar loop. Using this as scalar cost for a single
4634 iteration for now.
4636 TODO: Add outer loop support.
4638 TODO: Consider assigning different costs to different scalar
4639 statements. */
4641 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4643 /* Add additional cost for the peeled instructions in prologue and epilogue
4644 loop. (For fully-masked loops there will be no peeling.)
4646 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4647 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4649 TODO: Build an expression that represents peel_iters for prologue and
4650 epilogue to be used in a run-time test. */
4652 bool prologue_need_br_taken_cost = false;
4653 bool prologue_need_br_not_taken_cost = false;
4655 /* Calculate peel_iters_prologue. */
4656 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4657 peel_iters_prologue = 0;
4658 else if (npeel < 0)
4660 peel_iters_prologue = assumed_vf / 2;
4661 if (dump_enabled_p ())
4662 dump_printf (MSG_NOTE, "cost model: "
4663 "prologue peel iters set to vf/2.\n");
4665 /* If peeled iterations are unknown, count a taken branch and a not taken
4666 branch per peeled loop. Even if scalar loop iterations are known,
4667 vector iterations are not known since peeled prologue iterations are
4668 not known. Hence guards remain the same. */
4669 prologue_need_br_taken_cost = true;
4670 prologue_need_br_not_taken_cost = true;
4672 else
4674 peel_iters_prologue = npeel;
4675 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4676 /* If peeled iterations are known but number of scalar loop
4677 iterations are unknown, count a taken branch per peeled loop. */
4678 prologue_need_br_taken_cost = true;
4681 bool epilogue_need_br_taken_cost = false;
4682 bool epilogue_need_br_not_taken_cost = false;
4684 /* Calculate peel_iters_epilogue. */
4685 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4686 /* We need to peel exactly one iteration for gaps. */
4687 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4688 else if (npeel < 0)
4690 /* If peeling for alignment is unknown, loop bound of main loop
4691 becomes unknown. */
4692 peel_iters_epilogue = assumed_vf / 2;
4693 if (dump_enabled_p ())
4694 dump_printf (MSG_NOTE, "cost model: "
4695 "epilogue peel iters set to vf/2 because "
4696 "peeling for alignment is unknown.\n");
4698 /* See the same reason above in peel_iters_prologue calculation. */
4699 epilogue_need_br_taken_cost = true;
4700 epilogue_need_br_not_taken_cost = true;
4702 else
4704 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4705 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4706 /* If peeled iterations are known but number of scalar loop
4707 iterations are unknown, count a taken branch per peeled loop. */
4708 epilogue_need_br_taken_cost = true;
4711 stmt_info_for_cost *si;
4712 int j;
4713 /* Add costs associated with peel_iters_prologue. */
4714 if (peel_iters_prologue)
4715 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4717 (void) add_stmt_cost (target_cost_data,
4718 si->count * peel_iters_prologue, si->kind,
4719 si->stmt_info, si->node, si->vectype,
4720 si->misalign, vect_prologue);
4723 /* Add costs associated with peel_iters_epilogue. */
4724 if (peel_iters_epilogue)
4725 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4727 (void) add_stmt_cost (target_cost_data,
4728 si->count * peel_iters_epilogue, si->kind,
4729 si->stmt_info, si->node, si->vectype,
4730 si->misalign, vect_epilogue);
4733 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4735 if (prologue_need_br_taken_cost)
4736 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4737 vect_prologue);
4739 if (prologue_need_br_not_taken_cost)
4740 (void) add_stmt_cost (target_cost_data, 1,
4741 cond_branch_not_taken, vect_prologue);
4743 if (epilogue_need_br_taken_cost)
4744 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4745 vect_epilogue);
4747 if (epilogue_need_br_not_taken_cost)
4748 (void) add_stmt_cost (target_cost_data, 1,
4749 cond_branch_not_taken, vect_epilogue);
4751 /* Take care of special costs for rgroup controls of partial vectors. */
4752 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4753 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4754 == vect_partial_vectors_avx512))
4756 /* Calculate how many masks we need to generate. */
4757 unsigned int num_masks = 0;
4758 bool need_saturation = false;
4759 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4760 if (rgm.type)
4762 unsigned nvectors = rgm.factor;
4763 num_masks += nvectors;
4764 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4765 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4766 need_saturation = true;
4769 /* ??? The target isn't able to identify the costs below as
4770 producing masks so it cannot penaltize cases where we'd run
4771 out of mask registers for example. */
4773 /* ??? We are also failing to account for smaller vector masks
4774 we generate by splitting larger masks in vect_get_loop_mask. */
4776 /* In the worst case, we need to generate each mask in the prologue
4777 and in the loop body. We need one splat per group and one
4778 compare per mask.
4780 Sometimes the prologue mask will fold to a constant,
4781 so the actual prologue cost might be smaller. However, it's
4782 simpler and safer to use the worst-case cost; if this ends up
4783 being the tie-breaker between vectorizing or not, then it's
4784 probably better not to vectorize. */
4785 (void) add_stmt_cost (target_cost_data,
4786 num_masks
4787 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4788 vector_stmt, NULL, NULL, NULL_TREE, 0,
4789 vect_prologue);
4790 (void) add_stmt_cost (target_cost_data,
4791 num_masks
4792 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4793 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4795 /* When we need saturation we need it both in the prologue and
4796 the epilogue. */
4797 if (need_saturation)
4799 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4800 NULL, NULL, NULL_TREE, 0, vect_prologue);
4801 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4802 NULL, NULL, NULL_TREE, 0, vect_body);
4805 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4806 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4807 == vect_partial_vectors_while_ult))
4809 /* Calculate how many masks we need to generate. */
4810 unsigned int num_masks = 0;
4811 rgroup_controls *rgm;
4812 unsigned int num_vectors_m1;
4813 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4814 num_vectors_m1, rgm)
4815 if (rgm->type)
4816 num_masks += num_vectors_m1 + 1;
4817 gcc_assert (num_masks > 0);
4819 /* In the worst case, we need to generate each mask in the prologue
4820 and in the loop body. One of the loop body mask instructions
4821 replaces the comparison in the scalar loop, and since we don't
4822 count the scalar comparison against the scalar body, we shouldn't
4823 count that vector instruction against the vector body either.
4825 Sometimes we can use unpacks instead of generating prologue
4826 masks and sometimes the prologue mask will fold to a constant,
4827 so the actual prologue cost might be smaller. However, it's
4828 simpler and safer to use the worst-case cost; if this ends up
4829 being the tie-breaker between vectorizing or not, then it's
4830 probably better not to vectorize. */
4831 (void) add_stmt_cost (target_cost_data, num_masks,
4832 vector_stmt, NULL, NULL, NULL_TREE, 0,
4833 vect_prologue);
4834 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4835 vector_stmt, NULL, NULL, NULL_TREE, 0,
4836 vect_body);
4838 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4840 /* Referring to the functions vect_set_loop_condition_partial_vectors
4841 and vect_set_loop_controls_directly, we need to generate each
4842 length in the prologue and in the loop body if required. Although
4843 there are some possible optimizations, we consider the worst case
4844 here. */
4846 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4847 signed char partial_load_store_bias
4848 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4849 bool need_iterate_p
4850 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4851 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4853 /* Calculate how many statements to be added. */
4854 unsigned int prologue_stmts = 0;
4855 unsigned int body_stmts = 0;
4857 rgroup_controls *rgc;
4858 unsigned int num_vectors_m1;
4859 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4860 if (rgc->type)
4862 /* May need one SHIFT for nitems_total computation. */
4863 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4864 if (nitems != 1 && !niters_known_p)
4865 prologue_stmts += 1;
4867 /* May need one MAX and one MINUS for wrap around. */
4868 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4869 prologue_stmts += 2;
4871 /* Need one MAX and one MINUS for each batch limit excepting for
4872 the 1st one. */
4873 prologue_stmts += num_vectors_m1 * 2;
4875 unsigned int num_vectors = num_vectors_m1 + 1;
4877 /* Need to set up lengths in prologue, only one MIN required
4878 for each since start index is zero. */
4879 prologue_stmts += num_vectors;
4881 /* If we have a non-zero partial load bias, we need one PLUS
4882 to adjust the load length. */
4883 if (partial_load_store_bias != 0)
4884 body_stmts += 1;
4886 unsigned int length_update_cost = 0;
4887 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4888 /* For decrement IV style, Each only need a single SELECT_VL
4889 or MIN since beginning to calculate the number of elements
4890 need to be processed in current iteration. */
4891 length_update_cost = 1;
4892 else
4893 /* For increment IV stype, Each may need two MINs and one MINUS to
4894 update lengths in body for next iteration. */
4895 length_update_cost = 3;
4897 if (need_iterate_p)
4898 body_stmts += length_update_cost * num_vectors;
4901 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4902 scalar_stmt, vect_prologue);
4903 (void) add_stmt_cost (target_cost_data, body_stmts,
4904 scalar_stmt, vect_body);
4907 /* FORNOW: The scalar outside cost is incremented in one of the
4908 following ways:
4910 1. The vectorizer checks for alignment and aliasing and generates
4911 a condition that allows dynamic vectorization. A cost model
4912 check is ANDED with the versioning condition. Hence scalar code
4913 path now has the added cost of the versioning check.
4915 if (cost > th & versioning_check)
4916 jmp to vector code
4918 Hence run-time scalar is incremented by not-taken branch cost.
4920 2. The vectorizer then checks if a prologue is required. If the
4921 cost model check was not done before during versioning, it has to
4922 be done before the prologue check.
4924 if (cost <= th)
4925 prologue = scalar_iters
4926 if (prologue == 0)
4927 jmp to vector code
4928 else
4929 execute prologue
4930 if (prologue == num_iters)
4931 go to exit
4933 Hence the run-time scalar cost is incremented by a taken branch,
4934 plus a not-taken branch, plus a taken branch cost.
4936 3. The vectorizer then checks if an epilogue is required. If the
4937 cost model check was not done before during prologue check, it
4938 has to be done with the epilogue check.
4940 if (prologue == 0)
4941 jmp to vector code
4942 else
4943 execute prologue
4944 if (prologue == num_iters)
4945 go to exit
4946 vector code:
4947 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4948 jmp to epilogue
4950 Hence the run-time scalar cost should be incremented by 2 taken
4951 branches.
4953 TODO: The back end may reorder the BBS's differently and reverse
4954 conditions/branch directions. Change the estimates below to
4955 something more reasonable. */
4957 /* If the number of iterations is known and we do not do versioning, we can
4958 decide whether to vectorize at compile time. Hence the scalar version
4959 do not carry cost model guard costs. */
4960 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4961 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4963 /* Cost model check occurs at versioning. */
4964 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4965 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4966 else
4968 /* Cost model check occurs at prologue generation. */
4969 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4970 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4971 + vect_get_stmt_cost (cond_branch_not_taken);
4972 /* Cost model check occurs at epilogue generation. */
4973 else
4974 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4978 /* Complete the target-specific cost calculations. */
4979 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4980 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4981 suggested_unroll_factor);
4983 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4984 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4985 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4986 *suggested_unroll_factor,
4987 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4989 if (dump_enabled_p ())
4990 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4991 "can't unroll as unrolled vectorization factor larger"
4992 " than maximum vectorization factor: "
4993 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4994 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4995 *suggested_unroll_factor = 1;
4998 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5000 if (dump_enabled_p ())
5002 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5003 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
5004 vec_inside_cost);
5005 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
5006 vec_prologue_cost);
5007 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
5008 vec_epilogue_cost);
5009 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
5010 scalar_single_iter_cost);
5011 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
5012 scalar_outside_cost);
5013 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
5014 vec_outside_cost);
5015 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
5016 peel_iters_prologue);
5017 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
5018 peel_iters_epilogue);
5021 /* Calculate number of iterations required to make the vector version
5022 profitable, relative to the loop bodies only. The following condition
5023 must hold true:
5024 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5025 where
5026 SIC = scalar iteration cost, VIC = vector iteration cost,
5027 VOC = vector outside cost, VF = vectorization factor,
5028 NPEEL = prologue iterations + epilogue iterations,
5029 SOC = scalar outside cost for run time cost model check. */
5031 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5032 - vec_inside_cost);
5033 if (saving_per_viter <= 0)
5035 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5036 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5037 "vectorization did not happen for a simd loop");
5039 if (dump_enabled_p ())
5040 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5041 "cost model: the vector iteration cost = %d "
5042 "divided by the scalar iteration cost = %d "
5043 "is greater or equal to the vectorization factor = %d"
5044 ".\n",
5045 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5046 *ret_min_profitable_niters = -1;
5047 *ret_min_profitable_estimate = -1;
5048 return;
5051 /* ??? The "if" arm is written to handle all cases; see below for what
5052 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5053 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5055 /* Rewriting the condition above in terms of the number of
5056 vector iterations (vniters) rather than the number of
5057 scalar iterations (niters) gives:
5059 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5061 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5063 For integer N, X and Y when X > 0:
5065 N * X > Y <==> N >= (Y /[floor] X) + 1. */
5066 int outside_overhead = (vec_outside_cost
5067 - scalar_single_iter_cost * peel_iters_prologue
5068 - scalar_single_iter_cost * peel_iters_epilogue
5069 - scalar_outside_cost);
5070 /* We're only interested in cases that require at least one
5071 vector iteration. */
5072 int min_vec_niters = 1;
5073 if (outside_overhead > 0)
5074 min_vec_niters = outside_overhead / saving_per_viter + 1;
5076 if (dump_enabled_p ())
5077 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5078 min_vec_niters);
5080 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5082 /* Now that we know the minimum number of vector iterations,
5083 find the minimum niters for which the scalar cost is larger:
5085 SIC * niters > VIC * vniters + VOC - SOC
5087 We know that the minimum niters is no more than
5088 vniters * VF + NPEEL, but it might be (and often is) less
5089 than that if a partial vector iteration is cheaper than the
5090 equivalent scalar code. */
5091 int threshold = (vec_inside_cost * min_vec_niters
5092 + vec_outside_cost
5093 - scalar_outside_cost);
5094 if (threshold <= 0)
5095 min_profitable_iters = 1;
5096 else
5097 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5099 else
5100 /* Convert the number of vector iterations into a number of
5101 scalar iterations. */
5102 min_profitable_iters = (min_vec_niters * assumed_vf
5103 + peel_iters_prologue
5104 + peel_iters_epilogue);
5106 else
5108 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5109 * assumed_vf
5110 - vec_inside_cost * peel_iters_prologue
5111 - vec_inside_cost * peel_iters_epilogue);
5112 if (min_profitable_iters <= 0)
5113 min_profitable_iters = 0;
5114 else
5116 min_profitable_iters /= saving_per_viter;
5118 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5119 <= (((int) vec_inside_cost * min_profitable_iters)
5120 + (((int) vec_outside_cost - scalar_outside_cost)
5121 * assumed_vf)))
5122 min_profitable_iters++;
5126 if (dump_enabled_p ())
5127 dump_printf (MSG_NOTE,
5128 " Calculated minimum iters for profitability: %d\n",
5129 min_profitable_iters);
5131 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5132 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5133 /* We want the vectorized loop to execute at least once. */
5134 min_profitable_iters = assumed_vf + peel_iters_prologue;
5135 else if (min_profitable_iters < peel_iters_prologue)
5136 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5137 vectorized loop executes at least once. */
5138 min_profitable_iters = peel_iters_prologue;
5140 if (dump_enabled_p ())
5141 dump_printf_loc (MSG_NOTE, vect_location,
5142 " Runtime profitability threshold = %d\n",
5143 min_profitable_iters);
5145 *ret_min_profitable_niters = min_profitable_iters;
5147 /* Calculate number of iterations required to make the vector version
5148 profitable, relative to the loop bodies only.
5150 Non-vectorized variant is SIC * niters and it must win over vector
5151 variant on the expected loop trip count. The following condition must hold true:
5152 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5154 if (vec_outside_cost <= 0)
5155 min_profitable_estimate = 0;
5156 /* ??? This "else if" arm is written to handle all cases; see below for
5157 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5158 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5160 /* This is a repeat of the code above, but with + SOC rather
5161 than - SOC. */
5162 int outside_overhead = (vec_outside_cost
5163 - scalar_single_iter_cost * peel_iters_prologue
5164 - scalar_single_iter_cost * peel_iters_epilogue
5165 + scalar_outside_cost);
5166 int min_vec_niters = 1;
5167 if (outside_overhead > 0)
5168 min_vec_niters = outside_overhead / saving_per_viter + 1;
5170 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5172 int threshold = (vec_inside_cost * min_vec_niters
5173 + vec_outside_cost
5174 + scalar_outside_cost);
5175 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5177 else
5178 min_profitable_estimate = (min_vec_niters * assumed_vf
5179 + peel_iters_prologue
5180 + peel_iters_epilogue);
5182 else
5184 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5185 * assumed_vf
5186 - vec_inside_cost * peel_iters_prologue
5187 - vec_inside_cost * peel_iters_epilogue)
5188 / ((scalar_single_iter_cost * assumed_vf)
5189 - vec_inside_cost);
5191 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5192 if (dump_enabled_p ())
5193 dump_printf_loc (MSG_NOTE, vect_location,
5194 " Static estimate profitability threshold = %d\n",
5195 min_profitable_estimate);
5197 *ret_min_profitable_estimate = min_profitable_estimate;
5200 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5201 vector elements (not bits) for a vector with NELT elements. */
5202 static void
5203 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5204 vec_perm_builder *sel)
5206 /* The encoding is a single stepped pattern. Any wrap-around is handled
5207 by vec_perm_indices. */
5208 sel->new_vector (nelt, 1, 3);
5209 for (unsigned int i = 0; i < 3; i++)
5210 sel->quick_push (i + offset);
5213 /* Checks whether the target supports whole-vector shifts for vectors of mode
5214 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5215 it supports vec_perm_const with masks for all necessary shift amounts. */
5216 static bool
5217 have_whole_vector_shift (machine_mode mode)
5219 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5220 return true;
5222 /* Variable-length vectors should be handled via the optab. */
5223 unsigned int nelt;
5224 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5225 return false;
5227 vec_perm_builder sel;
5228 vec_perm_indices indices;
5229 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5231 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5232 indices.new_vector (sel, 2, nelt);
5233 if (!can_vec_perm_const_p (mode, mode, indices, false))
5234 return false;
5236 return true;
5239 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5240 multiplication operands have differing signs and (b) we intend
5241 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5242 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5244 static bool
5245 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5246 stmt_vec_info stmt_info)
5248 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5249 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5250 return false;
5252 tree rhs1 = gimple_assign_rhs1 (assign);
5253 tree rhs2 = gimple_assign_rhs2 (assign);
5254 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5255 return false;
5257 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5258 gcc_assert (reduc_info->is_reduc_info);
5259 return !directly_supported_p (DOT_PROD_EXPR,
5260 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5261 optab_vector_mixed_sign);
5264 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5265 functions. Design better to avoid maintenance issues. */
5267 /* Function vect_model_reduction_cost.
5269 Models cost for a reduction operation, including the vector ops
5270 generated within the strip-mine loop in some cases, the initial
5271 definition before the loop, and the epilogue code that must be generated. */
5273 static void
5274 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5275 stmt_vec_info stmt_info, internal_fn reduc_fn,
5276 vect_reduction_type reduction_type,
5277 int ncopies, stmt_vector_for_cost *cost_vec)
5279 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5280 tree vectype;
5281 machine_mode mode;
5282 class loop *loop = NULL;
5284 if (loop_vinfo)
5285 loop = LOOP_VINFO_LOOP (loop_vinfo);
5287 /* Condition reductions generate two reductions in the loop. */
5288 if (reduction_type == COND_REDUCTION)
5289 ncopies *= 2;
5291 vectype = STMT_VINFO_VECTYPE (stmt_info);
5292 mode = TYPE_MODE (vectype);
5293 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5295 gimple_match_op op;
5296 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5297 gcc_unreachable ();
5299 bool emulated_mixed_dot_prod
5300 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5301 if (reduction_type == EXTRACT_LAST_REDUCTION)
5302 /* No extra instructions are needed in the prologue. The loop body
5303 operations are costed in vectorizable_condition. */
5304 inside_cost = 0;
5305 else if (reduction_type == FOLD_LEFT_REDUCTION)
5307 /* No extra instructions needed in the prologue. */
5308 prologue_cost = 0;
5310 if (reduc_fn != IFN_LAST)
5311 /* Count one reduction-like operation per vector. */
5312 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5313 stmt_info, 0, vect_body);
5314 else
5316 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5317 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5318 inside_cost = record_stmt_cost (cost_vec, nelements,
5319 vec_to_scalar, stmt_info, 0,
5320 vect_body);
5321 inside_cost += record_stmt_cost (cost_vec, nelements,
5322 scalar_stmt, stmt_info, 0,
5323 vect_body);
5326 else
5328 /* Add in the cost of the initial definitions. */
5329 int prologue_stmts;
5330 if (reduction_type == COND_REDUCTION)
5331 /* For cond reductions we have four vectors: initial index, step,
5332 initial result of the data reduction, initial value of the index
5333 reduction. */
5334 prologue_stmts = 4;
5335 else if (emulated_mixed_dot_prod)
5336 /* We need the initial reduction value and two invariants:
5337 one that contains the minimum signed value and one that
5338 contains half of its negative. */
5339 prologue_stmts = 3;
5340 else
5341 prologue_stmts = 1;
5342 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5343 scalar_to_vec, stmt_info, 0,
5344 vect_prologue);
5347 /* Determine cost of epilogue code.
5349 We have a reduction operator that will reduce the vector in one statement.
5350 Also requires scalar extract. */
5352 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5354 if (reduc_fn != IFN_LAST)
5356 if (reduction_type == COND_REDUCTION)
5358 /* An EQ stmt and an COND_EXPR stmt. */
5359 epilogue_cost += record_stmt_cost (cost_vec, 2,
5360 vector_stmt, stmt_info, 0,
5361 vect_epilogue);
5362 /* Reduction of the max index and a reduction of the found
5363 values. */
5364 epilogue_cost += record_stmt_cost (cost_vec, 2,
5365 vec_to_scalar, stmt_info, 0,
5366 vect_epilogue);
5367 /* A broadcast of the max value. */
5368 epilogue_cost += record_stmt_cost (cost_vec, 1,
5369 scalar_to_vec, stmt_info, 0,
5370 vect_epilogue);
5372 else
5374 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5375 stmt_info, 0, vect_epilogue);
5376 epilogue_cost += record_stmt_cost (cost_vec, 1,
5377 vec_to_scalar, stmt_info, 0,
5378 vect_epilogue);
5381 else if (reduction_type == COND_REDUCTION)
5383 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5384 /* Extraction of scalar elements. */
5385 epilogue_cost += record_stmt_cost (cost_vec,
5386 2 * estimated_nunits,
5387 vec_to_scalar, stmt_info, 0,
5388 vect_epilogue);
5389 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5390 epilogue_cost += record_stmt_cost (cost_vec,
5391 2 * estimated_nunits - 3,
5392 scalar_stmt, stmt_info, 0,
5393 vect_epilogue);
5395 else if (reduction_type == EXTRACT_LAST_REDUCTION
5396 || reduction_type == FOLD_LEFT_REDUCTION)
5397 /* No extra instructions need in the epilogue. */
5399 else
5401 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5402 tree bitsize = TYPE_SIZE (op.type);
5403 int element_bitsize = tree_to_uhwi (bitsize);
5404 int nelements = vec_size_in_bits / element_bitsize;
5406 if (op.code == COND_EXPR)
5407 op.code = MAX_EXPR;
5409 /* We have a whole vector shift available. */
5410 if (VECTOR_MODE_P (mode)
5411 && directly_supported_p (op.code, vectype)
5412 && have_whole_vector_shift (mode))
5414 /* Final reduction via vector shifts and the reduction operator.
5415 Also requires scalar extract. */
5416 epilogue_cost += record_stmt_cost (cost_vec,
5417 exact_log2 (nelements) * 2,
5418 vector_stmt, stmt_info, 0,
5419 vect_epilogue);
5420 epilogue_cost += record_stmt_cost (cost_vec, 1,
5421 vec_to_scalar, stmt_info, 0,
5422 vect_epilogue);
5424 else
5425 /* Use extracts and reduction op for final reduction. For N
5426 elements, we have N extracts and N-1 reduction ops. */
5427 epilogue_cost += record_stmt_cost (cost_vec,
5428 nelements + nelements - 1,
5429 vector_stmt, stmt_info, 0,
5430 vect_epilogue);
5434 if (dump_enabled_p ())
5435 dump_printf (MSG_NOTE,
5436 "vect_model_reduction_cost: inside_cost = %d, "
5437 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5438 prologue_cost, epilogue_cost);
5441 /* SEQ is a sequence of instructions that initialize the reduction
5442 described by REDUC_INFO. Emit them in the appropriate place. */
5444 static void
5445 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5446 stmt_vec_info reduc_info, gimple *seq)
5448 if (reduc_info->reused_accumulator)
5450 /* When reusing an accumulator from the main loop, we only need
5451 initialization instructions if the main loop can be skipped.
5452 In that case, emit the initialization instructions at the end
5453 of the guard block that does the skip. */
5454 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5455 gcc_assert (skip_edge);
5456 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5457 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5459 else
5461 /* The normal case: emit the initialization instructions on the
5462 preheader edge. */
5463 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5464 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5468 /* Function get_initial_def_for_reduction
5470 Input:
5471 REDUC_INFO - the info_for_reduction
5472 INIT_VAL - the initial value of the reduction variable
5473 NEUTRAL_OP - a value that has no effect on the reduction, as per
5474 neutral_op_for_reduction
5476 Output:
5477 Return a vector variable, initialized according to the operation that
5478 STMT_VINFO performs. This vector will be used as the initial value
5479 of the vector of partial results.
5481 The value we need is a vector in which element 0 has value INIT_VAL
5482 and every other element has value NEUTRAL_OP. */
5484 static tree
5485 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5486 stmt_vec_info reduc_info,
5487 tree init_val, tree neutral_op)
5489 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5490 tree scalar_type = TREE_TYPE (init_val);
5491 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5492 tree init_def;
5493 gimple_seq stmts = NULL;
5495 gcc_assert (vectype);
5497 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5498 || SCALAR_FLOAT_TYPE_P (scalar_type));
5500 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5501 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5503 if (operand_equal_p (init_val, neutral_op))
5505 /* If both elements are equal then the vector described above is
5506 just a splat. */
5507 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5508 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5510 else
5512 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5513 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5514 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5516 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5517 element 0. */
5518 init_def = gimple_build_vector_from_val (&stmts, vectype,
5519 neutral_op);
5520 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5521 vectype, init_def, init_val);
5523 else
5525 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5526 tree_vector_builder elts (vectype, 1, 2);
5527 elts.quick_push (init_val);
5528 elts.quick_push (neutral_op);
5529 init_def = gimple_build_vector (&stmts, &elts);
5533 if (stmts)
5534 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5535 return init_def;
5538 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5539 which performs a reduction involving GROUP_SIZE scalar statements.
5540 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5541 is nonnull, introducing extra elements of that value will not change the
5542 result. */
5544 static void
5545 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5546 stmt_vec_info reduc_info,
5547 vec<tree> *vec_oprnds,
5548 unsigned int number_of_vectors,
5549 unsigned int group_size, tree neutral_op)
5551 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5552 unsigned HOST_WIDE_INT nunits;
5553 unsigned j, number_of_places_left_in_vector;
5554 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5555 unsigned int i;
5557 gcc_assert (group_size == initial_values.length () || neutral_op);
5559 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5560 created vectors. It is greater than 1 if unrolling is performed.
5562 For example, we have two scalar operands, s1 and s2 (e.g., group of
5563 strided accesses of size two), while NUNITS is four (i.e., four scalars
5564 of this type can be packed in a vector). The output vector will contain
5565 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5566 will be 2).
5568 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5569 vectors containing the operands.
5571 For example, NUNITS is four as before, and the group size is 8
5572 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5573 {s5, s6, s7, s8}. */
5575 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5576 nunits = group_size;
5578 number_of_places_left_in_vector = nunits;
5579 bool constant_p = true;
5580 tree_vector_builder elts (vector_type, nunits, 1);
5581 elts.quick_grow (nunits);
5582 gimple_seq ctor_seq = NULL;
5583 for (j = 0; j < nunits * number_of_vectors; ++j)
5585 tree op;
5586 i = j % group_size;
5588 /* Get the def before the loop. In reduction chain we have only
5589 one initial value. Else we have as many as PHIs in the group. */
5590 if (i >= initial_values.length () || (j > i && neutral_op))
5591 op = neutral_op;
5592 else
5593 op = initial_values[i];
5595 /* Create 'vect_ = {op0,op1,...,opn}'. */
5596 number_of_places_left_in_vector--;
5597 elts[nunits - number_of_places_left_in_vector - 1] = op;
5598 if (!CONSTANT_CLASS_P (op))
5599 constant_p = false;
5601 if (number_of_places_left_in_vector == 0)
5603 tree init;
5604 if (constant_p && !neutral_op
5605 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5606 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5607 /* Build the vector directly from ELTS. */
5608 init = gimple_build_vector (&ctor_seq, &elts);
5609 else if (neutral_op)
5611 /* Build a vector of the neutral value and shift the
5612 other elements into place. */
5613 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5614 neutral_op);
5615 int k = nunits;
5616 while (k > 0 && elts[k - 1] == neutral_op)
5617 k -= 1;
5618 while (k > 0)
5620 k -= 1;
5621 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5622 vector_type, init, elts[k]);
5625 else
5627 /* First time round, duplicate ELTS to fill the
5628 required number of vectors. */
5629 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5630 elts, number_of_vectors, *vec_oprnds);
5631 break;
5633 vec_oprnds->quick_push (init);
5635 number_of_places_left_in_vector = nunits;
5636 elts.new_vector (vector_type, nunits, 1);
5637 elts.quick_grow (nunits);
5638 constant_p = true;
5641 if (ctor_seq != NULL)
5642 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5645 /* For a statement STMT_INFO taking part in a reduction operation return
5646 the stmt_vec_info the meta information is stored on. */
5648 stmt_vec_info
5649 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5651 stmt_info = vect_orig_stmt (stmt_info);
5652 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5653 if (!is_a <gphi *> (stmt_info->stmt)
5654 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5655 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5656 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5657 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5659 if (gimple_phi_num_args (phi) == 1)
5660 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5662 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5664 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5665 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5666 stmt_info = info;
5668 return stmt_info;
5671 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5672 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5673 return false. */
5675 static bool
5676 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5677 stmt_vec_info reduc_info)
5679 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5680 if (!main_loop_vinfo)
5681 return false;
5683 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5684 return false;
5686 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5687 auto_vec<tree, 16> main_loop_results (num_phis);
5688 auto_vec<tree, 16> initial_values (num_phis);
5689 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5691 /* The epilogue loop can be entered either from the main loop or
5692 from an earlier guard block. */
5693 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5694 for (tree incoming_value : reduc_info->reduc_initial_values)
5696 /* Look for:
5698 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5699 INITIAL_VALUE(guard block)>. */
5700 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5702 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5703 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5705 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5706 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5708 main_loop_results.quick_push (from_main_loop);
5709 initial_values.quick_push (from_skip);
5712 else
5713 /* The main loop dominates the epilogue loop. */
5714 main_loop_results.splice (reduc_info->reduc_initial_values);
5716 /* See if the main loop has the kind of accumulator we need. */
5717 vect_reusable_accumulator *accumulator
5718 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5719 if (!accumulator
5720 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5721 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5722 accumulator->reduc_info->reduc_scalar_results.begin ()))
5723 return false;
5725 /* Handle the case where we can reduce wider vectors to narrower ones. */
5726 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5727 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5728 unsigned HOST_WIDE_INT m;
5729 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5730 TYPE_VECTOR_SUBPARTS (vectype), &m))
5731 return false;
5732 /* Check the intermediate vector types and operations are available. */
5733 tree prev_vectype = old_vectype;
5734 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5735 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5737 intermediate_nunits = exact_div (intermediate_nunits, 2);
5738 tree intermediate_vectype = get_related_vectype_for_scalar_type
5739 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5740 if (!intermediate_vectype
5741 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5742 intermediate_vectype)
5743 || !can_vec_extract (TYPE_MODE (prev_vectype),
5744 TYPE_MODE (intermediate_vectype)))
5745 return false;
5746 prev_vectype = intermediate_vectype;
5749 /* Non-SLP reductions might apply an adjustment after the reduction
5750 operation, in order to simplify the initialization of the accumulator.
5751 If the epilogue loop carries on from where the main loop left off,
5752 it should apply the same adjustment to the final reduction result.
5754 If the epilogue loop can also be entered directly (rather than via
5755 the main loop), we need to be able to handle that case in the same way,
5756 with the same adjustment. (In principle we could add a PHI node
5757 to select the correct adjustment, but in practice that shouldn't be
5758 necessary.) */
5759 tree main_adjustment
5760 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5761 if (loop_vinfo->main_loop_edge && main_adjustment)
5763 gcc_assert (num_phis == 1);
5764 tree initial_value = initial_values[0];
5765 /* Check that we can use INITIAL_VALUE as the adjustment and
5766 initialize the accumulator with a neutral value instead. */
5767 if (!operand_equal_p (initial_value, main_adjustment))
5768 return false;
5769 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5770 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5771 code, initial_value);
5773 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5774 reduc_info->reduc_initial_values.truncate (0);
5775 reduc_info->reduc_initial_values.splice (initial_values);
5776 reduc_info->reused_accumulator = accumulator;
5777 return true;
5780 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5781 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5783 static tree
5784 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5785 gimple_seq *seq)
5787 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5788 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5789 tree stype = TREE_TYPE (vectype);
5790 tree new_temp = vec_def;
5791 while (nunits > nunits1)
5793 nunits /= 2;
5794 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5795 stype, nunits);
5796 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5798 /* The target has to make sure we support lowpart/highpart
5799 extraction, either via direct vector extract or through
5800 an integer mode punning. */
5801 tree dst1, dst2;
5802 gimple *epilog_stmt;
5803 if (convert_optab_handler (vec_extract_optab,
5804 TYPE_MODE (TREE_TYPE (new_temp)),
5805 TYPE_MODE (vectype1))
5806 != CODE_FOR_nothing)
5808 /* Extract sub-vectors directly once vec_extract becomes
5809 a conversion optab. */
5810 dst1 = make_ssa_name (vectype1);
5811 epilog_stmt
5812 = gimple_build_assign (dst1, BIT_FIELD_REF,
5813 build3 (BIT_FIELD_REF, vectype1,
5814 new_temp, TYPE_SIZE (vectype1),
5815 bitsize_int (0)));
5816 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5817 dst2 = make_ssa_name (vectype1);
5818 epilog_stmt
5819 = gimple_build_assign (dst2, BIT_FIELD_REF,
5820 build3 (BIT_FIELD_REF, vectype1,
5821 new_temp, TYPE_SIZE (vectype1),
5822 bitsize_int (bitsize)));
5823 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5825 else
5827 /* Extract via punning to appropriately sized integer mode
5828 vector. */
5829 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5830 tree etype = build_vector_type (eltype, 2);
5831 gcc_assert (convert_optab_handler (vec_extract_optab,
5832 TYPE_MODE (etype),
5833 TYPE_MODE (eltype))
5834 != CODE_FOR_nothing);
5835 tree tem = make_ssa_name (etype);
5836 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5837 build1 (VIEW_CONVERT_EXPR,
5838 etype, new_temp));
5839 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5840 new_temp = tem;
5841 tem = make_ssa_name (eltype);
5842 epilog_stmt
5843 = gimple_build_assign (tem, BIT_FIELD_REF,
5844 build3 (BIT_FIELD_REF, eltype,
5845 new_temp, TYPE_SIZE (eltype),
5846 bitsize_int (0)));
5847 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5848 dst1 = make_ssa_name (vectype1);
5849 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5850 build1 (VIEW_CONVERT_EXPR,
5851 vectype1, tem));
5852 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5853 tem = make_ssa_name (eltype);
5854 epilog_stmt
5855 = gimple_build_assign (tem, BIT_FIELD_REF,
5856 build3 (BIT_FIELD_REF, eltype,
5857 new_temp, TYPE_SIZE (eltype),
5858 bitsize_int (bitsize)));
5859 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5860 dst2 = make_ssa_name (vectype1);
5861 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5862 build1 (VIEW_CONVERT_EXPR,
5863 vectype1, tem));
5864 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5867 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5870 return new_temp;
5873 /* Retrieves the definining statement to be used for a reduction.
5874 For MAIN_EXIT_P we use the current VEC_STMTs and otherwise we look at
5875 the reduction definitions. */
5877 tree
5878 vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
5879 slp_instance slp_node_instance, bool main_exit_p, unsigned i,
5880 vec <gimple *> &vec_stmts)
5882 tree def;
5884 if (slp_node)
5886 if (!main_exit_p)
5887 slp_node = slp_node_instance->reduc_phis;
5888 def = vect_get_slp_vect_def (slp_node, i);
5890 else
5892 if (!main_exit_p)
5893 reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
5894 vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
5895 def = gimple_get_lhs (vec_stmts[0]);
5898 return def;
5901 /* Function vect_create_epilog_for_reduction
5903 Create code at the loop-epilog to finalize the result of a reduction
5904 computation.
5906 STMT_INFO is the scalar reduction stmt that is being vectorized.
5907 SLP_NODE is an SLP node containing a group of reduction statements. The
5908 first one in this group is STMT_INFO.
5909 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5910 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5911 (counting from 0)
5912 LOOP_EXIT is the edge to update in the merge block. In the case of a single
5913 exit this edge is always the main loop exit.
5915 This function:
5916 1. Completes the reduction def-use cycles.
5917 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5918 by calling the function specified by REDUC_FN if available, or by
5919 other means (whole-vector shifts or a scalar loop).
5920 The function also creates a new phi node at the loop exit to preserve
5921 loop-closed form, as illustrated below.
5923 The flow at the entry to this function:
5925 loop:
5926 vec_def = phi <vec_init, null> # REDUCTION_PHI
5927 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5928 s_loop = scalar_stmt # (scalar) STMT_INFO
5929 loop_exit:
5930 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5931 use <s_out0>
5932 use <s_out0>
5934 The above is transformed by this function into:
5936 loop:
5937 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5938 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5939 s_loop = scalar_stmt # (scalar) STMT_INFO
5940 loop_exit:
5941 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5942 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5943 v_out2 = reduce <v_out1>
5944 s_out3 = extract_field <v_out2, 0>
5945 s_out4 = adjust_result <s_out3>
5946 use <s_out4>
5947 use <s_out4>
5950 static void
5951 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5952 stmt_vec_info stmt_info,
5953 slp_tree slp_node,
5954 slp_instance slp_node_instance,
5955 edge loop_exit)
5957 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5958 gcc_assert (reduc_info->is_reduc_info);
5959 /* For double reductions we need to get at the inner loop reduction
5960 stmt which has the meta info attached. Our stmt_info is that of the
5961 loop-closed PHI of the inner loop which we remember as
5962 def for the reduction PHI generation. */
5963 bool double_reduc = false;
5964 bool main_exit_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit;
5965 stmt_vec_info rdef_info = stmt_info;
5966 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5968 gcc_assert (!slp_node);
5969 double_reduc = true;
5970 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5971 (stmt_info->stmt, 0));
5972 stmt_info = vect_stmt_to_vectorize (stmt_info);
5974 gphi *reduc_def_stmt
5975 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5976 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5977 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5978 tree vectype;
5979 machine_mode mode;
5980 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5981 basic_block exit_bb;
5982 tree scalar_dest;
5983 tree scalar_type;
5984 gimple *new_phi = NULL, *phi = NULL;
5985 gimple_stmt_iterator exit_gsi;
5986 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5987 gimple *epilog_stmt = NULL;
5988 gimple *exit_phi;
5989 tree bitsize;
5990 tree def;
5991 tree orig_name, scalar_result;
5992 imm_use_iterator imm_iter, phi_imm_iter;
5993 use_operand_p use_p, phi_use_p;
5994 gimple *use_stmt;
5995 auto_vec<tree> reduc_inputs;
5996 int j, i;
5997 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5998 unsigned int group_size = 1, k;
5999 auto_vec<gimple *> phis;
6000 /* SLP reduction without reduction chain, e.g.,
6001 # a1 = phi <a2, a0>
6002 # b1 = phi <b2, b0>
6003 a2 = operation (a1)
6004 b2 = operation (b1) */
6005 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6006 bool direct_slp_reduc;
6007 tree induction_index = NULL_TREE;
6009 if (slp_node)
6010 group_size = SLP_TREE_LANES (slp_node);
6012 if (nested_in_vect_loop_p (loop, stmt_info))
6014 outer_loop = loop;
6015 loop = loop->inner;
6016 gcc_assert (!slp_node && double_reduc);
6019 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6020 gcc_assert (vectype);
6021 mode = TYPE_MODE (vectype);
6023 tree induc_val = NULL_TREE;
6024 tree adjustment_def = NULL;
6025 if (slp_node)
6027 else
6029 /* Optimize: for induction condition reduction, if we can't use zero
6030 for induc_val, use initial_def. */
6031 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6032 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6033 else if (double_reduc)
6035 else
6036 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6039 stmt_vec_info single_live_out_stmt[] = { stmt_info };
6040 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6041 if (slp_reduc)
6042 /* All statements produce live-out values. */
6043 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6044 else if (slp_node)
6046 /* The last statement in the reduction chain produces the live-out
6047 value. Note SLP optimization can shuffle scalar stmts to
6048 optimize permutations so we have to search for the last stmt. */
6049 for (k = 0; k < group_size; ++k)
6050 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
6052 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6053 break;
6057 unsigned vec_num;
6058 int ncopies;
6059 if (slp_node)
6061 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6062 ncopies = 1;
6064 else
6066 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6067 vec_num = 1;
6068 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6071 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6072 which is updated with the current index of the loop for every match of
6073 the original loop's cond_expr (VEC_STMT). This results in a vector
6074 containing the last time the condition passed for that vector lane.
6075 The first match will be a 1 to allow 0 to be used for non-matching
6076 indexes. If there are no matches at all then the vector will be all
6077 zeroes.
6079 PR92772: This algorithm is broken for architectures that support
6080 masked vectors, but do not provide fold_extract_last. */
6081 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6083 auto_vec<std::pair<tree, bool>, 2> ccompares;
6084 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6085 cond_info = vect_stmt_to_vectorize (cond_info);
6086 while (cond_info != reduc_info)
6088 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6090 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6091 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6092 ccompares.safe_push
6093 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6094 STMT_VINFO_REDUC_IDX (cond_info) == 2));
6096 cond_info
6097 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6098 1 + STMT_VINFO_REDUC_IDX
6099 (cond_info)));
6100 cond_info = vect_stmt_to_vectorize (cond_info);
6102 gcc_assert (ccompares.length () != 0);
6104 tree indx_before_incr, indx_after_incr;
6105 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6106 int scalar_precision
6107 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6108 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6109 tree cr_index_vector_type = get_related_vectype_for_scalar_type
6110 (TYPE_MODE (vectype), cr_index_scalar_type,
6111 TYPE_VECTOR_SUBPARTS (vectype));
6113 /* First we create a simple vector induction variable which starts
6114 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6115 vector size (STEP). */
6117 /* Create a {1,2,3,...} vector. */
6118 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6120 /* Create a vector of the step value. */
6121 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6122 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6124 /* Create an induction variable. */
6125 gimple_stmt_iterator incr_gsi;
6126 bool insert_after;
6127 vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6128 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6129 insert_after, &indx_before_incr, &indx_after_incr);
6131 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6132 filled with zeros (VEC_ZERO). */
6134 /* Create a vector of 0s. */
6135 tree zero = build_zero_cst (cr_index_scalar_type);
6136 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6138 /* Create a vector phi node. */
6139 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6140 new_phi = create_phi_node (new_phi_tree, loop->header);
6141 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6142 loop_preheader_edge (loop), UNKNOWN_LOCATION);
6144 /* Now take the condition from the loops original cond_exprs
6145 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6146 every match uses values from the induction variable
6147 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6148 (NEW_PHI_TREE).
6149 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6150 the new cond_expr (INDEX_COND_EXPR). */
6151 gimple_seq stmts = NULL;
6152 for (int i = ccompares.length () - 1; i != -1; --i)
6154 tree ccompare = ccompares[i].first;
6155 if (ccompares[i].second)
6156 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6157 cr_index_vector_type,
6158 ccompare,
6159 indx_before_incr, new_phi_tree);
6160 else
6161 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6162 cr_index_vector_type,
6163 ccompare,
6164 new_phi_tree, indx_before_incr);
6166 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6168 /* Update the phi with the vec cond. */
6169 induction_index = new_phi_tree;
6170 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6171 loop_latch_edge (loop), UNKNOWN_LOCATION);
6174 /* 2. Create epilog code.
6175 The reduction epilog code operates across the elements of the vector
6176 of partial results computed by the vectorized loop.
6177 The reduction epilog code consists of:
6179 step 1: compute the scalar result in a vector (v_out2)
6180 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6181 step 3: adjust the scalar result (s_out3) if needed.
6183 Step 1 can be accomplished using one the following three schemes:
6184 (scheme 1) using reduc_fn, if available.
6185 (scheme 2) using whole-vector shifts, if available.
6186 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6187 combined.
6189 The overall epilog code looks like this:
6191 s_out0 = phi <s_loop> # original EXIT_PHI
6192 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6193 v_out2 = reduce <v_out1> # step 1
6194 s_out3 = extract_field <v_out2, 0> # step 2
6195 s_out4 = adjust_result <s_out3> # step 3
6197 (step 3 is optional, and steps 1 and 2 may be combined).
6198 Lastly, the uses of s_out0 are replaced by s_out4. */
6201 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6202 v_out1 = phi <VECT_DEF>
6203 Store them in NEW_PHIS. */
6204 if (double_reduc)
6205 loop = outer_loop;
6206 /* We need to reduce values in all exits. */
6207 exit_bb = loop_exit->dest;
6208 exit_gsi = gsi_after_labels (exit_bb);
6209 reduc_inputs.create (slp_node ? vec_num : ncopies);
6210 vec <gimple *> vec_stmts = vNULL;
6211 for (unsigned i = 0; i < vec_num; i++)
6213 gimple_seq stmts = NULL;
6214 def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
6215 main_exit_p, i, vec_stmts);
6216 for (j = 0; j < ncopies; j++)
6218 tree new_def = copy_ssa_name (def);
6219 phi = create_phi_node (new_def, exit_bb);
6220 if (j)
6221 def = gimple_get_lhs (vec_stmts[j]);
6222 SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6223 new_def = gimple_convert (&stmts, vectype, new_def);
6224 reduc_inputs.quick_push (new_def);
6226 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6229 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6230 (i.e. when reduc_fn is not available) and in the final adjustment
6231 code (if needed). Also get the original scalar reduction variable as
6232 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6233 represents a reduction pattern), the tree-code and scalar-def are
6234 taken from the original stmt that the pattern-stmt (STMT) replaces.
6235 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6236 are taken from STMT. */
6238 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6239 if (orig_stmt_info != stmt_info)
6241 /* Reduction pattern */
6242 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6243 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6246 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6247 scalar_type = TREE_TYPE (scalar_dest);
6248 scalar_results.truncate (0);
6249 scalar_results.reserve_exact (group_size);
6250 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6251 bitsize = TYPE_SIZE (scalar_type);
6253 /* True if we should implement SLP_REDUC using native reduction operations
6254 instead of scalar operations. */
6255 direct_slp_reduc = (reduc_fn != IFN_LAST
6256 && slp_reduc
6257 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6259 /* In case of reduction chain, e.g.,
6260 # a1 = phi <a3, a0>
6261 a2 = operation (a1)
6262 a3 = operation (a2),
6264 we may end up with more than one vector result. Here we reduce them
6265 to one vector.
6267 The same is true for a SLP reduction, e.g.,
6268 # a1 = phi <a2, a0>
6269 # b1 = phi <b2, b0>
6270 a2 = operation (a1)
6271 b2 = operation (a2),
6273 where we can end up with more than one vector as well. We can
6274 easily accumulate vectors when the number of vector elements is
6275 a multiple of the SLP group size.
6277 The same is true if we couldn't use a single defuse cycle. */
6278 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6279 || direct_slp_reduc
6280 || (slp_reduc
6281 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6282 || ncopies > 1)
6284 gimple_seq stmts = NULL;
6285 tree single_input = reduc_inputs[0];
6286 for (k = 1; k < reduc_inputs.length (); k++)
6287 single_input = gimple_build (&stmts, code, vectype,
6288 single_input, reduc_inputs[k]);
6289 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6291 reduc_inputs.truncate (0);
6292 reduc_inputs.safe_push (single_input);
6295 tree orig_reduc_input = reduc_inputs[0];
6297 /* If this loop is an epilogue loop that can be skipped after the
6298 main loop, we can only share a reduction operation between the
6299 main loop and the epilogue if we put it at the target of the
6300 skip edge.
6302 We can still reuse accumulators if this check fails. Doing so has
6303 the minor(?) benefit of making the epilogue loop's scalar result
6304 independent of the main loop's scalar result. */
6305 bool unify_with_main_loop_p = false;
6306 if (reduc_info->reused_accumulator
6307 && loop_vinfo->skip_this_loop_edge
6308 && single_succ_p (exit_bb)
6309 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6311 unify_with_main_loop_p = true;
6313 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6314 reduc_inputs[0] = make_ssa_name (vectype);
6315 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6316 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6317 UNKNOWN_LOCATION);
6318 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6319 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6320 exit_gsi = gsi_after_labels (reduc_block);
6323 /* Shouldn't be used beyond this point. */
6324 exit_bb = nullptr;
6326 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6327 && reduc_fn != IFN_LAST)
6329 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6330 various data values where the condition matched and another vector
6331 (INDUCTION_INDEX) containing all the indexes of those matches. We
6332 need to extract the last matching index (which will be the index with
6333 highest value) and use this to index into the data vector.
6334 For the case where there were no matches, the data vector will contain
6335 all default values and the index vector will be all zeros. */
6337 /* Get various versions of the type of the vector of indexes. */
6338 tree index_vec_type = TREE_TYPE (induction_index);
6339 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6340 tree index_scalar_type = TREE_TYPE (index_vec_type);
6341 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6343 /* Get an unsigned integer version of the type of the data vector. */
6344 int scalar_precision
6345 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6346 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6347 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6348 vectype);
6350 /* First we need to create a vector (ZERO_VEC) of zeros and another
6351 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6352 can create using a MAX reduction and then expanding.
6353 In the case where the loop never made any matches, the max index will
6354 be zero. */
6356 /* Vector of {0, 0, 0,...}. */
6357 tree zero_vec = build_zero_cst (vectype);
6359 /* Find maximum value from the vector of found indexes. */
6360 tree max_index = make_ssa_name (index_scalar_type);
6361 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6362 1, induction_index);
6363 gimple_call_set_lhs (max_index_stmt, max_index);
6364 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6366 /* Vector of {max_index, max_index, max_index,...}. */
6367 tree max_index_vec = make_ssa_name (index_vec_type);
6368 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6369 max_index);
6370 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6371 max_index_vec_rhs);
6372 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6374 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6375 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6376 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6377 otherwise. Only one value should match, resulting in a vector
6378 (VEC_COND) with one data value and the rest zeros.
6379 In the case where the loop never made any matches, every index will
6380 match, resulting in a vector with all data values (which will all be
6381 the default value). */
6383 /* Compare the max index vector to the vector of found indexes to find
6384 the position of the max value. */
6385 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6386 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6387 induction_index,
6388 max_index_vec);
6389 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6391 /* Use the compare to choose either values from the data vector or
6392 zero. */
6393 tree vec_cond = make_ssa_name (vectype);
6394 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6395 vec_compare,
6396 reduc_inputs[0],
6397 zero_vec);
6398 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6400 /* Finally we need to extract the data value from the vector (VEC_COND)
6401 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6402 reduction, but because this doesn't exist, we can use a MAX reduction
6403 instead. The data value might be signed or a float so we need to cast
6404 it first.
6405 In the case where the loop never made any matches, the data values are
6406 all identical, and so will reduce down correctly. */
6408 /* Make the matched data values unsigned. */
6409 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6410 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6411 vec_cond);
6412 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6413 VIEW_CONVERT_EXPR,
6414 vec_cond_cast_rhs);
6415 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6417 /* Reduce down to a scalar value. */
6418 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6419 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6420 1, vec_cond_cast);
6421 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6422 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6424 /* Convert the reduced value back to the result type and set as the
6425 result. */
6426 gimple_seq stmts = NULL;
6427 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6428 data_reduc);
6429 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6430 scalar_results.safe_push (new_temp);
6432 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6433 && reduc_fn == IFN_LAST)
6435 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6436 idx = 0;
6437 idx_val = induction_index[0];
6438 val = data_reduc[0];
6439 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6440 if (induction_index[i] > idx_val)
6441 val = data_reduc[i], idx_val = induction_index[i];
6442 return val; */
6444 tree data_eltype = TREE_TYPE (vectype);
6445 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6446 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6447 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6448 /* Enforced by vectorizable_reduction, which ensures we have target
6449 support before allowing a conditional reduction on variable-length
6450 vectors. */
6451 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6452 tree idx_val = NULL_TREE, val = NULL_TREE;
6453 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6455 tree old_idx_val = idx_val;
6456 tree old_val = val;
6457 idx_val = make_ssa_name (idx_eltype);
6458 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6459 build3 (BIT_FIELD_REF, idx_eltype,
6460 induction_index,
6461 bitsize_int (el_size),
6462 bitsize_int (off)));
6463 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6464 val = make_ssa_name (data_eltype);
6465 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6466 build3 (BIT_FIELD_REF,
6467 data_eltype,
6468 reduc_inputs[0],
6469 bitsize_int (el_size),
6470 bitsize_int (off)));
6471 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6472 if (off != 0)
6474 tree new_idx_val = idx_val;
6475 if (off != v_size - el_size)
6477 new_idx_val = make_ssa_name (idx_eltype);
6478 epilog_stmt = gimple_build_assign (new_idx_val,
6479 MAX_EXPR, idx_val,
6480 old_idx_val);
6481 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6483 tree cond = make_ssa_name (boolean_type_node);
6484 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6485 idx_val, old_idx_val);
6486 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6487 tree new_val = make_ssa_name (data_eltype);
6488 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6489 cond, val, old_val);
6490 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6491 idx_val = new_idx_val;
6492 val = new_val;
6495 /* Convert the reduced value back to the result type and set as the
6496 result. */
6497 gimple_seq stmts = NULL;
6498 val = gimple_convert (&stmts, scalar_type, val);
6499 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6500 scalar_results.safe_push (val);
6503 /* 2.3 Create the reduction code, using one of the three schemes described
6504 above. In SLP we simply need to extract all the elements from the
6505 vector (without reducing them), so we use scalar shifts. */
6506 else if (reduc_fn != IFN_LAST && !slp_reduc)
6508 tree tmp;
6509 tree vec_elem_type;
6511 /* Case 1: Create:
6512 v_out2 = reduc_expr <v_out1> */
6514 if (dump_enabled_p ())
6515 dump_printf_loc (MSG_NOTE, vect_location,
6516 "Reduce using direct vector reduction.\n");
6518 gimple_seq stmts = NULL;
6519 vec_elem_type = TREE_TYPE (vectype);
6520 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6521 vec_elem_type, reduc_inputs[0]);
6522 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6523 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6525 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6526 && induc_val)
6528 /* Earlier we set the initial value to be a vector if induc_val
6529 values. Check the result and if it is induc_val then replace
6530 with the original initial value, unless induc_val is
6531 the same as initial_def already. */
6532 tree zcompare = make_ssa_name (boolean_type_node);
6533 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6534 new_temp, induc_val);
6535 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6536 tree initial_def = reduc_info->reduc_initial_values[0];
6537 tmp = make_ssa_name (new_scalar_dest);
6538 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6539 initial_def, new_temp);
6540 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6541 new_temp = tmp;
6544 scalar_results.safe_push (new_temp);
6546 else if (direct_slp_reduc)
6548 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6549 with the elements for other SLP statements replaced with the
6550 neutral value. We can then do a normal reduction on each vector. */
6552 /* Enforced by vectorizable_reduction. */
6553 gcc_assert (reduc_inputs.length () == 1);
6554 gcc_assert (pow2p_hwi (group_size));
6556 gimple_seq seq = NULL;
6558 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6559 and the same element size as VECTYPE. */
6560 tree index = build_index_vector (vectype, 0, 1);
6561 tree index_type = TREE_TYPE (index);
6562 tree index_elt_type = TREE_TYPE (index_type);
6563 tree mask_type = truth_type_for (index_type);
6565 /* Create a vector that, for each element, identifies which of
6566 the REDUC_GROUP_SIZE results should use it. */
6567 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6568 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6569 build_vector_from_val (index_type, index_mask));
6571 /* Get a neutral vector value. This is simply a splat of the neutral
6572 scalar value if we have one, otherwise the initial scalar value
6573 is itself a neutral value. */
6574 tree vector_identity = NULL_TREE;
6575 tree neutral_op = NULL_TREE;
6576 if (slp_node)
6578 tree initial_value = NULL_TREE;
6579 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6580 initial_value = reduc_info->reduc_initial_values[0];
6581 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6582 initial_value, false);
6584 if (neutral_op)
6585 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6586 neutral_op);
6587 for (unsigned int i = 0; i < group_size; ++i)
6589 /* If there's no univeral neutral value, we can use the
6590 initial scalar value from the original PHI. This is used
6591 for MIN and MAX reduction, for example. */
6592 if (!neutral_op)
6594 tree scalar_value = reduc_info->reduc_initial_values[i];
6595 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6596 scalar_value);
6597 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6598 scalar_value);
6601 /* Calculate the equivalent of:
6603 sel[j] = (index[j] == i);
6605 which selects the elements of REDUC_INPUTS[0] that should
6606 be included in the result. */
6607 tree compare_val = build_int_cst (index_elt_type, i);
6608 compare_val = build_vector_from_val (index_type, compare_val);
6609 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6610 index, compare_val);
6612 /* Calculate the equivalent of:
6614 vec = seq ? reduc_inputs[0] : vector_identity;
6616 VEC is now suitable for a full vector reduction. */
6617 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6618 sel, reduc_inputs[0], vector_identity);
6620 /* Do the reduction and convert it to the appropriate type. */
6621 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6622 TREE_TYPE (vectype), vec);
6623 scalar = gimple_convert (&seq, scalar_type, scalar);
6624 scalar_results.safe_push (scalar);
6626 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6628 else
6630 bool reduce_with_shift;
6631 tree vec_temp;
6633 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6635 /* See if the target wants to do the final (shift) reduction
6636 in a vector mode of smaller size and first reduce upper/lower
6637 halves against each other. */
6638 enum machine_mode mode1 = mode;
6639 tree stype = TREE_TYPE (vectype);
6640 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6641 unsigned nunits1 = nunits;
6642 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6643 && reduc_inputs.length () == 1)
6645 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6646 /* For SLP reductions we have to make sure lanes match up, but
6647 since we're doing individual element final reduction reducing
6648 vector width here is even more important.
6649 ??? We can also separate lanes with permutes, for the common
6650 case of power-of-two group-size odd/even extracts would work. */
6651 if (slp_reduc && nunits != nunits1)
6653 nunits1 = least_common_multiple (nunits1, group_size);
6654 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6657 if (!slp_reduc
6658 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6659 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6661 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6662 stype, nunits1);
6663 reduce_with_shift = have_whole_vector_shift (mode1);
6664 if (!VECTOR_MODE_P (mode1)
6665 || !directly_supported_p (code, vectype1))
6666 reduce_with_shift = false;
6668 /* First reduce the vector to the desired vector size we should
6669 do shift reduction on by combining upper and lower halves. */
6670 gimple_seq stmts = NULL;
6671 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6672 code, &stmts);
6673 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6674 reduc_inputs[0] = new_temp;
6676 if (reduce_with_shift && !slp_reduc)
6678 int element_bitsize = tree_to_uhwi (bitsize);
6679 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6680 for variable-length vectors and also requires direct target support
6681 for loop reductions. */
6682 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6683 int nelements = vec_size_in_bits / element_bitsize;
6684 vec_perm_builder sel;
6685 vec_perm_indices indices;
6687 int elt_offset;
6689 tree zero_vec = build_zero_cst (vectype1);
6690 /* Case 2: Create:
6691 for (offset = nelements/2; offset >= 1; offset/=2)
6693 Create: va' = vec_shift <va, offset>
6694 Create: va = vop <va, va'>
6695 } */
6697 tree rhs;
6699 if (dump_enabled_p ())
6700 dump_printf_loc (MSG_NOTE, vect_location,
6701 "Reduce using vector shifts\n");
6703 gimple_seq stmts = NULL;
6704 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6705 for (elt_offset = nelements / 2;
6706 elt_offset >= 1;
6707 elt_offset /= 2)
6709 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6710 indices.new_vector (sel, 2, nelements);
6711 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6712 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6713 new_temp, zero_vec, mask);
6714 new_temp = gimple_build (&stmts, code,
6715 vectype1, new_name, new_temp);
6717 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6719 /* 2.4 Extract the final scalar result. Create:
6720 s_out3 = extract_field <v_out2, bitpos> */
6722 if (dump_enabled_p ())
6723 dump_printf_loc (MSG_NOTE, vect_location,
6724 "extract scalar result\n");
6726 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6727 bitsize, bitsize_zero_node);
6728 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6729 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6730 gimple_assign_set_lhs (epilog_stmt, new_temp);
6731 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6732 scalar_results.safe_push (new_temp);
6734 else
6736 /* Case 3: Create:
6737 s = extract_field <v_out2, 0>
6738 for (offset = element_size;
6739 offset < vector_size;
6740 offset += element_size;)
6742 Create: s' = extract_field <v_out2, offset>
6743 Create: s = op <s, s'> // For non SLP cases
6744 } */
6746 if (dump_enabled_p ())
6747 dump_printf_loc (MSG_NOTE, vect_location,
6748 "Reduce using scalar code.\n");
6750 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6751 int element_bitsize = tree_to_uhwi (bitsize);
6752 tree compute_type = TREE_TYPE (vectype);
6753 gimple_seq stmts = NULL;
6754 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6756 int bit_offset;
6757 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6758 vec_temp, bitsize, bitsize_zero_node);
6760 /* In SLP we don't need to apply reduction operation, so we just
6761 collect s' values in SCALAR_RESULTS. */
6762 if (slp_reduc)
6763 scalar_results.safe_push (new_temp);
6765 for (bit_offset = element_bitsize;
6766 bit_offset < vec_size_in_bits;
6767 bit_offset += element_bitsize)
6769 tree bitpos = bitsize_int (bit_offset);
6770 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6771 compute_type, vec_temp,
6772 bitsize, bitpos);
6773 if (slp_reduc)
6775 /* In SLP we don't need to apply reduction operation, so
6776 we just collect s' values in SCALAR_RESULTS. */
6777 new_temp = new_name;
6778 scalar_results.safe_push (new_name);
6780 else
6781 new_temp = gimple_build (&stmts, code, compute_type,
6782 new_name, new_temp);
6786 /* The only case where we need to reduce scalar results in SLP, is
6787 unrolling. If the size of SCALAR_RESULTS is greater than
6788 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6789 REDUC_GROUP_SIZE. */
6790 if (slp_reduc)
6792 tree res, first_res, new_res;
6794 /* Reduce multiple scalar results in case of SLP unrolling. */
6795 for (j = group_size; scalar_results.iterate (j, &res);
6796 j++)
6798 first_res = scalar_results[j % group_size];
6799 new_res = gimple_build (&stmts, code, compute_type,
6800 first_res, res);
6801 scalar_results[j % group_size] = new_res;
6803 scalar_results.truncate (group_size);
6804 for (k = 0; k < group_size; k++)
6805 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6806 scalar_results[k]);
6808 else
6810 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6811 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6812 scalar_results.safe_push (new_temp);
6815 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6818 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6819 && induc_val)
6821 /* Earlier we set the initial value to be a vector if induc_val
6822 values. Check the result and if it is induc_val then replace
6823 with the original initial value, unless induc_val is
6824 the same as initial_def already. */
6825 tree zcompare = make_ssa_name (boolean_type_node);
6826 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6827 induc_val);
6828 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6829 tree initial_def = reduc_info->reduc_initial_values[0];
6830 tree tmp = make_ssa_name (new_scalar_dest);
6831 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6832 initial_def, new_temp);
6833 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6834 scalar_results[0] = tmp;
6838 /* 2.5 Adjust the final result by the initial value of the reduction
6839 variable. (When such adjustment is not needed, then
6840 'adjustment_def' is zero). For example, if code is PLUS we create:
6841 new_temp = loop_exit_def + adjustment_def */
6843 if (adjustment_def)
6845 gcc_assert (!slp_reduc);
6846 gimple_seq stmts = NULL;
6847 if (double_reduc)
6849 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6850 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6851 new_temp = gimple_build (&stmts, code, vectype,
6852 reduc_inputs[0], adjustment_def);
6854 else
6856 new_temp = scalar_results[0];
6857 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6858 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6859 adjustment_def);
6860 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6861 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6862 new_temp, adjustment_def);
6863 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6866 epilog_stmt = gimple_seq_last_stmt (stmts);
6867 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6868 scalar_results[0] = new_temp;
6871 /* Record this operation if it could be reused by the epilogue loop. */
6872 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6873 && reduc_inputs.length () == 1)
6874 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6875 { orig_reduc_input, reduc_info });
6877 if (double_reduc)
6878 loop = outer_loop;
6880 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6881 phis with new adjusted scalar results, i.e., replace use <s_out0>
6882 with use <s_out4>.
6884 Transform:
6885 loop_exit:
6886 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6887 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6888 v_out2 = reduce <v_out1>
6889 s_out3 = extract_field <v_out2, 0>
6890 s_out4 = adjust_result <s_out3>
6891 use <s_out0>
6892 use <s_out0>
6894 into:
6896 loop_exit:
6897 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6898 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6899 v_out2 = reduce <v_out1>
6900 s_out3 = extract_field <v_out2, 0>
6901 s_out4 = adjust_result <s_out3>
6902 use <s_out4>
6903 use <s_out4> */
6905 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6906 for (k = 0; k < live_out_stmts.size (); k++)
6908 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6909 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6911 phis.create (3);
6912 /* Find the loop-closed-use at the loop exit of the original scalar
6913 result. (The reduction result is expected to have two immediate uses,
6914 one at the latch block, and one at the loop exit). For double
6915 reductions we are looking for exit phis of the outer loop. */
6916 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6918 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6920 if (!is_gimple_debug (USE_STMT (use_p)))
6921 phis.safe_push (USE_STMT (use_p));
6923 else
6925 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6927 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6929 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6931 if (!flow_bb_inside_loop_p (loop,
6932 gimple_bb (USE_STMT (phi_use_p)))
6933 && !is_gimple_debug (USE_STMT (phi_use_p)))
6934 phis.safe_push (USE_STMT (phi_use_p));
6940 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6942 /* Replace the uses: */
6943 orig_name = PHI_RESULT (exit_phi);
6945 /* Look for a single use at the target of the skip edge. */
6946 if (unify_with_main_loop_p)
6948 use_operand_p use_p;
6949 gimple *user;
6950 if (!single_imm_use (orig_name, &use_p, &user))
6951 gcc_unreachable ();
6952 orig_name = gimple_get_lhs (user);
6955 scalar_result = scalar_results[k];
6956 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6958 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6959 SET_USE (use_p, scalar_result);
6960 update_stmt (use_stmt);
6964 phis.release ();
6968 /* Return a vector of type VECTYPE that is equal to the vector select
6969 operation "MASK ? VEC : IDENTITY". Insert the select statements
6970 before GSI. */
6972 static tree
6973 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6974 tree vec, tree identity)
6976 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6977 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6978 mask, vec, identity);
6979 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6980 return cond;
6983 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6984 order, starting with LHS. Insert the extraction statements before GSI and
6985 associate the new scalar SSA names with variable SCALAR_DEST.
6986 If MASK is nonzero mask the input and then operate on it unconditionally.
6987 Return the SSA name for the result. */
6989 static tree
6990 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6991 tree_code code, tree lhs, tree vector_rhs,
6992 tree mask)
6994 tree vectype = TREE_TYPE (vector_rhs);
6995 tree scalar_type = TREE_TYPE (vectype);
6996 tree bitsize = TYPE_SIZE (scalar_type);
6997 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6998 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
7000 /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7001 to perform an unconditional element-wise reduction of it. */
7002 if (mask)
7004 tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7005 "masked_vector_rhs");
7006 tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7007 false);
7008 tree vector_identity = build_vector_from_val (vectype, neutral_op);
7009 gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7010 mask, vector_rhs, vector_identity);
7011 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7012 vector_rhs = masked_vector_rhs;
7015 for (unsigned HOST_WIDE_INT bit_offset = 0;
7016 bit_offset < vec_size_in_bits;
7017 bit_offset += element_bitsize)
7019 tree bitpos = bitsize_int (bit_offset);
7020 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7021 bitsize, bitpos);
7023 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7024 rhs = make_ssa_name (scalar_dest, stmt);
7025 gimple_assign_set_lhs (stmt, rhs);
7026 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7028 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7029 tree new_name = make_ssa_name (scalar_dest, stmt);
7030 gimple_assign_set_lhs (stmt, new_name);
7031 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7032 lhs = new_name;
7034 return lhs;
7037 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
7038 type of the vector input. */
7040 static internal_fn
7041 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7043 internal_fn mask_reduc_fn;
7044 internal_fn mask_len_reduc_fn;
7046 switch (reduc_fn)
7048 case IFN_FOLD_LEFT_PLUS:
7049 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7050 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7051 break;
7053 default:
7054 return IFN_LAST;
7057 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7058 OPTIMIZE_FOR_SPEED))
7059 return mask_reduc_fn;
7060 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7061 OPTIMIZE_FOR_SPEED))
7062 return mask_len_reduc_fn;
7063 return IFN_LAST;
7066 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
7067 statement that sets the live-out value. REDUC_DEF_STMT is the phi
7068 statement. CODE is the operation performed by STMT_INFO and OPS are
7069 its scalar operands. REDUC_INDEX is the index of the operand in
7070 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
7071 implements in-order reduction, or IFN_LAST if we should open-code it.
7072 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7073 that should be used to control the operation in a fully-masked loop. */
7075 static bool
7076 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7077 stmt_vec_info stmt_info,
7078 gimple_stmt_iterator *gsi,
7079 gimple **vec_stmt, slp_tree slp_node,
7080 gimple *reduc_def_stmt,
7081 code_helper code, internal_fn reduc_fn,
7082 tree *ops, int num_ops, tree vectype_in,
7083 int reduc_index, vec_loop_masks *masks,
7084 vec_loop_lens *lens)
7086 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7087 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7088 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7090 int ncopies;
7091 if (slp_node)
7092 ncopies = 1;
7093 else
7094 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7096 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7097 gcc_assert (ncopies == 1);
7099 bool is_cond_op = false;
7100 if (!code.is_tree_code ())
7102 code = conditional_internal_fn_code (internal_fn (code));
7103 gcc_assert (code != ERROR_MARK);
7104 is_cond_op = true;
7107 gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7109 if (slp_node)
7111 if (is_cond_op)
7113 if (dump_enabled_p ())
7114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7115 "fold-left reduction on SLP not supported.\n");
7116 return false;
7119 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7120 TYPE_VECTOR_SUBPARTS (vectype_in)));
7123 /* The operands either come from a binary operation or an IFN_COND operation.
7124 The former is a gimple assign with binary rhs and the latter is a
7125 gimple call with four arguments. */
7126 gcc_assert (num_ops == 2 || num_ops == 4);
7127 tree op0, opmask;
7128 if (!is_cond_op)
7129 op0 = ops[1 - reduc_index];
7130 else
7132 op0 = ops[2 + (1 - reduc_index)];
7133 opmask = ops[0];
7134 gcc_assert (!slp_node);
7137 int group_size = 1;
7138 stmt_vec_info scalar_dest_def_info;
7139 auto_vec<tree> vec_oprnds0, vec_opmask;
7140 if (slp_node)
7142 auto_vec<vec<tree> > vec_defs (2);
7143 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7144 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7145 vec_defs[0].release ();
7146 vec_defs[1].release ();
7147 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7148 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7150 else
7152 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7153 op0, &vec_oprnds0);
7154 scalar_dest_def_info = stmt_info;
7156 /* For an IFN_COND_OP we also need the vector mask operand. */
7157 if (is_cond_op)
7158 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7159 opmask, &vec_opmask);
7162 gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7163 tree scalar_dest = gimple_get_lhs (sdef);
7164 tree scalar_type = TREE_TYPE (scalar_dest);
7165 tree reduc_var = gimple_phi_result (reduc_def_stmt);
7167 int vec_num = vec_oprnds0.length ();
7168 gcc_assert (vec_num == 1 || slp_node);
7169 tree vec_elem_type = TREE_TYPE (vectype_out);
7170 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7172 tree vector_identity = NULL_TREE;
7173 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7175 vector_identity = build_zero_cst (vectype_out);
7176 if (!HONOR_SIGNED_ZEROS (vectype_out))
7178 else
7180 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7181 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7182 vector_identity);
7186 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7187 int i;
7188 tree def0;
7189 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7191 gimple *new_stmt;
7192 tree mask = NULL_TREE;
7193 tree len = NULL_TREE;
7194 tree bias = NULL_TREE;
7195 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7196 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7197 else if (is_cond_op)
7198 mask = vec_opmask[0];
7199 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7201 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7202 i, 1);
7203 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7204 bias = build_int_cst (intQI_type_node, biasval);
7205 if (!is_cond_op)
7206 mask = build_minus_one_cst (truth_type_for (vectype_in));
7209 /* Handle MINUS by adding the negative. */
7210 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7212 tree negated = make_ssa_name (vectype_out);
7213 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7214 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7215 def0 = negated;
7218 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7219 && mask && mask_reduc_fn == IFN_LAST)
7220 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7221 vector_identity);
7223 /* On the first iteration the input is simply the scalar phi
7224 result, and for subsequent iterations it is the output of
7225 the preceding operation. */
7226 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7228 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7229 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7230 def0, mask, len, bias);
7231 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7232 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7233 def0, mask);
7234 else
7235 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7236 def0);
7237 /* For chained SLP reductions the output of the previous reduction
7238 operation serves as the input of the next. For the final statement
7239 the output cannot be a temporary - we reuse the original
7240 scalar destination of the last statement. */
7241 if (i != vec_num - 1)
7243 gimple_set_lhs (new_stmt, scalar_dest_var);
7244 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7245 gimple_set_lhs (new_stmt, reduc_var);
7248 else
7250 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7251 tree_code (code), reduc_var, def0,
7252 mask);
7253 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7254 /* Remove the statement, so that we can use the same code paths
7255 as for statements that we've just created. */
7256 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7257 gsi_remove (&tmp_gsi, true);
7260 if (i == vec_num - 1)
7262 gimple_set_lhs (new_stmt, scalar_dest);
7263 vect_finish_replace_stmt (loop_vinfo,
7264 scalar_dest_def_info,
7265 new_stmt);
7267 else
7268 vect_finish_stmt_generation (loop_vinfo,
7269 scalar_dest_def_info,
7270 new_stmt, gsi);
7272 if (slp_node)
7273 slp_node->push_vec_def (new_stmt);
7274 else
7276 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7277 *vec_stmt = new_stmt;
7281 return true;
7284 /* Function is_nonwrapping_integer_induction.
7286 Check if STMT_VINO (which is part of loop LOOP) both increments and
7287 does not cause overflow. */
7289 static bool
7290 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7292 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7293 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7294 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7295 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7296 widest_int ni, max_loop_value, lhs_max;
7297 wi::overflow_type overflow = wi::OVF_NONE;
7299 /* Make sure the loop is integer based. */
7300 if (TREE_CODE (base) != INTEGER_CST
7301 || TREE_CODE (step) != INTEGER_CST)
7302 return false;
7304 /* Check that the max size of the loop will not wrap. */
7306 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7307 return true;
7309 if (! max_stmt_executions (loop, &ni))
7310 return false;
7312 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7313 &overflow);
7314 if (overflow)
7315 return false;
7317 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7318 TYPE_SIGN (lhs_type), &overflow);
7319 if (overflow)
7320 return false;
7322 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7323 <= TYPE_PRECISION (lhs_type));
7326 /* Check if masking can be supported by inserting a conditional expression.
7327 CODE is the code for the operation. COND_FN is the conditional internal
7328 function, if it exists. VECTYPE_IN is the type of the vector input. */
7329 static bool
7330 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7331 tree vectype_in)
7333 if (cond_fn != IFN_LAST
7334 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7335 OPTIMIZE_FOR_SPEED))
7336 return false;
7338 if (code.is_tree_code ())
7339 switch (tree_code (code))
7341 case DOT_PROD_EXPR:
7342 case SAD_EXPR:
7343 return true;
7345 default:
7346 break;
7348 return false;
7351 /* Insert a conditional expression to enable masked vectorization. CODE is the
7352 code for the operation. VOP is the array of operands. MASK is the loop
7353 mask. GSI is a statement iterator used to place the new conditional
7354 expression. */
7355 static void
7356 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7357 gimple_stmt_iterator *gsi)
7359 switch (tree_code (code))
7361 case DOT_PROD_EXPR:
7363 tree vectype = TREE_TYPE (vop[1]);
7364 tree zero = build_zero_cst (vectype);
7365 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7366 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7367 mask, vop[1], zero);
7368 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7369 vop[1] = masked_op1;
7370 break;
7373 case SAD_EXPR:
7375 tree vectype = TREE_TYPE (vop[1]);
7376 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7377 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7378 mask, vop[1], vop[0]);
7379 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7380 vop[1] = masked_op1;
7381 break;
7384 default:
7385 gcc_unreachable ();
7389 /* Function vectorizable_reduction.
7391 Check if STMT_INFO performs a reduction operation that can be vectorized.
7392 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7393 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7394 Return true if STMT_INFO is vectorizable in this way.
7396 This function also handles reduction idioms (patterns) that have been
7397 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7398 may be of this form:
7399 X = pattern_expr (arg0, arg1, ..., X)
7400 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7401 sequence that had been detected and replaced by the pattern-stmt
7402 (STMT_INFO).
7404 This function also handles reduction of condition expressions, for example:
7405 for (int i = 0; i < N; i++)
7406 if (a[i] < value)
7407 last = a[i];
7408 This is handled by vectorising the loop and creating an additional vector
7409 containing the loop indexes for which "a[i] < value" was true. In the
7410 function epilogue this is reduced to a single max value and then used to
7411 index into the vector of results.
7413 In some cases of reduction patterns, the type of the reduction variable X is
7414 different than the type of the other arguments of STMT_INFO.
7415 In such cases, the vectype that is used when transforming STMT_INFO into
7416 a vector stmt is different than the vectype that is used to determine the
7417 vectorization factor, because it consists of a different number of elements
7418 than the actual number of elements that are being operated upon in parallel.
7420 For example, consider an accumulation of shorts into an int accumulator.
7421 On some targets it's possible to vectorize this pattern operating on 8
7422 shorts at a time (hence, the vectype for purposes of determining the
7423 vectorization factor should be V8HI); on the other hand, the vectype that
7424 is used to create the vector form is actually V4SI (the type of the result).
7426 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7427 indicates what is the actual level of parallelism (V8HI in the example), so
7428 that the right vectorization factor would be derived. This vectype
7429 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7430 be used to create the vectorized stmt. The right vectype for the vectorized
7431 stmt is obtained from the type of the result X:
7432 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7434 This means that, contrary to "regular" reductions (or "regular" stmts in
7435 general), the following equation:
7436 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7437 does *NOT* necessarily hold for reduction patterns. */
7439 bool
7440 vectorizable_reduction (loop_vec_info loop_vinfo,
7441 stmt_vec_info stmt_info, slp_tree slp_node,
7442 slp_instance slp_node_instance,
7443 stmt_vector_for_cost *cost_vec)
7445 tree vectype_in = NULL_TREE;
7446 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7447 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7448 stmt_vec_info cond_stmt_vinfo = NULL;
7449 int i;
7450 int ncopies;
7451 bool single_defuse_cycle = false;
7452 bool nested_cycle = false;
7453 bool double_reduc = false;
7454 int vec_num;
7455 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7456 tree cond_reduc_val = NULL_TREE;
7458 /* Make sure it was already recognized as a reduction computation. */
7459 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7460 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7461 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7462 return false;
7464 /* The stmt we store reduction analysis meta on. */
7465 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7466 reduc_info->is_reduc_info = true;
7468 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7470 if (is_a <gphi *> (stmt_info->stmt))
7472 if (slp_node)
7474 /* We eventually need to set a vector type on invariant
7475 arguments. */
7476 unsigned j;
7477 slp_tree child;
7478 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7479 if (!vect_maybe_update_slp_op_vectype
7480 (child, SLP_TREE_VECTYPE (slp_node)))
7482 if (dump_enabled_p ())
7483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7484 "incompatible vector types for "
7485 "invariants\n");
7486 return false;
7489 /* Analysis for double-reduction is done on the outer
7490 loop PHI, nested cycles have no further restrictions. */
7491 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7493 else
7494 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7495 return true;
7498 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7499 stmt_vec_info phi_info = stmt_info;
7500 if (!is_a <gphi *> (stmt_info->stmt))
7502 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7503 return true;
7505 if (slp_node)
7507 slp_node_instance->reduc_phis = slp_node;
7508 /* ??? We're leaving slp_node to point to the PHIs, we only
7509 need it to get at the number of vector stmts which wasn't
7510 yet initialized for the instance root. */
7512 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7514 use_operand_p use_p;
7515 gimple *use_stmt;
7516 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7517 &use_p, &use_stmt);
7518 gcc_assert (res);
7519 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7522 /* PHIs should not participate in patterns. */
7523 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7524 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7526 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7527 and compute the reduction chain length. Discover the real
7528 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7529 tree reduc_def
7530 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7531 loop_latch_edge
7532 (gimple_bb (reduc_def_phi)->loop_father));
7533 unsigned reduc_chain_length = 0;
7534 bool only_slp_reduc_chain = true;
7535 stmt_info = NULL;
7536 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7537 while (reduc_def != PHI_RESULT (reduc_def_phi))
7539 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7540 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7541 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7543 if (dump_enabled_p ())
7544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7545 "reduction chain broken by patterns.\n");
7546 return false;
7548 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7549 only_slp_reduc_chain = false;
7550 /* For epilogue generation live members of the chain need
7551 to point back to the PHI via their original stmt for
7552 info_for_reduction to work. For SLP we need to look at
7553 all lanes here - even though we only will vectorize from
7554 the SLP node with live lane zero the other live lanes also
7555 need to be identified as part of a reduction to be able
7556 to skip code generation for them. */
7557 if (slp_for_stmt_info)
7559 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7560 if (STMT_VINFO_LIVE_P (s))
7561 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7563 else if (STMT_VINFO_LIVE_P (vdef))
7564 STMT_VINFO_REDUC_DEF (def) = phi_info;
7565 gimple_match_op op;
7566 if (!gimple_extract_op (vdef->stmt, &op))
7568 if (dump_enabled_p ())
7569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7570 "reduction chain includes unsupported"
7571 " statement type.\n");
7572 return false;
7574 if (CONVERT_EXPR_CODE_P (op.code))
7576 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7578 if (dump_enabled_p ())
7579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7580 "conversion in the reduction chain.\n");
7581 return false;
7584 else if (!stmt_info)
7585 /* First non-conversion stmt. */
7586 stmt_info = vdef;
7587 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7588 reduc_chain_length++;
7589 if (!stmt_info && slp_node)
7590 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7592 /* PHIs should not participate in patterns. */
7593 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7595 if (nested_in_vect_loop_p (loop, stmt_info))
7597 loop = loop->inner;
7598 nested_cycle = true;
7601 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7602 element. */
7603 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7605 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7606 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7608 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7609 gcc_assert (slp_node
7610 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7612 /* 1. Is vectorizable reduction? */
7613 /* Not supportable if the reduction variable is used in the loop, unless
7614 it's a reduction chain. */
7615 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7616 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7617 return false;
7619 /* Reductions that are not used even in an enclosing outer-loop,
7620 are expected to be "live" (used out of the loop). */
7621 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7622 && !STMT_VINFO_LIVE_P (stmt_info))
7623 return false;
7625 /* 2. Has this been recognized as a reduction pattern?
7627 Check if STMT represents a pattern that has been recognized
7628 in earlier analysis stages. For stmts that represent a pattern,
7629 the STMT_VINFO_RELATED_STMT field records the last stmt in
7630 the original sequence that constitutes the pattern. */
7632 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7633 if (orig_stmt_info)
7635 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7636 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7639 /* 3. Check the operands of the operation. The first operands are defined
7640 inside the loop body. The last operand is the reduction variable,
7641 which is defined by the loop-header-phi. */
7643 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7644 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7645 gimple_match_op op;
7646 if (!gimple_extract_op (stmt_info->stmt, &op))
7647 gcc_unreachable ();
7648 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7649 || op.code == WIDEN_SUM_EXPR
7650 || op.code == SAD_EXPR);
7652 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7653 && !SCALAR_FLOAT_TYPE_P (op.type))
7654 return false;
7656 /* Do not try to vectorize bit-precision reductions. */
7657 if (!type_has_mode_precision_p (op.type))
7658 return false;
7660 /* For lane-reducing ops we're reducing the number of reduction PHIs
7661 which means the only use of that may be in the lane-reducing operation. */
7662 if (lane_reduc_code_p
7663 && reduc_chain_length != 1
7664 && !only_slp_reduc_chain)
7666 if (dump_enabled_p ())
7667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7668 "lane-reducing reduction with extra stmts.\n");
7669 return false;
7672 /* All uses but the last are expected to be defined in the loop.
7673 The last use is the reduction variable. In case of nested cycle this
7674 assumption is not true: we use reduc_index to record the index of the
7675 reduction variable. */
7676 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7677 tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7678 /* We need to skip an extra operand for COND_EXPRs with embedded
7679 comparison. */
7680 unsigned opno_adjust = 0;
7681 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7682 opno_adjust = 1;
7683 for (i = 0; i < (int) op.num_ops; i++)
7685 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7686 if (i == 0 && op.code == COND_EXPR)
7687 continue;
7689 stmt_vec_info def_stmt_info;
7690 enum vect_def_type dt;
7691 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7692 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7693 &vectype_op[i], &def_stmt_info))
7695 if (dump_enabled_p ())
7696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7697 "use not simple.\n");
7698 return false;
7700 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7701 continue;
7703 /* For an IFN_COND_OP we might hit the reduction definition operand
7704 twice (once as definition, once as else). */
7705 if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7706 continue;
7708 /* There should be only one cycle def in the stmt, the one
7709 leading to reduc_def. */
7710 if (VECTORIZABLE_CYCLE_DEF (dt))
7711 return false;
7713 if (!vectype_op[i])
7714 vectype_op[i]
7715 = get_vectype_for_scalar_type (loop_vinfo,
7716 TREE_TYPE (op.ops[i]), slp_op[i]);
7718 /* To properly compute ncopies we are interested in the widest
7719 non-reduction input type in case we're looking at a widening
7720 accumulation that we later handle in vect_transform_reduction. */
7721 if (lane_reduc_code_p
7722 && vectype_op[i]
7723 && (!vectype_in
7724 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7725 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7726 vectype_in = vectype_op[i];
7728 if (op.code == COND_EXPR)
7730 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7731 if (dt == vect_constant_def)
7733 cond_reduc_dt = dt;
7734 cond_reduc_val = op.ops[i];
7736 if (dt == vect_induction_def
7737 && def_stmt_info
7738 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7740 cond_reduc_dt = dt;
7741 cond_stmt_vinfo = def_stmt_info;
7745 if (!vectype_in)
7746 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7747 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7749 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7750 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7751 /* If we have a condition reduction, see if we can simplify it further. */
7752 if (v_reduc_type == COND_REDUCTION)
7754 if (slp_node)
7755 return false;
7757 /* When the condition uses the reduction value in the condition, fail. */
7758 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7760 if (dump_enabled_p ())
7761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7762 "condition depends on previous iteration\n");
7763 return false;
7766 if (reduc_chain_length == 1
7767 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7768 OPTIMIZE_FOR_SPEED)
7769 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7770 vectype_in,
7771 OPTIMIZE_FOR_SPEED)))
7773 if (dump_enabled_p ())
7774 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7775 "optimizing condition reduction with"
7776 " FOLD_EXTRACT_LAST.\n");
7777 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7779 else if (cond_reduc_dt == vect_induction_def)
7781 tree base
7782 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7783 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7785 gcc_assert (TREE_CODE (base) == INTEGER_CST
7786 && TREE_CODE (step) == INTEGER_CST);
7787 cond_reduc_val = NULL_TREE;
7788 enum tree_code cond_reduc_op_code = ERROR_MARK;
7789 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7790 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7792 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7793 above base; punt if base is the minimum value of the type for
7794 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7795 else if (tree_int_cst_sgn (step) == -1)
7797 cond_reduc_op_code = MIN_EXPR;
7798 if (tree_int_cst_sgn (base) == -1)
7799 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7800 else if (tree_int_cst_lt (base,
7801 TYPE_MAX_VALUE (TREE_TYPE (base))))
7802 cond_reduc_val
7803 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7805 else
7807 cond_reduc_op_code = MAX_EXPR;
7808 if (tree_int_cst_sgn (base) == 1)
7809 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7810 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7811 base))
7812 cond_reduc_val
7813 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7815 if (cond_reduc_val)
7817 if (dump_enabled_p ())
7818 dump_printf_loc (MSG_NOTE, vect_location,
7819 "condition expression based on "
7820 "integer induction.\n");
7821 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7822 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7823 = cond_reduc_val;
7824 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7827 else if (cond_reduc_dt == vect_constant_def)
7829 enum vect_def_type cond_initial_dt;
7830 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7831 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7832 if (cond_initial_dt == vect_constant_def
7833 && types_compatible_p (TREE_TYPE (cond_initial_val),
7834 TREE_TYPE (cond_reduc_val)))
7836 tree e = fold_binary (LE_EXPR, boolean_type_node,
7837 cond_initial_val, cond_reduc_val);
7838 if (e && (integer_onep (e) || integer_zerop (e)))
7840 if (dump_enabled_p ())
7841 dump_printf_loc (MSG_NOTE, vect_location,
7842 "condition expression based on "
7843 "compile time constant.\n");
7844 /* Record reduction code at analysis stage. */
7845 STMT_VINFO_REDUC_CODE (reduc_info)
7846 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7847 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7853 if (STMT_VINFO_LIVE_P (phi_info))
7854 return false;
7856 if (slp_node)
7857 ncopies = 1;
7858 else
7859 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7861 gcc_assert (ncopies >= 1);
7863 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7865 if (nested_cycle)
7867 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7868 == vect_double_reduction_def);
7869 double_reduc = true;
7872 /* 4.2. Check support for the epilog operation.
7874 If STMT represents a reduction pattern, then the type of the
7875 reduction variable may be different than the type of the rest
7876 of the arguments. For example, consider the case of accumulation
7877 of shorts into an int accumulator; The original code:
7878 S1: int_a = (int) short_a;
7879 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7881 was replaced with:
7882 STMT: int_acc = widen_sum <short_a, int_acc>
7884 This means that:
7885 1. The tree-code that is used to create the vector operation in the
7886 epilog code (that reduces the partial results) is not the
7887 tree-code of STMT, but is rather the tree-code of the original
7888 stmt from the pattern that STMT is replacing. I.e, in the example
7889 above we want to use 'widen_sum' in the loop, but 'plus' in the
7890 epilog.
7891 2. The type (mode) we use to check available target support
7892 for the vector operation to be created in the *epilog*, is
7893 determined by the type of the reduction variable (in the example
7894 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7895 However the type (mode) we use to check available target support
7896 for the vector operation to be created *inside the loop*, is
7897 determined by the type of the other arguments to STMT (in the
7898 example we'd check this: optab_handler (widen_sum_optab,
7899 vect_short_mode)).
7901 This is contrary to "regular" reductions, in which the types of all
7902 the arguments are the same as the type of the reduction variable.
7903 For "regular" reductions we can therefore use the same vector type
7904 (and also the same tree-code) when generating the epilog code and
7905 when generating the code inside the loop. */
7907 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7909 /* If conversion might have created a conditional operation like
7910 IFN_COND_ADD already. Use the internal code for the following checks. */
7911 if (orig_code.is_internal_fn ())
7913 tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7914 orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7917 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7919 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7920 if (reduction_type == TREE_CODE_REDUCTION)
7922 /* Check whether it's ok to change the order of the computation.
7923 Generally, when vectorizing a reduction we change the order of the
7924 computation. This may change the behavior of the program in some
7925 cases, so we need to check that this is ok. One exception is when
7926 vectorizing an outer-loop: the inner-loop is executed sequentially,
7927 and therefore vectorizing reductions in the inner-loop during
7928 outer-loop vectorization is safe. Likewise when we are vectorizing
7929 a series of reductions using SLP and the VF is one the reductions
7930 are performed in scalar order. */
7931 if (slp_node
7932 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7933 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7935 else if (needs_fold_left_reduction_p (op.type, orig_code))
7937 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7938 is not directy used in stmt. */
7939 if (!only_slp_reduc_chain
7940 && reduc_chain_length != 1)
7942 if (dump_enabled_p ())
7943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7944 "in-order reduction chain without SLP.\n");
7945 return false;
7947 STMT_VINFO_REDUC_TYPE (reduc_info)
7948 = reduction_type = FOLD_LEFT_REDUCTION;
7950 else if (!commutative_binary_op_p (orig_code, op.type)
7951 || !associative_binary_op_p (orig_code, op.type))
7953 if (dump_enabled_p ())
7954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7955 "reduction: not commutative/associative\n");
7956 return false;
7960 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7961 && ncopies > 1)
7963 if (dump_enabled_p ())
7964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7965 "multiple types in double reduction or condition "
7966 "reduction or fold-left reduction.\n");
7967 return false;
7970 internal_fn reduc_fn = IFN_LAST;
7971 if (reduction_type == TREE_CODE_REDUCTION
7972 || reduction_type == FOLD_LEFT_REDUCTION
7973 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7974 || reduction_type == CONST_COND_REDUCTION)
7976 if (reduction_type == FOLD_LEFT_REDUCTION
7977 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7978 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7980 if (reduc_fn != IFN_LAST
7981 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7982 OPTIMIZE_FOR_SPEED))
7984 if (dump_enabled_p ())
7985 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7986 "reduc op not supported by target.\n");
7988 reduc_fn = IFN_LAST;
7991 else
7993 if (!nested_cycle || double_reduc)
7995 if (dump_enabled_p ())
7996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7997 "no reduc code for scalar code.\n");
7999 return false;
8003 else if (reduction_type == COND_REDUCTION)
8005 int scalar_precision
8006 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8007 cr_index_scalar_type = make_unsigned_type (scalar_precision);
8008 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8009 vectype_out);
8011 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8012 OPTIMIZE_FOR_SPEED))
8013 reduc_fn = IFN_REDUC_MAX;
8015 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8017 if (reduction_type != EXTRACT_LAST_REDUCTION
8018 && (!nested_cycle || double_reduc)
8019 && reduc_fn == IFN_LAST
8020 && !nunits_out.is_constant ())
8022 if (dump_enabled_p ())
8023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8024 "missing target support for reduction on"
8025 " variable-length vectors.\n");
8026 return false;
8029 /* For SLP reductions, see if there is a neutral value we can use. */
8030 tree neutral_op = NULL_TREE;
8031 if (slp_node)
8033 tree initial_value = NULL_TREE;
8034 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8035 initial_value = vect_phi_initial_value (reduc_def_phi);
8036 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8037 orig_code, initial_value);
8040 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8042 /* We can't support in-order reductions of code such as this:
8044 for (int i = 0; i < n1; ++i)
8045 for (int j = 0; j < n2; ++j)
8046 l += a[j];
8048 since GCC effectively transforms the loop when vectorizing:
8050 for (int i = 0; i < n1 / VF; ++i)
8051 for (int j = 0; j < n2; ++j)
8052 for (int k = 0; k < VF; ++k)
8053 l += a[j];
8055 which is a reassociation of the original operation. */
8056 if (dump_enabled_p ())
8057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8058 "in-order double reduction not supported.\n");
8060 return false;
8063 if (reduction_type == FOLD_LEFT_REDUCTION
8064 && slp_node
8065 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8067 /* We cannot use in-order reductions in this case because there is
8068 an implicit reassociation of the operations involved. */
8069 if (dump_enabled_p ())
8070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8071 "in-order unchained SLP reductions not supported.\n");
8072 return false;
8075 /* For double reductions, and for SLP reductions with a neutral value,
8076 we construct a variable-length initial vector by loading a vector
8077 full of the neutral value and then shift-and-inserting the start
8078 values into the low-numbered elements. */
8079 if ((double_reduc || neutral_op)
8080 && !nunits_out.is_constant ()
8081 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8082 vectype_out, OPTIMIZE_FOR_SPEED))
8084 if (dump_enabled_p ())
8085 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8086 "reduction on variable-length vectors requires"
8087 " target support for a vector-shift-and-insert"
8088 " operation.\n");
8089 return false;
8092 /* Check extra constraints for variable-length unchained SLP reductions. */
8093 if (slp_node
8094 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8095 && !nunits_out.is_constant ())
8097 /* We checked above that we could build the initial vector when
8098 there's a neutral element value. Check here for the case in
8099 which each SLP statement has its own initial value and in which
8100 that value needs to be repeated for every instance of the
8101 statement within the initial vector. */
8102 unsigned int group_size = SLP_TREE_LANES (slp_node);
8103 if (!neutral_op
8104 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8105 TREE_TYPE (vectype_out)))
8107 if (dump_enabled_p ())
8108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8109 "unsupported form of SLP reduction for"
8110 " variable-length vectors: cannot build"
8111 " initial vector.\n");
8112 return false;
8114 /* The epilogue code relies on the number of elements being a multiple
8115 of the group size. The duplicate-and-interleave approach to setting
8116 up the initial vector does too. */
8117 if (!multiple_p (nunits_out, group_size))
8119 if (dump_enabled_p ())
8120 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8121 "unsupported form of SLP reduction for"
8122 " variable-length vectors: the vector size"
8123 " is not a multiple of the number of results.\n");
8124 return false;
8128 if (reduction_type == COND_REDUCTION)
8130 widest_int ni;
8132 if (! max_loop_iterations (loop, &ni))
8134 if (dump_enabled_p ())
8135 dump_printf_loc (MSG_NOTE, vect_location,
8136 "loop count not known, cannot create cond "
8137 "reduction.\n");
8138 return false;
8140 /* Convert backedges to iterations. */
8141 ni += 1;
8143 /* The additional index will be the same type as the condition. Check
8144 that the loop can fit into this less one (because we'll use up the
8145 zero slot for when there are no matches). */
8146 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8147 if (wi::geu_p (ni, wi::to_widest (max_index)))
8149 if (dump_enabled_p ())
8150 dump_printf_loc (MSG_NOTE, vect_location,
8151 "loop size is greater than data size.\n");
8152 return false;
8156 /* In case the vectorization factor (VF) is bigger than the number
8157 of elements that we can fit in a vectype (nunits), we have to generate
8158 more than one vector stmt - i.e - we need to "unroll" the
8159 vector stmt by a factor VF/nunits. For more details see documentation
8160 in vectorizable_operation. */
8162 /* If the reduction is used in an outer loop we need to generate
8163 VF intermediate results, like so (e.g. for ncopies=2):
8164 r0 = phi (init, r0)
8165 r1 = phi (init, r1)
8166 r0 = x0 + r0;
8167 r1 = x1 + r1;
8168 (i.e. we generate VF results in 2 registers).
8169 In this case we have a separate def-use cycle for each copy, and therefore
8170 for each copy we get the vector def for the reduction variable from the
8171 respective phi node created for this copy.
8173 Otherwise (the reduction is unused in the loop nest), we can combine
8174 together intermediate results, like so (e.g. for ncopies=2):
8175 r = phi (init, r)
8176 r = x0 + r;
8177 r = x1 + r;
8178 (i.e. we generate VF/2 results in a single register).
8179 In this case for each copy we get the vector def for the reduction variable
8180 from the vectorized reduction operation generated in the previous iteration.
8182 This only works when we see both the reduction PHI and its only consumer
8183 in vectorizable_reduction and there are no intermediate stmts
8184 participating. When unrolling we want each unrolled iteration to have its
8185 own reduction accumulator since one of the main goals of unrolling a
8186 reduction is to reduce the aggregate loop-carried latency. */
8187 if (ncopies > 1
8188 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8189 && reduc_chain_length == 1
8190 && loop_vinfo->suggested_unroll_factor == 1)
8191 single_defuse_cycle = true;
8193 if (single_defuse_cycle || lane_reduc_code_p)
8195 gcc_assert (op.code != COND_EXPR);
8197 /* 4. Supportable by target? */
8198 bool ok = true;
8200 /* 4.1. check support for the operation in the loop
8202 This isn't necessary for the lane reduction codes, since they
8203 can only be produced by pattern matching, and it's up to the
8204 pattern matcher to test for support. The main reason for
8205 specifically skipping this step is to avoid rechecking whether
8206 mixed-sign dot-products can be implemented using signed
8207 dot-products. */
8208 machine_mode vec_mode = TYPE_MODE (vectype_in);
8209 if (!lane_reduc_code_p
8210 && !directly_supported_p (op.code, vectype_in, optab_vector))
8212 if (dump_enabled_p ())
8213 dump_printf (MSG_NOTE, "op not supported by target.\n");
8214 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8215 || !vect_can_vectorize_without_simd_p (op.code))
8216 ok = false;
8217 else
8218 if (dump_enabled_p ())
8219 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8222 if (vect_emulated_vector_p (vectype_in)
8223 && !vect_can_vectorize_without_simd_p (op.code))
8225 if (dump_enabled_p ())
8226 dump_printf (MSG_NOTE, "using word mode not possible.\n");
8227 return false;
8230 /* lane-reducing operations have to go through vect_transform_reduction.
8231 For the other cases try without the single cycle optimization. */
8232 if (!ok)
8234 if (lane_reduc_code_p)
8235 return false;
8236 else
8237 single_defuse_cycle = false;
8240 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8242 /* If the reduction stmt is one of the patterns that have lane
8243 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
8244 if ((ncopies > 1 && ! single_defuse_cycle)
8245 && lane_reduc_code_p)
8247 if (dump_enabled_p ())
8248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8249 "multi def-use cycle not possible for lane-reducing "
8250 "reduction operation\n");
8251 return false;
8254 if (slp_node
8255 && !(!single_defuse_cycle
8256 && !lane_reduc_code_p
8257 && reduction_type != FOLD_LEFT_REDUCTION))
8258 for (i = 0; i < (int) op.num_ops; i++)
8259 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8261 if (dump_enabled_p ())
8262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8263 "incompatible vector types for invariants\n");
8264 return false;
8267 if (slp_node)
8268 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8269 else
8270 vec_num = 1;
8272 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8273 reduction_type, ncopies, cost_vec);
8274 /* Cost the reduction op inside the loop if transformed via
8275 vect_transform_reduction. Otherwise this is costed by the
8276 separate vectorizable_* routines. */
8277 if (single_defuse_cycle || lane_reduc_code_p)
8279 int factor = 1;
8280 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8281 /* Three dot-products and a subtraction. */
8282 factor = 4;
8283 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8284 stmt_info, 0, vect_body);
8287 if (dump_enabled_p ()
8288 && reduction_type == FOLD_LEFT_REDUCTION)
8289 dump_printf_loc (MSG_NOTE, vect_location,
8290 "using an in-order (fold-left) reduction.\n");
8291 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8292 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8293 reductions go through their own vectorizable_* routines. */
8294 if (!single_defuse_cycle
8295 && !lane_reduc_code_p
8296 && reduction_type != FOLD_LEFT_REDUCTION)
8298 stmt_vec_info tem
8299 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8300 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8302 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8303 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8305 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8306 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8308 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8310 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8311 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8312 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8314 if (reduction_type != FOLD_LEFT_REDUCTION
8315 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8316 && (cond_fn == IFN_LAST
8317 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8318 OPTIMIZE_FOR_SPEED)))
8320 if (dump_enabled_p ())
8321 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8322 "can't operate on partial vectors because"
8323 " no conditional operation is available.\n");
8324 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8326 else if (reduction_type == FOLD_LEFT_REDUCTION
8327 && reduc_fn == IFN_LAST
8328 && !expand_vec_cond_expr_p (vectype_in,
8329 truth_type_for (vectype_in),
8330 SSA_NAME))
8332 if (dump_enabled_p ())
8333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8334 "can't operate on partial vectors because"
8335 " no conditional operation is available.\n");
8336 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8338 else if (reduction_type == FOLD_LEFT_REDUCTION
8339 && internal_fn_mask_index (reduc_fn) == -1
8340 && FLOAT_TYPE_P (vectype_in)
8341 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8343 if (dump_enabled_p ())
8344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8345 "can't operate on partial vectors because"
8346 " signed zeros cannot be preserved.\n");
8347 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8349 else
8351 internal_fn mask_reduc_fn
8352 = get_masked_reduction_fn (reduc_fn, vectype_in);
8354 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8355 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8356 vectype_in, 1);
8357 else
8358 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8359 vectype_in, NULL);
8362 return true;
8365 /* STMT_INFO is a dot-product reduction whose multiplication operands
8366 have different signs. Emit a sequence to emulate the operation
8367 using a series of signed DOT_PROD_EXPRs and return the last
8368 statement generated. VEC_DEST is the result of the vector operation
8369 and VOP lists its inputs. */
8371 static gassign *
8372 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8373 gimple_stmt_iterator *gsi, tree vec_dest,
8374 tree vop[3])
8376 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8377 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8378 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8379 gimple *new_stmt;
8381 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8382 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8383 std::swap (vop[0], vop[1]);
8385 /* Convert all inputs to signed types. */
8386 for (int i = 0; i < 3; ++i)
8387 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8389 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8390 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8391 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8392 vop[i] = tmp;
8395 /* In the comments below we assume 8-bit inputs for simplicity,
8396 but the approach works for any full integer type. */
8398 /* Create a vector of -128. */
8399 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8400 tree min_narrow = build_vector_from_val (narrow_vectype,
8401 min_narrow_elttype);
8403 /* Create a vector of 64. */
8404 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8405 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8406 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8408 /* Emit: SUB_RES = VOP[0] - 128. */
8409 tree sub_res = make_ssa_name (narrow_vectype);
8410 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8411 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8413 /* Emit:
8415 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8416 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8417 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8419 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8420 Doing the two 64 * y steps first allows more time to compute x. */
8421 tree stage1 = make_ssa_name (wide_vectype);
8422 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8423 vop[1], half_narrow, vop[2]);
8424 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8426 tree stage2 = make_ssa_name (wide_vectype);
8427 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8428 vop[1], half_narrow, stage1);
8429 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8431 tree stage3 = make_ssa_name (wide_vectype);
8432 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8433 sub_res, vop[1], stage2);
8434 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8436 /* Convert STAGE3 to the reduction type. */
8437 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8440 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8441 value. */
8443 bool
8444 vect_transform_reduction (loop_vec_info loop_vinfo,
8445 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8446 gimple **vec_stmt, slp_tree slp_node)
8448 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8449 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8450 int i;
8451 int ncopies;
8452 int vec_num;
8454 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8455 gcc_assert (reduc_info->is_reduc_info);
8457 if (nested_in_vect_loop_p (loop, stmt_info))
8459 loop = loop->inner;
8460 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8463 gimple_match_op op;
8464 if (!gimple_extract_op (stmt_info->stmt, &op))
8465 gcc_unreachable ();
8467 /* All uses but the last are expected to be defined in the loop.
8468 The last use is the reduction variable. In case of nested cycle this
8469 assumption is not true: we use reduc_index to record the index of the
8470 reduction variable. */
8471 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8472 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8473 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8474 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8476 if (slp_node)
8478 ncopies = 1;
8479 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8481 else
8483 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8484 vec_num = 1;
8487 code_helper code = canonicalize_code (op.code, op.type);
8488 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8490 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8491 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8492 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8494 /* Transform. */
8495 tree new_temp = NULL_TREE;
8496 auto_vec<tree> vec_oprnds0;
8497 auto_vec<tree> vec_oprnds1;
8498 auto_vec<tree> vec_oprnds2;
8499 tree def0;
8501 if (dump_enabled_p ())
8502 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8504 /* FORNOW: Multiple types are not supported for condition. */
8505 if (code == COND_EXPR)
8506 gcc_assert (ncopies == 1);
8508 /* A binary COND_OP reduction must have the same definition and else
8509 value. */
8510 bool cond_fn_p = code.is_internal_fn ()
8511 && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8512 if (cond_fn_p)
8514 gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8515 || code == IFN_COND_MUL || code == IFN_COND_AND
8516 || code == IFN_COND_IOR || code == IFN_COND_XOR);
8517 gcc_assert (op.num_ops == 4
8518 && (op.ops[reduc_index]
8519 == op.ops[internal_fn_else_index ((internal_fn) code)]));
8522 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8524 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8525 if (reduction_type == FOLD_LEFT_REDUCTION)
8527 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8528 gcc_assert (code.is_tree_code () || cond_fn_p);
8529 return vectorize_fold_left_reduction
8530 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8531 code, reduc_fn, op.ops, op.num_ops, vectype_in,
8532 reduc_index, masks, lens);
8535 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8536 gcc_assert (single_defuse_cycle
8537 || code == DOT_PROD_EXPR
8538 || code == WIDEN_SUM_EXPR
8539 || code == SAD_EXPR);
8541 /* Create the destination vector */
8542 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8543 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8545 /* Get NCOPIES vector definitions for all operands except the reduction
8546 definition. */
8547 if (!cond_fn_p)
8549 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8550 single_defuse_cycle && reduc_index == 0
8551 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8552 single_defuse_cycle && reduc_index == 1
8553 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8554 op.num_ops == 3
8555 && !(single_defuse_cycle && reduc_index == 2)
8556 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8558 else
8560 /* For a conditional operation pass the truth type as mask
8561 vectype. */
8562 gcc_assert (single_defuse_cycle
8563 && (reduc_index == 1 || reduc_index == 2));
8564 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8565 op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8566 reduc_index == 1 ? NULL_TREE : op.ops[1],
8567 NULL_TREE, &vec_oprnds1,
8568 reduc_index == 2 ? NULL_TREE : op.ops[2],
8569 NULL_TREE, &vec_oprnds2);
8572 /* For single def-use cycles get one copy of the vectorized reduction
8573 definition. */
8574 if (single_defuse_cycle)
8576 gcc_assert (!slp_node);
8577 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8578 op.ops[reduc_index],
8579 reduc_index == 0 ? &vec_oprnds0
8580 : (reduc_index == 1 ? &vec_oprnds1
8581 : &vec_oprnds2));
8584 bool emulated_mixed_dot_prod
8585 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8586 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8588 gimple *new_stmt;
8589 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8590 if (masked_loop_p && !mask_by_cond_expr)
8592 /* No conditional ifns have been defined for dot-product yet. */
8593 gcc_assert (code != DOT_PROD_EXPR);
8595 /* Make sure that the reduction accumulator is vop[0]. */
8596 if (reduc_index == 1)
8598 gcc_assert (commutative_binary_op_p (code, op.type));
8599 std::swap (vop[0], vop[1]);
8601 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8602 vec_num * ncopies, vectype_in, i);
8603 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8604 vop[0], vop[1], vop[0]);
8605 new_temp = make_ssa_name (vec_dest, call);
8606 gimple_call_set_lhs (call, new_temp);
8607 gimple_call_set_nothrow (call, true);
8608 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8609 new_stmt = call;
8611 else
8613 if (op.num_ops >= 3)
8614 vop[2] = vec_oprnds2[i];
8616 if (masked_loop_p && mask_by_cond_expr)
8618 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8619 vec_num * ncopies, vectype_in, i);
8620 build_vect_cond_expr (code, vop, mask, gsi);
8623 if (emulated_mixed_dot_prod)
8624 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8625 vec_dest, vop);
8627 else if (code.is_internal_fn () && !cond_fn_p)
8628 new_stmt = gimple_build_call_internal (internal_fn (code),
8629 op.num_ops,
8630 vop[0], vop[1], vop[2]);
8631 else if (code.is_internal_fn () && cond_fn_p)
8632 new_stmt = gimple_build_call_internal (internal_fn (code),
8633 op.num_ops,
8634 vop[0], vop[1], vop[2],
8635 vop[1]);
8636 else
8637 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8638 vop[0], vop[1], vop[2]);
8639 new_temp = make_ssa_name (vec_dest, new_stmt);
8640 gimple_set_lhs (new_stmt, new_temp);
8641 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8644 if (slp_node)
8645 slp_node->push_vec_def (new_stmt);
8646 else if (single_defuse_cycle
8647 && i < ncopies - 1)
8649 if (reduc_index == 0)
8650 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8651 else if (reduc_index == 1)
8652 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8653 else if (reduc_index == 2)
8654 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8656 else
8657 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8660 if (!slp_node)
8661 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8663 return true;
8666 /* Transform phase of a cycle PHI. */
8668 bool
8669 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8670 stmt_vec_info stmt_info, gimple **vec_stmt,
8671 slp_tree slp_node, slp_instance slp_node_instance)
8673 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8674 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8675 int i;
8676 int ncopies;
8677 int j;
8678 bool nested_cycle = false;
8679 int vec_num;
8681 if (nested_in_vect_loop_p (loop, stmt_info))
8683 loop = loop->inner;
8684 nested_cycle = true;
8687 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8688 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8689 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8690 gcc_assert (reduc_info->is_reduc_info);
8692 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8693 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8694 /* Leave the scalar phi in place. */
8695 return true;
8697 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8698 /* For a nested cycle we do not fill the above. */
8699 if (!vectype_in)
8700 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8701 gcc_assert (vectype_in);
8703 if (slp_node)
8705 /* The size vect_schedule_slp_instance computes is off for us. */
8706 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8707 * SLP_TREE_LANES (slp_node), vectype_in);
8708 ncopies = 1;
8710 else
8712 vec_num = 1;
8713 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8716 /* Check whether we should use a single PHI node and accumulate
8717 vectors to one before the backedge. */
8718 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8719 ncopies = 1;
8721 /* Create the destination vector */
8722 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8723 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8724 vectype_out);
8726 /* Get the loop-entry arguments. */
8727 tree vec_initial_def = NULL_TREE;
8728 auto_vec<tree> vec_initial_defs;
8729 if (slp_node)
8731 vec_initial_defs.reserve (vec_num);
8732 if (nested_cycle)
8734 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8735 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8736 &vec_initial_defs);
8738 else
8740 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8741 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8742 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8744 unsigned int num_phis = stmts.length ();
8745 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8746 num_phis = 1;
8747 initial_values.reserve (num_phis);
8748 for (unsigned int i = 0; i < num_phis; ++i)
8750 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8751 initial_values.quick_push (vect_phi_initial_value (this_phi));
8753 if (vec_num == 1)
8754 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8755 if (!initial_values.is_empty ())
8757 tree initial_value
8758 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8759 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8760 tree neutral_op
8761 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8762 code, initial_value);
8763 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8764 &vec_initial_defs, vec_num,
8765 stmts.length (), neutral_op);
8769 else
8771 /* Get at the scalar def before the loop, that defines the initial
8772 value of the reduction variable. */
8773 tree initial_def = vect_phi_initial_value (phi);
8774 reduc_info->reduc_initial_values.safe_push (initial_def);
8775 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8776 and we can't use zero for induc_val, use initial_def. Similarly
8777 for REDUC_MIN and initial_def larger than the base. */
8778 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8780 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8781 if (TREE_CODE (initial_def) == INTEGER_CST
8782 && !integer_zerop (induc_val)
8783 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8784 && tree_int_cst_lt (initial_def, induc_val))
8785 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8786 && tree_int_cst_lt (induc_val, initial_def))))
8788 induc_val = initial_def;
8789 /* Communicate we used the initial_def to epilouge
8790 generation. */
8791 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8793 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8795 else if (nested_cycle)
8797 /* Do not use an adjustment def as that case is not supported
8798 correctly if ncopies is not one. */
8799 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8800 ncopies, initial_def,
8801 &vec_initial_defs);
8803 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8804 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8805 /* Fill the initial vector with the initial scalar value. */
8806 vec_initial_def
8807 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8808 initial_def, initial_def);
8809 else
8811 if (ncopies == 1)
8812 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8813 if (!reduc_info->reduc_initial_values.is_empty ())
8815 initial_def = reduc_info->reduc_initial_values[0];
8816 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8817 tree neutral_op
8818 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8819 code, initial_def);
8820 gcc_assert (neutral_op);
8821 /* Try to simplify the vector initialization by applying an
8822 adjustment after the reduction has been performed. */
8823 if (!reduc_info->reused_accumulator
8824 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8825 && !operand_equal_p (neutral_op, initial_def))
8827 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8828 = initial_def;
8829 initial_def = neutral_op;
8831 vec_initial_def
8832 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8833 initial_def, neutral_op);
8838 if (vec_initial_def)
8840 vec_initial_defs.create (ncopies);
8841 for (i = 0; i < ncopies; ++i)
8842 vec_initial_defs.quick_push (vec_initial_def);
8845 if (auto *accumulator = reduc_info->reused_accumulator)
8847 tree def = accumulator->reduc_input;
8848 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8850 unsigned int nreduc;
8851 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8852 (TREE_TYPE (def)),
8853 TYPE_VECTOR_SUBPARTS (vectype_out),
8854 &nreduc);
8855 gcc_assert (res);
8856 gimple_seq stmts = NULL;
8857 /* Reduce the single vector to a smaller one. */
8858 if (nreduc != 1)
8860 /* Perform the reduction in the appropriate type. */
8861 tree rvectype = vectype_out;
8862 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8863 TREE_TYPE (TREE_TYPE (def))))
8864 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8865 TYPE_VECTOR_SUBPARTS
8866 (vectype_out));
8867 def = vect_create_partial_epilog (def, rvectype,
8868 STMT_VINFO_REDUC_CODE
8869 (reduc_info),
8870 &stmts);
8872 /* The epilogue loop might use a different vector mode, like
8873 VNx2DI vs. V2DI. */
8874 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8876 tree reduc_type = build_vector_type_for_mode
8877 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8878 def = gimple_convert (&stmts, reduc_type, def);
8880 /* Adjust the input so we pick up the partially reduced value
8881 for the skip edge in vect_create_epilog_for_reduction. */
8882 accumulator->reduc_input = def;
8883 /* And the reduction could be carried out using a different sign. */
8884 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8885 def = gimple_convert (&stmts, vectype_out, def);
8886 if (loop_vinfo->main_loop_edge)
8888 /* While we'd like to insert on the edge this will split
8889 blocks and disturb bookkeeping, we also will eventually
8890 need this on the skip edge. Rely on sinking to
8891 fixup optimal placement and insert in the pred. */
8892 gimple_stmt_iterator gsi
8893 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8894 /* Insert before a cond that eventually skips the
8895 epilogue. */
8896 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8897 gsi_prev (&gsi);
8898 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8900 else
8901 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8902 stmts);
8904 if (loop_vinfo->main_loop_edge)
8905 vec_initial_defs[0]
8906 = vect_get_main_loop_result (loop_vinfo, def,
8907 vec_initial_defs[0]);
8908 else
8909 vec_initial_defs.safe_push (def);
8912 /* Generate the reduction PHIs upfront. */
8913 for (i = 0; i < vec_num; i++)
8915 tree vec_init_def = vec_initial_defs[i];
8916 for (j = 0; j < ncopies; j++)
8918 /* Create the reduction-phi that defines the reduction
8919 operand. */
8920 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8922 /* Set the loop-entry arg of the reduction-phi. */
8923 if (j != 0 && nested_cycle)
8924 vec_init_def = vec_initial_defs[j];
8925 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8926 UNKNOWN_LOCATION);
8928 /* The loop-latch arg is set in epilogue processing. */
8930 if (slp_node)
8931 slp_node->push_vec_def (new_phi);
8932 else
8934 if (j == 0)
8935 *vec_stmt = new_phi;
8936 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8941 return true;
8944 /* Vectorizes LC PHIs. */
8946 bool
8947 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8948 stmt_vec_info stmt_info, gimple **vec_stmt,
8949 slp_tree slp_node)
8951 if (!loop_vinfo
8952 || !is_a <gphi *> (stmt_info->stmt)
8953 || gimple_phi_num_args (stmt_info->stmt) != 1)
8954 return false;
8956 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8957 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8958 return false;
8960 if (!vec_stmt) /* transformation not required. */
8962 /* Deal with copies from externs or constants that disguise as
8963 loop-closed PHI nodes (PR97886). */
8964 if (slp_node
8965 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8966 SLP_TREE_VECTYPE (slp_node)))
8968 if (dump_enabled_p ())
8969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8970 "incompatible vector types for invariants\n");
8971 return false;
8973 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8974 return true;
8977 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8978 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8979 basic_block bb = gimple_bb (stmt_info->stmt);
8980 edge e = single_pred_edge (bb);
8981 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8982 auto_vec<tree> vec_oprnds;
8983 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8984 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8985 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8986 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8988 /* Create the vectorized LC PHI node. */
8989 gphi *new_phi = create_phi_node (vec_dest, bb);
8990 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8991 if (slp_node)
8992 slp_node->push_vec_def (new_phi);
8993 else
8994 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8996 if (!slp_node)
8997 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8999 return true;
9002 /* Vectorizes PHIs. */
9004 bool
9005 vectorizable_phi (vec_info *,
9006 stmt_vec_info stmt_info, gimple **vec_stmt,
9007 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9009 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9010 return false;
9012 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9013 return false;
9015 tree vectype = SLP_TREE_VECTYPE (slp_node);
9017 if (!vec_stmt) /* transformation not required. */
9019 slp_tree child;
9020 unsigned i;
9021 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9022 if (!child)
9024 if (dump_enabled_p ())
9025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9026 "PHI node with unvectorized backedge def\n");
9027 return false;
9029 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9031 if (dump_enabled_p ())
9032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9033 "incompatible vector types for invariants\n");
9034 return false;
9036 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9037 && !useless_type_conversion_p (vectype,
9038 SLP_TREE_VECTYPE (child)))
9040 /* With bools we can have mask and non-mask precision vectors
9041 or different non-mask precisions. while pattern recog is
9042 supposed to guarantee consistency here bugs in it can cause
9043 mismatches (PR103489 and PR103800 for example).
9044 Deal with them here instead of ICEing later. */
9045 if (dump_enabled_p ())
9046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9047 "incompatible vector type setup from "
9048 "bool pattern detection\n");
9049 return false;
9052 /* For single-argument PHIs assume coalescing which means zero cost
9053 for the scalar and the vector PHIs. This avoids artificially
9054 favoring the vector path (but may pessimize it in some cases). */
9055 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9056 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9057 vector_stmt, stmt_info, vectype, 0, vect_body);
9058 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9059 return true;
9062 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9063 basic_block bb = gimple_bb (stmt_info->stmt);
9064 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9065 auto_vec<gphi *> new_phis;
9066 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9068 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9070 /* Skip not yet vectorized defs. */
9071 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9072 && SLP_TREE_VEC_DEFS (child).is_empty ())
9073 continue;
9075 auto_vec<tree> vec_oprnds;
9076 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9077 if (!new_phis.exists ())
9079 new_phis.create (vec_oprnds.length ());
9080 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9082 /* Create the vectorized LC PHI node. */
9083 new_phis.quick_push (create_phi_node (vec_dest, bb));
9084 slp_node->push_vec_def (new_phis[j]);
9087 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9088 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9089 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9091 /* We should have at least one already vectorized child. */
9092 gcc_assert (new_phis.exists ());
9094 return true;
9097 /* Vectorizes first order recurrences. An overview of the transformation
9098 is described below. Suppose we have the following loop.
9100 int t = 0;
9101 for (int i = 0; i < n; ++i)
9103 b[i] = a[i] - t;
9104 t = a[i];
9107 There is a first-order recurrence on 'a'. For this loop, the scalar IR
9108 looks (simplified) like:
9110 scalar.preheader:
9111 init = 0;
9113 scalar.body:
9114 i = PHI <0(scalar.preheader), i+1(scalar.body)>
9115 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9116 _1 = a[i]
9117 b[i] = _1 - _2
9118 if (i < n) goto scalar.body
9120 In this example, _2 is a recurrence because it's value depends on the
9121 previous iteration. We vectorize this as (VF = 4)
9123 vector.preheader:
9124 vect_init = vect_cst(..., ..., ..., 0)
9126 vector.body
9127 i = PHI <0(vector.preheader), i+4(vector.body)>
9128 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9129 vect_2 = a[i, i+1, i+2, i+3];
9130 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9131 b[i, i+1, i+2, i+3] = vect_2 - vect_3
9132 if (..) goto vector.body
9134 In this function, vectorizable_recurr, we code generate both the
9135 vector PHI node and the permute since those together compute the
9136 vectorized value of the scalar PHI. We do not yet have the
9137 backedge value to fill in there nor into the vec_perm. Those
9138 are filled in maybe_set_vectorized_backedge_value and
9139 vect_schedule_scc.
9141 TODO: Since the scalar loop does not have a use of the recurrence
9142 outside of the loop the natural way to implement peeling via
9143 vectorizing the live value doesn't work. For now peeling of loops
9144 with a recurrence is not implemented. For SLP the supported cases
9145 are restricted to those requiring a single vector recurrence PHI. */
9147 bool
9148 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9149 gimple **vec_stmt, slp_tree slp_node,
9150 stmt_vector_for_cost *cost_vec)
9152 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9153 return false;
9155 gphi *phi = as_a<gphi *> (stmt_info->stmt);
9157 /* So far we only support first-order recurrence auto-vectorization. */
9158 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9159 return false;
9161 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9162 unsigned ncopies;
9163 if (slp_node)
9164 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9165 else
9166 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9167 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9168 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9169 /* We need to be able to make progress with a single vector. */
9170 if (maybe_gt (dist * 2, nunits))
9172 if (dump_enabled_p ())
9173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9174 "first order recurrence exceeds half of "
9175 "a vector\n");
9176 return false;
9179 /* First-order recurrence autovectorization needs to handle permutation
9180 with indices = [nunits-1, nunits, nunits+1, ...]. */
9181 vec_perm_builder sel (nunits, 1, 3);
9182 for (int i = 0; i < 3; ++i)
9183 sel.quick_push (nunits - dist + i);
9184 vec_perm_indices indices (sel, 2, nunits);
9186 if (!vec_stmt) /* transformation not required. */
9188 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9189 indices))
9190 return false;
9192 if (slp_node)
9194 /* We eventually need to set a vector type on invariant
9195 arguments. */
9196 unsigned j;
9197 slp_tree child;
9198 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9199 if (!vect_maybe_update_slp_op_vectype
9200 (child, SLP_TREE_VECTYPE (slp_node)))
9202 if (dump_enabled_p ())
9203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9204 "incompatible vector types for "
9205 "invariants\n");
9206 return false;
9209 /* The recurrence costs the initialization vector and one permute
9210 for each copy. */
9211 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9212 stmt_info, 0, vect_prologue);
9213 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9214 stmt_info, 0, vect_body);
9215 if (dump_enabled_p ())
9216 dump_printf_loc (MSG_NOTE, vect_location,
9217 "vectorizable_recurr: inside_cost = %d, "
9218 "prologue_cost = %d .\n", inside_cost,
9219 prologue_cost);
9221 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9222 return true;
9225 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9226 basic_block bb = gimple_bb (phi);
9227 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9228 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9230 gimple_seq stmts = NULL;
9231 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9232 gsi_insert_seq_on_edge_immediate (pe, stmts);
9234 tree vec_init = build_vector_from_val (vectype, preheader);
9235 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9237 /* Create the vectorized first-order PHI node. */
9238 tree vec_dest = vect_get_new_vect_var (vectype,
9239 vect_simple_var, "vec_recur_");
9240 gphi *new_phi = create_phi_node (vec_dest, bb);
9241 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9243 /* Insert shuffles the first-order recurrence autovectorization.
9244 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9245 tree perm = vect_gen_perm_mask_checked (vectype, indices);
9247 /* Insert the required permute after the latch definition. The
9248 second and later operands are tentative and will be updated when we have
9249 vectorized the latch definition. */
9250 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9251 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9252 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9253 gsi_next (&gsi2);
9255 for (unsigned i = 0; i < ncopies; ++i)
9257 vec_dest = make_ssa_name (vectype);
9258 gassign *vperm
9259 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9260 i == 0 ? gimple_phi_result (new_phi) : NULL,
9261 NULL, perm);
9262 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9264 if (slp_node)
9265 slp_node->push_vec_def (vperm);
9266 else
9267 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9270 if (!slp_node)
9271 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9272 return true;
9275 /* Return true if VECTYPE represents a vector that requires lowering
9276 by the vector lowering pass. */
9278 bool
9279 vect_emulated_vector_p (tree vectype)
9281 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9282 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9283 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9286 /* Return true if we can emulate CODE on an integer mode representation
9287 of a vector. */
9289 bool
9290 vect_can_vectorize_without_simd_p (tree_code code)
9292 switch (code)
9294 case PLUS_EXPR:
9295 case MINUS_EXPR:
9296 case NEGATE_EXPR:
9297 case BIT_AND_EXPR:
9298 case BIT_IOR_EXPR:
9299 case BIT_XOR_EXPR:
9300 case BIT_NOT_EXPR:
9301 return true;
9303 default:
9304 return false;
9308 /* Likewise, but taking a code_helper. */
9310 bool
9311 vect_can_vectorize_without_simd_p (code_helper code)
9313 return (code.is_tree_code ()
9314 && vect_can_vectorize_without_simd_p (tree_code (code)));
9317 /* Create vector init for vectorized iv. */
9318 static tree
9319 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9320 tree step_expr, poly_uint64 nunits,
9321 tree vectype,
9322 enum vect_induction_op_type induction_type)
9324 unsigned HOST_WIDE_INT const_nunits;
9325 tree vec_shift, vec_init, new_name;
9326 unsigned i;
9327 tree itype = TREE_TYPE (vectype);
9329 /* iv_loop is the loop to be vectorized. Create:
9330 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9331 new_name = gimple_convert (stmts, itype, init_expr);
9332 switch (induction_type)
9334 case vect_step_op_shr:
9335 case vect_step_op_shl:
9336 /* Build the Initial value from shift_expr. */
9337 vec_init = gimple_build_vector_from_val (stmts,
9338 vectype,
9339 new_name);
9340 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9341 build_zero_cst (itype), step_expr);
9342 vec_init = gimple_build (stmts,
9343 (induction_type == vect_step_op_shr
9344 ? RSHIFT_EXPR : LSHIFT_EXPR),
9345 vectype, vec_init, vec_shift);
9346 break;
9348 case vect_step_op_neg:
9350 vec_init = gimple_build_vector_from_val (stmts,
9351 vectype,
9352 new_name);
9353 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9354 vectype, vec_init);
9355 /* The encoding has 2 interleaved stepped patterns. */
9356 vec_perm_builder sel (nunits, 2, 3);
9357 sel.quick_grow (6);
9358 for (i = 0; i < 3; i++)
9360 sel[2 * i] = i;
9361 sel[2 * i + 1] = i + nunits;
9363 vec_perm_indices indices (sel, 2, nunits);
9364 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9365 fail when vec_init is const vector. In that situation vec_perm is not
9366 really needed. */
9367 tree perm_mask_even
9368 = vect_gen_perm_mask_any (vectype, indices);
9369 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9370 vectype,
9371 vec_init, vec_neg,
9372 perm_mask_even);
9374 break;
9376 case vect_step_op_mul:
9378 /* Use unsigned mult to avoid UD integer overflow. */
9379 gcc_assert (nunits.is_constant (&const_nunits));
9380 tree utype = unsigned_type_for (itype);
9381 tree uvectype = build_vector_type (utype,
9382 TYPE_VECTOR_SUBPARTS (vectype));
9383 new_name = gimple_convert (stmts, utype, new_name);
9384 vec_init = gimple_build_vector_from_val (stmts,
9385 uvectype,
9386 new_name);
9387 tree_vector_builder elts (uvectype, const_nunits, 1);
9388 tree elt_step = build_one_cst (utype);
9390 elts.quick_push (elt_step);
9391 for (i = 1; i < const_nunits; i++)
9393 /* Create: new_name_i = new_name + step_expr. */
9394 elt_step = gimple_build (stmts, MULT_EXPR,
9395 utype, elt_step, step_expr);
9396 elts.quick_push (elt_step);
9398 /* Create a vector from [new_name_0, new_name_1, ...,
9399 new_name_nunits-1]. */
9400 tree vec_mul = gimple_build_vector (stmts, &elts);
9401 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9402 vec_init, vec_mul);
9403 vec_init = gimple_convert (stmts, vectype, vec_init);
9405 break;
9407 default:
9408 gcc_unreachable ();
9411 return vec_init;
9414 /* Peel init_expr by skip_niter for induction_type. */
9415 tree
9416 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9417 tree skip_niters, tree step_expr,
9418 enum vect_induction_op_type induction_type)
9420 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9421 tree type = TREE_TYPE (init_expr);
9422 unsigned prec = TYPE_PRECISION (type);
9423 switch (induction_type)
9425 case vect_step_op_neg:
9426 if (TREE_INT_CST_LOW (skip_niters) % 2)
9427 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9428 /* else no change. */
9429 break;
9431 case vect_step_op_shr:
9432 case vect_step_op_shl:
9433 skip_niters = gimple_convert (stmts, type, skip_niters);
9434 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9435 /* When shift mount >= precision, need to avoid UD.
9436 In the original loop, there's no UD, and according to semantic,
9437 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9438 if (!tree_fits_uhwi_p (step_expr)
9439 || tree_to_uhwi (step_expr) >= prec)
9441 if (induction_type == vect_step_op_shl
9442 || TYPE_UNSIGNED (type))
9443 init_expr = build_zero_cst (type);
9444 else
9445 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9446 init_expr,
9447 wide_int_to_tree (type, prec - 1));
9449 else
9450 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9451 ? RSHIFT_EXPR : LSHIFT_EXPR),
9452 type, init_expr, step_expr);
9453 break;
9455 case vect_step_op_mul:
9457 tree utype = unsigned_type_for (type);
9458 init_expr = gimple_convert (stmts, utype, init_expr);
9459 wide_int skipn = wi::to_wide (skip_niters);
9460 wide_int begin = wi::to_wide (step_expr);
9461 auto_mpz base, exp, mod, res;
9462 wi::to_mpz (begin, base, TYPE_SIGN (type));
9463 wi::to_mpz (skipn, exp, UNSIGNED);
9464 mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9465 mpz_powm (res, base, exp, mod);
9466 begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9467 tree mult_expr = wide_int_to_tree (utype, begin);
9468 init_expr = gimple_build (stmts, MULT_EXPR, utype,
9469 init_expr, mult_expr);
9470 init_expr = gimple_convert (stmts, type, init_expr);
9472 break;
9474 default:
9475 gcc_unreachable ();
9478 return init_expr;
9481 /* Create vector step for vectorized iv. */
9482 static tree
9483 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9484 poly_uint64 vf,
9485 enum vect_induction_op_type induction_type)
9487 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9488 tree new_name = NULL;
9489 /* Step should be pow (step, vf) for mult induction. */
9490 if (induction_type == vect_step_op_mul)
9492 gcc_assert (vf.is_constant ());
9493 wide_int begin = wi::to_wide (step_expr);
9495 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9496 begin = wi::mul (begin, wi::to_wide (step_expr));
9498 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9500 else if (induction_type == vect_step_op_neg)
9501 /* Do nothing. */
9503 else
9504 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9505 expr, step_expr);
9506 return new_name;
9509 static tree
9510 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9511 stmt_vec_info stmt_info,
9512 tree new_name, tree vectype,
9513 enum vect_induction_op_type induction_type)
9515 /* No step is needed for neg induction. */
9516 if (induction_type == vect_step_op_neg)
9517 return NULL;
9519 tree t = unshare_expr (new_name);
9520 gcc_assert (CONSTANT_CLASS_P (new_name)
9521 || TREE_CODE (new_name) == SSA_NAME);
9522 tree new_vec = build_vector_from_val (vectype, t);
9523 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9524 new_vec, vectype, NULL);
9525 return vec_step;
9528 /* Update vectorized iv with vect_step, induc_def is init. */
9529 static tree
9530 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9531 tree induc_def, tree vec_step,
9532 enum vect_induction_op_type induction_type)
9534 tree vec_def = induc_def;
9535 switch (induction_type)
9537 case vect_step_op_mul:
9539 /* Use unsigned mult to avoid UD integer overflow. */
9540 tree uvectype
9541 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9542 TYPE_VECTOR_SUBPARTS (vectype));
9543 vec_def = gimple_convert (stmts, uvectype, vec_def);
9544 vec_step = gimple_convert (stmts, uvectype, vec_step);
9545 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9546 vec_def, vec_step);
9547 vec_def = gimple_convert (stmts, vectype, vec_def);
9549 break;
9551 case vect_step_op_shr:
9552 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9553 vec_def, vec_step);
9554 break;
9556 case vect_step_op_shl:
9557 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9558 vec_def, vec_step);
9559 break;
9560 case vect_step_op_neg:
9561 vec_def = induc_def;
9562 /* Do nothing. */
9563 break;
9564 default:
9565 gcc_unreachable ();
9568 return vec_def;
9572 /* Function vectorizable_induction
9574 Check if STMT_INFO performs an nonlinear induction computation that can be
9575 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9576 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9577 basic block.
9578 Return true if STMT_INFO is vectorizable in this way. */
9580 static bool
9581 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9582 stmt_vec_info stmt_info,
9583 gimple **vec_stmt, slp_tree slp_node,
9584 stmt_vector_for_cost *cost_vec)
9586 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9587 unsigned ncopies;
9588 bool nested_in_vect_loop = false;
9589 class loop *iv_loop;
9590 tree vec_def;
9591 edge pe = loop_preheader_edge (loop);
9592 basic_block new_bb;
9593 tree vec_init, vec_step;
9594 tree new_name;
9595 gimple *new_stmt;
9596 gphi *induction_phi;
9597 tree induc_def, vec_dest;
9598 tree init_expr, step_expr;
9599 tree niters_skip;
9600 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9601 unsigned i;
9602 gimple_stmt_iterator si;
9604 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9606 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9607 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9608 enum vect_induction_op_type induction_type
9609 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9611 gcc_assert (induction_type > vect_step_op_add);
9613 if (slp_node)
9614 ncopies = 1;
9615 else
9616 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9617 gcc_assert (ncopies >= 1);
9619 /* FORNOW. Only handle nonlinear induction in the same loop. */
9620 if (nested_in_vect_loop_p (loop, stmt_info))
9622 if (dump_enabled_p ())
9623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9624 "nonlinear induction in nested loop.\n");
9625 return false;
9628 iv_loop = loop;
9629 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9631 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9632 update for each iv and a permutation to generate wanted vector iv. */
9633 if (slp_node)
9635 if (dump_enabled_p ())
9636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9637 "SLP induction not supported for nonlinear"
9638 " induction.\n");
9639 return false;
9642 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9644 if (dump_enabled_p ())
9645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9646 "floating point nonlinear induction vectorization"
9647 " not supported.\n");
9648 return false;
9651 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9652 init_expr = vect_phi_initial_value (phi);
9653 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9654 && TREE_CODE (step_expr) == INTEGER_CST);
9655 /* step_expr should be aligned with init_expr,
9656 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9657 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9659 if (TREE_CODE (init_expr) == INTEGER_CST)
9660 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9661 else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9663 /* INIT_EXPR could be a bit_field, bail out for such case. */
9664 if (dump_enabled_p ())
9665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9666 "nonlinear induction vectorization failed:"
9667 " component type of vectype is not a nop conversion"
9668 " from type of init_expr.\n");
9669 return false;
9672 switch (induction_type)
9674 case vect_step_op_neg:
9675 if (TREE_CODE (init_expr) != INTEGER_CST
9676 && TREE_CODE (init_expr) != REAL_CST)
9678 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9679 if (!directly_supported_p (NEGATE_EXPR, vectype))
9680 return false;
9682 /* The encoding has 2 interleaved stepped patterns. */
9683 vec_perm_builder sel (nunits, 2, 3);
9684 machine_mode mode = TYPE_MODE (vectype);
9685 sel.quick_grow (6);
9686 for (i = 0; i < 3; i++)
9688 sel[i * 2] = i;
9689 sel[i * 2 + 1] = i + nunits;
9691 vec_perm_indices indices (sel, 2, nunits);
9692 if (!can_vec_perm_const_p (mode, mode, indices))
9693 return false;
9695 break;
9697 case vect_step_op_mul:
9699 /* Check for backend support of MULT_EXPR. */
9700 if (!directly_supported_p (MULT_EXPR, vectype))
9701 return false;
9703 /* ?? How to construct vector step for variable number vector.
9704 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9705 if (!vf.is_constant ())
9706 return false;
9708 break;
9710 case vect_step_op_shr:
9711 /* Check for backend support of RSHIFT_EXPR. */
9712 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9713 return false;
9715 /* Don't shift more than type precision to avoid UD. */
9716 if (!tree_fits_uhwi_p (step_expr)
9717 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9718 TYPE_PRECISION (TREE_TYPE (init_expr))))
9719 return false;
9720 break;
9722 case vect_step_op_shl:
9723 /* Check for backend support of RSHIFT_EXPR. */
9724 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9725 return false;
9727 /* Don't shift more than type precision to avoid UD. */
9728 if (!tree_fits_uhwi_p (step_expr)
9729 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9730 TYPE_PRECISION (TREE_TYPE (init_expr))))
9731 return false;
9733 break;
9735 default:
9736 gcc_unreachable ();
9739 if (!vec_stmt) /* transformation not required. */
9741 unsigned inside_cost = 0, prologue_cost = 0;
9742 /* loop cost for vec_loop. Neg induction doesn't have any
9743 inside_cost. */
9744 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9745 stmt_info, 0, vect_body);
9747 /* loop cost for vec_loop. Neg induction doesn't have any
9748 inside_cost. */
9749 if (induction_type == vect_step_op_neg)
9750 inside_cost = 0;
9752 /* prologue cost for vec_init and vec_step. */
9753 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9754 stmt_info, 0, vect_prologue);
9756 if (dump_enabled_p ())
9757 dump_printf_loc (MSG_NOTE, vect_location,
9758 "vect_model_induction_cost: inside_cost = %d, "
9759 "prologue_cost = %d. \n", inside_cost,
9760 prologue_cost);
9762 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9763 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9764 return true;
9767 /* Transform. */
9769 /* Compute a vector variable, initialized with the first VF values of
9770 the induction variable. E.g., for an iv with IV_PHI='X' and
9771 evolution S, for a vector of 4 units, we want to compute:
9772 [X, X + S, X + 2*S, X + 3*S]. */
9774 if (dump_enabled_p ())
9775 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9777 pe = loop_preheader_edge (iv_loop);
9778 /* Find the first insertion point in the BB. */
9779 basic_block bb = gimple_bb (phi);
9780 si = gsi_after_labels (bb);
9782 gimple_seq stmts = NULL;
9784 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9785 /* If we are using the loop mask to "peel" for alignment then we need
9786 to adjust the start value here. */
9787 if (niters_skip != NULL_TREE)
9788 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9789 step_expr, induction_type);
9791 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9792 step_expr, nunits, vectype,
9793 induction_type);
9794 if (stmts)
9796 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9797 gcc_assert (!new_bb);
9800 stmts = NULL;
9801 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9802 vf, induction_type);
9803 if (stmts)
9805 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9806 gcc_assert (!new_bb);
9809 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9810 new_name, vectype,
9811 induction_type);
9812 /* Create the following def-use cycle:
9813 loop prolog:
9814 vec_init = ...
9815 vec_step = ...
9816 loop:
9817 vec_iv = PHI <vec_init, vec_loop>
9819 STMT
9821 vec_loop = vec_iv + vec_step; */
9823 /* Create the induction-phi that defines the induction-operand. */
9824 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9825 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9826 induc_def = PHI_RESULT (induction_phi);
9828 /* Create the iv update inside the loop. */
9829 stmts = NULL;
9830 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9831 induc_def, vec_step,
9832 induction_type);
9834 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9835 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9837 /* Set the arguments of the phi node: */
9838 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9839 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9840 UNKNOWN_LOCATION);
9842 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9843 *vec_stmt = induction_phi;
9845 /* In case that vectorization factor (VF) is bigger than the number
9846 of elements that we can fit in a vectype (nunits), we have to generate
9847 more than one vector stmt - i.e - we need to "unroll" the
9848 vector stmt by a factor VF/nunits. For more details see documentation
9849 in vectorizable_operation. */
9851 if (ncopies > 1)
9853 stmts = NULL;
9854 /* FORNOW. This restriction should be relaxed. */
9855 gcc_assert (!nested_in_vect_loop);
9857 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9858 nunits, induction_type);
9860 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9861 new_name, vectype,
9862 induction_type);
9863 vec_def = induc_def;
9864 for (i = 1; i < ncopies; i++)
9866 /* vec_i = vec_prev + vec_step. */
9867 stmts = NULL;
9868 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9869 vec_def, vec_step,
9870 induction_type);
9871 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9872 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9873 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9877 if (dump_enabled_p ())
9878 dump_printf_loc (MSG_NOTE, vect_location,
9879 "transform induction: created def-use cycle: %G%G",
9880 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9882 return true;
9885 /* Function vectorizable_induction
9887 Check if STMT_INFO performs an induction computation that can be vectorized.
9888 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9889 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9890 Return true if STMT_INFO is vectorizable in this way. */
9892 bool
9893 vectorizable_induction (loop_vec_info loop_vinfo,
9894 stmt_vec_info stmt_info,
9895 gimple **vec_stmt, slp_tree slp_node,
9896 stmt_vector_for_cost *cost_vec)
9898 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9899 unsigned ncopies;
9900 bool nested_in_vect_loop = false;
9901 class loop *iv_loop;
9902 tree vec_def;
9903 edge pe = loop_preheader_edge (loop);
9904 basic_block new_bb;
9905 tree new_vec, vec_init, vec_step, t;
9906 tree new_name;
9907 gimple *new_stmt;
9908 gphi *induction_phi;
9909 tree induc_def, vec_dest;
9910 tree init_expr, step_expr;
9911 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9912 unsigned i;
9913 tree expr;
9914 gimple_stmt_iterator si;
9915 enum vect_induction_op_type induction_type
9916 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9918 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9919 if (!phi)
9920 return false;
9922 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9923 return false;
9925 /* Make sure it was recognized as induction computation. */
9926 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9927 return false;
9929 /* Handle nonlinear induction in a separate place. */
9930 if (induction_type != vect_step_op_add)
9931 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9932 vec_stmt, slp_node, cost_vec);
9934 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9935 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9937 if (slp_node)
9938 ncopies = 1;
9939 else
9940 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9941 gcc_assert (ncopies >= 1);
9943 /* FORNOW. These restrictions should be relaxed. */
9944 if (nested_in_vect_loop_p (loop, stmt_info))
9946 imm_use_iterator imm_iter;
9947 use_operand_p use_p;
9948 gimple *exit_phi;
9949 edge latch_e;
9950 tree loop_arg;
9952 if (ncopies > 1)
9954 if (dump_enabled_p ())
9955 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9956 "multiple types in nested loop.\n");
9957 return false;
9960 exit_phi = NULL;
9961 latch_e = loop_latch_edge (loop->inner);
9962 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9963 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9965 gimple *use_stmt = USE_STMT (use_p);
9966 if (is_gimple_debug (use_stmt))
9967 continue;
9969 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9971 exit_phi = use_stmt;
9972 break;
9975 if (exit_phi)
9977 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9978 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9979 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9981 if (dump_enabled_p ())
9982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9983 "inner-loop induction only used outside "
9984 "of the outer vectorized loop.\n");
9985 return false;
9989 nested_in_vect_loop = true;
9990 iv_loop = loop->inner;
9992 else
9993 iv_loop = loop;
9994 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9996 if (slp_node && !nunits.is_constant ())
9998 /* The current SLP code creates the step value element-by-element. */
9999 if (dump_enabled_p ())
10000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10001 "SLP induction not supported for variable-length"
10002 " vectors.\n");
10003 return false;
10006 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10008 if (dump_enabled_p ())
10009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10010 "floating point induction vectorization disabled\n");
10011 return false;
10014 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10015 gcc_assert (step_expr != NULL_TREE);
10016 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10018 /* Check for backend support of PLUS/MINUS_EXPR. */
10019 if (!directly_supported_p (PLUS_EXPR, step_vectype)
10020 || !directly_supported_p (MINUS_EXPR, step_vectype))
10021 return false;
10023 if (!vec_stmt) /* transformation not required. */
10025 unsigned inside_cost = 0, prologue_cost = 0;
10026 if (slp_node)
10028 /* We eventually need to set a vector type on invariant
10029 arguments. */
10030 unsigned j;
10031 slp_tree child;
10032 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10033 if (!vect_maybe_update_slp_op_vectype
10034 (child, SLP_TREE_VECTYPE (slp_node)))
10036 if (dump_enabled_p ())
10037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10038 "incompatible vector types for "
10039 "invariants\n");
10040 return false;
10042 /* loop cost for vec_loop. */
10043 inside_cost
10044 = record_stmt_cost (cost_vec,
10045 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10046 vector_stmt, stmt_info, 0, vect_body);
10047 /* prologue cost for vec_init (if not nested) and step. */
10048 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10049 scalar_to_vec,
10050 stmt_info, 0, vect_prologue);
10052 else /* if (!slp_node) */
10054 /* loop cost for vec_loop. */
10055 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10056 stmt_info, 0, vect_body);
10057 /* prologue cost for vec_init and vec_step. */
10058 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10059 stmt_info, 0, vect_prologue);
10061 if (dump_enabled_p ())
10062 dump_printf_loc (MSG_NOTE, vect_location,
10063 "vect_model_induction_cost: inside_cost = %d, "
10064 "prologue_cost = %d .\n", inside_cost,
10065 prologue_cost);
10067 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10068 DUMP_VECT_SCOPE ("vectorizable_induction");
10069 return true;
10072 /* Transform. */
10074 /* Compute a vector variable, initialized with the first VF values of
10075 the induction variable. E.g., for an iv with IV_PHI='X' and
10076 evolution S, for a vector of 4 units, we want to compute:
10077 [X, X + S, X + 2*S, X + 3*S]. */
10079 if (dump_enabled_p ())
10080 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10082 pe = loop_preheader_edge (iv_loop);
10083 /* Find the first insertion point in the BB. */
10084 basic_block bb = gimple_bb (phi);
10085 si = gsi_after_labels (bb);
10087 /* For SLP induction we have to generate several IVs as for example
10088 with group size 3 we need
10089 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10090 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
10091 if (slp_node)
10093 /* Enforced above. */
10094 unsigned int const_nunits = nunits.to_constant ();
10096 /* The initial values are vectorized, but any lanes > group_size
10097 need adjustment. */
10098 slp_tree init_node
10099 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10101 /* Gather steps. Since we do not vectorize inductions as
10102 cycles we have to reconstruct the step from SCEV data. */
10103 unsigned group_size = SLP_TREE_LANES (slp_node);
10104 tree *steps = XALLOCAVEC (tree, group_size);
10105 tree *inits = XALLOCAVEC (tree, group_size);
10106 stmt_vec_info phi_info;
10107 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10109 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10110 if (!init_node)
10111 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10112 pe->dest_idx);
10115 /* Now generate the IVs. */
10116 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10117 gcc_assert ((const_nunits * nvects) % group_size == 0);
10118 unsigned nivs;
10119 if (nested_in_vect_loop)
10120 nivs = nvects;
10121 else
10123 /* Compute the number of distinct IVs we need. First reduce
10124 group_size if it is a multiple of const_nunits so we get
10125 one IV for a group_size of 4 but const_nunits 2. */
10126 unsigned group_sizep = group_size;
10127 if (group_sizep % const_nunits == 0)
10128 group_sizep = group_sizep / const_nunits;
10129 nivs = least_common_multiple (group_sizep,
10130 const_nunits) / const_nunits;
10132 tree stept = TREE_TYPE (step_vectype);
10133 tree lupdate_mul = NULL_TREE;
10134 if (!nested_in_vect_loop)
10136 /* The number of iterations covered in one vector iteration. */
10137 unsigned lup_mul = (nvects * const_nunits) / group_size;
10138 lupdate_mul
10139 = build_vector_from_val (step_vectype,
10140 SCALAR_FLOAT_TYPE_P (stept)
10141 ? build_real_from_wide (stept, lup_mul,
10142 UNSIGNED)
10143 : build_int_cstu (stept, lup_mul));
10145 tree peel_mul = NULL_TREE;
10146 gimple_seq init_stmts = NULL;
10147 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10149 if (SCALAR_FLOAT_TYPE_P (stept))
10150 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10151 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10152 else
10153 peel_mul = gimple_convert (&init_stmts, stept,
10154 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10155 peel_mul = gimple_build_vector_from_val (&init_stmts,
10156 step_vectype, peel_mul);
10158 unsigned ivn;
10159 auto_vec<tree> vec_steps;
10160 for (ivn = 0; ivn < nivs; ++ivn)
10162 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10163 tree_vector_builder init_elts (vectype, const_nunits, 1);
10164 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10165 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10167 /* The scalar steps of the IVs. */
10168 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10169 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10170 step_elts.quick_push (elt);
10171 if (!init_node)
10173 /* The scalar inits of the IVs if not vectorized. */
10174 elt = inits[(ivn*const_nunits + eltn) % group_size];
10175 if (!useless_type_conversion_p (TREE_TYPE (vectype),
10176 TREE_TYPE (elt)))
10177 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10178 TREE_TYPE (vectype), elt);
10179 init_elts.quick_push (elt);
10181 /* The number of steps to add to the initial values. */
10182 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10183 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10184 ? build_real_from_wide (stept,
10185 mul_elt, UNSIGNED)
10186 : build_int_cstu (stept, mul_elt));
10188 vec_step = gimple_build_vector (&init_stmts, &step_elts);
10189 vec_steps.safe_push (vec_step);
10190 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10191 if (peel_mul)
10192 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10193 step_mul, peel_mul);
10194 if (!init_node)
10195 vec_init = gimple_build_vector (&init_stmts, &init_elts);
10197 /* Create the induction-phi that defines the induction-operand. */
10198 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10199 "vec_iv_");
10200 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10201 induc_def = PHI_RESULT (induction_phi);
10203 /* Create the iv update inside the loop */
10204 tree up = vec_step;
10205 if (lupdate_mul)
10206 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10207 vec_step, lupdate_mul);
10208 gimple_seq stmts = NULL;
10209 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10210 vec_def = gimple_build (&stmts,
10211 PLUS_EXPR, step_vectype, vec_def, up);
10212 vec_def = gimple_convert (&stmts, vectype, vec_def);
10213 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10214 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10215 UNKNOWN_LOCATION);
10217 if (init_node)
10218 vec_init = vect_get_slp_vect_def (init_node, ivn);
10219 if (!nested_in_vect_loop
10220 && !integer_zerop (step_mul))
10222 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10223 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10224 vec_step, step_mul);
10225 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10226 vec_def, up);
10227 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10230 /* Set the arguments of the phi node: */
10231 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10233 slp_node->push_vec_def (induction_phi);
10235 if (!nested_in_vect_loop)
10237 /* Fill up to the number of vectors we need for the whole group. */
10238 nivs = least_common_multiple (group_size,
10239 const_nunits) / const_nunits;
10240 vec_steps.reserve (nivs-ivn);
10241 for (; ivn < nivs; ++ivn)
10243 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10244 vec_steps.quick_push (vec_steps[0]);
10248 /* Re-use IVs when we can. We are generating further vector
10249 stmts by adding VF' * stride to the IVs generated above. */
10250 if (ivn < nvects)
10252 unsigned vfp
10253 = least_common_multiple (group_size, const_nunits) / group_size;
10254 tree lupdate_mul
10255 = build_vector_from_val (step_vectype,
10256 SCALAR_FLOAT_TYPE_P (stept)
10257 ? build_real_from_wide (stept,
10258 vfp, UNSIGNED)
10259 : build_int_cstu (stept, vfp));
10260 for (; ivn < nvects; ++ivn)
10262 gimple *iv
10263 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10264 tree def = gimple_get_lhs (iv);
10265 if (ivn < 2*nivs)
10266 vec_steps[ivn - nivs]
10267 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10268 vec_steps[ivn - nivs], lupdate_mul);
10269 gimple_seq stmts = NULL;
10270 def = gimple_convert (&stmts, step_vectype, def);
10271 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10272 def, vec_steps[ivn % nivs]);
10273 def = gimple_convert (&stmts, vectype, def);
10274 if (gimple_code (iv) == GIMPLE_PHI)
10275 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10276 else
10278 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10279 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10281 slp_node->push_vec_def (def);
10285 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10286 gcc_assert (!new_bb);
10288 return true;
10291 init_expr = vect_phi_initial_value (phi);
10293 gimple_seq stmts = NULL;
10294 if (!nested_in_vect_loop)
10296 /* Convert the initial value to the IV update type. */
10297 tree new_type = TREE_TYPE (step_expr);
10298 init_expr = gimple_convert (&stmts, new_type, init_expr);
10300 /* If we are using the loop mask to "peel" for alignment then we need
10301 to adjust the start value here. */
10302 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10303 if (skip_niters != NULL_TREE)
10305 if (FLOAT_TYPE_P (vectype))
10306 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10307 skip_niters);
10308 else
10309 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10310 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10311 skip_niters, step_expr);
10312 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10313 init_expr, skip_step);
10317 if (stmts)
10319 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10320 gcc_assert (!new_bb);
10323 /* Create the vector that holds the initial_value of the induction. */
10324 if (nested_in_vect_loop)
10326 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10327 been created during vectorization of previous stmts. We obtain it
10328 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10329 auto_vec<tree> vec_inits;
10330 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10331 init_expr, &vec_inits);
10332 vec_init = vec_inits[0];
10333 /* If the initial value is not of proper type, convert it. */
10334 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10336 new_stmt
10337 = gimple_build_assign (vect_get_new_ssa_name (vectype,
10338 vect_simple_var,
10339 "vec_iv_"),
10340 VIEW_CONVERT_EXPR,
10341 build1 (VIEW_CONVERT_EXPR, vectype,
10342 vec_init));
10343 vec_init = gimple_assign_lhs (new_stmt);
10344 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10345 new_stmt);
10346 gcc_assert (!new_bb);
10349 else
10351 /* iv_loop is the loop to be vectorized. Create:
10352 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10353 stmts = NULL;
10354 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10356 unsigned HOST_WIDE_INT const_nunits;
10357 if (nunits.is_constant (&const_nunits))
10359 tree_vector_builder elts (step_vectype, const_nunits, 1);
10360 elts.quick_push (new_name);
10361 for (i = 1; i < const_nunits; i++)
10363 /* Create: new_name_i = new_name + step_expr */
10364 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10365 new_name, step_expr);
10366 elts.quick_push (new_name);
10368 /* Create a vector from [new_name_0, new_name_1, ...,
10369 new_name_nunits-1] */
10370 vec_init = gimple_build_vector (&stmts, &elts);
10372 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10373 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10374 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10375 new_name, step_expr);
10376 else
10378 /* Build:
10379 [base, base, base, ...]
10380 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10381 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10382 gcc_assert (flag_associative_math);
10383 tree index = build_index_vector (step_vectype, 0, 1);
10384 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10385 new_name);
10386 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10387 step_expr);
10388 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10389 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10390 vec_init, step_vec);
10391 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10392 vec_init, base_vec);
10394 vec_init = gimple_convert (&stmts, vectype, vec_init);
10396 if (stmts)
10398 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10399 gcc_assert (!new_bb);
10404 /* Create the vector that holds the step of the induction. */
10405 gimple_stmt_iterator *step_iv_si = NULL;
10406 if (nested_in_vect_loop)
10407 /* iv_loop is nested in the loop to be vectorized. Generate:
10408 vec_step = [S, S, S, S] */
10409 new_name = step_expr;
10410 else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10412 /* When we're using loop_len produced by SELEC_VL, the non-final
10413 iterations are not always processing VF elements. So vectorize
10414 induction variable instead of
10416 _21 = vect_vec_iv_.6_22 + { VF, ... };
10418 We should generate:
10420 _35 = .SELECT_VL (ivtmp_33, VF);
10421 vect_cst__22 = [vec_duplicate_expr] _35;
10422 _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10423 gcc_assert (!slp_node);
10424 gimple_seq seq = NULL;
10425 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10426 tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10427 expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10428 unshare_expr (len)),
10429 &seq, true, NULL_TREE);
10430 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10431 step_expr);
10432 gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10433 step_iv_si = &si;
10435 else
10437 /* iv_loop is the loop to be vectorized. Generate:
10438 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10439 gimple_seq seq = NULL;
10440 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10442 expr = build_int_cst (integer_type_node, vf);
10443 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10445 else
10446 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10447 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10448 expr, step_expr);
10449 if (seq)
10451 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10452 gcc_assert (!new_bb);
10456 t = unshare_expr (new_name);
10457 gcc_assert (CONSTANT_CLASS_P (new_name)
10458 || TREE_CODE (new_name) == SSA_NAME);
10459 new_vec = build_vector_from_val (step_vectype, t);
10460 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10461 new_vec, step_vectype, step_iv_si);
10464 /* Create the following def-use cycle:
10465 loop prolog:
10466 vec_init = ...
10467 vec_step = ...
10468 loop:
10469 vec_iv = PHI <vec_init, vec_loop>
10471 STMT
10473 vec_loop = vec_iv + vec_step; */
10475 /* Create the induction-phi that defines the induction-operand. */
10476 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10477 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10478 induc_def = PHI_RESULT (induction_phi);
10480 /* Create the iv update inside the loop */
10481 stmts = NULL;
10482 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10483 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10484 vec_def = gimple_convert (&stmts, vectype, vec_def);
10485 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10486 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10488 /* Set the arguments of the phi node: */
10489 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10490 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10491 UNKNOWN_LOCATION);
10493 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10494 *vec_stmt = induction_phi;
10496 /* In case that vectorization factor (VF) is bigger than the number
10497 of elements that we can fit in a vectype (nunits), we have to generate
10498 more than one vector stmt - i.e - we need to "unroll" the
10499 vector stmt by a factor VF/nunits. For more details see documentation
10500 in vectorizable_operation. */
10502 if (ncopies > 1)
10504 gimple_seq seq = NULL;
10505 /* FORNOW. This restriction should be relaxed. */
10506 gcc_assert (!nested_in_vect_loop);
10507 /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1. */
10508 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10510 /* Create the vector that holds the step of the induction. */
10511 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10513 expr = build_int_cst (integer_type_node, nunits);
10514 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10516 else
10517 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10518 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10519 expr, step_expr);
10520 if (seq)
10522 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10523 gcc_assert (!new_bb);
10526 t = unshare_expr (new_name);
10527 gcc_assert (CONSTANT_CLASS_P (new_name)
10528 || TREE_CODE (new_name) == SSA_NAME);
10529 new_vec = build_vector_from_val (step_vectype, t);
10530 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10531 new_vec, step_vectype, NULL);
10533 vec_def = induc_def;
10534 for (i = 1; i < ncopies + 1; i++)
10536 /* vec_i = vec_prev + vec_step */
10537 gimple_seq stmts = NULL;
10538 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10539 vec_def = gimple_build (&stmts,
10540 PLUS_EXPR, step_vectype, vec_def, vec_step);
10541 vec_def = gimple_convert (&stmts, vectype, vec_def);
10543 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10544 if (i < ncopies)
10546 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10547 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10549 else
10551 /* vec_1 = vec_iv + (VF/n * S)
10552 vec_2 = vec_1 + (VF/n * S)
10554 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10556 vec_n is used as vec_loop to save the large step register and
10557 related operations. */
10558 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10559 UNKNOWN_LOCATION);
10564 if (dump_enabled_p ())
10565 dump_printf_loc (MSG_NOTE, vect_location,
10566 "transform induction: created def-use cycle: %G%G",
10567 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10569 return true;
10572 /* Function vectorizable_live_operation_1.
10574 helper function for vectorizable_live_operation. */
10576 tree
10577 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10578 stmt_vec_info stmt_info, basic_block exit_bb,
10579 tree vectype, int ncopies, slp_tree slp_node,
10580 tree bitsize, tree bitstart, tree vec_lhs,
10581 tree lhs_type, bool restart_loop,
10582 gimple_stmt_iterator *exit_gsi)
10584 gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10586 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10587 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10588 for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10589 SET_PHI_ARG_DEF (phi, i, vec_lhs);
10591 gimple_seq stmts = NULL;
10592 tree new_tree;
10593 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10595 /* Emit:
10597 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10599 where VEC_LHS is the vectorized live-out result and MASK is
10600 the loop mask for the final iteration. */
10601 gcc_assert (ncopies == 1 && !slp_node);
10602 gimple_seq tem = NULL;
10603 gimple_stmt_iterator gsi = gsi_last (tem);
10604 tree len = vect_get_loop_len (loop_vinfo, &gsi,
10605 &LOOP_VINFO_LENS (loop_vinfo),
10606 1, vectype, 0, 0);
10608 /* BIAS - 1. */
10609 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10610 tree bias_minus_one
10611 = int_const_binop (MINUS_EXPR,
10612 build_int_cst (TREE_TYPE (len), biasval),
10613 build_one_cst (TREE_TYPE (len)));
10615 /* LAST_INDEX = LEN + (BIAS - 1). */
10616 tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10617 len, bias_minus_one);
10619 /* This needs to implement extraction of the first index, but not sure
10620 how the LEN stuff works. At the moment we shouldn't get here since
10621 there's no LEN support for early breaks. But guard this so there's
10622 no incorrect codegen. */
10623 gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10625 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10626 tree scalar_res
10627 = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10628 vec_lhs_phi, last_index);
10630 /* Convert the extracted vector element to the scalar type. */
10631 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10633 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10635 /* Emit:
10637 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10639 where VEC_LHS is the vectorized live-out result and MASK is
10640 the loop mask for the final iteration. */
10641 gcc_assert (!slp_node);
10642 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10643 gimple_seq tem = NULL;
10644 gimple_stmt_iterator gsi = gsi_last (tem);
10645 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10646 &LOOP_VINFO_MASKS (loop_vinfo),
10647 1, vectype, 0);
10648 tree scalar_res;
10650 /* For an inverted control flow with early breaks we want EXTRACT_FIRST
10651 instead of EXTRACT_LAST. Emulate by reversing the vector and mask. */
10652 if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10654 /* First create the permuted mask. */
10655 tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
10656 tree perm_dest = copy_ssa_name (mask);
10657 gimple *perm_stmt
10658 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
10659 mask, perm_mask);
10660 vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
10661 &gsi);
10662 mask = perm_dest;
10664 /* Then permute the vector contents. */
10665 tree perm_elem = perm_mask_for_reverse (vectype);
10666 perm_dest = copy_ssa_name (vec_lhs_phi);
10667 perm_stmt
10668 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
10669 vec_lhs_phi, perm_elem);
10670 vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
10671 &gsi);
10672 vec_lhs_phi = perm_dest;
10675 gimple_seq_add_seq (&stmts, tem);
10677 scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10678 mask, vec_lhs_phi);
10680 /* Convert the extracted vector element to the scalar type. */
10681 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10683 else
10685 tree bftype = TREE_TYPE (vectype);
10686 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10687 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10688 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10689 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10690 &stmts, true, NULL_TREE);
10693 *exit_gsi = gsi_after_labels (exit_bb);
10694 if (stmts)
10695 gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10697 return new_tree;
10700 /* Find the edge that's the final one in the path from SRC to DEST and
10701 return it. This edge must exist in at most one forwarder edge between. */
10703 static edge
10704 find_connected_edge (edge src, basic_block dest)
10706 if (src->dest == dest)
10707 return src;
10709 return find_edge (src->dest, dest);
10712 /* Function vectorizable_live_operation.
10714 STMT_INFO computes a value that is used outside the loop. Check if
10715 it can be supported. */
10717 bool
10718 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10719 slp_tree slp_node, slp_instance slp_node_instance,
10720 int slp_index, bool vec_stmt_p,
10721 stmt_vector_for_cost *cost_vec)
10723 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10724 imm_use_iterator imm_iter;
10725 tree lhs, lhs_type, bitsize;
10726 tree vectype = (slp_node
10727 ? SLP_TREE_VECTYPE (slp_node)
10728 : STMT_VINFO_VECTYPE (stmt_info));
10729 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10730 int ncopies;
10731 gimple *use_stmt;
10732 use_operand_p use_p;
10733 auto_vec<tree> vec_oprnds;
10734 int vec_entry = 0;
10735 poly_uint64 vec_index = 0;
10737 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10738 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10740 /* If a stmt of a reduction is live, vectorize it via
10741 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10742 validity so just trigger the transform here. */
10743 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10745 if (!vec_stmt_p)
10746 return true;
10747 if (slp_node)
10749 /* For reduction chains the meta-info is attached to
10750 the group leader. */
10751 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10752 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10753 /* For SLP reductions we vectorize the epilogue for
10754 all involved stmts together. */
10755 else if (slp_index != 0)
10756 return true;
10758 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10759 gcc_assert (reduc_info->is_reduc_info);
10760 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10761 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10762 return true;
10764 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10765 slp_node_instance,
10766 LOOP_VINFO_IV_EXIT (loop_vinfo));
10768 /* If early break we only have to materialize the reduction on the merge
10769 block, but we have to find an alternate exit first. */
10770 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10772 for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10773 if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10775 vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10776 slp_node, slp_node_instance,
10777 exit);
10778 break;
10782 return true;
10785 /* If STMT is not relevant and it is a simple assignment and its inputs are
10786 invariant then it can remain in place, unvectorized. The original last
10787 scalar value that it computes will be used. */
10788 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10790 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10791 if (dump_enabled_p ())
10792 dump_printf_loc (MSG_NOTE, vect_location,
10793 "statement is simple and uses invariant. Leaving in "
10794 "place.\n");
10795 return true;
10798 if (slp_node)
10799 ncopies = 1;
10800 else
10801 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10803 if (slp_node)
10805 gcc_assert (slp_index >= 0);
10807 /* Get the last occurrence of the scalar index from the concatenation of
10808 all the slp vectors. Calculate which slp vector it is and the index
10809 within. */
10810 int num_scalar = SLP_TREE_LANES (slp_node);
10811 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10812 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10814 /* Calculate which vector contains the result, and which lane of
10815 that vector we need. */
10816 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10818 if (dump_enabled_p ())
10819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10820 "Cannot determine which vector holds the"
10821 " final result.\n");
10822 return false;
10826 if (!vec_stmt_p)
10828 /* No transformation required. */
10829 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10831 if (slp_node)
10833 if (dump_enabled_p ())
10834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10835 "can't operate on partial vectors "
10836 "because an SLP statement is live after "
10837 "the loop.\n");
10838 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10840 else if (ncopies > 1)
10842 if (dump_enabled_p ())
10843 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10844 "can't operate on partial vectors "
10845 "because ncopies is greater than 1.\n");
10846 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10848 else
10850 gcc_assert (ncopies == 1 && !slp_node);
10851 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10852 OPTIMIZE_FOR_SPEED))
10853 vect_record_loop_mask (loop_vinfo,
10854 &LOOP_VINFO_MASKS (loop_vinfo),
10855 1, vectype, NULL);
10856 else if (can_vec_extract_var_idx_p (
10857 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10858 vect_record_loop_len (loop_vinfo,
10859 &LOOP_VINFO_LENS (loop_vinfo),
10860 1, vectype, 1);
10861 else
10863 if (dump_enabled_p ())
10864 dump_printf_loc (
10865 MSG_MISSED_OPTIMIZATION, vect_location,
10866 "can't operate on partial vectors "
10867 "because the target doesn't support extract "
10868 "last reduction.\n");
10869 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10873 /* ??? Enable for loop costing as well. */
10874 if (!loop_vinfo)
10875 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10876 0, vect_epilogue);
10877 return true;
10880 /* Use the lhs of the original scalar statement. */
10881 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10882 if (dump_enabled_p ())
10883 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10884 "stmt %G", stmt);
10886 lhs = gimple_get_lhs (stmt);
10887 lhs_type = TREE_TYPE (lhs);
10889 bitsize = vector_element_bits_tree (vectype);
10891 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10892 tree vec_lhs, vec_lhs0, bitstart;
10893 gimple *vec_stmt, *vec_stmt0;
10894 if (slp_node)
10896 gcc_assert (!loop_vinfo
10897 || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10898 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10900 /* Get the correct slp vectorized stmt. */
10901 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10902 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10904 /* In case we need to early break vectorize also get the first stmt. */
10905 vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10906 vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10908 /* Get entry to use. */
10909 bitstart = bitsize_int (vec_index);
10910 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10912 else
10914 /* For multiple copies, get the last copy. */
10915 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10916 vec_lhs = gimple_get_lhs (vec_stmt);
10918 /* In case we need to early break vectorize also get the first stmt. */
10919 vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10920 vec_lhs0 = gimple_get_lhs (vec_stmt0);
10922 /* Get the last lane in the vector. */
10923 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10926 if (loop_vinfo)
10928 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10929 requirement, insert one phi node for it. It looks like:
10930 loop;
10932 # lhs' = PHI <lhs>
10934 loop;
10936 # vec_lhs' = PHI <vec_lhs>
10937 new_tree = lane_extract <vec_lhs', ...>;
10938 lhs' = new_tree; */
10940 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10941 /* Check if we have a loop where the chosen exit is not the main exit,
10942 in these cases for an early break we restart the iteration the vector code
10943 did. For the live values we want the value at the start of the iteration
10944 rather than at the end. */
10945 edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10946 bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10947 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10948 if (!is_gimple_debug (use_stmt)
10949 && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10950 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10952 edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10953 phi_arg_index_from_use (use_p));
10954 bool main_exit_edge = e == main_e
10955 || find_connected_edge (main_e, e->src);
10957 /* Early exits have an merge block, we want the merge block itself
10958 so use ->src. For main exit the merge block is the
10959 destination. */
10960 basic_block dest = main_exit_edge ? main_e->dest : e->src;
10961 tree tmp_vec_lhs = vec_lhs;
10962 tree tmp_bitstart = bitstart;
10964 /* For early exit where the exit is not in the BB that leads
10965 to the latch then we're restarting the iteration in the
10966 scalar loop. So get the first live value. */
10967 restart_loop = restart_loop || !main_exit_edge;
10968 if (restart_loop
10969 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10971 tmp_vec_lhs = vec_lhs0;
10972 tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10975 gimple_stmt_iterator exit_gsi;
10976 tree new_tree
10977 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10978 dest, vectype, ncopies,
10979 slp_node, bitsize,
10980 tmp_bitstart, tmp_vec_lhs,
10981 lhs_type, restart_loop,
10982 &exit_gsi);
10984 if (gimple_phi_num_args (use_stmt) == 1)
10986 auto gsi = gsi_for_stmt (use_stmt);
10987 remove_phi_node (&gsi, false);
10988 tree lhs_phi = gimple_phi_result (use_stmt);
10989 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10990 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10992 else
10993 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, new_tree);
10996 /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10997 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10998 gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
11000 else
11002 /* For basic-block vectorization simply insert the lane-extraction. */
11003 tree bftype = TREE_TYPE (vectype);
11004 if (VECTOR_BOOLEAN_TYPE_P (vectype))
11005 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11006 tree new_tree = build3 (BIT_FIELD_REF, bftype,
11007 vec_lhs, bitsize, bitstart);
11008 gimple_seq stmts = NULL;
11009 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11010 &stmts, true, NULL_TREE);
11011 if (TREE_CODE (new_tree) == SSA_NAME
11012 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11013 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11014 if (is_a <gphi *> (vec_stmt))
11016 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11017 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11019 else
11021 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11022 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11025 /* Replace use of lhs with newly computed result. If the use stmt is a
11026 single arg PHI, just replace all uses of PHI result. It's necessary
11027 because lcssa PHI defining lhs may be before newly inserted stmt. */
11028 use_operand_p use_p;
11029 stmt_vec_info use_stmt_info;
11030 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11031 if (!is_gimple_debug (use_stmt)
11032 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11033 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11035 /* ??? This can happen when the live lane ends up being
11036 rooted in a vector construction code-generated by an
11037 external SLP node (and code-generation for that already
11038 happened). See gcc.dg/vect/bb-slp-47.c.
11039 Doing this is what would happen if that vector CTOR
11040 were not code-generated yet so it is not too bad.
11041 ??? In fact we'd likely want to avoid this situation
11042 in the first place. */
11043 if (TREE_CODE (new_tree) == SSA_NAME
11044 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11045 && gimple_code (use_stmt) != GIMPLE_PHI
11046 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11047 use_stmt))
11049 if (dump_enabled_p ())
11050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11051 "Using original scalar computation for "
11052 "live lane because use preceeds vector "
11053 "def\n");
11054 continue;
11056 /* ??? It can also happen that we end up pulling a def into
11057 a loop where replacing out-of-loop uses would require
11058 a new LC SSA PHI node. Retain the original scalar in
11059 those cases as well. PR98064. */
11060 if (TREE_CODE (new_tree) == SSA_NAME
11061 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11062 && (gimple_bb (use_stmt)->loop_father
11063 != gimple_bb (vec_stmt)->loop_father)
11064 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11065 gimple_bb (use_stmt)->loop_father))
11067 if (dump_enabled_p ())
11068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11069 "Using original scalar computation for "
11070 "live lane because there is an out-of-loop "
11071 "definition for it\n");
11072 continue;
11074 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11075 SET_USE (use_p, new_tree);
11076 update_stmt (use_stmt);
11080 return true;
11083 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
11085 static void
11086 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11088 ssa_op_iter op_iter;
11089 imm_use_iterator imm_iter;
11090 def_operand_p def_p;
11091 gimple *ustmt;
11093 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11095 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11097 basic_block bb;
11099 if (!is_gimple_debug (ustmt))
11100 continue;
11102 bb = gimple_bb (ustmt);
11104 if (!flow_bb_inside_loop_p (loop, bb))
11106 if (gimple_debug_bind_p (ustmt))
11108 if (dump_enabled_p ())
11109 dump_printf_loc (MSG_NOTE, vect_location,
11110 "killing debug use\n");
11112 gimple_debug_bind_reset_value (ustmt);
11113 update_stmt (ustmt);
11115 else
11116 gcc_unreachable ();
11122 /* Given loop represented by LOOP_VINFO, return true if computation of
11123 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11124 otherwise. */
11126 static bool
11127 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11129 /* Constant case. */
11130 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11132 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11133 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11135 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11136 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11137 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11138 return true;
11141 widest_int max;
11142 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11143 /* Check the upper bound of loop niters. */
11144 if (get_max_loop_iterations (loop, &max))
11146 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11147 signop sgn = TYPE_SIGN (type);
11148 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11149 if (max < type_max)
11150 return true;
11152 return false;
11155 /* Return a mask type with half the number of elements as OLD_TYPE,
11156 given that it should have mode NEW_MODE. */
11158 tree
11159 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11161 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11162 return build_truth_vector_type_for_mode (nunits, new_mode);
11165 /* Return a mask type with twice as many elements as OLD_TYPE,
11166 given that it should have mode NEW_MODE. */
11168 tree
11169 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11171 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11172 return build_truth_vector_type_for_mode (nunits, new_mode);
11175 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11176 contain a sequence of NVECTORS masks that each control a vector of type
11177 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
11178 these vector masks with the vector version of SCALAR_MASK. */
11180 void
11181 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11182 unsigned int nvectors, tree vectype, tree scalar_mask)
11184 gcc_assert (nvectors != 0);
11186 if (scalar_mask)
11188 scalar_cond_masked_key cond (scalar_mask, nvectors);
11189 loop_vinfo->scalar_cond_masked_set.add (cond);
11192 masks->mask_set.add (std::make_pair (vectype, nvectors));
11195 /* Given a complete set of masks MASKS, extract mask number INDEX
11196 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11197 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
11199 See the comment above vec_loop_masks for more details about the mask
11200 arrangement. */
11202 tree
11203 vect_get_loop_mask (loop_vec_info loop_vinfo,
11204 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11205 unsigned int nvectors, tree vectype, unsigned int index)
11207 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11208 == vect_partial_vectors_while_ult)
11210 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11211 tree mask_type = rgm->type;
11213 /* Populate the rgroup's mask array, if this is the first time we've
11214 used it. */
11215 if (rgm->controls.is_empty ())
11217 rgm->controls.safe_grow_cleared (nvectors, true);
11218 for (unsigned int i = 0; i < nvectors; ++i)
11220 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11221 /* Provide a dummy definition until the real one is available. */
11222 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11223 rgm->controls[i] = mask;
11227 tree mask = rgm->controls[index];
11228 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11229 TYPE_VECTOR_SUBPARTS (vectype)))
11231 /* A loop mask for data type X can be reused for data type Y
11232 if X has N times more elements than Y and if Y's elements
11233 are N times bigger than X's. In this case each sequence
11234 of N elements in the loop mask will be all-zero or all-one.
11235 We can then view-convert the mask so that each sequence of
11236 N elements is replaced by a single element. */
11237 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11238 TYPE_VECTOR_SUBPARTS (vectype)));
11239 gimple_seq seq = NULL;
11240 mask_type = truth_type_for (vectype);
11241 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11242 if (seq)
11243 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11245 return mask;
11247 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11248 == vect_partial_vectors_avx512)
11250 /* The number of scalars per iteration and the number of vectors are
11251 both compile-time constants. */
11252 unsigned int nscalars_per_iter
11253 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11254 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11256 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11258 /* The stored nV is dependent on the mask type produced. */
11259 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11260 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11261 == rgm->factor);
11262 nvectors = rgm->factor;
11264 /* Populate the rgroup's mask array, if this is the first time we've
11265 used it. */
11266 if (rgm->controls.is_empty ())
11268 rgm->controls.safe_grow_cleared (nvectors, true);
11269 for (unsigned int i = 0; i < nvectors; ++i)
11271 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11272 /* Provide a dummy definition until the real one is available. */
11273 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11274 rgm->controls[i] = mask;
11277 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11278 TYPE_VECTOR_SUBPARTS (vectype)))
11279 return rgm->controls[index];
11281 /* Split the vector if needed. Since we are dealing with integer mode
11282 masks with AVX512 we can operate on the integer representation
11283 performing the whole vector shifting. */
11284 unsigned HOST_WIDE_INT factor;
11285 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11286 TYPE_VECTOR_SUBPARTS (vectype), &factor);
11287 gcc_assert (ok);
11288 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11289 tree mask_type = truth_type_for (vectype);
11290 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11291 unsigned vi = index / factor;
11292 unsigned vpart = index % factor;
11293 tree vec = rgm->controls[vi];
11294 gimple_seq seq = NULL;
11295 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11296 lang_hooks.types.type_for_mode
11297 (TYPE_MODE (rgm->type), 1), vec);
11298 /* For integer mode masks simply shift the right bits into position. */
11299 if (vpart != 0)
11300 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11301 build_int_cst (integer_type_node,
11302 (TYPE_VECTOR_SUBPARTS (vectype)
11303 * vpart)));
11304 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11305 (TYPE_MODE (mask_type), 1), vec);
11306 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11307 if (seq)
11308 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11309 return vec;
11311 else
11312 gcc_unreachable ();
11315 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11316 lengths for controlling an operation on VECTYPE. The operation splits
11317 each element of VECTYPE into FACTOR separate subelements, measuring the
11318 length as a number of these subelements. */
11320 void
11321 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11322 unsigned int nvectors, tree vectype, unsigned int factor)
11324 gcc_assert (nvectors != 0);
11325 if (lens->length () < nvectors)
11326 lens->safe_grow_cleared (nvectors, true);
11327 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11329 /* The number of scalars per iteration, scalar occupied bytes and
11330 the number of vectors are both compile-time constants. */
11331 unsigned int nscalars_per_iter
11332 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11333 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11335 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11337 /* For now, we only support cases in which all loads and stores fall back
11338 to VnQI or none do. */
11339 gcc_assert (!rgl->max_nscalars_per_iter
11340 || (rgl->factor == 1 && factor == 1)
11341 || (rgl->max_nscalars_per_iter * rgl->factor
11342 == nscalars_per_iter * factor));
11343 rgl->max_nscalars_per_iter = nscalars_per_iter;
11344 rgl->type = vectype;
11345 rgl->factor = factor;
11349 /* Given a complete set of lengths LENS, extract length number INDEX
11350 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11351 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11352 multipled by the number of elements that should be processed.
11353 Insert any set-up statements before GSI. */
11355 tree
11356 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11357 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11358 unsigned int index, unsigned int factor)
11360 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11361 bool use_bias_adjusted_len =
11362 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11364 /* Populate the rgroup's len array, if this is the first time we've
11365 used it. */
11366 if (rgl->controls.is_empty ())
11368 rgl->controls.safe_grow_cleared (nvectors, true);
11369 for (unsigned int i = 0; i < nvectors; ++i)
11371 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11372 gcc_assert (len_type != NULL_TREE);
11374 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11376 /* Provide a dummy definition until the real one is available. */
11377 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11378 rgl->controls[i] = len;
11380 if (use_bias_adjusted_len)
11382 gcc_assert (i == 0);
11383 tree adjusted_len =
11384 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11385 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11386 rgl->bias_adjusted_ctrl = adjusted_len;
11391 if (use_bias_adjusted_len)
11392 return rgl->bias_adjusted_ctrl;
11394 tree loop_len = rgl->controls[index];
11395 if (rgl->factor == 1 && factor == 1)
11397 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11398 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11399 if (maybe_ne (nunits1, nunits2))
11401 /* A loop len for data type X can be reused for data type Y
11402 if X has N times more elements than Y and if Y's elements
11403 are N times bigger than X's. */
11404 gcc_assert (multiple_p (nunits1, nunits2));
11405 factor = exact_div (nunits1, nunits2).to_constant ();
11406 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11407 gimple_seq seq = NULL;
11408 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11409 build_int_cst (iv_type, factor));
11410 if (seq)
11411 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11414 return loop_len;
11417 /* Scale profiling counters by estimation for LOOP which is vectorized
11418 by factor VF.
11419 If FLAT is true, the loop we started with had unrealistically flat
11420 profile. */
11422 static void
11423 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11425 /* For flat profiles do not scale down proportionally by VF and only
11426 cap by known iteration count bounds. */
11427 if (flat)
11429 if (dump_file && (dump_flags & TDF_DETAILS))
11430 fprintf (dump_file,
11431 "Vectorized loop profile seems flat; not scaling iteration "
11432 "count down by the vectorization factor %i\n", vf);
11433 scale_loop_profile (loop, profile_probability::always (),
11434 get_likely_max_loop_iterations_int (loop));
11435 return;
11437 /* Loop body executes VF fewer times and exit increases VF times. */
11438 profile_count entry_count = loop_preheader_edge (loop)->count ();
11440 /* If we have unreliable loop profile avoid dropping entry
11441 count bellow header count. This can happen since loops
11442 has unrealistically low trip counts. */
11443 while (vf > 1
11444 && loop->header->count > entry_count
11445 && loop->header->count < entry_count * vf)
11447 if (dump_file && (dump_flags & TDF_DETAILS))
11448 fprintf (dump_file,
11449 "Vectorization factor %i seems too large for profile "
11450 "prevoiusly believed to be consistent; reducing.\n", vf);
11451 vf /= 2;
11454 if (entry_count.nonzero_p ())
11455 set_edge_probability_and_rescale_others
11456 (exit_e,
11457 entry_count.probability_in (loop->header->count / vf));
11458 /* Avoid producing very large exit probability when we do not have
11459 sensible profile. */
11460 else if (exit_e->probability < profile_probability::always () / (vf * 2))
11461 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11462 loop->latch->count = single_pred_edge (loop->latch)->count ();
11464 scale_loop_profile (loop, profile_probability::always () / vf,
11465 get_likely_max_loop_iterations_int (loop));
11468 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11469 latch edge values originally defined by it. */
11471 static void
11472 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11473 stmt_vec_info def_stmt_info)
11475 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11476 if (!def || TREE_CODE (def) != SSA_NAME)
11477 return;
11478 stmt_vec_info phi_info;
11479 imm_use_iterator iter;
11480 use_operand_p use_p;
11481 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11483 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11484 if (!phi)
11485 continue;
11486 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11487 && (phi_info = loop_vinfo->lookup_stmt (phi))
11488 && STMT_VINFO_RELEVANT_P (phi_info)))
11489 continue;
11490 loop_p loop = gimple_bb (phi)->loop_father;
11491 edge e = loop_latch_edge (loop);
11492 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11493 continue;
11495 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11496 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11497 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11499 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11500 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11501 gcc_assert (phi_defs.length () == latch_defs.length ());
11502 for (unsigned i = 0; i < phi_defs.length (); ++i)
11503 add_phi_arg (as_a <gphi *> (phi_defs[i]),
11504 gimple_get_lhs (latch_defs[i]), e,
11505 gimple_phi_arg_location (phi, e->dest_idx));
11507 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11509 /* For first order recurrences we have to update both uses of
11510 the latch definition, the one in the PHI node and the one
11511 in the generated VEC_PERM_EXPR. */
11512 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11513 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11514 gcc_assert (phi_defs.length () == latch_defs.length ());
11515 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11516 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11517 for (unsigned i = 0; i < phi_defs.length (); ++i)
11519 gassign *perm = as_a <gassign *> (phi_defs[i]);
11520 if (i > 0)
11521 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11522 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11523 update_stmt (perm);
11525 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11526 gimple_phi_arg_location (phi, e->dest_idx));
11531 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11532 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11533 stmt_vec_info. */
11535 static bool
11536 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11537 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11539 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11540 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11542 if (dump_enabled_p ())
11543 dump_printf_loc (MSG_NOTE, vect_location,
11544 "------>vectorizing statement: %G", stmt_info->stmt);
11546 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11547 vect_loop_kill_debug_uses (loop, stmt_info);
11549 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11550 && !STMT_VINFO_LIVE_P (stmt_info))
11552 if (is_gimple_call (stmt_info->stmt)
11553 && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11555 gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11556 *seen_store = stmt_info;
11557 return false;
11559 return false;
11562 if (STMT_VINFO_VECTYPE (stmt_info))
11564 poly_uint64 nunits
11565 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11566 if (!STMT_SLP_TYPE (stmt_info)
11567 && maybe_ne (nunits, vf)
11568 && dump_enabled_p ())
11569 /* For SLP VF is set according to unrolling factor, and not
11570 to vector size, hence for SLP this print is not valid. */
11571 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11574 /* Pure SLP statements have already been vectorized. We still need
11575 to apply loop vectorization to hybrid SLP statements. */
11576 if (PURE_SLP_STMT (stmt_info))
11577 return false;
11579 if (dump_enabled_p ())
11580 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11582 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11583 *seen_store = stmt_info;
11585 return true;
11588 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11589 in the hash_map with its corresponding values. */
11591 static tree
11592 find_in_mapping (tree t, void *context)
11594 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11596 tree *value = mapping->get (t);
11597 return value ? *value : t;
11600 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11601 original loop that has now been vectorized.
11603 The inits of the data_references need to be advanced with the number of
11604 iterations of the main loop. This has been computed in vect_do_peeling and
11605 is stored in parameter ADVANCE. We first restore the data_references
11606 initial offset with the values recored in ORIG_DRS_INIT.
11608 Since the loop_vec_info of this EPILOGUE was constructed for the original
11609 loop, its stmt_vec_infos all point to the original statements. These need
11610 to be updated to point to their corresponding copies as well as the SSA_NAMES
11611 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11613 The data_reference's connections also need to be updated. Their
11614 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11615 stmt_vec_infos, their statements need to point to their corresponding copy,
11616 if they are gather loads or scatter stores then their reference needs to be
11617 updated to point to its corresponding copy and finally we set
11618 'base_misaligned' to false as we have already peeled for alignment in the
11619 prologue of the main loop. */
11621 static void
11622 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11624 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11625 auto_vec<gimple *> stmt_worklist;
11626 hash_map<tree,tree> mapping;
11627 gimple *orig_stmt, *new_stmt;
11628 gimple_stmt_iterator epilogue_gsi;
11629 gphi_iterator epilogue_phi_gsi;
11630 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11631 basic_block *epilogue_bbs = get_loop_body (epilogue);
11632 unsigned i;
11634 free (LOOP_VINFO_BBS (epilogue_vinfo));
11635 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11637 /* Advance data_reference's with the number of iterations of the previous
11638 loop and its prologue. */
11639 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11642 /* The EPILOGUE loop is a copy of the original loop so they share the same
11643 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11644 point to the copied statements. We also create a mapping of all LHS' in
11645 the original loop and all the LHS' in the EPILOGUE and create worklists to
11646 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11647 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11649 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11650 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11652 new_stmt = epilogue_phi_gsi.phi ();
11654 gcc_assert (gimple_uid (new_stmt) > 0);
11655 stmt_vinfo
11656 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11658 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11659 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11661 mapping.put (gimple_phi_result (orig_stmt),
11662 gimple_phi_result (new_stmt));
11663 /* PHI nodes can not have patterns or related statements. */
11664 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11665 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11668 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11669 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11671 new_stmt = gsi_stmt (epilogue_gsi);
11672 if (is_gimple_debug (new_stmt))
11673 continue;
11675 gcc_assert (gimple_uid (new_stmt) > 0);
11676 stmt_vinfo
11677 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11679 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11680 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11682 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11683 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11685 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11687 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11688 for (gimple_stmt_iterator gsi = gsi_start (seq);
11689 !gsi_end_p (gsi); gsi_next (&gsi))
11690 stmt_worklist.safe_push (gsi_stmt (gsi));
11693 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11694 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11696 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11697 stmt_worklist.safe_push (stmt);
11698 /* Set BB such that the assert in
11699 'get_initial_def_for_reduction' is able to determine that
11700 the BB of the related stmt is inside this loop. */
11701 gimple_set_bb (stmt,
11702 gimple_bb (new_stmt));
11703 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11704 gcc_assert (related_vinfo == NULL
11705 || related_vinfo == stmt_vinfo);
11710 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11711 using the original main loop and thus need to be updated to refer to the
11712 cloned variables used in the epilogue. */
11713 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11715 gimple *stmt = stmt_worklist[i];
11716 tree *new_op;
11718 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11720 tree op = gimple_op (stmt, j);
11721 if ((new_op = mapping.get(op)))
11722 gimple_set_op (stmt, j, *new_op);
11723 else
11725 /* PR92429: The last argument of simplify_replace_tree disables
11726 folding when replacing arguments. This is required as
11727 otherwise you might end up with different statements than the
11728 ones analyzed in vect_loop_analyze, leading to different
11729 vectorization. */
11730 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11731 &find_in_mapping, &mapping, false);
11732 gimple_set_op (stmt, j, op);
11737 struct data_reference *dr;
11738 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11739 FOR_EACH_VEC_ELT (datarefs, i, dr)
11741 orig_stmt = DR_STMT (dr);
11742 gcc_assert (gimple_uid (orig_stmt) > 0);
11743 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11744 /* Data references for gather loads and scatter stores do not use the
11745 updated offset we set using ADVANCE. Instead we have to make sure the
11746 reference in the data references point to the corresponding copy of
11747 the original in the epilogue. Make sure to update both
11748 gather/scatters recognized by dataref analysis and also other
11749 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11750 auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11751 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11752 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11754 DR_REF (dr)
11755 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11756 &find_in_mapping, &mapping);
11757 DR_BASE_ADDRESS (dr)
11758 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11759 &find_in_mapping, &mapping);
11761 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11762 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11763 /* The vector size of the epilogue is smaller than that of the main loop
11764 so the alignment is either the same or lower. This means the dr will
11765 thus by definition be aligned. */
11766 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11769 epilogue_vinfo->shared->datarefs_copy.release ();
11770 epilogue_vinfo->shared->save_datarefs ();
11773 /* When vectorizing early break statements instructions that happen before
11774 the early break in the current BB need to be moved to after the early
11775 break. This function deals with that and assumes that any validity
11776 checks has already been performed.
11778 While moving the instructions if it encounters a VUSE or VDEF it then
11779 corrects the VUSES as it moves the statements along. GDEST is the location
11780 in which to insert the new statements. */
11782 static void
11783 move_early_exit_stmts (loop_vec_info loop_vinfo)
11785 DUMP_VECT_SCOPE ("move_early_exit_stmts");
11787 if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11788 return;
11790 /* Move all stmts that need moving. */
11791 basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11792 gimple_stmt_iterator dest_gsi = gsi_start_bb (dest_bb);
11794 for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11796 /* Check to see if statement is still required for vect or has been
11797 elided. */
11798 auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11799 if (!stmt_info)
11800 continue;
11802 if (dump_enabled_p ())
11803 dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11805 gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11806 gsi_move_before (&stmt_gsi, &dest_gsi);
11807 gsi_prev (&dest_gsi);
11810 /* Update all the stmts with their new reaching VUSES. */
11811 tree vuse
11812 = gimple_vuse (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).last ());
11813 for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11815 if (dump_enabled_p ())
11816 dump_printf_loc (MSG_NOTE, vect_location,
11817 "updating vuse to %T for load %G", vuse, p);
11818 gimple_set_vuse (p, vuse);
11819 update_stmt (p);
11823 /* Function vect_transform_loop.
11825 The analysis phase has determined that the loop is vectorizable.
11826 Vectorize the loop - created vectorized stmts to replace the scalar
11827 stmts in the loop, and update the loop exit condition.
11828 Returns scalar epilogue loop if any. */
11830 class loop *
11831 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11833 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11834 class loop *epilogue = NULL;
11835 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11836 int nbbs = loop->num_nodes;
11837 int i;
11838 tree niters_vector = NULL_TREE;
11839 tree step_vector = NULL_TREE;
11840 tree niters_vector_mult_vf = NULL_TREE;
11841 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11842 unsigned int lowest_vf = constant_lower_bound (vf);
11843 gimple *stmt;
11844 bool check_profitability = false;
11845 unsigned int th;
11846 bool flat = maybe_flat_loop_profile (loop);
11848 DUMP_VECT_SCOPE ("vec_transform_loop");
11850 loop_vinfo->shared->check_datarefs ();
11852 /* Use the more conservative vectorization threshold. If the number
11853 of iterations is constant assume the cost check has been performed
11854 by our caller. If the threshold makes all loops profitable that
11855 run at least the (estimated) vectorization factor number of times
11856 checking is pointless, too. */
11857 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11858 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11860 if (dump_enabled_p ())
11861 dump_printf_loc (MSG_NOTE, vect_location,
11862 "Profitability threshold is %d loop iterations.\n",
11863 th);
11864 check_profitability = true;
11867 /* Make sure there exists a single-predecessor exit bb. Do this before
11868 versioning. */
11869 edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11870 if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11872 split_loop_exit_edge (e, true);
11873 if (dump_enabled_p ())
11874 dump_printf (MSG_NOTE, "split exit edge\n");
11877 /* Version the loop first, if required, so the profitability check
11878 comes first. */
11880 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11882 class loop *sloop
11883 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11884 sloop->force_vectorize = false;
11885 check_profitability = false;
11888 /* Make sure there exists a single-predecessor exit bb also on the
11889 scalar loop copy. Do this after versioning but before peeling
11890 so CFG structure is fine for both scalar and if-converted loop
11891 to make slpeel_duplicate_current_defs_from_edges face matched
11892 loop closed PHI nodes on the exit. */
11893 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11895 e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11896 if (! single_pred_p (e->dest))
11898 split_loop_exit_edge (e, true);
11899 if (dump_enabled_p ())
11900 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11904 tree niters = vect_build_loop_niters (loop_vinfo);
11905 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11906 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11907 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11908 tree advance;
11909 drs_init_vec orig_drs_init;
11911 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11912 &step_vector, &niters_vector_mult_vf, th,
11913 check_profitability, niters_no_overflow,
11914 &advance);
11915 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11916 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11918 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11919 block after loop exit. We need to scale all that. */
11920 basic_block preheader
11921 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11922 preheader->count
11923 = preheader->count.apply_probability
11924 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11925 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11926 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11927 single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11928 = preheader->count;
11931 if (niters_vector == NULL_TREE)
11933 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11934 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11935 && known_eq (lowest_vf, vf))
11937 niters_vector
11938 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11939 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11940 step_vector = build_one_cst (TREE_TYPE (niters));
11942 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11943 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11944 &step_vector, niters_no_overflow);
11945 else
11946 /* vect_do_peeling subtracted the number of peeled prologue
11947 iterations from LOOP_VINFO_NITERS. */
11948 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11949 &niters_vector, &step_vector,
11950 niters_no_overflow);
11953 /* 1) Make sure the loop header has exactly two entries
11954 2) Make sure we have a preheader basic block. */
11956 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11958 split_edge (loop_preheader_edge (loop));
11960 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11961 /* This will deal with any possible peeling. */
11962 vect_prepare_for_masked_peels (loop_vinfo);
11964 /* Handle any code motion that we need to for early-break vectorization after
11965 we've done peeling but just before we start vectorizing. */
11966 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11967 move_early_exit_stmts (loop_vinfo);
11969 /* Schedule the SLP instances first, then handle loop vectorization
11970 below. */
11971 if (!loop_vinfo->slp_instances.is_empty ())
11973 DUMP_VECT_SCOPE ("scheduling SLP instances");
11974 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11977 /* FORNOW: the vectorizer supports only loops which body consist
11978 of one basic block (header + empty latch). When the vectorizer will
11979 support more involved loop forms, the order by which the BBs are
11980 traversed need to be reconsidered. */
11982 for (i = 0; i < nbbs; i++)
11984 basic_block bb = bbs[i];
11985 stmt_vec_info stmt_info;
11987 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11988 gsi_next (&si))
11990 gphi *phi = si.phi ();
11991 if (dump_enabled_p ())
11992 dump_printf_loc (MSG_NOTE, vect_location,
11993 "------>vectorizing phi: %G", (gimple *) phi);
11994 stmt_info = loop_vinfo->lookup_stmt (phi);
11995 if (!stmt_info)
11996 continue;
11998 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11999 vect_loop_kill_debug_uses (loop, stmt_info);
12001 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12002 && !STMT_VINFO_LIVE_P (stmt_info))
12003 continue;
12005 if (STMT_VINFO_VECTYPE (stmt_info)
12006 && (maybe_ne
12007 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12008 && dump_enabled_p ())
12009 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12011 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12012 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12013 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12014 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12015 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12016 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12017 && ! PURE_SLP_STMT (stmt_info))
12019 if (dump_enabled_p ())
12020 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12021 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12025 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12026 gsi_next (&si))
12028 gphi *phi = si.phi ();
12029 stmt_info = loop_vinfo->lookup_stmt (phi);
12030 if (!stmt_info)
12031 continue;
12033 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12034 && !STMT_VINFO_LIVE_P (stmt_info))
12035 continue;
12037 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12038 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12039 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12040 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12041 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12042 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12043 && ! PURE_SLP_STMT (stmt_info))
12044 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12047 for (gimple_stmt_iterator si = gsi_start_bb (bb);
12048 !gsi_end_p (si);)
12050 stmt = gsi_stmt (si);
12051 /* During vectorization remove existing clobber stmts. */
12052 if (gimple_clobber_p (stmt))
12054 unlink_stmt_vdef (stmt);
12055 gsi_remove (&si, true);
12056 release_defs (stmt);
12058 else
12060 /* Ignore vector stmts created in the outer loop. */
12061 stmt_info = loop_vinfo->lookup_stmt (stmt);
12063 /* vector stmts created in the outer-loop during vectorization of
12064 stmts in an inner-loop may not have a stmt_info, and do not
12065 need to be vectorized. */
12066 stmt_vec_info seen_store = NULL;
12067 if (stmt_info)
12069 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12071 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12072 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12073 !gsi_end_p (subsi); gsi_next (&subsi))
12075 stmt_vec_info pat_stmt_info
12076 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12077 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12078 &si, &seen_store);
12080 stmt_vec_info pat_stmt_info
12081 = STMT_VINFO_RELATED_STMT (stmt_info);
12082 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12083 &si, &seen_store))
12084 maybe_set_vectorized_backedge_value (loop_vinfo,
12085 pat_stmt_info);
12087 else
12089 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12090 &seen_store))
12091 maybe_set_vectorized_backedge_value (loop_vinfo,
12092 stmt_info);
12095 gsi_next (&si);
12096 if (seen_store)
12098 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12099 /* Interleaving. If IS_STORE is TRUE, the
12100 vectorization of the interleaving chain was
12101 completed - free all the stores in the chain. */
12102 vect_remove_stores (loop_vinfo,
12103 DR_GROUP_FIRST_ELEMENT (seen_store));
12104 else
12105 /* Free the attached stmt_vec_info and remove the stmt. */
12106 loop_vinfo->remove_stmt (stmt_info);
12111 /* Stub out scalar statements that must not survive vectorization.
12112 Doing this here helps with grouped statements, or statements that
12113 are involved in patterns. */
12114 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12115 !gsi_end_p (gsi); gsi_next (&gsi))
12117 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12118 if (!call || !gimple_call_internal_p (call))
12119 continue;
12120 internal_fn ifn = gimple_call_internal_fn (call);
12121 if (ifn == IFN_MASK_LOAD)
12123 tree lhs = gimple_get_lhs (call);
12124 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12126 tree zero = build_zero_cst (TREE_TYPE (lhs));
12127 gimple *new_stmt = gimple_build_assign (lhs, zero);
12128 gsi_replace (&gsi, new_stmt, true);
12131 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12133 tree lhs = gimple_get_lhs (call);
12134 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12136 tree else_arg
12137 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12138 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12139 gsi_replace (&gsi, new_stmt, true);
12143 } /* BBs in loop */
12145 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12146 a zero NITERS becomes a nonzero NITERS_VECTOR. */
12147 if (integer_onep (step_vector))
12148 niters_no_overflow = true;
12149 vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12150 niters_vector, step_vector, niters_vector_mult_vf,
12151 !niters_no_overflow);
12153 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12155 /* True if the final iteration might not handle a full vector's
12156 worth of scalar iterations. */
12157 bool final_iter_may_be_partial
12158 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
12159 /* The minimum number of iterations performed by the epilogue. This
12160 is 1 when peeling for gaps because we always need a final scalar
12161 iteration. */
12162 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12163 /* +1 to convert latch counts to loop iteration counts,
12164 -min_epilogue_iters to remove iterations that cannot be performed
12165 by the vector code. */
12166 int bias_for_lowest = 1 - min_epilogue_iters;
12167 int bias_for_assumed = bias_for_lowest;
12168 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12169 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12171 /* When the amount of peeling is known at compile time, the first
12172 iteration will have exactly alignment_npeels active elements.
12173 In the worst case it will have at least one. */
12174 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12175 bias_for_lowest += lowest_vf - min_first_active;
12176 bias_for_assumed += assumed_vf - min_first_active;
12178 /* In these calculations the "- 1" converts loop iteration counts
12179 back to latch counts. */
12180 if (loop->any_upper_bound)
12182 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12183 loop->nb_iterations_upper_bound
12184 = (final_iter_may_be_partial
12185 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12186 lowest_vf) - 1
12187 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12188 lowest_vf) - 1);
12189 if (main_vinfo
12190 /* Both peeling for alignment and peeling for gaps can end up
12191 with the scalar epilogue running for more than VF-1 iterations. */
12192 && !main_vinfo->peeling_for_alignment
12193 && !main_vinfo->peeling_for_gaps)
12195 unsigned int bound;
12196 poly_uint64 main_iters
12197 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12198 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12199 main_iters
12200 = upper_bound (main_iters,
12201 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12202 if (can_div_away_from_zero_p (main_iters,
12203 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12204 &bound))
12205 loop->nb_iterations_upper_bound
12206 = wi::umin ((bound_wide_int) (bound - 1),
12207 loop->nb_iterations_upper_bound);
12210 if (loop->any_likely_upper_bound)
12211 loop->nb_iterations_likely_upper_bound
12212 = (final_iter_may_be_partial
12213 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12214 + bias_for_lowest, lowest_vf) - 1
12215 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12216 + bias_for_lowest, lowest_vf) - 1);
12217 if (loop->any_estimate)
12218 loop->nb_iterations_estimate
12219 = (final_iter_may_be_partial
12220 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12221 assumed_vf) - 1
12222 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12223 assumed_vf) - 1);
12224 scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12225 assumed_vf, flat);
12227 if (dump_enabled_p ())
12229 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12231 dump_printf_loc (MSG_NOTE, vect_location,
12232 "LOOP VECTORIZED\n");
12233 if (loop->inner)
12234 dump_printf_loc (MSG_NOTE, vect_location,
12235 "OUTER LOOP VECTORIZED\n");
12236 dump_printf (MSG_NOTE, "\n");
12238 else
12239 dump_printf_loc (MSG_NOTE, vect_location,
12240 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12241 GET_MODE_NAME (loop_vinfo->vector_mode));
12244 /* Loops vectorized with a variable factor won't benefit from
12245 unrolling/peeling. */
12246 if (!vf.is_constant ())
12248 loop->unroll = 1;
12249 if (dump_enabled_p ())
12250 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12251 " variable-length vectorization factor\n");
12253 /* Free SLP instances here because otherwise stmt reference counting
12254 won't work. */
12255 slp_instance instance;
12256 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12257 vect_free_slp_instance (instance);
12258 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12259 /* Clear-up safelen field since its value is invalid after vectorization
12260 since vectorized loop can have loop-carried dependencies. */
12261 loop->safelen = 0;
12263 if (epilogue)
12265 update_epilogue_loop_vinfo (epilogue, advance);
12267 epilogue->simduid = loop->simduid;
12268 epilogue->force_vectorize = loop->force_vectorize;
12269 epilogue->dont_vectorize = false;
12272 return epilogue;
12275 /* The code below is trying to perform simple optimization - revert
12276 if-conversion for masked stores, i.e. if the mask of a store is zero
12277 do not perform it and all stored value producers also if possible.
12278 For example,
12279 for (i=0; i<n; i++)
12280 if (c[i])
12282 p1[i] += 1;
12283 p2[i] = p3[i] +2;
12285 this transformation will produce the following semi-hammock:
12287 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12289 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12290 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12291 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12292 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12293 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12294 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12298 void
12299 optimize_mask_stores (class loop *loop)
12301 basic_block *bbs = get_loop_body (loop);
12302 unsigned nbbs = loop->num_nodes;
12303 unsigned i;
12304 basic_block bb;
12305 class loop *bb_loop;
12306 gimple_stmt_iterator gsi;
12307 gimple *stmt;
12308 auto_vec<gimple *> worklist;
12309 auto_purge_vect_location sentinel;
12311 vect_location = find_loop_location (loop);
12312 /* Pick up all masked stores in loop if any. */
12313 for (i = 0; i < nbbs; i++)
12315 bb = bbs[i];
12316 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12317 gsi_next (&gsi))
12319 stmt = gsi_stmt (gsi);
12320 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12321 worklist.safe_push (stmt);
12325 free (bbs);
12326 if (worklist.is_empty ())
12327 return;
12329 /* Loop has masked stores. */
12330 while (!worklist.is_empty ())
12332 gimple *last, *last_store;
12333 edge e, efalse;
12334 tree mask;
12335 basic_block store_bb, join_bb;
12336 gimple_stmt_iterator gsi_to;
12337 tree vdef, new_vdef;
12338 gphi *phi;
12339 tree vectype;
12340 tree zero;
12342 last = worklist.pop ();
12343 mask = gimple_call_arg (last, 2);
12344 bb = gimple_bb (last);
12345 /* Create then_bb and if-then structure in CFG, then_bb belongs to
12346 the same loop as if_bb. It could be different to LOOP when two
12347 level loop-nest is vectorized and mask_store belongs to the inner
12348 one. */
12349 e = split_block (bb, last);
12350 bb_loop = bb->loop_father;
12351 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12352 join_bb = e->dest;
12353 store_bb = create_empty_bb (bb);
12354 add_bb_to_loop (store_bb, bb_loop);
12355 e->flags = EDGE_TRUE_VALUE;
12356 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12357 /* Put STORE_BB to likely part. */
12358 efalse->probability = profile_probability::likely ();
12359 e->probability = efalse->probability.invert ();
12360 store_bb->count = efalse->count ();
12361 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12362 if (dom_info_available_p (CDI_DOMINATORS))
12363 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12364 if (dump_enabled_p ())
12365 dump_printf_loc (MSG_NOTE, vect_location,
12366 "Create new block %d to sink mask stores.",
12367 store_bb->index);
12368 /* Create vector comparison with boolean result. */
12369 vectype = TREE_TYPE (mask);
12370 zero = build_zero_cst (vectype);
12371 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12372 gsi = gsi_last_bb (bb);
12373 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12374 /* Create new PHI node for vdef of the last masked store:
12375 .MEM_2 = VDEF <.MEM_1>
12376 will be converted to
12377 .MEM.3 = VDEF <.MEM_1>
12378 and new PHI node will be created in join bb
12379 .MEM_2 = PHI <.MEM_1, .MEM_3>
12381 vdef = gimple_vdef (last);
12382 new_vdef = make_ssa_name (gimple_vop (cfun), last);
12383 gimple_set_vdef (last, new_vdef);
12384 phi = create_phi_node (vdef, join_bb);
12385 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12387 /* Put all masked stores with the same mask to STORE_BB if possible. */
12388 while (true)
12390 gimple_stmt_iterator gsi_from;
12391 gimple *stmt1 = NULL;
12393 /* Move masked store to STORE_BB. */
12394 last_store = last;
12395 gsi = gsi_for_stmt (last);
12396 gsi_from = gsi;
12397 /* Shift GSI to the previous stmt for further traversal. */
12398 gsi_prev (&gsi);
12399 gsi_to = gsi_start_bb (store_bb);
12400 gsi_move_before (&gsi_from, &gsi_to);
12401 /* Setup GSI_TO to the non-empty block start. */
12402 gsi_to = gsi_start_bb (store_bb);
12403 if (dump_enabled_p ())
12404 dump_printf_loc (MSG_NOTE, vect_location,
12405 "Move stmt to created bb\n%G", last);
12406 /* Move all stored value producers if possible. */
12407 while (!gsi_end_p (gsi))
12409 tree lhs;
12410 imm_use_iterator imm_iter;
12411 use_operand_p use_p;
12412 bool res;
12414 /* Skip debug statements. */
12415 if (is_gimple_debug (gsi_stmt (gsi)))
12417 gsi_prev (&gsi);
12418 continue;
12420 stmt1 = gsi_stmt (gsi);
12421 /* Do not consider statements writing to memory or having
12422 volatile operand. */
12423 if (gimple_vdef (stmt1)
12424 || gimple_has_volatile_ops (stmt1))
12425 break;
12426 gsi_from = gsi;
12427 gsi_prev (&gsi);
12428 lhs = gimple_get_lhs (stmt1);
12429 if (!lhs)
12430 break;
12432 /* LHS of vectorized stmt must be SSA_NAME. */
12433 if (TREE_CODE (lhs) != SSA_NAME)
12434 break;
12436 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12438 /* Remove dead scalar statement. */
12439 if (has_zero_uses (lhs))
12441 gsi_remove (&gsi_from, true);
12442 continue;
12446 /* Check that LHS does not have uses outside of STORE_BB. */
12447 res = true;
12448 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12450 gimple *use_stmt;
12451 use_stmt = USE_STMT (use_p);
12452 if (is_gimple_debug (use_stmt))
12453 continue;
12454 if (gimple_bb (use_stmt) != store_bb)
12456 res = false;
12457 break;
12460 if (!res)
12461 break;
12463 if (gimple_vuse (stmt1)
12464 && gimple_vuse (stmt1) != gimple_vuse (last_store))
12465 break;
12467 /* Can move STMT1 to STORE_BB. */
12468 if (dump_enabled_p ())
12469 dump_printf_loc (MSG_NOTE, vect_location,
12470 "Move stmt to created bb\n%G", stmt1);
12471 gsi_move_before (&gsi_from, &gsi_to);
12472 /* Shift GSI_TO for further insertion. */
12473 gsi_prev (&gsi_to);
12475 /* Put other masked stores with the same mask to STORE_BB. */
12476 if (worklist.is_empty ()
12477 || gimple_call_arg (worklist.last (), 2) != mask
12478 || worklist.last () != stmt1)
12479 break;
12480 last = worklist.pop ();
12482 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12486 /* Decide whether it is possible to use a zero-based induction variable
12487 when vectorizing LOOP_VINFO with partial vectors. If it is, return
12488 the value that the induction variable must be able to hold in order
12489 to ensure that the rgroups eventually have no active vector elements.
12490 Return -1 otherwise. */
12492 widest_int
12493 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12495 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12496 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12497 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12499 /* Calculate the value that the induction variable must be able
12500 to hit in order to ensure that we end the loop with an all-false mask.
12501 This involves adding the maximum number of inactive trailing scalar
12502 iterations. */
12503 widest_int iv_limit = -1;
12504 if (max_loop_iterations (loop, &iv_limit))
12506 if (niters_skip)
12508 /* Add the maximum number of skipped iterations to the
12509 maximum iteration count. */
12510 if (TREE_CODE (niters_skip) == INTEGER_CST)
12511 iv_limit += wi::to_widest (niters_skip);
12512 else
12513 iv_limit += max_vf - 1;
12515 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12516 /* Make a conservatively-correct assumption. */
12517 iv_limit += max_vf - 1;
12519 /* IV_LIMIT is the maximum number of latch iterations, which is also
12520 the maximum in-range IV value. Round this value down to the previous
12521 vector alignment boundary and then add an extra full iteration. */
12522 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12523 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12525 return iv_limit;
12528 /* For the given rgroup_controls RGC, check whether an induction variable
12529 would ever hit a value that produces a set of all-false masks or zero
12530 lengths before wrapping around. Return true if it's possible to wrap
12531 around before hitting the desirable value, otherwise return false. */
12533 bool
12534 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12536 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12538 if (iv_limit == -1)
12539 return true;
12541 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12542 unsigned int compare_precision = TYPE_PRECISION (compare_type);
12543 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12545 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12546 return true;
12548 return false;