ada: Update personality function for CHERI purecap
[official-gcc.git] / gcc / tree-vect-loop.cc
blob23c6e8259e7b133cd7acc6bcf0bad26423e9993a
1 /* Loop Vectorization
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "memmodel.h"
36 #include "optabs.h"
37 #include "diagnostic-core.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "cfganal.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "gimple-fold.h"
52 #include "cgraph.h"
53 #include "tree-cfg.h"
54 #include "tree-if-conv.h"
55 #include "internal-fn.h"
56 #include "tree-vector-builder.h"
57 #include "vec-perm-indices.h"
58 #include "tree-eh.h"
59 #include "case-cfn-macros.h"
60 #include "langhooks.h"
62 /* Loop Vectorization Pass.
64 This pass tries to vectorize loops.
66 For example, the vectorizer transforms the following simple loop:
68 short a[N]; short b[N]; short c[N]; int i;
70 for (i=0; i<N; i++){
71 a[i] = b[i] + c[i];
74 as if it was manually vectorized by rewriting the source code into:
76 typedef int __attribute__((mode(V8HI))) v8hi;
77 short a[N]; short b[N]; short c[N]; int i;
78 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 v8hi va, vb, vc;
81 for (i=0; i<N/8; i++){
82 vb = pb[i];
83 vc = pc[i];
84 va = vb + vc;
85 pa[i] = va;
88 The main entry to this pass is vectorize_loops(), in which
89 the vectorizer applies a set of analyses on a given set of loops,
90 followed by the actual vectorization transformation for the loops that
91 had successfully passed the analysis phase.
92 Throughout this pass we make a distinction between two types of
93 data: scalars (which are represented by SSA_NAMES), and memory references
94 ("data-refs"). These two types of data require different handling both
95 during analysis and transformation. The types of data-refs that the
96 vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 accesses are required to have a simple (consecutive) access pattern.
100 Analysis phase:
101 ===============
102 The driver for the analysis phase is vect_analyze_loop().
103 It applies a set of analyses, some of which rely on the scalar evolution
104 analyzer (scev) developed by Sebastian Pop.
106 During the analysis phase the vectorizer records some information
107 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 loop, as well as general information about the loop as a whole, which is
109 recorded in a "loop_vec_info" struct attached to each loop.
111 Transformation phase:
112 =====================
113 The loop transformation phase scans all the stmts in the loop, and
114 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 the loop that needs to be vectorized. It inserts the vector code sequence
116 just before the scalar stmt S, and records a pointer to the vector code
117 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 attached to S). This pointer will be used for the vectorization of following
119 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 otherwise, we rely on dead code elimination for removing it.
122 For example, say stmt S1 was vectorized into stmt VS1:
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 S2: a = b;
128 To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 resulting sequence would be:
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 VS2: va = vb;
136 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 Operands that are not SSA_NAMEs, are data-refs that appear in
139 load/store operations (like 'x[i]' in S1), and are handled differently.
141 Target modeling:
142 =================
143 Currently the only target specific information that is used is the
144 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 Targets that can support different sizes of vectors, for now will need
146 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 flexibility will be added in the future.
149 Since we only vectorize operations which vector form can be
150 expressed using existing tree codes, to verify that an operation is
151 supported, the vectorizer checks the relevant optab at the relevant
152 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 the value found is CODE_FOR_nothing, then there's no target support, and
154 we can't vectorize the stmt.
156 For additional information on this project see:
157 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 unsigned *);
162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 bool *, bool *, bool);
165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 may already be set for general statements (not just data refs). */
169 static opt_result
170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 bool vectype_maybe_set_p,
172 poly_uint64 *vf)
174 gimple *stmt = stmt_info->stmt;
176 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 && !STMT_VINFO_LIVE_P (stmt_info))
178 || gimple_clobber_p (stmt))
180 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 return opt_result::success ();
185 tree stmt_vectype, nunits_vectype;
186 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 &stmt_vectype,
188 &nunits_vectype);
189 if (!res)
190 return res;
192 if (stmt_vectype)
194 if (STMT_VINFO_VECTYPE (stmt_info))
195 /* The only case when a vectype had been already set is for stmts
196 that contain a data ref, or for "pattern-stmts" (stmts generated
197 by the vectorizer to represent/replace a certain idiom). */
198 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 || vectype_maybe_set_p)
200 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 else
202 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
205 if (nunits_vectype)
206 vect_update_max_nunits (vf, nunits_vectype);
208 return opt_result::success ();
211 /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 types of STMT_INFO and all attached pattern statements and update
213 the vectorization factor VF accordingly. Return true on success
214 or false if something prevented vectorization. */
216 static opt_result
217 vect_determine_vf_for_stmt (vec_info *vinfo,
218 stmt_vec_info stmt_info, poly_uint64 *vf)
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 if (!res)
225 return res;
227 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 && STMT_VINFO_RELATED_STMT (stmt_info))
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
233 /* If a pattern statement has def stmts, analyze them too. */
234 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 !gsi_end_p (si); gsi_next (&si))
237 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "==> examining pattern def stmt: %G",
241 def_stmt_info->stmt);
242 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 if (!res)
244 return res;
247 if (dump_enabled_p ())
248 dump_printf_loc (MSG_NOTE, vect_location,
249 "==> examining pattern statement: %G",
250 stmt_info->stmt);
251 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 if (!res)
253 return res;
256 return opt_result::success ();
259 /* Function vect_determine_vectorization_factor
261 Determine the vectorization factor (VF). VF is the number of data elements
262 that are operated upon in parallel in a single iteration of the vectorized
263 loop. For example, when vectorizing a loop that operates on 4byte elements,
264 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 elements can fit in a single vector register.
267 We currently support vectorization of loops in which all types operated upon
268 are of the same size. Therefore this function currently sets VF according to
269 the size of the types operated upon, and fails if there are multiple sizes
270 in the loop.
272 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 original loop:
274 for (i=0; i<N; i++){
275 a[i] = b[i] + c[i];
278 vectorized loop:
279 for (i=0; i<N; i+=VF){
280 a[i:VF] = b[i:VF] + c[i:VF];
284 static opt_result
285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
287 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 unsigned nbbs = loop->num_nodes;
290 poly_uint64 vectorization_factor = 1;
291 tree scalar_type = NULL_TREE;
292 gphi *phi;
293 tree vectype;
294 stmt_vec_info stmt_info;
295 unsigned i;
297 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
299 for (i = 0; i < nbbs; i++)
301 basic_block bb = bbs[i];
303 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 gsi_next (&si))
306 phi = si.phi ();
307 stmt_info = loop_vinfo->lookup_stmt (phi);
308 if (dump_enabled_p ())
309 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 (gimple *) phi);
312 gcc_assert (stmt_info);
314 if (STMT_VINFO_RELEVANT_P (stmt_info)
315 || STMT_VINFO_LIVE_P (stmt_info))
317 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 scalar_type = TREE_TYPE (PHI_RESULT (phi));
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE, vect_location,
322 "get vectype for scalar type: %T\n",
323 scalar_type);
325 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 if (!vectype)
327 return opt_result::failure_at (phi,
328 "not vectorized: unsupported "
329 "data-type %T\n",
330 scalar_type);
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 dump_printf (MSG_NOTE, "\n");
344 vect_update_max_nunits (&vectorization_factor, vectype);
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 gsi_next (&si))
351 if (is_gimple_debug (gsi_stmt (si)))
352 continue;
353 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 opt_result res
355 = vect_determine_vf_for_stmt (loop_vinfo,
356 stmt_info, &vectorization_factor);
357 if (!res)
358 return res;
362 /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 dump_dec (MSG_NOTE, vectorization_factor);
367 dump_printf (MSG_NOTE, "\n");
370 if (known_le (vectorization_factor, 1U))
371 return opt_result::failure_at (vect_location,
372 "not vectorized: unsupported data-type\n");
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 return opt_result::success ();
378 /* Function vect_is_simple_iv_evolution.
380 FORNOW: A simple evolution of an induction variables in the loop is
381 considered a polynomial evolution. */
383 static bool
384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 tree * step)
387 tree init_expr;
388 tree step_expr;
389 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 basic_block bb;
392 /* When there is no evolution in this loop, the evolution function
393 is not "simple". */
394 if (evolution_part == NULL_TREE)
395 return false;
397 /* When the evolution is a polynomial of degree >= 2
398 the evolution function is not "simple". */
399 if (tree_is_chrec (evolution_part))
400 return false;
402 step_expr = evolution_part;
403 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 step_expr, init_expr);
409 *init = init_expr;
410 *step = step_expr;
412 if (TREE_CODE (step_expr) != INTEGER_CST
413 && (TREE_CODE (step_expr) != SSA_NAME
414 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 || !flag_associative_math)))
419 && (TREE_CODE (step_expr) != REAL_CST
420 || !flag_associative_math))
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "step unknown.\n");
425 return false;
428 return true;
431 /* Function vect_is_nonlinear_iv_evolution
433 Only support nonlinear induction for integer type
434 1. neg
435 2. mul by constant
436 3. lshift/rshift by constant.
438 For neg induction, return a fake step as integer -1. */
439 static bool
440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 gphi* loop_phi_node, tree *init, tree *step)
443 tree init_expr, ev_expr, result, op1, op2;
444 gimple* def;
446 if (gimple_phi_num_args (loop_phi_node) != 2)
447 return false;
449 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
452 /* Support nonlinear induction only for integer type. */
453 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 return false;
456 *init = init_expr;
457 result = PHI_RESULT (loop_phi_node);
459 if (TREE_CODE (ev_expr) != SSA_NAME
460 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 || !is_gimple_assign (def))
462 return false;
464 enum tree_code t_code = gimple_assign_rhs_code (def);
465 switch (t_code)
467 case NEGATE_EXPR:
468 if (gimple_assign_rhs1 (def) != result)
469 return false;
470 *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 break;
474 case RSHIFT_EXPR:
475 case LSHIFT_EXPR:
476 case MULT_EXPR:
477 op1 = gimple_assign_rhs1 (def);
478 op2 = gimple_assign_rhs2 (def);
479 if (TREE_CODE (op2) != INTEGER_CST
480 || op1 != result)
481 return false;
482 *step = op2;
483 if (t_code == LSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 else if (t_code == RSHIFT_EXPR)
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 else
489 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 break;
492 default:
493 return false;
496 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
499 return true;
502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 what we are assuming is a double reduction. For example, given
504 a structure like this:
506 outer1:
507 x_1 = PHI <x_4(outer2), ...>;
510 inner:
511 x_2 = PHI <x_1(outer1), ...>;
513 x_3 = ...;
516 outer2:
517 x_4 = PHI <x_3(inner)>;
520 outer loop analysis would treat x_1 as a double reduction phi and
521 this function would then return true for x_2. */
523 static bool
524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
526 use_operand_p use_p;
527 ssa_op_iter op_iter;
528 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 return true;
532 return false;
535 /* Returns true if Phi is a first-order recurrence. A first-order
536 recurrence is a non-reduction recurrence relation in which the value of
537 the recurrence in the current loop iteration equals a value defined in
538 the previous iteration. */
540 static bool
541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 gphi *phi)
544 /* A nested cycle isn't vectorizable as first order recurrence. */
545 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 return false;
548 /* Ensure the loop latch definition is from within the loop. */
549 edge latch = loop_latch_edge (loop);
550 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 if (TREE_CODE (ldef) != SSA_NAME
552 || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 return false;
557 tree def = gimple_phi_result (phi);
559 /* Ensure every use_stmt of the phi node is dominated by the latch
560 definition. */
561 imm_use_iterator imm_iter;
562 use_operand_p use_p;
563 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 if (!is_gimple_debug (USE_STMT (use_p))
565 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 USE_STMT (use_p))))
568 return false;
570 /* First-order recurrence autovectorization needs shuffle vector. */
571 tree scalar_type = TREE_TYPE (def);
572 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 if (!vectype)
574 return false;
576 return true;
579 /* Function vect_analyze_scalar_cycles_1.
581 Examine the cross iteration def-use cycles of scalar variables
582 in LOOP. LOOP_VINFO represents the loop that is now being
583 considered for vectorization (can be LOOP, or an outer-loop
584 enclosing LOOP). SLP indicates there will be some subsequent
585 slp analyses or not. */
587 static void
588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 bool slp)
591 basic_block bb = loop->header;
592 tree init, step;
593 auto_vec<stmt_vec_info, 64> worklist;
594 gphi_iterator gsi;
595 bool double_reduc, reduc_chain;
597 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
599 /* First - identify all inductions. Reduction detection assumes that all the
600 inductions have been identified, therefore, this order must not be
601 changed. */
602 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
604 gphi *phi = gsi.phi ();
605 tree access_fn = NULL;
606 tree def = PHI_RESULT (phi);
607 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 (gimple *) phi);
613 /* Skip virtual phi's. The data dependences that are associated with
614 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 if (virtual_operand_p (def))
616 continue;
618 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
620 /* Analyze the evolution function. */
621 access_fn = analyze_scalar_evolution (loop, def);
622 if (access_fn)
624 STRIP_NOPS (access_fn);
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "Access function of PHI: %T\n", access_fn);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 = initial_condition_in_loop_num (access_fn, loop->num);
630 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 = evolution_part_in_loop_num (access_fn, loop->num);
634 if ((!access_fn
635 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 &init, &step)
638 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 && TREE_CODE (step) != INTEGER_CST))
640 /* Only handle nonlinear iv for same loop. */
641 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 phi, &init, &step)))
645 worklist.safe_push (stmt_vinfo);
646 continue;
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 != NULL_TREE);
651 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
659 /* Second - identify all reductions and nested cycles. */
660 while (worklist.length () > 0)
662 stmt_vec_info stmt_vinfo = worklist.pop ();
663 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
664 tree def = PHI_RESULT (phi);
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 (gimple *) phi);
670 gcc_assert (!virtual_operand_p (def)
671 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
673 stmt_vec_info reduc_stmt_info
674 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 &reduc_chain, slp);
676 if (reduc_stmt_info)
678 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 if (double_reduc)
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location,
684 "Detected double reduction.\n");
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
689 else
691 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "Detected vectorizable nested cycle.\n");
697 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
699 else
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_NOTE, vect_location,
703 "Detected reduction.\n");
705 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
706 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
707 /* Store the reduction cycles for possible vectorization in
708 loop-aware SLP if it was not detected as reduction
709 chain. */
710 if (! reduc_chain)
711 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
712 (reduc_stmt_info);
716 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
718 else
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown def-use cycle pattern.\n");
726 /* Function vect_analyze_scalar_cycles.
728 Examine the cross iteration def-use cycles of scalar variables, by
729 analyzing the loop-header PHIs of scalar variables. Classify each
730 cycle as one of the following: invariant, induction, reduction, unknown.
731 We do that for the loop represented by LOOP_VINFO, and also to its
732 inner-loop, if exists.
733 Examples for scalar cycles:
735 Example1: reduction:
737 loop1:
738 for (i=0; i<N; i++)
739 sum += a[i];
741 Example2: induction:
743 loop2:
744 for (i=0; i<N; i++)
745 a[i] = i; */
747 static void
748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
752 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
754 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 Reductions in such inner-loop therefore have different properties than
756 the reductions in the nest that gets vectorized:
757 1. When vectorized, they are executed in the same order as in the original
758 scalar loop, so we can't change the order of computation when
759 vectorizing them.
760 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 current checks are too strict. */
763 if (loop->inner)
764 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
767 /* Transfer group and reduction information from STMT_INFO to its
768 pattern stmt. */
770 static void
771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
773 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
774 stmt_vec_info stmtp;
775 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
777 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
780 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
781 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
782 == STMT_VINFO_DEF_TYPE (stmt_info));
783 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
784 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
785 if (stmt_info)
786 REDUC_GROUP_NEXT_ELEMENT (stmtp)
787 = STMT_VINFO_RELATED_STMT (stmt_info);
789 while (stmt_info);
792 /* Fixup scalar cycles that now have their stmts detected as patterns. */
794 static void
795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
797 stmt_vec_info first;
798 unsigned i;
800 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
802 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
803 while (next)
805 if ((STMT_VINFO_IN_PATTERN_P (next)
806 != STMT_VINFO_IN_PATTERN_P (first))
807 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
808 break;
809 next = REDUC_GROUP_NEXT_ELEMENT (next);
811 /* If all reduction chain members are well-formed patterns adjust
812 the group to group the pattern stmts instead. */
813 if (! next
814 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
816 if (STMT_VINFO_IN_PATTERN_P (first))
818 vect_fixup_reduc_chain (first);
819 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
820 = STMT_VINFO_RELATED_STMT (first);
823 /* If not all stmt in the chain are patterns or if we failed
824 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 it as regular reduction instead. */
826 else
828 stmt_vec_info vinfo = first;
829 stmt_vec_info last = NULL;
830 while (vinfo)
832 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
833 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
834 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
835 last = vinfo;
836 vinfo = next;
838 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
839 = vect_internal_def;
840 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
841 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
842 --i;
847 /* Function vect_get_loop_niters.
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 niter information holds in ASSUMPTIONS.
854 Return the loop exit condition. */
857 static gcond *
858 vect_get_loop_niters (class loop *loop, tree *assumptions,
859 tree *number_of_iterations, tree *number_of_iterationsm1)
861 edge exit = single_exit (loop);
862 class tree_niter_desc niter_desc;
863 tree niter_assumptions, niter, may_be_zero;
864 gcond *cond = get_loop_exit_condition (loop);
866 *assumptions = boolean_true_node;
867 *number_of_iterationsm1 = chrec_dont_know;
868 *number_of_iterations = chrec_dont_know;
869 DUMP_VECT_SCOPE ("get_loop_niters");
871 if (!exit)
872 return cond;
874 may_be_zero = NULL_TREE;
875 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
876 || chrec_contains_undetermined (niter_desc.niter))
877 return cond;
879 niter_assumptions = niter_desc.assumptions;
880 may_be_zero = niter_desc.may_be_zero;
881 niter = niter_desc.niter;
883 if (may_be_zero && integer_zerop (may_be_zero))
884 may_be_zero = NULL_TREE;
886 if (may_be_zero)
888 if (COMPARISON_CLASS_P (may_be_zero))
890 /* Try to combine may_be_zero with assumptions, this can simplify
891 computation of niter expression. */
892 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
893 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
894 niter_assumptions,
895 fold_build1 (TRUTH_NOT_EXPR,
896 boolean_type_node,
897 may_be_zero));
898 else
899 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
900 build_int_cst (TREE_TYPE (niter), 0),
901 rewrite_to_non_trapping_overflow (niter));
903 may_be_zero = NULL_TREE;
905 else if (integer_nonzerop (may_be_zero))
907 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
908 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
909 return cond;
911 else
912 return cond;
915 *assumptions = niter_assumptions;
916 *number_of_iterationsm1 = niter;
918 /* We want the number of loop header executions which is the number
919 of latch executions plus one.
920 ??? For UINT_MAX latch executions this number overflows to zero
921 for loops like do { n++; } while (n != 0); */
922 if (niter && !chrec_contains_undetermined (niter))
923 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
924 build_int_cst (TREE_TYPE (niter), 1));
925 *number_of_iterations = niter;
927 return cond;
930 /* Function bb_in_loop_p
932 Used as predicate for dfs order traversal of the loop bbs. */
934 static bool
935 bb_in_loop_p (const_basic_block bb, const void *data)
937 const class loop *const loop = (const class loop *)data;
938 if (flow_bb_inside_loop_p (loop, bb))
939 return true;
940 return false;
944 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
945 stmt_vec_info structs for all the stmts in LOOP_IN. */
947 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
948 : vec_info (vec_info::loop, shared),
949 loop (loop_in),
950 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
951 num_itersm1 (NULL_TREE),
952 num_iters (NULL_TREE),
953 num_iters_unchanged (NULL_TREE),
954 num_iters_assumptions (NULL_TREE),
955 vector_costs (nullptr),
956 scalar_costs (nullptr),
957 th (0),
958 versioning_threshold (0),
959 vectorization_factor (0),
960 main_loop_edge (nullptr),
961 skip_main_loop_edge (nullptr),
962 skip_this_loop_edge (nullptr),
963 reusable_accumulators (),
964 suggested_unroll_factor (1),
965 max_vectorization_factor (0),
966 mask_skip_niters (NULL_TREE),
967 rgroup_compare_type (NULL_TREE),
968 simd_if_cond (NULL_TREE),
969 partial_vector_style (vect_partial_vectors_none),
970 unaligned_dr (NULL),
971 peeling_for_alignment (0),
972 ptr_mask (0),
973 ivexpr_map (NULL),
974 scan_map (NULL),
975 slp_unrolling_factor (1),
976 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
977 vectorizable (false),
978 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
979 using_partial_vectors_p (false),
980 using_decrementing_iv_p (false),
981 using_select_vl_p (false),
982 epil_using_partial_vectors_p (false),
983 partial_load_store_bias (0),
984 peeling_for_gaps (false),
985 peeling_for_niter (false),
986 no_data_dependencies (false),
987 has_mask_store (false),
988 scalar_loop_scaling (profile_probability::uninitialized ()),
989 scalar_loop (NULL),
990 orig_loop_info (NULL)
992 /* CHECKME: We want to visit all BBs before their successors (except for
993 latch blocks, for which this assertion wouldn't hold). In the simple
994 case of the loop forms we allow, a dfs order of the BBs would the same
995 as reversed postorder traversal, so we are safe. */
997 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
998 bbs, loop->num_nodes, loop);
999 gcc_assert (nbbs == loop->num_nodes);
1001 for (unsigned int i = 0; i < nbbs; i++)
1003 basic_block bb = bbs[i];
1004 gimple_stmt_iterator si;
1006 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1008 gimple *phi = gsi_stmt (si);
1009 gimple_set_uid (phi, 0);
1010 add_stmt (phi);
1013 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1015 gimple *stmt = gsi_stmt (si);
1016 gimple_set_uid (stmt, 0);
1017 if (is_gimple_debug (stmt))
1018 continue;
1019 add_stmt (stmt);
1020 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1021 third argument is the #pragma omp simd if (x) condition, when 0,
1022 loop shouldn't be vectorized, when non-zero constant, it should
1023 be vectorized normally, otherwise versioned with vectorized loop
1024 done if the condition is non-zero at runtime. */
1025 if (loop_in->simduid
1026 && is_gimple_call (stmt)
1027 && gimple_call_internal_p (stmt)
1028 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1029 && gimple_call_num_args (stmt) >= 3
1030 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1031 && (loop_in->simduid
1032 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1034 tree arg = gimple_call_arg (stmt, 2);
1035 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1036 simd_if_cond = arg;
1037 else
1038 gcc_assert (integer_nonzerop (arg));
1043 epilogue_vinfos.create (6);
1046 /* Free all levels of rgroup CONTROLS. */
1048 void
1049 release_vec_loop_controls (vec<rgroup_controls> *controls)
1051 rgroup_controls *rgc;
1052 unsigned int i;
1053 FOR_EACH_VEC_ELT (*controls, i, rgc)
1054 rgc->controls.release ();
1055 controls->release ();
1058 /* Free all memory used by the _loop_vec_info, as well as all the
1059 stmt_vec_info structs of all the stmts in the loop. */
1061 _loop_vec_info::~_loop_vec_info ()
1063 free (bbs);
1065 release_vec_loop_controls (&masks.rgc_vec);
1066 release_vec_loop_controls (&lens);
1067 delete ivexpr_map;
1068 delete scan_map;
1069 epilogue_vinfos.release ();
1070 delete scalar_costs;
1071 delete vector_costs;
1073 /* When we release an epiloge vinfo that we do not intend to use
1074 avoid clearing AUX of the main loop which should continue to
1075 point to the main loop vinfo since otherwise we'll leak that. */
1076 if (loop->aux == this)
1077 loop->aux = NULL;
1080 /* Return an invariant or register for EXPR and emit necessary
1081 computations in the LOOP_VINFO loop preheader. */
1083 tree
1084 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1086 if (is_gimple_reg (expr)
1087 || is_gimple_min_invariant (expr))
1088 return expr;
1090 if (! loop_vinfo->ivexpr_map)
1091 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1092 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1093 if (! cached)
1095 gimple_seq stmts = NULL;
1096 cached = force_gimple_operand (unshare_expr (expr),
1097 &stmts, true, NULL_TREE);
1098 if (stmts)
1100 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1101 gsi_insert_seq_on_edge_immediate (e, stmts);
1104 return cached;
1107 /* Return true if we can use CMP_TYPE as the comparison type to produce
1108 all masks required to mask LOOP_VINFO. */
1110 static bool
1111 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1113 rgroup_controls *rgm;
1114 unsigned int i;
1115 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1116 if (rgm->type != NULL_TREE
1117 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1118 cmp_type, rgm->type,
1119 OPTIMIZE_FOR_SPEED))
1120 return false;
1121 return true;
1124 /* Calculate the maximum number of scalars per iteration for every
1125 rgroup in LOOP_VINFO. */
1127 static unsigned int
1128 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1130 unsigned int res = 1;
1131 unsigned int i;
1132 rgroup_controls *rgm;
1133 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1134 res = MAX (res, rgm->max_nscalars_per_iter);
1135 return res;
1138 /* Calculate the minimum precision necessary to represent:
1140 MAX_NITERS * FACTOR
1142 as an unsigned integer, where MAX_NITERS is the maximum number of
1143 loop header iterations for the original scalar form of LOOP_VINFO. */
1145 static unsigned
1146 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1148 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1150 /* Get the maximum number of iterations that is representable
1151 in the counter type. */
1152 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1153 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1155 /* Get a more refined estimate for the number of iterations. */
1156 widest_int max_back_edges;
1157 if (max_loop_iterations (loop, &max_back_edges))
1158 max_ni = wi::smin (max_ni, max_back_edges + 1);
1160 /* Work out how many bits we need to represent the limit. */
1161 return wi::min_precision (max_ni * factor, UNSIGNED);
1164 /* True if the loop needs peeling or partial vectors when vectorized. */
1166 static bool
1167 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1169 unsigned HOST_WIDE_INT const_vf;
1170 HOST_WIDE_INT max_niter
1171 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1173 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1174 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1175 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1176 (loop_vinfo));
1178 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1179 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1181 /* Work out the (constant) number of iterations that need to be
1182 peeled for reasons other than niters. */
1183 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1184 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1185 peel_niter += 1;
1186 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1187 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1188 return true;
1190 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1191 /* ??? When peeling for gaps but not alignment, we could
1192 try to check whether the (variable) niters is known to be
1193 VF * N + 1. That's something of a niche case though. */
1194 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1195 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1196 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1197 < (unsigned) exact_log2 (const_vf))
1198 /* In case of versioning, check if the maximum number of
1199 iterations is greater than th. If they are identical,
1200 the epilogue is unnecessary. */
1201 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1202 || ((unsigned HOST_WIDE_INT) max_niter
1203 > (th / const_vf) * const_vf))))
1204 return true;
1206 return false;
1209 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1210 whether we can actually generate the masks required. Return true if so,
1211 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1213 static bool
1214 vect_verify_full_masking (loop_vec_info loop_vinfo)
1216 unsigned int min_ni_width;
1218 /* Use a normal loop if there are no statements that need masking.
1219 This only happens in rare degenerate cases: it means that the loop
1220 has no loads, no stores, and no live-out values. */
1221 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1222 return false;
1224 /* Produce the rgroup controls. */
1225 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1227 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1228 tree vectype = mask.first;
1229 unsigned nvectors = mask.second;
1231 if (masks->rgc_vec.length () < nvectors)
1232 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1233 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1234 /* The number of scalars per iteration and the number of vectors are
1235 both compile-time constants. */
1236 unsigned int nscalars_per_iter
1237 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1238 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1240 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1242 rgm->max_nscalars_per_iter = nscalars_per_iter;
1243 rgm->type = truth_type_for (vectype);
1244 rgm->factor = 1;
1248 unsigned int max_nscalars_per_iter
1249 = vect_get_max_nscalars_per_iter (loop_vinfo);
1251 /* Work out how many bits we need to represent the limit. */
1252 min_ni_width
1253 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1255 /* Find a scalar mode for which WHILE_ULT is supported. */
1256 opt_scalar_int_mode cmp_mode_iter;
1257 tree cmp_type = NULL_TREE;
1258 tree iv_type = NULL_TREE;
1259 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1260 unsigned int iv_precision = UINT_MAX;
1262 if (iv_limit != -1)
1263 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1264 UNSIGNED);
1266 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1268 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1269 if (cmp_bits >= min_ni_width
1270 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1272 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1273 if (this_type
1274 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1276 /* Although we could stop as soon as we find a valid mode,
1277 there are at least two reasons why that's not always the
1278 best choice:
1280 - An IV that's Pmode or wider is more likely to be reusable
1281 in address calculations than an IV that's narrower than
1282 Pmode.
1284 - Doing the comparison in IV_PRECISION or wider allows
1285 a natural 0-based IV, whereas using a narrower comparison
1286 type requires mitigations against wrap-around.
1288 Conversely, if the IV limit is variable, doing the comparison
1289 in a wider type than the original type can introduce
1290 unnecessary extensions, so picking the widest valid mode
1291 is not always a good choice either.
1293 Here we prefer the first IV type that's Pmode or wider,
1294 and the first comparison type that's IV_PRECISION or wider.
1295 (The comparison type must be no wider than the IV type,
1296 to avoid extensions in the vector loop.)
1298 ??? We might want to try continuing beyond Pmode for ILP32
1299 targets if CMP_BITS < IV_PRECISION. */
1300 iv_type = this_type;
1301 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1302 cmp_type = this_type;
1303 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1304 break;
1309 if (!cmp_type)
1311 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1312 return false;
1315 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1316 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1317 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1318 return true;
1321 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1322 whether we can actually generate AVX512 style masks. Return true if so,
1323 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1325 static bool
1326 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1328 /* Produce differently organized rgc_vec and differently check
1329 we can produce masks. */
1331 /* Use a normal loop if there are no statements that need masking.
1332 This only happens in rare degenerate cases: it means that the loop
1333 has no loads, no stores, and no live-out values. */
1334 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1335 return false;
1337 /* For the decrementing IV we need to represent all values in
1338 [0, niter + niter_skip] where niter_skip is the elements we
1339 skip in the first iteration for prologue peeling. */
1340 tree iv_type = NULL_TREE;
1341 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1342 unsigned int iv_precision = UINT_MAX;
1343 if (iv_limit != -1)
1344 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1346 /* First compute the type for the IV we use to track the remaining
1347 scalar iterations. */
1348 opt_scalar_int_mode cmp_mode_iter;
1349 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1351 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1352 if (cmp_bits >= iv_precision
1353 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1355 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1356 if (iv_type)
1357 break;
1360 if (!iv_type)
1361 return false;
1363 /* Produce the rgroup controls. */
1364 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1366 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1367 tree vectype = mask.first;
1368 unsigned nvectors = mask.second;
1370 /* The number of scalars per iteration and the number of vectors are
1371 both compile-time constants. */
1372 unsigned int nscalars_per_iter
1373 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1374 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1376 /* We index the rgroup_controls vector with nscalars_per_iter
1377 which we keep constant and instead have a varying nvectors,
1378 remembering the vector mask with the fewest nV. */
1379 if (masks->rgc_vec.length () < nscalars_per_iter)
1380 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1381 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1383 if (!rgm->type || rgm->factor > nvectors)
1385 rgm->type = truth_type_for (vectype);
1386 rgm->compare_type = NULL_TREE;
1387 rgm->max_nscalars_per_iter = nscalars_per_iter;
1388 rgm->factor = nvectors;
1389 rgm->bias_adjusted_ctrl = NULL_TREE;
1393 /* There is no fixed compare type we are going to use but we have to
1394 be able to get at one for each mask group. */
1395 unsigned int min_ni_width
1396 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1398 bool ok = true;
1399 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1401 tree mask_type = rgc.type;
1402 if (!mask_type)
1403 continue;
1405 if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1407 ok = false;
1408 break;
1411 /* If iv_type is usable as compare type use that - we can elide the
1412 saturation in that case. */
1413 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1415 tree cmp_vectype
1416 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1417 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1418 rgc.compare_type = cmp_vectype;
1420 if (!rgc.compare_type)
1421 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1423 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1424 if (cmp_bits >= min_ni_width
1425 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1427 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1428 if (!cmp_type)
1429 continue;
1431 /* Check whether we can produce the mask with cmp_type. */
1432 tree cmp_vectype
1433 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1434 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1436 rgc.compare_type = cmp_vectype;
1437 break;
1441 if (!rgc.compare_type)
1443 ok = false;
1444 break;
1447 if (!ok)
1449 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1450 return false;
1453 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1454 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1455 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1456 return true;
1459 /* Check whether we can use vector access with length based on precison
1460 comparison. So far, to keep it simple, we only allow the case that the
1461 precision of the target supported length is larger than the precision
1462 required by loop niters. */
1464 static bool
1465 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1467 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1468 return false;
1470 machine_mode len_load_mode, len_store_mode;
1471 if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1472 .exists (&len_load_mode))
1473 return false;
1474 if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1475 .exists (&len_store_mode))
1476 return false;
1478 signed char partial_load_bias = internal_len_load_store_bias
1479 (IFN_LEN_LOAD, len_load_mode);
1481 signed char partial_store_bias = internal_len_load_store_bias
1482 (IFN_LEN_STORE, len_store_mode);
1484 gcc_assert (partial_load_bias == partial_store_bias);
1486 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1487 return false;
1489 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1490 len_loads with a length of zero. In order to avoid that we prohibit
1491 more than one loop length here. */
1492 if (partial_load_bias == -1
1493 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1494 return false;
1496 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1498 unsigned int max_nitems_per_iter = 1;
1499 unsigned int i;
1500 rgroup_controls *rgl;
1501 /* Find the maximum number of items per iteration for every rgroup. */
1502 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1504 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1505 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1508 /* Work out how many bits we need to represent the length limit. */
1509 unsigned int min_ni_prec
1510 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1512 /* Now use the maximum of below precisions for one suitable IV type:
1513 - the IV's natural precision
1514 - the precision needed to hold: the maximum number of scalar
1515 iterations multiplied by the scale factor (min_ni_prec above)
1516 - the Pmode precision
1518 If min_ni_prec is less than the precision of the current niters,
1519 we perfer to still use the niters type. Prefer to use Pmode and
1520 wider IV to avoid narrow conversions. */
1522 unsigned int ni_prec
1523 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1524 min_ni_prec = MAX (min_ni_prec, ni_prec);
1525 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1527 tree iv_type = NULL_TREE;
1528 opt_scalar_int_mode tmode_iter;
1529 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1531 scalar_mode tmode = tmode_iter.require ();
1532 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1534 /* ??? Do we really want to construct one IV whose precision exceeds
1535 BITS_PER_WORD? */
1536 if (tbits > BITS_PER_WORD)
1537 break;
1539 /* Find the first available standard integral type. */
1540 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1542 iv_type = build_nonstandard_integer_type (tbits, true);
1543 break;
1547 if (!iv_type)
1549 if (dump_enabled_p ())
1550 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1551 "can't vectorize with length-based partial vectors"
1552 " because there is no suitable iv type.\n");
1553 return false;
1556 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1557 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1558 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1560 return true;
1563 /* Calculate the cost of one scalar iteration of the loop. */
1564 static void
1565 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1567 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1568 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1569 int nbbs = loop->num_nodes, factor;
1570 int innerloop_iters, i;
1572 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1574 /* Gather costs for statements in the scalar loop. */
1576 /* FORNOW. */
1577 innerloop_iters = 1;
1578 if (loop->inner)
1579 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1581 for (i = 0; i < nbbs; i++)
1583 gimple_stmt_iterator si;
1584 basic_block bb = bbs[i];
1586 if (bb->loop_father == loop->inner)
1587 factor = innerloop_iters;
1588 else
1589 factor = 1;
1591 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1593 gimple *stmt = gsi_stmt (si);
1594 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1596 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1597 continue;
1599 /* Skip stmts that are not vectorized inside the loop. */
1600 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1601 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1602 && (!STMT_VINFO_LIVE_P (vstmt_info)
1603 || !VECTORIZABLE_CYCLE_DEF
1604 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1605 continue;
1607 vect_cost_for_stmt kind;
1608 if (STMT_VINFO_DATA_REF (stmt_info))
1610 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1611 kind = scalar_load;
1612 else
1613 kind = scalar_store;
1615 else if (vect_nop_conversion_p (stmt_info))
1616 continue;
1617 else
1618 kind = scalar_stmt;
1620 /* We are using vect_prologue here to avoid scaling twice
1621 by the inner loop factor. */
1622 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1623 factor, kind, stmt_info, 0, vect_prologue);
1627 /* Now accumulate cost. */
1628 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1629 add_stmt_costs (loop_vinfo->scalar_costs,
1630 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1631 loop_vinfo->scalar_costs->finish_cost (nullptr);
1635 /* Function vect_analyze_loop_form.
1637 Verify that certain CFG restrictions hold, including:
1638 - the loop has a pre-header
1639 - the loop has a single entry and exit
1640 - the loop exit condition is simple enough
1641 - the number of iterations can be analyzed, i.e, a countable loop. The
1642 niter could be analyzed under some assumptions. */
1644 opt_result
1645 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1647 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1649 /* Different restrictions apply when we are considering an inner-most loop,
1650 vs. an outer (nested) loop.
1651 (FORNOW. May want to relax some of these restrictions in the future). */
1653 info->inner_loop_cond = NULL;
1654 if (!loop->inner)
1656 /* Inner-most loop. We currently require that the number of BBs is
1657 exactly 2 (the header and latch). Vectorizable inner-most loops
1658 look like this:
1660 (pre-header)
1662 header <--------+
1663 | | |
1664 | +--> latch --+
1666 (exit-bb) */
1668 if (loop->num_nodes != 2)
1669 return opt_result::failure_at (vect_location,
1670 "not vectorized:"
1671 " control flow in loop.\n");
1673 if (empty_block_p (loop->header))
1674 return opt_result::failure_at (vect_location,
1675 "not vectorized: empty loop.\n");
1677 else
1679 class loop *innerloop = loop->inner;
1680 edge entryedge;
1682 /* Nested loop. We currently require that the loop is doubly-nested,
1683 contains a single inner loop, and the number of BBs is exactly 5.
1684 Vectorizable outer-loops look like this:
1686 (pre-header)
1688 header <---+
1690 inner-loop |
1692 tail ------+
1694 (exit-bb)
1696 The inner-loop has the properties expected of inner-most loops
1697 as described above. */
1699 if ((loop->inner)->inner || (loop->inner)->next)
1700 return opt_result::failure_at (vect_location,
1701 "not vectorized:"
1702 " multiple nested loops.\n");
1704 if (loop->num_nodes != 5)
1705 return opt_result::failure_at (vect_location,
1706 "not vectorized:"
1707 " control flow in loop.\n");
1709 entryedge = loop_preheader_edge (innerloop);
1710 if (entryedge->src != loop->header
1711 || !single_exit (innerloop)
1712 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1713 return opt_result::failure_at (vect_location,
1714 "not vectorized:"
1715 " unsupported outerloop form.\n");
1717 /* Analyze the inner-loop. */
1718 vect_loop_form_info inner;
1719 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1720 if (!res)
1722 if (dump_enabled_p ())
1723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1724 "not vectorized: Bad inner loop.\n");
1725 return res;
1728 /* Don't support analyzing niter under assumptions for inner
1729 loop. */
1730 if (!integer_onep (inner.assumptions))
1731 return opt_result::failure_at (vect_location,
1732 "not vectorized: Bad inner loop.\n");
1734 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1735 return opt_result::failure_at (vect_location,
1736 "not vectorized: inner-loop count not"
1737 " invariant.\n");
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_NOTE, vect_location,
1741 "Considering outer-loop vectorization.\n");
1742 info->inner_loop_cond = inner.loop_cond;
1745 if (!single_exit (loop))
1746 return opt_result::failure_at (vect_location,
1747 "not vectorized: multiple exits.\n");
1748 if (EDGE_COUNT (loop->header->preds) != 2)
1749 return opt_result::failure_at (vect_location,
1750 "not vectorized:"
1751 " too many incoming edges.\n");
1753 /* We assume that the loop exit condition is at the end of the loop. i.e,
1754 that the loop is represented as a do-while (with a proper if-guard
1755 before the loop if needed), where the loop header contains all the
1756 executable statements, and the latch is empty. */
1757 if (!empty_block_p (loop->latch)
1758 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1759 return opt_result::failure_at (vect_location,
1760 "not vectorized: latch block not empty.\n");
1762 /* Make sure the exit is not abnormal. */
1763 edge e = single_exit (loop);
1764 if (e->flags & EDGE_ABNORMAL)
1765 return opt_result::failure_at (vect_location,
1766 "not vectorized:"
1767 " abnormal loop exit edge.\n");
1769 info->loop_cond
1770 = vect_get_loop_niters (loop, &info->assumptions,
1771 &info->number_of_iterations,
1772 &info->number_of_iterationsm1);
1773 if (!info->loop_cond)
1774 return opt_result::failure_at
1775 (vect_location,
1776 "not vectorized: complicated exit condition.\n");
1778 if (integer_zerop (info->assumptions)
1779 || !info->number_of_iterations
1780 || chrec_contains_undetermined (info->number_of_iterations))
1781 return opt_result::failure_at
1782 (info->loop_cond,
1783 "not vectorized: number of iterations cannot be computed.\n");
1785 if (integer_zerop (info->number_of_iterations))
1786 return opt_result::failure_at
1787 (info->loop_cond,
1788 "not vectorized: number of iterations = 0.\n");
1790 if (!(tree_fits_shwi_p (info->number_of_iterations)
1791 && tree_to_shwi (info->number_of_iterations) > 0))
1793 if (dump_enabled_p ())
1795 dump_printf_loc (MSG_NOTE, vect_location,
1796 "Symbolic number of iterations is ");
1797 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1798 dump_printf (MSG_NOTE, "\n");
1802 return opt_result::success ();
1805 /* Create a loop_vec_info for LOOP with SHARED and the
1806 vect_analyze_loop_form result. */
1808 loop_vec_info
1809 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1810 const vect_loop_form_info *info,
1811 loop_vec_info main_loop_info)
1813 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1814 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1815 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1816 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1817 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1818 /* Also record the assumptions for versioning. */
1819 if (!integer_onep (info->assumptions) && !main_loop_info)
1820 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1822 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1823 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1824 if (info->inner_loop_cond)
1826 stmt_vec_info inner_loop_cond_info
1827 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1828 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1829 /* If we have an estimate on the number of iterations of the inner
1830 loop use that to limit the scale for costing, otherwise use
1831 --param vect-inner-loop-cost-factor literally. */
1832 widest_int nit;
1833 if (estimated_stmt_executions (loop->inner, &nit))
1834 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1835 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1838 return loop_vinfo;
1843 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1844 statements update the vectorization factor. */
1846 static void
1847 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1849 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1850 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1851 int nbbs = loop->num_nodes;
1852 poly_uint64 vectorization_factor;
1853 int i;
1855 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1857 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1858 gcc_assert (known_ne (vectorization_factor, 0U));
1860 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1861 vectorization factor of the loop is the unrolling factor required by
1862 the SLP instances. If that unrolling factor is 1, we say, that we
1863 perform pure SLP on loop - cross iteration parallelism is not
1864 exploited. */
1865 bool only_slp_in_loop = true;
1866 for (i = 0; i < nbbs; i++)
1868 basic_block bb = bbs[i];
1869 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1870 gsi_next (&si))
1872 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1873 if (!stmt_info)
1874 continue;
1875 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1876 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1877 && !PURE_SLP_STMT (stmt_info))
1878 /* STMT needs both SLP and loop-based vectorization. */
1879 only_slp_in_loop = false;
1881 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1882 gsi_next (&si))
1884 if (is_gimple_debug (gsi_stmt (si)))
1885 continue;
1886 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1887 stmt_info = vect_stmt_to_vectorize (stmt_info);
1888 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1889 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1890 && !PURE_SLP_STMT (stmt_info))
1891 /* STMT needs both SLP and loop-based vectorization. */
1892 only_slp_in_loop = false;
1896 if (only_slp_in_loop)
1898 if (dump_enabled_p ())
1899 dump_printf_loc (MSG_NOTE, vect_location,
1900 "Loop contains only SLP stmts\n");
1901 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1903 else
1905 if (dump_enabled_p ())
1906 dump_printf_loc (MSG_NOTE, vect_location,
1907 "Loop contains SLP and non-SLP stmts\n");
1908 /* Both the vectorization factor and unroll factor have the form
1909 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1910 so they must have a common multiple. */
1911 vectorization_factor
1912 = force_common_multiple (vectorization_factor,
1913 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1916 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1917 if (dump_enabled_p ())
1919 dump_printf_loc (MSG_NOTE, vect_location,
1920 "Updating vectorization factor to ");
1921 dump_dec (MSG_NOTE, vectorization_factor);
1922 dump_printf (MSG_NOTE, ".\n");
1926 /* Return true if STMT_INFO describes a double reduction phi and if
1927 the other phi in the reduction is also relevant for vectorization.
1928 This rejects cases such as:
1930 outer1:
1931 x_1 = PHI <x_3(outer2), ...>;
1934 inner:
1935 x_2 = ...;
1938 outer2:
1939 x_3 = PHI <x_2(inner)>;
1941 if nothing in x_2 or elsewhere makes x_1 relevant. */
1943 static bool
1944 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1946 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1947 return false;
1949 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1952 /* Function vect_analyze_loop_operations.
1954 Scan the loop stmts and make sure they are all vectorizable. */
1956 static opt_result
1957 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1959 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1960 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1961 int nbbs = loop->num_nodes;
1962 int i;
1963 stmt_vec_info stmt_info;
1964 bool need_to_vectorize = false;
1965 bool ok;
1967 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1969 auto_vec<stmt_info_for_cost> cost_vec;
1971 for (i = 0; i < nbbs; i++)
1973 basic_block bb = bbs[i];
1975 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1976 gsi_next (&si))
1978 gphi *phi = si.phi ();
1979 ok = true;
1981 stmt_info = loop_vinfo->lookup_stmt (phi);
1982 if (dump_enabled_p ())
1983 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1984 (gimple *) phi);
1985 if (virtual_operand_p (gimple_phi_result (phi)))
1986 continue;
1988 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1989 (i.e., a phi in the tail of the outer-loop). */
1990 if (! is_loop_header_bb_p (bb))
1992 /* FORNOW: we currently don't support the case that these phis
1993 are not used in the outerloop (unless it is double reduction,
1994 i.e., this phi is vect_reduction_def), cause this case
1995 requires to actually do something here. */
1996 if (STMT_VINFO_LIVE_P (stmt_info)
1997 && !vect_active_double_reduction_p (stmt_info))
1998 return opt_result::failure_at (phi,
1999 "Unsupported loop-closed phi"
2000 " in outer-loop.\n");
2002 /* If PHI is used in the outer loop, we check that its operand
2003 is defined in the inner loop. */
2004 if (STMT_VINFO_RELEVANT_P (stmt_info))
2006 tree phi_op;
2008 if (gimple_phi_num_args (phi) != 1)
2009 return opt_result::failure_at (phi, "unsupported phi");
2011 phi_op = PHI_ARG_DEF (phi, 0);
2012 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2013 if (!op_def_info)
2014 return opt_result::failure_at (phi, "unsupported phi\n");
2016 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2017 && (STMT_VINFO_RELEVANT (op_def_info)
2018 != vect_used_in_outer_by_reduction))
2019 return opt_result::failure_at (phi, "unsupported phi\n");
2021 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2022 || (STMT_VINFO_DEF_TYPE (stmt_info)
2023 == vect_double_reduction_def))
2024 && !vectorizable_lc_phi (loop_vinfo,
2025 stmt_info, NULL, NULL))
2026 return opt_result::failure_at (phi, "unsupported phi\n");
2029 continue;
2032 gcc_assert (stmt_info);
2034 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2035 || STMT_VINFO_LIVE_P (stmt_info))
2036 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2037 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2038 /* A scalar-dependence cycle that we don't support. */
2039 return opt_result::failure_at (phi,
2040 "not vectorized:"
2041 " scalar dependence cycle.\n");
2043 if (STMT_VINFO_RELEVANT_P (stmt_info))
2045 need_to_vectorize = true;
2046 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2047 && ! PURE_SLP_STMT (stmt_info))
2048 ok = vectorizable_induction (loop_vinfo,
2049 stmt_info, NULL, NULL,
2050 &cost_vec);
2051 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2052 || (STMT_VINFO_DEF_TYPE (stmt_info)
2053 == vect_double_reduction_def)
2054 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2055 && ! PURE_SLP_STMT (stmt_info))
2056 ok = vectorizable_reduction (loop_vinfo,
2057 stmt_info, NULL, NULL, &cost_vec);
2058 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2059 == vect_first_order_recurrence)
2060 && ! PURE_SLP_STMT (stmt_info))
2061 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2062 &cost_vec);
2065 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2066 if (ok
2067 && STMT_VINFO_LIVE_P (stmt_info)
2068 && !PURE_SLP_STMT (stmt_info))
2069 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2070 -1, false, &cost_vec);
2072 if (!ok)
2073 return opt_result::failure_at (phi,
2074 "not vectorized: relevant phi not "
2075 "supported: %G",
2076 static_cast <gimple *> (phi));
2079 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2080 gsi_next (&si))
2082 gimple *stmt = gsi_stmt (si);
2083 if (!gimple_clobber_p (stmt)
2084 && !is_gimple_debug (stmt))
2086 opt_result res
2087 = vect_analyze_stmt (loop_vinfo,
2088 loop_vinfo->lookup_stmt (stmt),
2089 &need_to_vectorize,
2090 NULL, NULL, &cost_vec);
2091 if (!res)
2092 return res;
2095 } /* bbs */
2097 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2099 /* All operations in the loop are either irrelevant (deal with loop
2100 control, or dead), or only used outside the loop and can be moved
2101 out of the loop (e.g. invariants, inductions). The loop can be
2102 optimized away by scalar optimizations. We're better off not
2103 touching this loop. */
2104 if (!need_to_vectorize)
2106 if (dump_enabled_p ())
2107 dump_printf_loc (MSG_NOTE, vect_location,
2108 "All the computation can be taken out of the loop.\n");
2109 return opt_result::failure_at
2110 (vect_location,
2111 "not vectorized: redundant loop. no profit to vectorize.\n");
2114 return opt_result::success ();
2117 /* Return true if we know that the iteration count is smaller than the
2118 vectorization factor. Return false if it isn't, or if we can't be sure
2119 either way. */
2121 static bool
2122 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2124 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2126 HOST_WIDE_INT max_niter;
2127 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2128 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2129 else
2130 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2132 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2133 return true;
2135 return false;
2138 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2139 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2140 definitely no, or -1 if it's worth retrying. */
2142 static int
2143 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2144 unsigned *suggested_unroll_factor)
2146 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2147 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2149 /* Only loops that can handle partially-populated vectors can have iteration
2150 counts less than the vectorization factor. */
2151 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2152 && vect_known_niters_smaller_than_vf (loop_vinfo))
2154 if (dump_enabled_p ())
2155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2156 "not vectorized: iteration count smaller than "
2157 "vectorization factor.\n");
2158 return 0;
2161 /* If we know the number of iterations we can do better, for the
2162 epilogue we can also decide whether the main loop leaves us
2163 with enough iterations, prefering a smaller vector epilog then
2164 also possibly used for the case we skip the vector loop. */
2165 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2167 widest_int scalar_niters
2168 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2169 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2171 loop_vec_info orig_loop_vinfo
2172 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2173 unsigned lowest_vf
2174 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2175 int prolog_peeling = 0;
2176 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2177 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2178 if (prolog_peeling >= 0
2179 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2180 lowest_vf))
2182 unsigned gap
2183 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2184 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2185 % lowest_vf + gap);
2188 /* Reject vectorizing for a single scalar iteration, even if
2189 we could in principle implement that using partial vectors. */
2190 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2191 if (scalar_niters <= peeling_gap + 1)
2193 if (dump_enabled_p ())
2194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2195 "not vectorized: loop only has a single "
2196 "scalar iteration.\n");
2197 return 0;
2200 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2202 /* Check that the loop processes at least one full vector. */
2203 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2204 if (known_lt (scalar_niters, vf))
2206 if (dump_enabled_p ())
2207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2208 "loop does not have enough iterations "
2209 "to support vectorization.\n");
2210 return 0;
2213 /* If we need to peel an extra epilogue iteration to handle data
2214 accesses with gaps, check that there are enough scalar iterations
2215 available.
2217 The check above is redundant with this one when peeling for gaps,
2218 but the distinction is useful for diagnostics. */
2219 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2220 && known_le (scalar_niters, vf))
2222 if (dump_enabled_p ())
2223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2224 "loop does not have enough iterations "
2225 "to support peeling for gaps.\n");
2226 return 0;
2231 /* If using the "very cheap" model. reject cases in which we'd keep
2232 a copy of the scalar code (even if we might be able to vectorize it). */
2233 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2234 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2235 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2236 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2238 if (dump_enabled_p ())
2239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2240 "some scalar iterations would need to be peeled\n");
2241 return 0;
2244 int min_profitable_iters, min_profitable_estimate;
2245 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2246 &min_profitable_estimate,
2247 suggested_unroll_factor);
2249 if (min_profitable_iters < 0)
2251 if (dump_enabled_p ())
2252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2253 "not vectorized: vectorization not profitable.\n");
2254 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2256 "not vectorized: vector version will never be "
2257 "profitable.\n");
2258 return -1;
2261 int min_scalar_loop_bound = (param_min_vect_loop_bound
2262 * assumed_vf);
2264 /* Use the cost model only if it is more conservative than user specified
2265 threshold. */
2266 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2267 min_profitable_iters);
2269 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2271 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2272 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2274 if (dump_enabled_p ())
2275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2276 "not vectorized: vectorization not profitable.\n");
2277 if (dump_enabled_p ())
2278 dump_printf_loc (MSG_NOTE, vect_location,
2279 "not vectorized: iteration count smaller than user "
2280 "specified loop bound parameter or minimum profitable "
2281 "iterations (whichever is more conservative).\n");
2282 return 0;
2285 /* The static profitablity threshold min_profitable_estimate includes
2286 the cost of having to check at runtime whether the scalar loop
2287 should be used instead. If it turns out that we don't need or want
2288 such a check, the threshold we should use for the static estimate
2289 is simply the point at which the vector loop becomes more profitable
2290 than the scalar loop. */
2291 if (min_profitable_estimate > min_profitable_iters
2292 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2293 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2294 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2295 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2297 if (dump_enabled_p ())
2298 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2299 " choice between the scalar and vector loops\n");
2300 min_profitable_estimate = min_profitable_iters;
2303 /* If the vector loop needs multiple iterations to be beneficial then
2304 things are probably too close to call, and the conservative thing
2305 would be to stick with the scalar code. */
2306 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2307 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2309 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311 "one iteration of the vector loop would be"
2312 " more expensive than the equivalent number of"
2313 " iterations of the scalar loop\n");
2314 return 0;
2317 HOST_WIDE_INT estimated_niter;
2319 /* If we are vectorizing an epilogue then we know the maximum number of
2320 scalar iterations it will cover is at least one lower than the
2321 vectorization factor of the main loop. */
2322 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2323 estimated_niter
2324 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2325 else
2327 estimated_niter = estimated_stmt_executions_int (loop);
2328 if (estimated_niter == -1)
2329 estimated_niter = likely_max_stmt_executions_int (loop);
2331 if (estimated_niter != -1
2332 && ((unsigned HOST_WIDE_INT) estimated_niter
2333 < MAX (th, (unsigned) min_profitable_estimate)))
2335 if (dump_enabled_p ())
2336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337 "not vectorized: estimated iteration count too "
2338 "small.\n");
2339 if (dump_enabled_p ())
2340 dump_printf_loc (MSG_NOTE, vect_location,
2341 "not vectorized: estimated iteration count smaller "
2342 "than specified loop bound parameter or minimum "
2343 "profitable iterations (whichever is more "
2344 "conservative).\n");
2345 return -1;
2348 return 1;
2351 static opt_result
2352 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2353 vec<data_reference_p> *datarefs,
2354 unsigned int *n_stmts)
2356 *n_stmts = 0;
2357 for (unsigned i = 0; i < loop->num_nodes; i++)
2358 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2359 !gsi_end_p (gsi); gsi_next (&gsi))
2361 gimple *stmt = gsi_stmt (gsi);
2362 if (is_gimple_debug (stmt))
2363 continue;
2364 ++(*n_stmts);
2365 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2366 NULL, 0);
2367 if (!res)
2369 if (is_gimple_call (stmt) && loop->safelen)
2371 tree fndecl = gimple_call_fndecl (stmt), op;
2372 if (fndecl == NULL_TREE
2373 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2375 fndecl = gimple_call_arg (stmt, 0);
2376 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2377 fndecl = TREE_OPERAND (fndecl, 0);
2378 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2380 if (fndecl != NULL_TREE)
2382 cgraph_node *node = cgraph_node::get (fndecl);
2383 if (node != NULL && node->simd_clones != NULL)
2385 unsigned int j, n = gimple_call_num_args (stmt);
2386 for (j = 0; j < n; j++)
2388 op = gimple_call_arg (stmt, j);
2389 if (DECL_P (op)
2390 || (REFERENCE_CLASS_P (op)
2391 && get_base_address (op)))
2392 break;
2394 op = gimple_call_lhs (stmt);
2395 /* Ignore #pragma omp declare simd functions
2396 if they don't have data references in the
2397 call stmt itself. */
2398 if (j == n
2399 && !(op
2400 && (DECL_P (op)
2401 || (REFERENCE_CLASS_P (op)
2402 && get_base_address (op)))))
2403 continue;
2407 return res;
2409 /* If dependence analysis will give up due to the limit on the
2410 number of datarefs stop here and fail fatally. */
2411 if (datarefs->length ()
2412 > (unsigned)param_loop_max_datarefs_for_datadeps)
2413 return opt_result::failure_at (stmt, "exceeded param "
2414 "loop-max-datarefs-for-datadeps\n");
2416 return opt_result::success ();
2419 /* Look for SLP-only access groups and turn each individual access into its own
2420 group. */
2421 static void
2422 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2424 unsigned int i;
2425 struct data_reference *dr;
2427 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2429 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2430 FOR_EACH_VEC_ELT (datarefs, i, dr)
2432 gcc_assert (DR_REF (dr));
2433 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2435 /* Check if the load is a part of an interleaving chain. */
2436 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2438 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2439 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2440 unsigned int group_size = DR_GROUP_SIZE (first_element);
2442 /* Check if SLP-only groups. */
2443 if (!STMT_SLP_TYPE (stmt_info)
2444 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2446 /* Dissolve the group. */
2447 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2449 stmt_vec_info vinfo = first_element;
2450 while (vinfo)
2452 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2453 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2454 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2455 DR_GROUP_SIZE (vinfo) = 1;
2456 if (STMT_VINFO_STRIDED_P (first_element)
2457 /* We cannot handle stores with gaps. */
2458 || DR_IS_WRITE (dr_info->dr))
2460 STMT_VINFO_STRIDED_P (vinfo) = true;
2461 DR_GROUP_GAP (vinfo) = 0;
2463 else
2464 DR_GROUP_GAP (vinfo) = group_size - 1;
2465 /* Duplicate and adjust alignment info, it needs to
2466 be present on each group leader, see dr_misalignment. */
2467 if (vinfo != first_element)
2469 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2470 dr_info2->target_alignment = dr_info->target_alignment;
2471 int misalignment = dr_info->misalignment;
2472 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2474 HOST_WIDE_INT diff
2475 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2476 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2477 unsigned HOST_WIDE_INT align_c
2478 = dr_info->target_alignment.to_constant ();
2479 misalignment = (misalignment + diff) % align_c;
2481 dr_info2->misalignment = misalignment;
2483 vinfo = next;
2490 /* Determine if operating on full vectors for LOOP_VINFO might leave
2491 some scalar iterations still to do. If so, decide how we should
2492 handle those scalar iterations. The possibilities are:
2494 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2495 In this case:
2497 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2498 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2499 LOOP_VINFO_PEELING_FOR_NITER == false
2501 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2502 to handle the remaining scalar iterations. In this case:
2504 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2505 LOOP_VINFO_PEELING_FOR_NITER == true
2507 There are two choices:
2509 (2a) Consider vectorizing the epilogue loop at the same VF as the
2510 main loop, but using partial vectors instead of full vectors.
2511 In this case:
2513 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2515 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2516 In this case:
2518 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2521 opt_result
2522 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2524 /* Determine whether there would be any scalar iterations left over. */
2525 bool need_peeling_or_partial_vectors_p
2526 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2528 /* Decide whether to vectorize the loop with partial vectors. */
2529 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2530 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2531 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2532 && need_peeling_or_partial_vectors_p)
2534 /* For partial-vector-usage=1, try to push the handling of partial
2535 vectors to the epilogue, with the main loop continuing to operate
2536 on full vectors.
2538 If we are unrolling we also do not want to use partial vectors. This
2539 is to avoid the overhead of generating multiple masks and also to
2540 avoid having to execute entire iterations of FALSE masked instructions
2541 when dealing with one or less full iterations.
2543 ??? We could then end up failing to use partial vectors if we
2544 decide to peel iterations into a prologue, and if the main loop
2545 then ends up processing fewer than VF iterations. */
2546 if ((param_vect_partial_vector_usage == 1
2547 || loop_vinfo->suggested_unroll_factor > 1)
2548 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2549 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2550 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2551 else
2552 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2555 if (dump_enabled_p ())
2556 dump_printf_loc (MSG_NOTE, vect_location,
2557 "operating on %s vectors%s.\n",
2558 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2559 ? "partial" : "full",
2560 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2561 ? " for epilogue loop" : "");
2563 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2564 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2565 && need_peeling_or_partial_vectors_p);
2567 return opt_result::success ();
2570 /* Function vect_analyze_loop_2.
2572 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2573 analyses will record information in some members of LOOP_VINFO. FATAL
2574 indicates if some analysis meets fatal error. If one non-NULL pointer
2575 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2576 worked out suggested unroll factor, while one NULL pointer shows it's
2577 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2578 is to hold the slp decision when the suggested unroll factor is worked
2579 out. */
2580 static opt_result
2581 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2582 unsigned *suggested_unroll_factor,
2583 bool& slp_done_for_suggested_uf)
2585 opt_result ok = opt_result::success ();
2586 int res;
2587 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2588 poly_uint64 min_vf = 2;
2589 loop_vec_info orig_loop_vinfo = NULL;
2591 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2592 loop_vec_info of the first vectorized loop. */
2593 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2594 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2595 else
2596 orig_loop_vinfo = loop_vinfo;
2597 gcc_assert (orig_loop_vinfo);
2599 /* The first group of checks is independent of the vector size. */
2600 fatal = true;
2602 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2603 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2604 return opt_result::failure_at (vect_location,
2605 "not vectorized: simd if(0)\n");
2607 /* Find all data references in the loop (which correspond to vdefs/vuses)
2608 and analyze their evolution in the loop. */
2610 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2612 /* Gather the data references and count stmts in the loop. */
2613 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2615 opt_result res
2616 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2617 &LOOP_VINFO_DATAREFS (loop_vinfo),
2618 &LOOP_VINFO_N_STMTS (loop_vinfo));
2619 if (!res)
2621 if (dump_enabled_p ())
2622 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2623 "not vectorized: loop contains function "
2624 "calls or data references that cannot "
2625 "be analyzed\n");
2626 return res;
2628 loop_vinfo->shared->save_datarefs ();
2630 else
2631 loop_vinfo->shared->check_datarefs ();
2633 /* Analyze the data references and also adjust the minimal
2634 vectorization factor according to the loads and stores. */
2636 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2637 if (!ok)
2639 if (dump_enabled_p ())
2640 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2641 "bad data references.\n");
2642 return ok;
2645 /* Check if we are applying unroll factor now. */
2646 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2647 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2649 /* If the slp decision is false when suggested unroll factor is worked
2650 out, and we are applying suggested unroll factor, we can simply skip
2651 all slp related analyses this time. */
2652 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2654 /* Classify all cross-iteration scalar data-flow cycles.
2655 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2656 vect_analyze_scalar_cycles (loop_vinfo, slp);
2658 vect_pattern_recog (loop_vinfo);
2660 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2662 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2663 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2665 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2666 if (!ok)
2668 if (dump_enabled_p ())
2669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2670 "bad data access.\n");
2671 return ok;
2674 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2676 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2677 if (!ok)
2679 if (dump_enabled_p ())
2680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2681 "unexpected pattern.\n");
2682 return ok;
2685 /* While the rest of the analysis below depends on it in some way. */
2686 fatal = false;
2688 /* Analyze data dependences between the data-refs in the loop
2689 and adjust the maximum vectorization factor according to
2690 the dependences.
2691 FORNOW: fail at the first data dependence that we encounter. */
2693 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2694 if (!ok)
2696 if (dump_enabled_p ())
2697 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2698 "bad data dependence.\n");
2699 return ok;
2701 if (max_vf != MAX_VECTORIZATION_FACTOR
2702 && maybe_lt (max_vf, min_vf))
2703 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2704 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2706 ok = vect_determine_vectorization_factor (loop_vinfo);
2707 if (!ok)
2709 if (dump_enabled_p ())
2710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2711 "can't determine vectorization factor.\n");
2712 return ok;
2714 if (max_vf != MAX_VECTORIZATION_FACTOR
2715 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2716 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2718 /* Compute the scalar iteration cost. */
2719 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2721 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2723 if (slp)
2725 /* Check the SLP opportunities in the loop, analyze and build
2726 SLP trees. */
2727 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2728 if (!ok)
2729 return ok;
2731 /* If there are any SLP instances mark them as pure_slp. */
2732 slp = vect_make_slp_decision (loop_vinfo);
2733 if (slp)
2735 /* Find stmts that need to be both vectorized and SLPed. */
2736 vect_detect_hybrid_slp (loop_vinfo);
2738 /* Update the vectorization factor based on the SLP decision. */
2739 vect_update_vf_for_slp (loop_vinfo);
2741 /* Optimize the SLP graph with the vectorization factor fixed. */
2742 vect_optimize_slp (loop_vinfo);
2744 /* Gather the loads reachable from the SLP graph entries. */
2745 vect_gather_slp_loads (loop_vinfo);
2749 bool saved_can_use_partial_vectors_p
2750 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2752 /* We don't expect to have to roll back to anything other than an empty
2753 set of rgroups. */
2754 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2756 /* This is the point where we can re-start analysis with SLP forced off. */
2757 start_over:
2759 /* Apply the suggested unrolling factor, this was determined by the backend
2760 during finish_cost the first time we ran the analyzis for this
2761 vector mode. */
2762 if (applying_suggested_uf)
2763 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2765 /* Now the vectorization factor is final. */
2766 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2767 gcc_assert (known_ne (vectorization_factor, 0U));
2769 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2771 dump_printf_loc (MSG_NOTE, vect_location,
2772 "vectorization_factor = ");
2773 dump_dec (MSG_NOTE, vectorization_factor);
2774 dump_printf (MSG_NOTE, ", niters = %wd\n",
2775 LOOP_VINFO_INT_NITERS (loop_vinfo));
2778 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2780 /* Analyze the alignment of the data-refs in the loop.
2781 Fail if a data reference is found that cannot be vectorized. */
2783 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2784 if (!ok)
2786 if (dump_enabled_p ())
2787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2788 "bad data alignment.\n");
2789 return ok;
2792 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2793 It is important to call pruning after vect_analyze_data_ref_accesses,
2794 since we use grouping information gathered by interleaving analysis. */
2795 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2796 if (!ok)
2797 return ok;
2799 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2800 vectorization, since we do not want to add extra peeling or
2801 add versioning for alignment. */
2802 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2803 /* This pass will decide on using loop versioning and/or loop peeling in
2804 order to enhance the alignment of data references in the loop. */
2805 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2806 if (!ok)
2807 return ok;
2809 if (slp)
2811 /* Analyze operations in the SLP instances. Note this may
2812 remove unsupported SLP instances which makes the above
2813 SLP kind detection invalid. */
2814 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2815 vect_slp_analyze_operations (loop_vinfo);
2816 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2818 ok = opt_result::failure_at (vect_location,
2819 "unsupported SLP instances\n");
2820 goto again;
2823 /* Check whether any load in ALL SLP instances is possibly permuted. */
2824 slp_tree load_node, slp_root;
2825 unsigned i, x;
2826 slp_instance instance;
2827 bool can_use_lanes = true;
2828 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2830 slp_root = SLP_INSTANCE_TREE (instance);
2831 int group_size = SLP_TREE_LANES (slp_root);
2832 tree vectype = SLP_TREE_VECTYPE (slp_root);
2833 bool loads_permuted = false;
2834 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2836 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2837 continue;
2838 unsigned j;
2839 stmt_vec_info load_info;
2840 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2841 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2843 loads_permuted = true;
2844 break;
2848 /* If the loads and stores can be handled with load/store-lane
2849 instructions record it and move on to the next instance. */
2850 if (loads_permuted
2851 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2852 && vect_store_lanes_supported (vectype, group_size, false)
2853 != IFN_LAST)
2855 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2857 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2858 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2859 /* Use SLP for strided accesses (or if we can't
2860 load-lanes). */
2861 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2862 || vect_load_lanes_supported
2863 (STMT_VINFO_VECTYPE (stmt_vinfo),
2864 DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2865 break;
2868 can_use_lanes
2869 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2871 if (can_use_lanes && dump_enabled_p ())
2872 dump_printf_loc (MSG_NOTE, vect_location,
2873 "SLP instance %p can use load/store-lanes\n",
2874 (void *) instance);
2876 else
2878 can_use_lanes = false;
2879 break;
2883 /* If all SLP instances can use load/store-lanes abort SLP and try again
2884 with SLP disabled. */
2885 if (can_use_lanes)
2887 ok = opt_result::failure_at (vect_location,
2888 "Built SLP cancelled: can use "
2889 "load/store-lanes\n");
2890 if (dump_enabled_p ())
2891 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2892 "Built SLP cancelled: all SLP instances support "
2893 "load/store-lanes\n");
2894 goto again;
2898 /* Dissolve SLP-only groups. */
2899 vect_dissolve_slp_only_groups (loop_vinfo);
2901 /* Scan all the remaining operations in the loop that are not subject
2902 to SLP and make sure they are vectorizable. */
2903 ok = vect_analyze_loop_operations (loop_vinfo);
2904 if (!ok)
2906 if (dump_enabled_p ())
2907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2908 "bad operation or unsupported loop bound.\n");
2909 return ok;
2912 /* For now, we don't expect to mix both masking and length approaches for one
2913 loop, disable it if both are recorded. */
2914 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2915 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2916 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2918 if (dump_enabled_p ())
2919 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2920 "can't vectorize a loop with partial vectors"
2921 " because we don't expect to mix different"
2922 " approaches with partial vectors for the"
2923 " same loop.\n");
2924 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2927 /* If we still have the option of using partial vectors,
2928 check whether we can generate the necessary loop controls. */
2929 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2931 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2933 if (!vect_verify_full_masking (loop_vinfo)
2934 && !vect_verify_full_masking_avx512 (loop_vinfo))
2935 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2937 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2938 if (!vect_verify_loop_lens (loop_vinfo))
2939 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2942 /* If we're vectorizing a loop that uses length "controls" and
2943 can iterate more than once, we apply decrementing IV approach
2944 in loop control. */
2945 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2946 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2947 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2948 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2949 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2950 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2951 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2953 /* If a loop uses length controls and has a decrementing loop control IV,
2954 we will normally pass that IV through a MIN_EXPR to calcaluate the
2955 basis for the length controls. E.g. in a loop that processes one
2956 element per scalar iteration, the number of elements would be
2957 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2959 This MIN_EXPR approach allows us to use pointer IVs with an invariant
2960 step, since only the final iteration of the vector loop can have
2961 inactive lanes.
2963 However, some targets have a dedicated instruction for calculating the
2964 preferred length, given the total number of elements that still need to
2965 be processed. This is encapsulated in the SELECT_VL internal function.
2967 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2968 to determine the basis for the length controls. However, unlike the
2969 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2970 lanes inactive in any iteration of the vector loop, not just the last
2971 iteration. This SELECT_VL approach therefore requires us to use pointer
2972 IVs with variable steps.
2974 Once we've decided how many elements should be processed by one
2975 iteration of the vector loop, we need to populate the rgroup controls.
2976 If a loop has multiple rgroups, we need to make sure that those rgroups
2977 "line up" (that is, they must be consistent about which elements are
2978 active and which aren't). This is done by vect_adjust_loop_lens_control.
2980 In principle, it would be possible to use vect_adjust_loop_lens_control
2981 on either the result of a MIN_EXPR or the result of a SELECT_VL.
2982 However:
2984 (1) In practice, it only makes sense to use SELECT_VL when a vector
2985 operation will be controlled directly by the result. It is not
2986 worth using SELECT_VL if it would only be the input to other
2987 calculations.
2989 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2990 pointer IV will need N updates by a variable amount (N-1 updates
2991 within the iteration and 1 update to move to the next iteration).
2993 Because of this, we prefer to use the MIN_EXPR approach whenever there
2994 is more than one length control.
2996 In addition, SELECT_VL always operates to a granularity of 1 unit.
2997 If we wanted to use it to control an SLP operation on N consecutive
2998 elements, we would need to make the SELECT_VL inputs measure scalar
2999 iterations (rather than elements) and then multiply the SELECT_VL
3000 result by N. But using SELECT_VL this way is inefficient because
3001 of (1) above.
3003 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3004 satisfied:
3006 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3007 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3009 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3010 we will fail to gain benefits of following unroll optimizations. We prefer
3011 using the MIN_EXPR approach in this situation. */
3012 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3014 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3015 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3016 OPTIMIZE_FOR_SPEED)
3017 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3018 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3019 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3020 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3021 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3024 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3025 assuming that the loop will be used as a main loop. We will redo
3026 this analysis later if we instead decide to use the loop as an
3027 epilogue loop. */
3028 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3029 if (!ok)
3030 return ok;
3032 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3033 to be able to handle fewer than VF scalars, or needs to have a lower VF
3034 than the main loop. */
3035 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3036 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3038 poly_uint64 unscaled_vf
3039 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3040 orig_loop_vinfo->suggested_unroll_factor);
3041 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3042 return opt_result::failure_at (vect_location,
3043 "Vectorization factor too high for"
3044 " epilogue loop.\n");
3047 /* Check the costings of the loop make vectorizing worthwhile. */
3048 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3049 if (res < 0)
3051 ok = opt_result::failure_at (vect_location,
3052 "Loop costings may not be worthwhile.\n");
3053 goto again;
3055 if (!res)
3056 return opt_result::failure_at (vect_location,
3057 "Loop costings not worthwhile.\n");
3059 /* If an epilogue loop is required make sure we can create one. */
3060 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3061 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3063 if (dump_enabled_p ())
3064 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3065 if (!vect_can_advance_ivs_p (loop_vinfo)
3066 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
3067 single_exit (LOOP_VINFO_LOOP
3068 (loop_vinfo))))
3070 ok = opt_result::failure_at (vect_location,
3071 "not vectorized: can't create required "
3072 "epilog loop\n");
3073 goto again;
3077 /* During peeling, we need to check if number of loop iterations is
3078 enough for both peeled prolog loop and vector loop. This check
3079 can be merged along with threshold check of loop versioning, so
3080 increase threshold for this case if necessary.
3082 If we are analyzing an epilogue we still want to check what its
3083 versioning threshold would be. If we decide to vectorize the epilogues we
3084 will want to use the lowest versioning threshold of all epilogues and main
3085 loop. This will enable us to enter a vectorized epilogue even when
3086 versioning the loop. We can't simply check whether the epilogue requires
3087 versioning though since we may have skipped some versioning checks when
3088 analyzing the epilogue. For instance, checks for alias versioning will be
3089 skipped when dealing with epilogues as we assume we already checked them
3090 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3091 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3093 poly_uint64 niters_th = 0;
3094 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3096 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3098 /* Niters for peeled prolog loop. */
3099 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3101 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3102 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3103 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3105 else
3106 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3109 /* Niters for at least one iteration of vectorized loop. */
3110 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3111 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3112 /* One additional iteration because of peeling for gap. */
3113 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3114 niters_th += 1;
3116 /* Use the same condition as vect_transform_loop to decide when to use
3117 the cost to determine a versioning threshold. */
3118 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3119 && ordered_p (th, niters_th))
3120 niters_th = ordered_max (poly_uint64 (th), niters_th);
3122 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3125 gcc_assert (known_eq (vectorization_factor,
3126 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3128 slp_done_for_suggested_uf = slp;
3130 /* Ok to vectorize! */
3131 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3132 return opt_result::success ();
3134 again:
3135 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3136 gcc_assert (!ok);
3138 /* Try again with SLP forced off but if we didn't do any SLP there is
3139 no point in re-trying. */
3140 if (!slp)
3141 return ok;
3143 /* If the slp decision is true when suggested unroll factor is worked
3144 out, and we are applying suggested unroll factor, we don't need to
3145 re-try any more. */
3146 if (applying_suggested_uf && slp_done_for_suggested_uf)
3147 return ok;
3149 /* If there are reduction chains re-trying will fail anyway. */
3150 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3151 return ok;
3153 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3154 via interleaving or lane instructions. */
3155 slp_instance instance;
3156 slp_tree node;
3157 unsigned i, j;
3158 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3160 stmt_vec_info vinfo;
3161 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3162 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3163 continue;
3164 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3165 unsigned int size = DR_GROUP_SIZE (vinfo);
3166 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3167 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3168 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3169 && ! vect_grouped_store_supported (vectype, size))
3170 return opt_result::failure_at (vinfo->stmt,
3171 "unsupported grouped store\n");
3172 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3174 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
3175 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3176 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3177 size = DR_GROUP_SIZE (vinfo);
3178 vectype = STMT_VINFO_VECTYPE (vinfo);
3179 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3180 && ! vect_grouped_load_supported (vectype, single_element_p,
3181 size))
3182 return opt_result::failure_at (vinfo->stmt,
3183 "unsupported grouped load\n");
3187 if (dump_enabled_p ())
3188 dump_printf_loc (MSG_NOTE, vect_location,
3189 "re-trying with SLP disabled\n");
3191 /* Roll back state appropriately. No SLP this time. */
3192 slp = false;
3193 /* Restore vectorization factor as it were without SLP. */
3194 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3195 /* Free the SLP instances. */
3196 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3197 vect_free_slp_instance (instance);
3198 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3199 /* Reset SLP type to loop_vect on all stmts. */
3200 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3202 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3203 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3204 !gsi_end_p (si); gsi_next (&si))
3206 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3207 STMT_SLP_TYPE (stmt_info) = loop_vect;
3208 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3209 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3211 /* vectorizable_reduction adjusts reduction stmt def-types,
3212 restore them to that of the PHI. */
3213 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3214 = STMT_VINFO_DEF_TYPE (stmt_info);
3215 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3216 (STMT_VINFO_REDUC_DEF (stmt_info)))
3217 = STMT_VINFO_DEF_TYPE (stmt_info);
3220 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3221 !gsi_end_p (si); gsi_next (&si))
3223 if (is_gimple_debug (gsi_stmt (si)))
3224 continue;
3225 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3226 STMT_SLP_TYPE (stmt_info) = loop_vect;
3227 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3229 stmt_vec_info pattern_stmt_info
3230 = STMT_VINFO_RELATED_STMT (stmt_info);
3231 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3232 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3234 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3235 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3236 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3237 !gsi_end_p (pi); gsi_next (&pi))
3238 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3239 = loop_vect;
3243 /* Free optimized alias test DDRS. */
3244 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3245 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3246 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3247 /* Reset target cost data. */
3248 delete loop_vinfo->vector_costs;
3249 loop_vinfo->vector_costs = nullptr;
3250 /* Reset accumulated rgroup information. */
3251 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3252 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3253 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3254 /* Reset assorted flags. */
3255 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3256 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3257 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3258 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3259 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3260 = saved_can_use_partial_vectors_p;
3262 goto start_over;
3265 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3266 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3267 OLD_LOOP_VINFO is better unless something specifically indicates
3268 otherwise.
3270 Note that this deliberately isn't a partial order. */
3272 static bool
3273 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3274 loop_vec_info old_loop_vinfo)
3276 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3277 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3279 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3280 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3282 /* Always prefer a VF of loop->simdlen over any other VF. */
3283 if (loop->simdlen)
3285 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3286 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3287 if (new_simdlen_p != old_simdlen_p)
3288 return new_simdlen_p;
3291 const auto *old_costs = old_loop_vinfo->vector_costs;
3292 const auto *new_costs = new_loop_vinfo->vector_costs;
3293 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3294 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3296 return new_costs->better_main_loop_than_p (old_costs);
3299 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3300 true if we should. */
3302 static bool
3303 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3304 loop_vec_info old_loop_vinfo)
3306 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3307 return false;
3309 if (dump_enabled_p ())
3310 dump_printf_loc (MSG_NOTE, vect_location,
3311 "***** Preferring vector mode %s to vector mode %s\n",
3312 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3313 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3314 return true;
3317 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3318 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3319 MODE_I to the next mode useful to analyze.
3320 Return the loop_vinfo on success and wrapped null on failure. */
3322 static opt_loop_vec_info
3323 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3324 const vect_loop_form_info *loop_form_info,
3325 loop_vec_info main_loop_vinfo,
3326 const vector_modes &vector_modes, unsigned &mode_i,
3327 machine_mode &autodetected_vector_mode,
3328 bool &fatal)
3330 loop_vec_info loop_vinfo
3331 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3333 machine_mode vector_mode = vector_modes[mode_i];
3334 loop_vinfo->vector_mode = vector_mode;
3335 unsigned int suggested_unroll_factor = 1;
3336 bool slp_done_for_suggested_uf = false;
3338 /* Run the main analysis. */
3339 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3340 &suggested_unroll_factor,
3341 slp_done_for_suggested_uf);
3342 if (dump_enabled_p ())
3343 dump_printf_loc (MSG_NOTE, vect_location,
3344 "***** Analysis %s with vector mode %s\n",
3345 res ? "succeeded" : " failed",
3346 GET_MODE_NAME (loop_vinfo->vector_mode));
3348 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3350 if (dump_enabled_p ())
3351 dump_printf_loc (MSG_NOTE, vect_location,
3352 "***** Re-trying analysis for unrolling"
3353 " with unroll factor %d and slp %s.\n",
3354 suggested_unroll_factor,
3355 slp_done_for_suggested_uf ? "on" : "off");
3356 loop_vec_info unroll_vinfo
3357 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3358 unroll_vinfo->vector_mode = vector_mode;
3359 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3360 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3361 slp_done_for_suggested_uf);
3362 if (new_res)
3364 delete loop_vinfo;
3365 loop_vinfo = unroll_vinfo;
3367 else
3368 delete unroll_vinfo;
3371 /* Remember the autodetected vector mode. */
3372 if (vector_mode == VOIDmode)
3373 autodetected_vector_mode = loop_vinfo->vector_mode;
3375 /* Advance mode_i, first skipping modes that would result in the
3376 same analysis result. */
3377 while (mode_i + 1 < vector_modes.length ()
3378 && vect_chooses_same_modes_p (loop_vinfo,
3379 vector_modes[mode_i + 1]))
3381 if (dump_enabled_p ())
3382 dump_printf_loc (MSG_NOTE, vect_location,
3383 "***** The result for vector mode %s would"
3384 " be the same\n",
3385 GET_MODE_NAME (vector_modes[mode_i + 1]));
3386 mode_i += 1;
3388 if (mode_i + 1 < vector_modes.length ()
3389 && VECTOR_MODE_P (autodetected_vector_mode)
3390 && (related_vector_mode (vector_modes[mode_i + 1],
3391 GET_MODE_INNER (autodetected_vector_mode))
3392 == autodetected_vector_mode)
3393 && (related_vector_mode (autodetected_vector_mode,
3394 GET_MODE_INNER (vector_modes[mode_i + 1]))
3395 == vector_modes[mode_i + 1]))
3397 if (dump_enabled_p ())
3398 dump_printf_loc (MSG_NOTE, vect_location,
3399 "***** Skipping vector mode %s, which would"
3400 " repeat the analysis for %s\n",
3401 GET_MODE_NAME (vector_modes[mode_i + 1]),
3402 GET_MODE_NAME (autodetected_vector_mode));
3403 mode_i += 1;
3405 mode_i++;
3407 if (!res)
3409 delete loop_vinfo;
3410 if (fatal)
3411 gcc_checking_assert (main_loop_vinfo == NULL);
3412 return opt_loop_vec_info::propagate_failure (res);
3415 return opt_loop_vec_info::success (loop_vinfo);
3418 /* Function vect_analyze_loop.
3420 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3421 for it. The different analyses will record information in the
3422 loop_vec_info struct. */
3423 opt_loop_vec_info
3424 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3426 DUMP_VECT_SCOPE ("analyze_loop_nest");
3428 if (loop_outer (loop)
3429 && loop_vec_info_for_loop (loop_outer (loop))
3430 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3431 return opt_loop_vec_info::failure_at (vect_location,
3432 "outer-loop already vectorized.\n");
3434 if (!find_loop_nest (loop, &shared->loop_nest))
3435 return opt_loop_vec_info::failure_at
3436 (vect_location,
3437 "not vectorized: loop nest containing two or more consecutive inner"
3438 " loops cannot be vectorized\n");
3440 /* Analyze the loop form. */
3441 vect_loop_form_info loop_form_info;
3442 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3443 if (!res)
3445 if (dump_enabled_p ())
3446 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3447 "bad loop form.\n");
3448 return opt_loop_vec_info::propagate_failure (res);
3450 if (!integer_onep (loop_form_info.assumptions))
3452 /* We consider to vectorize this loop by versioning it under
3453 some assumptions. In order to do this, we need to clear
3454 existing information computed by scev and niter analyzer. */
3455 scev_reset_htab ();
3456 free_numbers_of_iterations_estimates (loop);
3457 /* Also set flag for this loop so that following scev and niter
3458 analysis are done under the assumptions. */
3459 loop_constraint_set (loop, LOOP_C_FINITE);
3462 auto_vector_modes vector_modes;
3463 /* Autodetect first vector size we try. */
3464 vector_modes.safe_push (VOIDmode);
3465 unsigned int autovec_flags
3466 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3467 loop->simdlen != 0);
3468 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3469 && !unlimited_cost_model (loop));
3470 machine_mode autodetected_vector_mode = VOIDmode;
3471 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3472 unsigned int mode_i = 0;
3473 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3475 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3476 a mode has not been analyzed. */
3477 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3478 for (unsigned i = 0; i < vector_modes.length (); ++i)
3479 cached_vf_per_mode.safe_push (0);
3481 /* First determine the main loop vectorization mode, either the first
3482 one that works, starting with auto-detecting the vector mode and then
3483 following the targets order of preference, or the one with the
3484 lowest cost if pick_lowest_cost_p. */
3485 while (1)
3487 bool fatal;
3488 unsigned int last_mode_i = mode_i;
3489 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3490 failed. */
3491 cached_vf_per_mode[last_mode_i] = -1;
3492 opt_loop_vec_info loop_vinfo
3493 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3494 NULL, vector_modes, mode_i,
3495 autodetected_vector_mode, fatal);
3496 if (fatal)
3497 break;
3499 if (loop_vinfo)
3501 /* Analyzis has been successful so update the VF value. The
3502 VF should always be a multiple of unroll_factor and we want to
3503 capture the original VF here. */
3504 cached_vf_per_mode[last_mode_i]
3505 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3506 loop_vinfo->suggested_unroll_factor);
3507 /* Once we hit the desired simdlen for the first time,
3508 discard any previous attempts. */
3509 if (simdlen
3510 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3512 delete first_loop_vinfo;
3513 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3514 simdlen = 0;
3516 else if (pick_lowest_cost_p
3517 && first_loop_vinfo
3518 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3520 /* Pick loop_vinfo over first_loop_vinfo. */
3521 delete first_loop_vinfo;
3522 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3524 if (first_loop_vinfo == NULL)
3525 first_loop_vinfo = loop_vinfo;
3526 else
3528 delete loop_vinfo;
3529 loop_vinfo = opt_loop_vec_info::success (NULL);
3532 /* Commit to first_loop_vinfo if we have no reason to try
3533 alternatives. */
3534 if (!simdlen && !pick_lowest_cost_p)
3535 break;
3537 if (mode_i == vector_modes.length ()
3538 || autodetected_vector_mode == VOIDmode)
3539 break;
3541 /* Try the next biggest vector size. */
3542 if (dump_enabled_p ())
3543 dump_printf_loc (MSG_NOTE, vect_location,
3544 "***** Re-trying analysis with vector mode %s\n",
3545 GET_MODE_NAME (vector_modes[mode_i]));
3547 if (!first_loop_vinfo)
3548 return opt_loop_vec_info::propagate_failure (res);
3550 if (dump_enabled_p ())
3551 dump_printf_loc (MSG_NOTE, vect_location,
3552 "***** Choosing vector mode %s\n",
3553 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3555 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3556 enabled, SIMDUID is not set, it is the innermost loop and we have
3557 either already found the loop's SIMDLEN or there was no SIMDLEN to
3558 begin with.
3559 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3560 bool vect_epilogues = (!simdlen
3561 && loop->inner == NULL
3562 && param_vect_epilogues_nomask
3563 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3564 && !loop->simduid);
3565 if (!vect_epilogues)
3566 return first_loop_vinfo;
3568 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3569 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3571 /* For epilogues start the analysis from the first mode. The motivation
3572 behind starting from the beginning comes from cases where the VECTOR_MODES
3573 array may contain length-agnostic and length-specific modes. Their
3574 ordering is not guaranteed, so we could end up picking a mode for the main
3575 loop that is after the epilogue's optimal mode. */
3576 vector_modes[0] = autodetected_vector_mode;
3577 mode_i = 0;
3579 bool supports_partial_vectors =
3580 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3581 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3583 while (1)
3585 /* If the target does not support partial vectors we can shorten the
3586 number of modes to analyze for the epilogue as we know we can't pick a
3587 mode that would lead to a VF at least as big as the
3588 FIRST_VINFO_VF. */
3589 if (!supports_partial_vectors
3590 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3592 mode_i++;
3593 if (mode_i == vector_modes.length ())
3594 break;
3595 continue;
3598 if (dump_enabled_p ())
3599 dump_printf_loc (MSG_NOTE, vect_location,
3600 "***** Re-trying epilogue analysis with vector "
3601 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3603 bool fatal;
3604 opt_loop_vec_info loop_vinfo
3605 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3606 first_loop_vinfo,
3607 vector_modes, mode_i,
3608 autodetected_vector_mode, fatal);
3609 if (fatal)
3610 break;
3612 if (loop_vinfo)
3614 if (pick_lowest_cost_p)
3616 /* Keep trying to roll back vectorization attempts while the
3617 loop_vec_infos they produced were worse than this one. */
3618 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3619 while (!vinfos.is_empty ()
3620 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3622 gcc_assert (vect_epilogues);
3623 delete vinfos.pop ();
3626 /* For now only allow one epilogue loop. */
3627 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3629 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3630 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3631 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3632 || maybe_ne (lowest_th, 0U));
3633 /* Keep track of the known smallest versioning
3634 threshold. */
3635 if (ordered_p (lowest_th, th))
3636 lowest_th = ordered_min (lowest_th, th);
3638 else
3640 delete loop_vinfo;
3641 loop_vinfo = opt_loop_vec_info::success (NULL);
3644 /* For now only allow one epilogue loop, but allow
3645 pick_lowest_cost_p to replace it, so commit to the
3646 first epilogue if we have no reason to try alternatives. */
3647 if (!pick_lowest_cost_p)
3648 break;
3651 if (mode_i == vector_modes.length ())
3652 break;
3656 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3658 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3659 if (dump_enabled_p ())
3660 dump_printf_loc (MSG_NOTE, vect_location,
3661 "***** Choosing epilogue vector mode %s\n",
3662 GET_MODE_NAME
3663 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3666 return first_loop_vinfo;
3669 /* Return true if there is an in-order reduction function for CODE, storing
3670 it in *REDUC_FN if so. */
3672 static bool
3673 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3675 if (code == PLUS_EXPR)
3677 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3678 return true;
3680 return false;
3683 /* Function reduction_fn_for_scalar_code
3685 Input:
3686 CODE - tree_code of a reduction operations.
3688 Output:
3689 REDUC_FN - the corresponding internal function to be used to reduce the
3690 vector of partial results into a single scalar result, or IFN_LAST
3691 if the operation is a supported reduction operation, but does not have
3692 such an internal function.
3694 Return FALSE if CODE currently cannot be vectorized as reduction. */
3696 bool
3697 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3699 if (code.is_tree_code ())
3700 switch (tree_code (code))
3702 case MAX_EXPR:
3703 *reduc_fn = IFN_REDUC_MAX;
3704 return true;
3706 case MIN_EXPR:
3707 *reduc_fn = IFN_REDUC_MIN;
3708 return true;
3710 case PLUS_EXPR:
3711 *reduc_fn = IFN_REDUC_PLUS;
3712 return true;
3714 case BIT_AND_EXPR:
3715 *reduc_fn = IFN_REDUC_AND;
3716 return true;
3718 case BIT_IOR_EXPR:
3719 *reduc_fn = IFN_REDUC_IOR;
3720 return true;
3722 case BIT_XOR_EXPR:
3723 *reduc_fn = IFN_REDUC_XOR;
3724 return true;
3726 case MULT_EXPR:
3727 case MINUS_EXPR:
3728 *reduc_fn = IFN_LAST;
3729 return true;
3731 default:
3732 return false;
3734 else
3735 switch (combined_fn (code))
3737 CASE_CFN_FMAX:
3738 *reduc_fn = IFN_REDUC_FMAX;
3739 return true;
3741 CASE_CFN_FMIN:
3742 *reduc_fn = IFN_REDUC_FMIN;
3743 return true;
3745 default:
3746 return false;
3750 /* If there is a neutral value X such that a reduction would not be affected
3751 by the introduction of additional X elements, return that X, otherwise
3752 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3753 of the scalar elements. If the reduction has just a single initial value
3754 then INITIAL_VALUE is that value, otherwise it is null. */
3756 tree
3757 neutral_op_for_reduction (tree scalar_type, code_helper code,
3758 tree initial_value)
3760 if (code.is_tree_code ())
3761 switch (tree_code (code))
3763 case WIDEN_SUM_EXPR:
3764 case DOT_PROD_EXPR:
3765 case SAD_EXPR:
3766 case PLUS_EXPR:
3767 case MINUS_EXPR:
3768 case BIT_IOR_EXPR:
3769 case BIT_XOR_EXPR:
3770 return build_zero_cst (scalar_type);
3772 case MULT_EXPR:
3773 return build_one_cst (scalar_type);
3775 case BIT_AND_EXPR:
3776 return build_all_ones_cst (scalar_type);
3778 case MAX_EXPR:
3779 case MIN_EXPR:
3780 return initial_value;
3782 default:
3783 return NULL_TREE;
3785 else
3786 switch (combined_fn (code))
3788 CASE_CFN_FMIN:
3789 CASE_CFN_FMAX:
3790 return initial_value;
3792 default:
3793 return NULL_TREE;
3797 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3798 STMT is printed with a message MSG. */
3800 static void
3801 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3803 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3806 /* Return true if we need an in-order reduction for operation CODE
3807 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3808 overflow must wrap. */
3810 bool
3811 needs_fold_left_reduction_p (tree type, code_helper code)
3813 /* CHECKME: check for !flag_finite_math_only too? */
3814 if (SCALAR_FLOAT_TYPE_P (type))
3816 if (code.is_tree_code ())
3817 switch (tree_code (code))
3819 case MIN_EXPR:
3820 case MAX_EXPR:
3821 return false;
3823 default:
3824 return !flag_associative_math;
3826 else
3827 switch (combined_fn (code))
3829 CASE_CFN_FMIN:
3830 CASE_CFN_FMAX:
3831 return false;
3833 default:
3834 return !flag_associative_math;
3838 if (INTEGRAL_TYPE_P (type))
3839 return (!code.is_tree_code ()
3840 || !operation_no_trapping_overflow (type, tree_code (code)));
3842 if (SAT_FIXED_POINT_TYPE_P (type))
3843 return true;
3845 return false;
3848 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3849 has a handled computation expression. Store the main reduction
3850 operation in *CODE. */
3852 static bool
3853 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3854 tree loop_arg, code_helper *code,
3855 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3857 auto_bitmap visited;
3858 tree lookfor = PHI_RESULT (phi);
3859 ssa_op_iter curri;
3860 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3861 while (USE_FROM_PTR (curr) != loop_arg)
3862 curr = op_iter_next_use (&curri);
3863 curri.i = curri.numops;
3866 path.safe_push (std::make_pair (curri, curr));
3867 tree use = USE_FROM_PTR (curr);
3868 if (use == lookfor)
3869 break;
3870 gimple *def = SSA_NAME_DEF_STMT (use);
3871 if (gimple_nop_p (def)
3872 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3874 pop:
3877 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3878 curri = x.first;
3879 curr = x.second;
3881 curr = op_iter_next_use (&curri);
3882 /* Skip already visited or non-SSA operands (from iterating
3883 over PHI args). */
3884 while (curr != NULL_USE_OPERAND_P
3885 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3886 || ! bitmap_set_bit (visited,
3887 SSA_NAME_VERSION
3888 (USE_FROM_PTR (curr)))));
3890 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3891 if (curr == NULL_USE_OPERAND_P)
3892 break;
3894 else
3896 if (gimple_code (def) == GIMPLE_PHI)
3897 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3898 else
3899 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3900 while (curr != NULL_USE_OPERAND_P
3901 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3902 || ! bitmap_set_bit (visited,
3903 SSA_NAME_VERSION
3904 (USE_FROM_PTR (curr)))))
3905 curr = op_iter_next_use (&curri);
3906 if (curr == NULL_USE_OPERAND_P)
3907 goto pop;
3910 while (1);
3911 if (dump_file && (dump_flags & TDF_DETAILS))
3913 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3914 unsigned i;
3915 std::pair<ssa_op_iter, use_operand_p> *x;
3916 FOR_EACH_VEC_ELT (path, i, x)
3917 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3918 dump_printf (MSG_NOTE, "\n");
3921 /* Check whether the reduction path detected is valid. */
3922 bool fail = path.length () == 0;
3923 bool neg = false;
3924 int sign = -1;
3925 *code = ERROR_MARK;
3926 for (unsigned i = 1; i < path.length (); ++i)
3928 gimple *use_stmt = USE_STMT (path[i].second);
3929 gimple_match_op op;
3930 if (!gimple_extract_op (use_stmt, &op))
3932 fail = true;
3933 break;
3935 unsigned int opi = op.num_ops;
3936 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3938 /* The following make sure we can compute the operand index
3939 easily plus it mostly disallows chaining via COND_EXPR condition
3940 operands. */
3941 for (opi = 0; opi < op.num_ops; ++opi)
3942 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3943 break;
3945 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3947 for (opi = 0; opi < op.num_ops; ++opi)
3948 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3949 break;
3951 if (opi == op.num_ops)
3953 fail = true;
3954 break;
3956 op.code = canonicalize_code (op.code, op.type);
3957 if (op.code == MINUS_EXPR)
3959 op.code = PLUS_EXPR;
3960 /* Track whether we negate the reduction value each iteration. */
3961 if (op.ops[1] == op.ops[opi])
3962 neg = ! neg;
3964 if (CONVERT_EXPR_CODE_P (op.code)
3965 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3967 else if (*code == ERROR_MARK)
3969 *code = op.code;
3970 sign = TYPE_SIGN (op.type);
3972 else if (op.code != *code)
3974 fail = true;
3975 break;
3977 else if ((op.code == MIN_EXPR
3978 || op.code == MAX_EXPR)
3979 && sign != TYPE_SIGN (op.type))
3981 fail = true;
3982 break;
3984 /* Check there's only a single stmt the op is used on. For the
3985 not value-changing tail and the last stmt allow out-of-loop uses.
3986 ??? We could relax this and handle arbitrary live stmts by
3987 forcing a scalar epilogue for example. */
3988 imm_use_iterator imm_iter;
3989 gimple *op_use_stmt;
3990 unsigned cnt = 0;
3991 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3992 if (!is_gimple_debug (op_use_stmt)
3993 && (*code != ERROR_MARK
3994 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3996 /* We want to allow x + x but not x < 1 ? x : 2. */
3997 if (is_gimple_assign (op_use_stmt)
3998 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
4000 use_operand_p use_p;
4001 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4002 cnt++;
4004 else
4005 cnt++;
4007 if (cnt != 1)
4009 fail = true;
4010 break;
4013 return ! fail && ! neg && *code != ERROR_MARK;
4016 bool
4017 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4018 tree loop_arg, enum tree_code code)
4020 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4021 code_helper code_;
4022 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4023 && code_ == code);
4028 /* Function vect_is_simple_reduction
4030 (1) Detect a cross-iteration def-use cycle that represents a simple
4031 reduction computation. We look for the following pattern:
4033 loop_header:
4034 a1 = phi < a0, a2 >
4035 a3 = ...
4036 a2 = operation (a3, a1)
4040 a3 = ...
4041 loop_header:
4042 a1 = phi < a0, a2 >
4043 a2 = operation (a3, a1)
4045 such that:
4046 1. operation is commutative and associative and it is safe to
4047 change the order of the computation
4048 2. no uses for a2 in the loop (a2 is used out of the loop)
4049 3. no uses of a1 in the loop besides the reduction operation
4050 4. no uses of a1 outside the loop.
4052 Conditions 1,4 are tested here.
4053 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4055 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4056 nested cycles.
4058 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4059 reductions:
4061 a1 = phi < a0, a2 >
4062 inner loop (def of a3)
4063 a2 = phi < a3 >
4065 (4) Detect condition expressions, ie:
4066 for (int i = 0; i < N; i++)
4067 if (a[i] < val)
4068 ret_val = a[i];
4072 static stmt_vec_info
4073 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4074 bool *double_reduc, bool *reduc_chain_p, bool slp)
4076 gphi *phi = as_a <gphi *> (phi_info->stmt);
4077 gimple *phi_use_stmt = NULL;
4078 imm_use_iterator imm_iter;
4079 use_operand_p use_p;
4081 *double_reduc = false;
4082 *reduc_chain_p = false;
4083 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4085 tree phi_name = PHI_RESULT (phi);
4086 /* ??? If there are no uses of the PHI result the inner loop reduction
4087 won't be detected as possibly double-reduction by vectorizable_reduction
4088 because that tries to walk the PHI arg from the preheader edge which
4089 can be constant. See PR60382. */
4090 if (has_zero_uses (phi_name))
4091 return NULL;
4092 class loop *loop = (gimple_bb (phi))->loop_father;
4093 unsigned nphi_def_loop_uses = 0;
4094 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4096 gimple *use_stmt = USE_STMT (use_p);
4097 if (is_gimple_debug (use_stmt))
4098 continue;
4100 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4102 if (dump_enabled_p ())
4103 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4104 "intermediate value used outside loop.\n");
4106 return NULL;
4109 nphi_def_loop_uses++;
4110 phi_use_stmt = use_stmt;
4113 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4114 if (TREE_CODE (latch_def) != SSA_NAME)
4116 if (dump_enabled_p ())
4117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4118 "reduction: not ssa_name: %T\n", latch_def);
4119 return NULL;
4122 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4123 if (!def_stmt_info
4124 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4125 return NULL;
4127 bool nested_in_vect_loop
4128 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4129 unsigned nlatch_def_loop_uses = 0;
4130 auto_vec<gphi *, 3> lcphis;
4131 bool inner_loop_of_double_reduc = false;
4132 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4134 gimple *use_stmt = USE_STMT (use_p);
4135 if (is_gimple_debug (use_stmt))
4136 continue;
4137 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4138 nlatch_def_loop_uses++;
4139 else
4141 /* We can have more than one loop-closed PHI. */
4142 lcphis.safe_push (as_a <gphi *> (use_stmt));
4143 if (nested_in_vect_loop
4144 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4145 == vect_double_reduction_def))
4146 inner_loop_of_double_reduc = true;
4150 /* If we are vectorizing an inner reduction we are executing that
4151 in the original order only in case we are not dealing with a
4152 double reduction. */
4153 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4155 if (dump_enabled_p ())
4156 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4157 "detected nested cycle: ");
4158 return def_stmt_info;
4161 /* When the inner loop of a double reduction ends up with more than
4162 one loop-closed PHI we have failed to classify alternate such
4163 PHIs as double reduction, leading to wrong code. See PR103237. */
4164 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4166 if (dump_enabled_p ())
4167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4168 "unhandle double reduction\n");
4169 return NULL;
4172 /* If this isn't a nested cycle or if the nested cycle reduction value
4173 is used ouside of the inner loop we cannot handle uses of the reduction
4174 value. */
4175 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4177 if (dump_enabled_p ())
4178 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4179 "reduction used in loop.\n");
4180 return NULL;
4183 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4184 defined in the inner loop. */
4185 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4187 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4188 if (gimple_phi_num_args (def_stmt) != 1
4189 || TREE_CODE (op1) != SSA_NAME)
4191 if (dump_enabled_p ())
4192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4193 "unsupported phi node definition.\n");
4195 return NULL;
4198 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4199 and the latch definition op1. */
4200 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4201 if (gimple_bb (def1)
4202 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4203 && loop->inner
4204 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4205 && (is_gimple_assign (def1) || is_gimple_call (def1))
4206 && is_a <gphi *> (phi_use_stmt)
4207 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4208 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4209 loop_latch_edge (loop->inner))))
4211 if (dump_enabled_p ())
4212 report_vect_op (MSG_NOTE, def_stmt,
4213 "detected double reduction: ");
4215 *double_reduc = true;
4216 return def_stmt_info;
4219 return NULL;
4222 /* Look for the expression computing latch_def from then loop PHI result. */
4223 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4224 code_helper code;
4225 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4226 path))
4228 STMT_VINFO_REDUC_CODE (phi_info) = code;
4229 if (code == COND_EXPR && !nested_in_vect_loop)
4230 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4232 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4233 reduction chain for which the additional restriction is that
4234 all operations in the chain are the same. */
4235 auto_vec<stmt_vec_info, 8> reduc_chain;
4236 unsigned i;
4237 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4238 for (i = path.length () - 1; i >= 1; --i)
4240 gimple *stmt = USE_STMT (path[i].second);
4241 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4242 gimple_match_op op;
4243 if (!gimple_extract_op (stmt, &op))
4244 gcc_unreachable ();
4245 if (gassign *assign = dyn_cast<gassign *> (stmt))
4246 STMT_VINFO_REDUC_IDX (stmt_info)
4247 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4248 else
4250 gcall *call = as_a<gcall *> (stmt);
4251 STMT_VINFO_REDUC_IDX (stmt_info)
4252 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4254 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4255 && (i == 1 || i == path.length () - 1));
4256 if ((op.code != code && !leading_conversion)
4257 /* We can only handle the final value in epilogue
4258 generation for reduction chains. */
4259 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4260 is_slp_reduc = false;
4261 /* For reduction chains we support a trailing/leading
4262 conversions. We do not store those in the actual chain. */
4263 if (leading_conversion)
4264 continue;
4265 reduc_chain.safe_push (stmt_info);
4267 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4269 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4271 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4272 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4274 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4275 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4277 /* Save the chain for further analysis in SLP detection. */
4278 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4279 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4281 *reduc_chain_p = true;
4282 if (dump_enabled_p ())
4283 dump_printf_loc (MSG_NOTE, vect_location,
4284 "reduction: detected reduction chain\n");
4286 else if (dump_enabled_p ())
4287 dump_printf_loc (MSG_NOTE, vect_location,
4288 "reduction: detected reduction\n");
4290 return def_stmt_info;
4293 if (dump_enabled_p ())
4294 dump_printf_loc (MSG_NOTE, vect_location,
4295 "reduction: unknown pattern\n");
4297 return NULL;
4300 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4301 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4302 or -1 if not known. */
4304 static int
4305 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4307 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4308 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4310 if (dump_enabled_p ())
4311 dump_printf_loc (MSG_NOTE, vect_location,
4312 "cost model: epilogue peel iters set to vf/2 "
4313 "because loop iterations are unknown .\n");
4314 return assumed_vf / 2;
4316 else
4318 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4319 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4320 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4321 /* If we need to peel for gaps, but no peeling is required, we have to
4322 peel VF iterations. */
4323 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4324 peel_iters_epilogue = assumed_vf;
4325 return peel_iters_epilogue;
4329 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4331 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4332 int *peel_iters_epilogue,
4333 stmt_vector_for_cost *scalar_cost_vec,
4334 stmt_vector_for_cost *prologue_cost_vec,
4335 stmt_vector_for_cost *epilogue_cost_vec)
4337 int retval = 0;
4339 *peel_iters_epilogue
4340 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4342 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4344 /* If peeled iterations are known but number of scalar loop
4345 iterations are unknown, count a taken branch per peeled loop. */
4346 if (peel_iters_prologue > 0)
4347 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4348 vect_prologue);
4349 if (*peel_iters_epilogue > 0)
4350 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4351 vect_epilogue);
4354 stmt_info_for_cost *si;
4355 int j;
4356 if (peel_iters_prologue)
4357 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4358 retval += record_stmt_cost (prologue_cost_vec,
4359 si->count * peel_iters_prologue,
4360 si->kind, si->stmt_info, si->misalign,
4361 vect_prologue);
4362 if (*peel_iters_epilogue)
4363 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4364 retval += record_stmt_cost (epilogue_cost_vec,
4365 si->count * *peel_iters_epilogue,
4366 si->kind, si->stmt_info, si->misalign,
4367 vect_epilogue);
4369 return retval;
4372 /* Function vect_estimate_min_profitable_iters
4374 Return the number of iterations required for the vector version of the
4375 loop to be profitable relative to the cost of the scalar version of the
4376 loop.
4378 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4379 of iterations for vectorization. -1 value means loop vectorization
4380 is not profitable. This returned value may be used for dynamic
4381 profitability check.
4383 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4384 for static check against estimated number of iterations. */
4386 static void
4387 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4388 int *ret_min_profitable_niters,
4389 int *ret_min_profitable_estimate,
4390 unsigned *suggested_unroll_factor)
4392 int min_profitable_iters;
4393 int min_profitable_estimate;
4394 int peel_iters_prologue;
4395 int peel_iters_epilogue;
4396 unsigned vec_inside_cost = 0;
4397 int vec_outside_cost = 0;
4398 unsigned vec_prologue_cost = 0;
4399 unsigned vec_epilogue_cost = 0;
4400 int scalar_single_iter_cost = 0;
4401 int scalar_outside_cost = 0;
4402 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4403 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4404 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4406 /* Cost model disabled. */
4407 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4409 if (dump_enabled_p ())
4410 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4411 *ret_min_profitable_niters = 0;
4412 *ret_min_profitable_estimate = 0;
4413 return;
4416 /* Requires loop versioning tests to handle misalignment. */
4417 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4419 /* FIXME: Make cost depend on complexity of individual check. */
4420 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4421 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4422 if (dump_enabled_p ())
4423 dump_printf (MSG_NOTE,
4424 "cost model: Adding cost of checks for loop "
4425 "versioning to treat misalignment.\n");
4428 /* Requires loop versioning with alias checks. */
4429 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4431 /* FIXME: Make cost depend on complexity of individual check. */
4432 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4433 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4434 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4435 if (len)
4436 /* Count LEN - 1 ANDs and LEN comparisons. */
4437 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4438 scalar_stmt, vect_prologue);
4439 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4440 if (len)
4442 /* Count LEN - 1 ANDs and LEN comparisons. */
4443 unsigned int nstmts = len * 2 - 1;
4444 /* +1 for each bias that needs adding. */
4445 for (unsigned int i = 0; i < len; ++i)
4446 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4447 nstmts += 1;
4448 (void) add_stmt_cost (target_cost_data, nstmts,
4449 scalar_stmt, vect_prologue);
4451 if (dump_enabled_p ())
4452 dump_printf (MSG_NOTE,
4453 "cost model: Adding cost of checks for loop "
4454 "versioning aliasing.\n");
4457 /* Requires loop versioning with niter checks. */
4458 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4460 /* FIXME: Make cost depend on complexity of individual check. */
4461 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4462 NULL, NULL, NULL_TREE, 0, vect_prologue);
4463 if (dump_enabled_p ())
4464 dump_printf (MSG_NOTE,
4465 "cost model: Adding cost of checks for loop "
4466 "versioning niters.\n");
4469 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4470 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4471 vect_prologue);
4473 /* Count statements in scalar loop. Using this as scalar cost for a single
4474 iteration for now.
4476 TODO: Add outer loop support.
4478 TODO: Consider assigning different costs to different scalar
4479 statements. */
4481 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4483 /* Add additional cost for the peeled instructions in prologue and epilogue
4484 loop. (For fully-masked loops there will be no peeling.)
4486 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4487 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4489 TODO: Build an expression that represents peel_iters for prologue and
4490 epilogue to be used in a run-time test. */
4492 bool prologue_need_br_taken_cost = false;
4493 bool prologue_need_br_not_taken_cost = false;
4495 /* Calculate peel_iters_prologue. */
4496 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4497 peel_iters_prologue = 0;
4498 else if (npeel < 0)
4500 peel_iters_prologue = assumed_vf / 2;
4501 if (dump_enabled_p ())
4502 dump_printf (MSG_NOTE, "cost model: "
4503 "prologue peel iters set to vf/2.\n");
4505 /* If peeled iterations are unknown, count a taken branch and a not taken
4506 branch per peeled loop. Even if scalar loop iterations are known,
4507 vector iterations are not known since peeled prologue iterations are
4508 not known. Hence guards remain the same. */
4509 prologue_need_br_taken_cost = true;
4510 prologue_need_br_not_taken_cost = true;
4512 else
4514 peel_iters_prologue = npeel;
4515 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4516 /* If peeled iterations are known but number of scalar loop
4517 iterations are unknown, count a taken branch per peeled loop. */
4518 prologue_need_br_taken_cost = true;
4521 bool epilogue_need_br_taken_cost = false;
4522 bool epilogue_need_br_not_taken_cost = false;
4524 /* Calculate peel_iters_epilogue. */
4525 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4526 /* We need to peel exactly one iteration for gaps. */
4527 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4528 else if (npeel < 0)
4530 /* If peeling for alignment is unknown, loop bound of main loop
4531 becomes unknown. */
4532 peel_iters_epilogue = assumed_vf / 2;
4533 if (dump_enabled_p ())
4534 dump_printf (MSG_NOTE, "cost model: "
4535 "epilogue peel iters set to vf/2 because "
4536 "peeling for alignment is unknown.\n");
4538 /* See the same reason above in peel_iters_prologue calculation. */
4539 epilogue_need_br_taken_cost = true;
4540 epilogue_need_br_not_taken_cost = true;
4542 else
4544 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4545 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4546 /* If peeled iterations are known but number of scalar loop
4547 iterations are unknown, count a taken branch per peeled loop. */
4548 epilogue_need_br_taken_cost = true;
4551 stmt_info_for_cost *si;
4552 int j;
4553 /* Add costs associated with peel_iters_prologue. */
4554 if (peel_iters_prologue)
4555 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4557 (void) add_stmt_cost (target_cost_data,
4558 si->count * peel_iters_prologue, si->kind,
4559 si->stmt_info, si->node, si->vectype,
4560 si->misalign, vect_prologue);
4563 /* Add costs associated with peel_iters_epilogue. */
4564 if (peel_iters_epilogue)
4565 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4567 (void) add_stmt_cost (target_cost_data,
4568 si->count * peel_iters_epilogue, si->kind,
4569 si->stmt_info, si->node, si->vectype,
4570 si->misalign, vect_epilogue);
4573 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4575 if (prologue_need_br_taken_cost)
4576 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4577 vect_prologue);
4579 if (prologue_need_br_not_taken_cost)
4580 (void) add_stmt_cost (target_cost_data, 1,
4581 cond_branch_not_taken, vect_prologue);
4583 if (epilogue_need_br_taken_cost)
4584 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4585 vect_epilogue);
4587 if (epilogue_need_br_not_taken_cost)
4588 (void) add_stmt_cost (target_cost_data, 1,
4589 cond_branch_not_taken, vect_epilogue);
4591 /* Take care of special costs for rgroup controls of partial vectors. */
4592 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4593 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4594 == vect_partial_vectors_avx512))
4596 /* Calculate how many masks we need to generate. */
4597 unsigned int num_masks = 0;
4598 bool need_saturation = false;
4599 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4600 if (rgm.type)
4602 unsigned nvectors = rgm.factor;
4603 num_masks += nvectors;
4604 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4605 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4606 need_saturation = true;
4609 /* ??? The target isn't able to identify the costs below as
4610 producing masks so it cannot penaltize cases where we'd run
4611 out of mask registers for example. */
4613 /* ??? We are also failing to account for smaller vector masks
4614 we generate by splitting larger masks in vect_get_loop_mask. */
4616 /* In the worst case, we need to generate each mask in the prologue
4617 and in the loop body. We need one splat per group and one
4618 compare per mask.
4620 Sometimes the prologue mask will fold to a constant,
4621 so the actual prologue cost might be smaller. However, it's
4622 simpler and safer to use the worst-case cost; if this ends up
4623 being the tie-breaker between vectorizing or not, then it's
4624 probably better not to vectorize. */
4625 (void) add_stmt_cost (target_cost_data,
4626 num_masks
4627 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4628 vector_stmt, NULL, NULL, NULL_TREE, 0,
4629 vect_prologue);
4630 (void) add_stmt_cost (target_cost_data,
4631 num_masks
4632 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4633 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4635 /* When we need saturation we need it both in the prologue and
4636 the epilogue. */
4637 if (need_saturation)
4639 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4640 NULL, NULL, NULL_TREE, 0, vect_prologue);
4641 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4642 NULL, NULL, NULL_TREE, 0, vect_body);
4645 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4646 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4647 == vect_partial_vectors_while_ult))
4649 /* Calculate how many masks we need to generate. */
4650 unsigned int num_masks = 0;
4651 rgroup_controls *rgm;
4652 unsigned int num_vectors_m1;
4653 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4654 num_vectors_m1, rgm)
4655 if (rgm->type)
4656 num_masks += num_vectors_m1 + 1;
4657 gcc_assert (num_masks > 0);
4659 /* In the worst case, we need to generate each mask in the prologue
4660 and in the loop body. One of the loop body mask instructions
4661 replaces the comparison in the scalar loop, and since we don't
4662 count the scalar comparison against the scalar body, we shouldn't
4663 count that vector instruction against the vector body either.
4665 Sometimes we can use unpacks instead of generating prologue
4666 masks and sometimes the prologue mask will fold to a constant,
4667 so the actual prologue cost might be smaller. However, it's
4668 simpler and safer to use the worst-case cost; if this ends up
4669 being the tie-breaker between vectorizing or not, then it's
4670 probably better not to vectorize. */
4671 (void) add_stmt_cost (target_cost_data, num_masks,
4672 vector_stmt, NULL, NULL, NULL_TREE, 0,
4673 vect_prologue);
4674 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4675 vector_stmt, NULL, NULL, NULL_TREE, 0,
4676 vect_body);
4678 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4680 /* Referring to the functions vect_set_loop_condition_partial_vectors
4681 and vect_set_loop_controls_directly, we need to generate each
4682 length in the prologue and in the loop body if required. Although
4683 there are some possible optimizations, we consider the worst case
4684 here. */
4686 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4687 signed char partial_load_store_bias
4688 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4689 bool need_iterate_p
4690 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4691 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4693 /* Calculate how many statements to be added. */
4694 unsigned int prologue_stmts = 0;
4695 unsigned int body_stmts = 0;
4697 rgroup_controls *rgc;
4698 unsigned int num_vectors_m1;
4699 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4700 if (rgc->type)
4702 /* May need one SHIFT for nitems_total computation. */
4703 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4704 if (nitems != 1 && !niters_known_p)
4705 prologue_stmts += 1;
4707 /* May need one MAX and one MINUS for wrap around. */
4708 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4709 prologue_stmts += 2;
4711 /* Need one MAX and one MINUS for each batch limit excepting for
4712 the 1st one. */
4713 prologue_stmts += num_vectors_m1 * 2;
4715 unsigned int num_vectors = num_vectors_m1 + 1;
4717 /* Need to set up lengths in prologue, only one MIN required
4718 for each since start index is zero. */
4719 prologue_stmts += num_vectors;
4721 /* If we have a non-zero partial load bias, we need one PLUS
4722 to adjust the load length. */
4723 if (partial_load_store_bias != 0)
4724 body_stmts += 1;
4726 /* Each may need two MINs and one MINUS to update lengths in body
4727 for next iteration. */
4728 if (need_iterate_p)
4729 body_stmts += 3 * num_vectors;
4732 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4733 scalar_stmt, vect_prologue);
4734 (void) add_stmt_cost (target_cost_data, body_stmts,
4735 scalar_stmt, vect_body);
4738 /* FORNOW: The scalar outside cost is incremented in one of the
4739 following ways:
4741 1. The vectorizer checks for alignment and aliasing and generates
4742 a condition that allows dynamic vectorization. A cost model
4743 check is ANDED with the versioning condition. Hence scalar code
4744 path now has the added cost of the versioning check.
4746 if (cost > th & versioning_check)
4747 jmp to vector code
4749 Hence run-time scalar is incremented by not-taken branch cost.
4751 2. The vectorizer then checks if a prologue is required. If the
4752 cost model check was not done before during versioning, it has to
4753 be done before the prologue check.
4755 if (cost <= th)
4756 prologue = scalar_iters
4757 if (prologue == 0)
4758 jmp to vector code
4759 else
4760 execute prologue
4761 if (prologue == num_iters)
4762 go to exit
4764 Hence the run-time scalar cost is incremented by a taken branch,
4765 plus a not-taken branch, plus a taken branch cost.
4767 3. The vectorizer then checks if an epilogue is required. If the
4768 cost model check was not done before during prologue check, it
4769 has to be done with the epilogue check.
4771 if (prologue == 0)
4772 jmp to vector code
4773 else
4774 execute prologue
4775 if (prologue == num_iters)
4776 go to exit
4777 vector code:
4778 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4779 jmp to epilogue
4781 Hence the run-time scalar cost should be incremented by 2 taken
4782 branches.
4784 TODO: The back end may reorder the BBS's differently and reverse
4785 conditions/branch directions. Change the estimates below to
4786 something more reasonable. */
4788 /* If the number of iterations is known and we do not do versioning, we can
4789 decide whether to vectorize at compile time. Hence the scalar version
4790 do not carry cost model guard costs. */
4791 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4792 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4794 /* Cost model check occurs at versioning. */
4795 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4796 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4797 else
4799 /* Cost model check occurs at prologue generation. */
4800 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4801 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4802 + vect_get_stmt_cost (cond_branch_not_taken);
4803 /* Cost model check occurs at epilogue generation. */
4804 else
4805 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4809 /* Complete the target-specific cost calculations. */
4810 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4811 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4812 suggested_unroll_factor);
4814 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4815 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4816 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4817 *suggested_unroll_factor,
4818 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4820 if (dump_enabled_p ())
4821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4822 "can't unroll as unrolled vectorization factor larger"
4823 " than maximum vectorization factor: "
4824 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4825 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4826 *suggested_unroll_factor = 1;
4829 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4831 if (dump_enabled_p ())
4833 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4834 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4835 vec_inside_cost);
4836 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4837 vec_prologue_cost);
4838 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4839 vec_epilogue_cost);
4840 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4841 scalar_single_iter_cost);
4842 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4843 scalar_outside_cost);
4844 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4845 vec_outside_cost);
4846 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4847 peel_iters_prologue);
4848 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4849 peel_iters_epilogue);
4852 /* Calculate number of iterations required to make the vector version
4853 profitable, relative to the loop bodies only. The following condition
4854 must hold true:
4855 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4856 where
4857 SIC = scalar iteration cost, VIC = vector iteration cost,
4858 VOC = vector outside cost, VF = vectorization factor,
4859 NPEEL = prologue iterations + epilogue iterations,
4860 SOC = scalar outside cost for run time cost model check. */
4862 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4863 - vec_inside_cost);
4864 if (saving_per_viter <= 0)
4866 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4867 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4868 "vectorization did not happen for a simd loop");
4870 if (dump_enabled_p ())
4871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4872 "cost model: the vector iteration cost = %d "
4873 "divided by the scalar iteration cost = %d "
4874 "is greater or equal to the vectorization factor = %d"
4875 ".\n",
4876 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4877 *ret_min_profitable_niters = -1;
4878 *ret_min_profitable_estimate = -1;
4879 return;
4882 /* ??? The "if" arm is written to handle all cases; see below for what
4883 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4884 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4886 /* Rewriting the condition above in terms of the number of
4887 vector iterations (vniters) rather than the number of
4888 scalar iterations (niters) gives:
4890 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4892 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4894 For integer N, X and Y when X > 0:
4896 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4897 int outside_overhead = (vec_outside_cost
4898 - scalar_single_iter_cost * peel_iters_prologue
4899 - scalar_single_iter_cost * peel_iters_epilogue
4900 - scalar_outside_cost);
4901 /* We're only interested in cases that require at least one
4902 vector iteration. */
4903 int min_vec_niters = 1;
4904 if (outside_overhead > 0)
4905 min_vec_niters = outside_overhead / saving_per_viter + 1;
4907 if (dump_enabled_p ())
4908 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4909 min_vec_niters);
4911 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4913 /* Now that we know the minimum number of vector iterations,
4914 find the minimum niters for which the scalar cost is larger:
4916 SIC * niters > VIC * vniters + VOC - SOC
4918 We know that the minimum niters is no more than
4919 vniters * VF + NPEEL, but it might be (and often is) less
4920 than that if a partial vector iteration is cheaper than the
4921 equivalent scalar code. */
4922 int threshold = (vec_inside_cost * min_vec_niters
4923 + vec_outside_cost
4924 - scalar_outside_cost);
4925 if (threshold <= 0)
4926 min_profitable_iters = 1;
4927 else
4928 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4930 else
4931 /* Convert the number of vector iterations into a number of
4932 scalar iterations. */
4933 min_profitable_iters = (min_vec_niters * assumed_vf
4934 + peel_iters_prologue
4935 + peel_iters_epilogue);
4937 else
4939 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4940 * assumed_vf
4941 - vec_inside_cost * peel_iters_prologue
4942 - vec_inside_cost * peel_iters_epilogue);
4943 if (min_profitable_iters <= 0)
4944 min_profitable_iters = 0;
4945 else
4947 min_profitable_iters /= saving_per_viter;
4949 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4950 <= (((int) vec_inside_cost * min_profitable_iters)
4951 + (((int) vec_outside_cost - scalar_outside_cost)
4952 * assumed_vf)))
4953 min_profitable_iters++;
4957 if (dump_enabled_p ())
4958 dump_printf (MSG_NOTE,
4959 " Calculated minimum iters for profitability: %d\n",
4960 min_profitable_iters);
4962 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4963 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4964 /* We want the vectorized loop to execute at least once. */
4965 min_profitable_iters = assumed_vf + peel_iters_prologue;
4966 else if (min_profitable_iters < peel_iters_prologue)
4967 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4968 vectorized loop executes at least once. */
4969 min_profitable_iters = peel_iters_prologue;
4971 if (dump_enabled_p ())
4972 dump_printf_loc (MSG_NOTE, vect_location,
4973 " Runtime profitability threshold = %d\n",
4974 min_profitable_iters);
4976 *ret_min_profitable_niters = min_profitable_iters;
4978 /* Calculate number of iterations required to make the vector version
4979 profitable, relative to the loop bodies only.
4981 Non-vectorized variant is SIC * niters and it must win over vector
4982 variant on the expected loop trip count. The following condition must hold true:
4983 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4985 if (vec_outside_cost <= 0)
4986 min_profitable_estimate = 0;
4987 /* ??? This "else if" arm is written to handle all cases; see below for
4988 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4989 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4991 /* This is a repeat of the code above, but with + SOC rather
4992 than - SOC. */
4993 int outside_overhead = (vec_outside_cost
4994 - scalar_single_iter_cost * peel_iters_prologue
4995 - scalar_single_iter_cost * peel_iters_epilogue
4996 + scalar_outside_cost);
4997 int min_vec_niters = 1;
4998 if (outside_overhead > 0)
4999 min_vec_niters = outside_overhead / saving_per_viter + 1;
5001 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5003 int threshold = (vec_inside_cost * min_vec_niters
5004 + vec_outside_cost
5005 + scalar_outside_cost);
5006 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5008 else
5009 min_profitable_estimate = (min_vec_niters * assumed_vf
5010 + peel_iters_prologue
5011 + peel_iters_epilogue);
5013 else
5015 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5016 * assumed_vf
5017 - vec_inside_cost * peel_iters_prologue
5018 - vec_inside_cost * peel_iters_epilogue)
5019 / ((scalar_single_iter_cost * assumed_vf)
5020 - vec_inside_cost);
5022 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5023 if (dump_enabled_p ())
5024 dump_printf_loc (MSG_NOTE, vect_location,
5025 " Static estimate profitability threshold = %d\n",
5026 min_profitable_estimate);
5028 *ret_min_profitable_estimate = min_profitable_estimate;
5031 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5032 vector elements (not bits) for a vector with NELT elements. */
5033 static void
5034 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5035 vec_perm_builder *sel)
5037 /* The encoding is a single stepped pattern. Any wrap-around is handled
5038 by vec_perm_indices. */
5039 sel->new_vector (nelt, 1, 3);
5040 for (unsigned int i = 0; i < 3; i++)
5041 sel->quick_push (i + offset);
5044 /* Checks whether the target supports whole-vector shifts for vectors of mode
5045 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5046 it supports vec_perm_const with masks for all necessary shift amounts. */
5047 static bool
5048 have_whole_vector_shift (machine_mode mode)
5050 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5051 return true;
5053 /* Variable-length vectors should be handled via the optab. */
5054 unsigned int nelt;
5055 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5056 return false;
5058 vec_perm_builder sel;
5059 vec_perm_indices indices;
5060 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5062 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5063 indices.new_vector (sel, 2, nelt);
5064 if (!can_vec_perm_const_p (mode, mode, indices, false))
5065 return false;
5067 return true;
5070 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5071 multiplication operands have differing signs and (b) we intend
5072 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5073 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5075 static bool
5076 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5077 stmt_vec_info stmt_info)
5079 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5080 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5081 return false;
5083 tree rhs1 = gimple_assign_rhs1 (assign);
5084 tree rhs2 = gimple_assign_rhs2 (assign);
5085 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5086 return false;
5088 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5089 gcc_assert (reduc_info->is_reduc_info);
5090 return !directly_supported_p (DOT_PROD_EXPR,
5091 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5092 optab_vector_mixed_sign);
5095 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5096 functions. Design better to avoid maintenance issues. */
5098 /* Function vect_model_reduction_cost.
5100 Models cost for a reduction operation, including the vector ops
5101 generated within the strip-mine loop in some cases, the initial
5102 definition before the loop, and the epilogue code that must be generated. */
5104 static void
5105 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5106 stmt_vec_info stmt_info, internal_fn reduc_fn,
5107 vect_reduction_type reduction_type,
5108 int ncopies, stmt_vector_for_cost *cost_vec)
5110 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5111 tree vectype;
5112 machine_mode mode;
5113 class loop *loop = NULL;
5115 if (loop_vinfo)
5116 loop = LOOP_VINFO_LOOP (loop_vinfo);
5118 /* Condition reductions generate two reductions in the loop. */
5119 if (reduction_type == COND_REDUCTION)
5120 ncopies *= 2;
5122 vectype = STMT_VINFO_VECTYPE (stmt_info);
5123 mode = TYPE_MODE (vectype);
5124 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5126 gimple_match_op op;
5127 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5128 gcc_unreachable ();
5130 bool emulated_mixed_dot_prod
5131 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5132 if (reduction_type == EXTRACT_LAST_REDUCTION)
5133 /* No extra instructions are needed in the prologue. The loop body
5134 operations are costed in vectorizable_condition. */
5135 inside_cost = 0;
5136 else if (reduction_type == FOLD_LEFT_REDUCTION)
5138 /* No extra instructions needed in the prologue. */
5139 prologue_cost = 0;
5141 if (reduc_fn != IFN_LAST)
5142 /* Count one reduction-like operation per vector. */
5143 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5144 stmt_info, 0, vect_body);
5145 else
5147 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5148 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5149 inside_cost = record_stmt_cost (cost_vec, nelements,
5150 vec_to_scalar, stmt_info, 0,
5151 vect_body);
5152 inside_cost += record_stmt_cost (cost_vec, nelements,
5153 scalar_stmt, stmt_info, 0,
5154 vect_body);
5157 else
5159 /* Add in the cost of the initial definitions. */
5160 int prologue_stmts;
5161 if (reduction_type == COND_REDUCTION)
5162 /* For cond reductions we have four vectors: initial index, step,
5163 initial result of the data reduction, initial value of the index
5164 reduction. */
5165 prologue_stmts = 4;
5166 else if (emulated_mixed_dot_prod)
5167 /* We need the initial reduction value and two invariants:
5168 one that contains the minimum signed value and one that
5169 contains half of its negative. */
5170 prologue_stmts = 3;
5171 else
5172 prologue_stmts = 1;
5173 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5174 scalar_to_vec, stmt_info, 0,
5175 vect_prologue);
5178 /* Determine cost of epilogue code.
5180 We have a reduction operator that will reduce the vector in one statement.
5181 Also requires scalar extract. */
5183 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5185 if (reduc_fn != IFN_LAST)
5187 if (reduction_type == COND_REDUCTION)
5189 /* An EQ stmt and an COND_EXPR stmt. */
5190 epilogue_cost += record_stmt_cost (cost_vec, 2,
5191 vector_stmt, stmt_info, 0,
5192 vect_epilogue);
5193 /* Reduction of the max index and a reduction of the found
5194 values. */
5195 epilogue_cost += record_stmt_cost (cost_vec, 2,
5196 vec_to_scalar, stmt_info, 0,
5197 vect_epilogue);
5198 /* A broadcast of the max value. */
5199 epilogue_cost += record_stmt_cost (cost_vec, 1,
5200 scalar_to_vec, stmt_info, 0,
5201 vect_epilogue);
5203 else
5205 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5206 stmt_info, 0, vect_epilogue);
5207 epilogue_cost += record_stmt_cost (cost_vec, 1,
5208 vec_to_scalar, stmt_info, 0,
5209 vect_epilogue);
5212 else if (reduction_type == COND_REDUCTION)
5214 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5215 /* Extraction of scalar elements. */
5216 epilogue_cost += record_stmt_cost (cost_vec,
5217 2 * estimated_nunits,
5218 vec_to_scalar, stmt_info, 0,
5219 vect_epilogue);
5220 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5221 epilogue_cost += record_stmt_cost (cost_vec,
5222 2 * estimated_nunits - 3,
5223 scalar_stmt, stmt_info, 0,
5224 vect_epilogue);
5226 else if (reduction_type == EXTRACT_LAST_REDUCTION
5227 || reduction_type == FOLD_LEFT_REDUCTION)
5228 /* No extra instructions need in the epilogue. */
5230 else
5232 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5233 tree bitsize = TYPE_SIZE (op.type);
5234 int element_bitsize = tree_to_uhwi (bitsize);
5235 int nelements = vec_size_in_bits / element_bitsize;
5237 if (op.code == COND_EXPR)
5238 op.code = MAX_EXPR;
5240 /* We have a whole vector shift available. */
5241 if (VECTOR_MODE_P (mode)
5242 && directly_supported_p (op.code, vectype)
5243 && have_whole_vector_shift (mode))
5245 /* Final reduction via vector shifts and the reduction operator.
5246 Also requires scalar extract. */
5247 epilogue_cost += record_stmt_cost (cost_vec,
5248 exact_log2 (nelements) * 2,
5249 vector_stmt, stmt_info, 0,
5250 vect_epilogue);
5251 epilogue_cost += record_stmt_cost (cost_vec, 1,
5252 vec_to_scalar, stmt_info, 0,
5253 vect_epilogue);
5255 else
5256 /* Use extracts and reduction op for final reduction. For N
5257 elements, we have N extracts and N-1 reduction ops. */
5258 epilogue_cost += record_stmt_cost (cost_vec,
5259 nelements + nelements - 1,
5260 vector_stmt, stmt_info, 0,
5261 vect_epilogue);
5265 if (dump_enabled_p ())
5266 dump_printf (MSG_NOTE,
5267 "vect_model_reduction_cost: inside_cost = %d, "
5268 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5269 prologue_cost, epilogue_cost);
5272 /* SEQ is a sequence of instructions that initialize the reduction
5273 described by REDUC_INFO. Emit them in the appropriate place. */
5275 static void
5276 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5277 stmt_vec_info reduc_info, gimple *seq)
5279 if (reduc_info->reused_accumulator)
5281 /* When reusing an accumulator from the main loop, we only need
5282 initialization instructions if the main loop can be skipped.
5283 In that case, emit the initialization instructions at the end
5284 of the guard block that does the skip. */
5285 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5286 gcc_assert (skip_edge);
5287 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5288 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5290 else
5292 /* The normal case: emit the initialization instructions on the
5293 preheader edge. */
5294 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5295 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5299 /* Function get_initial_def_for_reduction
5301 Input:
5302 REDUC_INFO - the info_for_reduction
5303 INIT_VAL - the initial value of the reduction variable
5304 NEUTRAL_OP - a value that has no effect on the reduction, as per
5305 neutral_op_for_reduction
5307 Output:
5308 Return a vector variable, initialized according to the operation that
5309 STMT_VINFO performs. This vector will be used as the initial value
5310 of the vector of partial results.
5312 The value we need is a vector in which element 0 has value INIT_VAL
5313 and every other element has value NEUTRAL_OP. */
5315 static tree
5316 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5317 stmt_vec_info reduc_info,
5318 tree init_val, tree neutral_op)
5320 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5321 tree scalar_type = TREE_TYPE (init_val);
5322 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5323 tree init_def;
5324 gimple_seq stmts = NULL;
5326 gcc_assert (vectype);
5328 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5329 || SCALAR_FLOAT_TYPE_P (scalar_type));
5331 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5332 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5334 if (operand_equal_p (init_val, neutral_op))
5336 /* If both elements are equal then the vector described above is
5337 just a splat. */
5338 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5339 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5341 else
5343 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5344 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5345 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5347 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5348 element 0. */
5349 init_def = gimple_build_vector_from_val (&stmts, vectype,
5350 neutral_op);
5351 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5352 vectype, init_def, init_val);
5354 else
5356 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5357 tree_vector_builder elts (vectype, 1, 2);
5358 elts.quick_push (init_val);
5359 elts.quick_push (neutral_op);
5360 init_def = gimple_build_vector (&stmts, &elts);
5364 if (stmts)
5365 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5366 return init_def;
5369 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5370 which performs a reduction involving GROUP_SIZE scalar statements.
5371 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5372 is nonnull, introducing extra elements of that value will not change the
5373 result. */
5375 static void
5376 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5377 stmt_vec_info reduc_info,
5378 vec<tree> *vec_oprnds,
5379 unsigned int number_of_vectors,
5380 unsigned int group_size, tree neutral_op)
5382 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5383 unsigned HOST_WIDE_INT nunits;
5384 unsigned j, number_of_places_left_in_vector;
5385 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5386 unsigned int i;
5388 gcc_assert (group_size == initial_values.length () || neutral_op);
5390 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5391 created vectors. It is greater than 1 if unrolling is performed.
5393 For example, we have two scalar operands, s1 and s2 (e.g., group of
5394 strided accesses of size two), while NUNITS is four (i.e., four scalars
5395 of this type can be packed in a vector). The output vector will contain
5396 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5397 will be 2).
5399 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5400 vectors containing the operands.
5402 For example, NUNITS is four as before, and the group size is 8
5403 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5404 {s5, s6, s7, s8}. */
5406 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5407 nunits = group_size;
5409 number_of_places_left_in_vector = nunits;
5410 bool constant_p = true;
5411 tree_vector_builder elts (vector_type, nunits, 1);
5412 elts.quick_grow (nunits);
5413 gimple_seq ctor_seq = NULL;
5414 for (j = 0; j < nunits * number_of_vectors; ++j)
5416 tree op;
5417 i = j % group_size;
5419 /* Get the def before the loop. In reduction chain we have only
5420 one initial value. Else we have as many as PHIs in the group. */
5421 if (i >= initial_values.length () || (j > i && neutral_op))
5422 op = neutral_op;
5423 else
5424 op = initial_values[i];
5426 /* Create 'vect_ = {op0,op1,...,opn}'. */
5427 number_of_places_left_in_vector--;
5428 elts[nunits - number_of_places_left_in_vector - 1] = op;
5429 if (!CONSTANT_CLASS_P (op))
5430 constant_p = false;
5432 if (number_of_places_left_in_vector == 0)
5434 tree init;
5435 if (constant_p && !neutral_op
5436 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5437 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5438 /* Build the vector directly from ELTS. */
5439 init = gimple_build_vector (&ctor_seq, &elts);
5440 else if (neutral_op)
5442 /* Build a vector of the neutral value and shift the
5443 other elements into place. */
5444 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5445 neutral_op);
5446 int k = nunits;
5447 while (k > 0 && elts[k - 1] == neutral_op)
5448 k -= 1;
5449 while (k > 0)
5451 k -= 1;
5452 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5453 vector_type, init, elts[k]);
5456 else
5458 /* First time round, duplicate ELTS to fill the
5459 required number of vectors. */
5460 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5461 elts, number_of_vectors, *vec_oprnds);
5462 break;
5464 vec_oprnds->quick_push (init);
5466 number_of_places_left_in_vector = nunits;
5467 elts.new_vector (vector_type, nunits, 1);
5468 elts.quick_grow (nunits);
5469 constant_p = true;
5472 if (ctor_seq != NULL)
5473 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5476 /* For a statement STMT_INFO taking part in a reduction operation return
5477 the stmt_vec_info the meta information is stored on. */
5479 stmt_vec_info
5480 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5482 stmt_info = vect_orig_stmt (stmt_info);
5483 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5484 if (!is_a <gphi *> (stmt_info->stmt)
5485 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5486 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5487 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5488 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5490 if (gimple_phi_num_args (phi) == 1)
5491 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5493 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5495 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5496 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5497 stmt_info = info;
5499 return stmt_info;
5502 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5503 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5504 return false. */
5506 static bool
5507 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5508 stmt_vec_info reduc_info)
5510 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5511 if (!main_loop_vinfo)
5512 return false;
5514 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5515 return false;
5517 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5518 auto_vec<tree, 16> main_loop_results (num_phis);
5519 auto_vec<tree, 16> initial_values (num_phis);
5520 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5522 /* The epilogue loop can be entered either from the main loop or
5523 from an earlier guard block. */
5524 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5525 for (tree incoming_value : reduc_info->reduc_initial_values)
5527 /* Look for:
5529 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5530 INITIAL_VALUE(guard block)>. */
5531 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5533 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5534 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5536 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5537 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5539 main_loop_results.quick_push (from_main_loop);
5540 initial_values.quick_push (from_skip);
5543 else
5544 /* The main loop dominates the epilogue loop. */
5545 main_loop_results.splice (reduc_info->reduc_initial_values);
5547 /* See if the main loop has the kind of accumulator we need. */
5548 vect_reusable_accumulator *accumulator
5549 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5550 if (!accumulator
5551 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5552 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5553 accumulator->reduc_info->reduc_scalar_results.begin ()))
5554 return false;
5556 /* Handle the case where we can reduce wider vectors to narrower ones. */
5557 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5558 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5559 unsigned HOST_WIDE_INT m;
5560 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5561 TYPE_VECTOR_SUBPARTS (vectype), &m))
5562 return false;
5563 /* Check the intermediate vector types and operations are available. */
5564 tree prev_vectype = old_vectype;
5565 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5566 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5568 intermediate_nunits = exact_div (intermediate_nunits, 2);
5569 tree intermediate_vectype = get_related_vectype_for_scalar_type
5570 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5571 if (!intermediate_vectype
5572 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5573 intermediate_vectype)
5574 || !can_vec_extract (TYPE_MODE (prev_vectype),
5575 TYPE_MODE (intermediate_vectype)))
5576 return false;
5577 prev_vectype = intermediate_vectype;
5580 /* Non-SLP reductions might apply an adjustment after the reduction
5581 operation, in order to simplify the initialization of the accumulator.
5582 If the epilogue loop carries on from where the main loop left off,
5583 it should apply the same adjustment to the final reduction result.
5585 If the epilogue loop can also be entered directly (rather than via
5586 the main loop), we need to be able to handle that case in the same way,
5587 with the same adjustment. (In principle we could add a PHI node
5588 to select the correct adjustment, but in practice that shouldn't be
5589 necessary.) */
5590 tree main_adjustment
5591 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5592 if (loop_vinfo->main_loop_edge && main_adjustment)
5594 gcc_assert (num_phis == 1);
5595 tree initial_value = initial_values[0];
5596 /* Check that we can use INITIAL_VALUE as the adjustment and
5597 initialize the accumulator with a neutral value instead. */
5598 if (!operand_equal_p (initial_value, main_adjustment))
5599 return false;
5600 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5601 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5602 code, initial_value);
5604 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5605 reduc_info->reduc_initial_values.truncate (0);
5606 reduc_info->reduc_initial_values.splice (initial_values);
5607 reduc_info->reused_accumulator = accumulator;
5608 return true;
5611 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5612 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5614 static tree
5615 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5616 gimple_seq *seq)
5618 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5619 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5620 tree stype = TREE_TYPE (vectype);
5621 tree new_temp = vec_def;
5622 while (nunits > nunits1)
5624 nunits /= 2;
5625 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5626 stype, nunits);
5627 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5629 /* The target has to make sure we support lowpart/highpart
5630 extraction, either via direct vector extract or through
5631 an integer mode punning. */
5632 tree dst1, dst2;
5633 gimple *epilog_stmt;
5634 if (convert_optab_handler (vec_extract_optab,
5635 TYPE_MODE (TREE_TYPE (new_temp)),
5636 TYPE_MODE (vectype1))
5637 != CODE_FOR_nothing)
5639 /* Extract sub-vectors directly once vec_extract becomes
5640 a conversion optab. */
5641 dst1 = make_ssa_name (vectype1);
5642 epilog_stmt
5643 = gimple_build_assign (dst1, BIT_FIELD_REF,
5644 build3 (BIT_FIELD_REF, vectype1,
5645 new_temp, TYPE_SIZE (vectype1),
5646 bitsize_int (0)));
5647 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5648 dst2 = make_ssa_name (vectype1);
5649 epilog_stmt
5650 = gimple_build_assign (dst2, BIT_FIELD_REF,
5651 build3 (BIT_FIELD_REF, vectype1,
5652 new_temp, TYPE_SIZE (vectype1),
5653 bitsize_int (bitsize)));
5654 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5656 else
5658 /* Extract via punning to appropriately sized integer mode
5659 vector. */
5660 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5661 tree etype = build_vector_type (eltype, 2);
5662 gcc_assert (convert_optab_handler (vec_extract_optab,
5663 TYPE_MODE (etype),
5664 TYPE_MODE (eltype))
5665 != CODE_FOR_nothing);
5666 tree tem = make_ssa_name (etype);
5667 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5668 build1 (VIEW_CONVERT_EXPR,
5669 etype, new_temp));
5670 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5671 new_temp = tem;
5672 tem = make_ssa_name (eltype);
5673 epilog_stmt
5674 = gimple_build_assign (tem, BIT_FIELD_REF,
5675 build3 (BIT_FIELD_REF, eltype,
5676 new_temp, TYPE_SIZE (eltype),
5677 bitsize_int (0)));
5678 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5679 dst1 = make_ssa_name (vectype1);
5680 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5681 build1 (VIEW_CONVERT_EXPR,
5682 vectype1, tem));
5683 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5684 tem = make_ssa_name (eltype);
5685 epilog_stmt
5686 = gimple_build_assign (tem, BIT_FIELD_REF,
5687 build3 (BIT_FIELD_REF, eltype,
5688 new_temp, TYPE_SIZE (eltype),
5689 bitsize_int (bitsize)));
5690 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5691 dst2 = make_ssa_name (vectype1);
5692 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5693 build1 (VIEW_CONVERT_EXPR,
5694 vectype1, tem));
5695 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5698 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5701 return new_temp;
5704 /* Function vect_create_epilog_for_reduction
5706 Create code at the loop-epilog to finalize the result of a reduction
5707 computation.
5709 STMT_INFO is the scalar reduction stmt that is being vectorized.
5710 SLP_NODE is an SLP node containing a group of reduction statements. The
5711 first one in this group is STMT_INFO.
5712 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5713 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5714 (counting from 0)
5716 This function:
5717 1. Completes the reduction def-use cycles.
5718 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5719 by calling the function specified by REDUC_FN if available, or by
5720 other means (whole-vector shifts or a scalar loop).
5721 The function also creates a new phi node at the loop exit to preserve
5722 loop-closed form, as illustrated below.
5724 The flow at the entry to this function:
5726 loop:
5727 vec_def = phi <vec_init, null> # REDUCTION_PHI
5728 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5729 s_loop = scalar_stmt # (scalar) STMT_INFO
5730 loop_exit:
5731 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5732 use <s_out0>
5733 use <s_out0>
5735 The above is transformed by this function into:
5737 loop:
5738 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5739 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5740 s_loop = scalar_stmt # (scalar) STMT_INFO
5741 loop_exit:
5742 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5743 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5744 v_out2 = reduce <v_out1>
5745 s_out3 = extract_field <v_out2, 0>
5746 s_out4 = adjust_result <s_out3>
5747 use <s_out4>
5748 use <s_out4>
5751 static void
5752 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5753 stmt_vec_info stmt_info,
5754 slp_tree slp_node,
5755 slp_instance slp_node_instance)
5757 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5758 gcc_assert (reduc_info->is_reduc_info);
5759 /* For double reductions we need to get at the inner loop reduction
5760 stmt which has the meta info attached. Our stmt_info is that of the
5761 loop-closed PHI of the inner loop which we remember as
5762 def for the reduction PHI generation. */
5763 bool double_reduc = false;
5764 stmt_vec_info rdef_info = stmt_info;
5765 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5767 gcc_assert (!slp_node);
5768 double_reduc = true;
5769 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5770 (stmt_info->stmt, 0));
5771 stmt_info = vect_stmt_to_vectorize (stmt_info);
5773 gphi *reduc_def_stmt
5774 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5775 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5776 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5777 tree vectype;
5778 machine_mode mode;
5779 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5780 basic_block exit_bb;
5781 tree scalar_dest;
5782 tree scalar_type;
5783 gimple *new_phi = NULL, *phi;
5784 gimple_stmt_iterator exit_gsi;
5785 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5786 gimple *epilog_stmt = NULL;
5787 gimple *exit_phi;
5788 tree bitsize;
5789 tree def;
5790 tree orig_name, scalar_result;
5791 imm_use_iterator imm_iter, phi_imm_iter;
5792 use_operand_p use_p, phi_use_p;
5793 gimple *use_stmt;
5794 auto_vec<tree> reduc_inputs;
5795 int j, i;
5796 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5797 unsigned int group_size = 1, k;
5798 auto_vec<gimple *> phis;
5799 /* SLP reduction without reduction chain, e.g.,
5800 # a1 = phi <a2, a0>
5801 # b1 = phi <b2, b0>
5802 a2 = operation (a1)
5803 b2 = operation (b1) */
5804 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5805 bool direct_slp_reduc;
5806 tree induction_index = NULL_TREE;
5808 if (slp_node)
5809 group_size = SLP_TREE_LANES (slp_node);
5811 if (nested_in_vect_loop_p (loop, stmt_info))
5813 outer_loop = loop;
5814 loop = loop->inner;
5815 gcc_assert (!slp_node && double_reduc);
5818 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5819 gcc_assert (vectype);
5820 mode = TYPE_MODE (vectype);
5822 tree induc_val = NULL_TREE;
5823 tree adjustment_def = NULL;
5824 if (slp_node)
5826 else
5828 /* Optimize: for induction condition reduction, if we can't use zero
5829 for induc_val, use initial_def. */
5830 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5831 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5832 else if (double_reduc)
5834 else
5835 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5838 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5839 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5840 if (slp_reduc)
5841 /* All statements produce live-out values. */
5842 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5843 else if (slp_node)
5845 /* The last statement in the reduction chain produces the live-out
5846 value. Note SLP optimization can shuffle scalar stmts to
5847 optimize permutations so we have to search for the last stmt. */
5848 for (k = 0; k < group_size; ++k)
5849 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5851 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5852 break;
5856 unsigned vec_num;
5857 int ncopies;
5858 if (slp_node)
5860 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5861 ncopies = 1;
5863 else
5865 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5866 vec_num = 1;
5867 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5870 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5871 which is updated with the current index of the loop for every match of
5872 the original loop's cond_expr (VEC_STMT). This results in a vector
5873 containing the last time the condition passed for that vector lane.
5874 The first match will be a 1 to allow 0 to be used for non-matching
5875 indexes. If there are no matches at all then the vector will be all
5876 zeroes.
5878 PR92772: This algorithm is broken for architectures that support
5879 masked vectors, but do not provide fold_extract_last. */
5880 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5882 auto_vec<std::pair<tree, bool>, 2> ccompares;
5883 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5884 cond_info = vect_stmt_to_vectorize (cond_info);
5885 while (cond_info != reduc_info)
5887 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5889 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5890 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5891 ccompares.safe_push
5892 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5893 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5895 cond_info
5896 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5897 1 + STMT_VINFO_REDUC_IDX
5898 (cond_info)));
5899 cond_info = vect_stmt_to_vectorize (cond_info);
5901 gcc_assert (ccompares.length () != 0);
5903 tree indx_before_incr, indx_after_incr;
5904 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5905 int scalar_precision
5906 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5907 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5908 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5909 (TYPE_MODE (vectype), cr_index_scalar_type,
5910 TYPE_VECTOR_SUBPARTS (vectype));
5912 /* First we create a simple vector induction variable which starts
5913 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5914 vector size (STEP). */
5916 /* Create a {1,2,3,...} vector. */
5917 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5919 /* Create a vector of the step value. */
5920 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5921 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5923 /* Create an induction variable. */
5924 gimple_stmt_iterator incr_gsi;
5925 bool insert_after;
5926 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5927 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5928 insert_after, &indx_before_incr, &indx_after_incr);
5930 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5931 filled with zeros (VEC_ZERO). */
5933 /* Create a vector of 0s. */
5934 tree zero = build_zero_cst (cr_index_scalar_type);
5935 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5937 /* Create a vector phi node. */
5938 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5939 new_phi = create_phi_node (new_phi_tree, loop->header);
5940 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5941 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5943 /* Now take the condition from the loops original cond_exprs
5944 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5945 every match uses values from the induction variable
5946 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5947 (NEW_PHI_TREE).
5948 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5949 the new cond_expr (INDEX_COND_EXPR). */
5950 gimple_seq stmts = NULL;
5951 for (int i = ccompares.length () - 1; i != -1; --i)
5953 tree ccompare = ccompares[i].first;
5954 if (ccompares[i].second)
5955 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5956 cr_index_vector_type,
5957 ccompare,
5958 indx_before_incr, new_phi_tree);
5959 else
5960 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5961 cr_index_vector_type,
5962 ccompare,
5963 new_phi_tree, indx_before_incr);
5965 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5967 /* Update the phi with the vec cond. */
5968 induction_index = new_phi_tree;
5969 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5970 loop_latch_edge (loop), UNKNOWN_LOCATION);
5973 /* 2. Create epilog code.
5974 The reduction epilog code operates across the elements of the vector
5975 of partial results computed by the vectorized loop.
5976 The reduction epilog code consists of:
5978 step 1: compute the scalar result in a vector (v_out2)
5979 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5980 step 3: adjust the scalar result (s_out3) if needed.
5982 Step 1 can be accomplished using one the following three schemes:
5983 (scheme 1) using reduc_fn, if available.
5984 (scheme 2) using whole-vector shifts, if available.
5985 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5986 combined.
5988 The overall epilog code looks like this:
5990 s_out0 = phi <s_loop> # original EXIT_PHI
5991 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5992 v_out2 = reduce <v_out1> # step 1
5993 s_out3 = extract_field <v_out2, 0> # step 2
5994 s_out4 = adjust_result <s_out3> # step 3
5996 (step 3 is optional, and steps 1 and 2 may be combined).
5997 Lastly, the uses of s_out0 are replaced by s_out4. */
6000 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6001 v_out1 = phi <VECT_DEF>
6002 Store them in NEW_PHIS. */
6003 if (double_reduc)
6004 loop = outer_loop;
6005 exit_bb = single_exit (loop)->dest;
6006 exit_gsi = gsi_after_labels (exit_bb);
6007 reduc_inputs.create (slp_node ? vec_num : ncopies);
6008 for (unsigned i = 0; i < vec_num; i++)
6010 gimple_seq stmts = NULL;
6011 if (slp_node)
6012 def = vect_get_slp_vect_def (slp_node, i);
6013 else
6014 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6015 for (j = 0; j < ncopies; j++)
6017 tree new_def = copy_ssa_name (def);
6018 phi = create_phi_node (new_def, exit_bb);
6019 if (j)
6020 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6021 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
6022 new_def = gimple_convert (&stmts, vectype, new_def);
6023 reduc_inputs.quick_push (new_def);
6025 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6028 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6029 (i.e. when reduc_fn is not available) and in the final adjustment
6030 code (if needed). Also get the original scalar reduction variable as
6031 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6032 represents a reduction pattern), the tree-code and scalar-def are
6033 taken from the original stmt that the pattern-stmt (STMT) replaces.
6034 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6035 are taken from STMT. */
6037 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6038 if (orig_stmt_info != stmt_info)
6040 /* Reduction pattern */
6041 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6042 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6045 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6046 scalar_type = TREE_TYPE (scalar_dest);
6047 scalar_results.truncate (0);
6048 scalar_results.reserve_exact (group_size);
6049 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6050 bitsize = TYPE_SIZE (scalar_type);
6052 /* True if we should implement SLP_REDUC using native reduction operations
6053 instead of scalar operations. */
6054 direct_slp_reduc = (reduc_fn != IFN_LAST
6055 && slp_reduc
6056 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6058 /* In case of reduction chain, e.g.,
6059 # a1 = phi <a3, a0>
6060 a2 = operation (a1)
6061 a3 = operation (a2),
6063 we may end up with more than one vector result. Here we reduce them
6064 to one vector.
6066 The same is true for a SLP reduction, e.g.,
6067 # a1 = phi <a2, a0>
6068 # b1 = phi <b2, b0>
6069 a2 = operation (a1)
6070 b2 = operation (a2),
6072 where we can end up with more than one vector as well. We can
6073 easily accumulate vectors when the number of vector elements is
6074 a multiple of the SLP group size.
6076 The same is true if we couldn't use a single defuse cycle. */
6077 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6078 || direct_slp_reduc
6079 || (slp_reduc
6080 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6081 || ncopies > 1)
6083 gimple_seq stmts = NULL;
6084 tree single_input = reduc_inputs[0];
6085 for (k = 1; k < reduc_inputs.length (); k++)
6086 single_input = gimple_build (&stmts, code, vectype,
6087 single_input, reduc_inputs[k]);
6088 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6090 reduc_inputs.truncate (0);
6091 reduc_inputs.safe_push (single_input);
6094 tree orig_reduc_input = reduc_inputs[0];
6096 /* If this loop is an epilogue loop that can be skipped after the
6097 main loop, we can only share a reduction operation between the
6098 main loop and the epilogue if we put it at the target of the
6099 skip edge.
6101 We can still reuse accumulators if this check fails. Doing so has
6102 the minor(?) benefit of making the epilogue loop's scalar result
6103 independent of the main loop's scalar result. */
6104 bool unify_with_main_loop_p = false;
6105 if (reduc_info->reused_accumulator
6106 && loop_vinfo->skip_this_loop_edge
6107 && single_succ_p (exit_bb)
6108 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6110 unify_with_main_loop_p = true;
6112 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6113 reduc_inputs[0] = make_ssa_name (vectype);
6114 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6115 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6116 UNKNOWN_LOCATION);
6117 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6118 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6119 exit_gsi = gsi_after_labels (reduc_block);
6122 /* Shouldn't be used beyond this point. */
6123 exit_bb = nullptr;
6125 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6126 && reduc_fn != IFN_LAST)
6128 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6129 various data values where the condition matched and another vector
6130 (INDUCTION_INDEX) containing all the indexes of those matches. We
6131 need to extract the last matching index (which will be the index with
6132 highest value) and use this to index into the data vector.
6133 For the case where there were no matches, the data vector will contain
6134 all default values and the index vector will be all zeros. */
6136 /* Get various versions of the type of the vector of indexes. */
6137 tree index_vec_type = TREE_TYPE (induction_index);
6138 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6139 tree index_scalar_type = TREE_TYPE (index_vec_type);
6140 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6142 /* Get an unsigned integer version of the type of the data vector. */
6143 int scalar_precision
6144 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6145 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6146 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6147 vectype);
6149 /* First we need to create a vector (ZERO_VEC) of zeros and another
6150 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6151 can create using a MAX reduction and then expanding.
6152 In the case where the loop never made any matches, the max index will
6153 be zero. */
6155 /* Vector of {0, 0, 0,...}. */
6156 tree zero_vec = build_zero_cst (vectype);
6158 /* Find maximum value from the vector of found indexes. */
6159 tree max_index = make_ssa_name (index_scalar_type);
6160 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6161 1, induction_index);
6162 gimple_call_set_lhs (max_index_stmt, max_index);
6163 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6165 /* Vector of {max_index, max_index, max_index,...}. */
6166 tree max_index_vec = make_ssa_name (index_vec_type);
6167 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6168 max_index);
6169 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6170 max_index_vec_rhs);
6171 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6173 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6174 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6175 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6176 otherwise. Only one value should match, resulting in a vector
6177 (VEC_COND) with one data value and the rest zeros.
6178 In the case where the loop never made any matches, every index will
6179 match, resulting in a vector with all data values (which will all be
6180 the default value). */
6182 /* Compare the max index vector to the vector of found indexes to find
6183 the position of the max value. */
6184 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6185 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6186 induction_index,
6187 max_index_vec);
6188 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6190 /* Use the compare to choose either values from the data vector or
6191 zero. */
6192 tree vec_cond = make_ssa_name (vectype);
6193 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6194 vec_compare,
6195 reduc_inputs[0],
6196 zero_vec);
6197 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6199 /* Finally we need to extract the data value from the vector (VEC_COND)
6200 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6201 reduction, but because this doesn't exist, we can use a MAX reduction
6202 instead. The data value might be signed or a float so we need to cast
6203 it first.
6204 In the case where the loop never made any matches, the data values are
6205 all identical, and so will reduce down correctly. */
6207 /* Make the matched data values unsigned. */
6208 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6209 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6210 vec_cond);
6211 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6212 VIEW_CONVERT_EXPR,
6213 vec_cond_cast_rhs);
6214 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6216 /* Reduce down to a scalar value. */
6217 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6218 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6219 1, vec_cond_cast);
6220 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6221 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6223 /* Convert the reduced value back to the result type and set as the
6224 result. */
6225 gimple_seq stmts = NULL;
6226 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6227 data_reduc);
6228 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6229 scalar_results.safe_push (new_temp);
6231 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6232 && reduc_fn == IFN_LAST)
6234 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6235 idx = 0;
6236 idx_val = induction_index[0];
6237 val = data_reduc[0];
6238 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6239 if (induction_index[i] > idx_val)
6240 val = data_reduc[i], idx_val = induction_index[i];
6241 return val; */
6243 tree data_eltype = TREE_TYPE (vectype);
6244 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6245 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6246 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6247 /* Enforced by vectorizable_reduction, which ensures we have target
6248 support before allowing a conditional reduction on variable-length
6249 vectors. */
6250 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6251 tree idx_val = NULL_TREE, val = NULL_TREE;
6252 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6254 tree old_idx_val = idx_val;
6255 tree old_val = val;
6256 idx_val = make_ssa_name (idx_eltype);
6257 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6258 build3 (BIT_FIELD_REF, idx_eltype,
6259 induction_index,
6260 bitsize_int (el_size),
6261 bitsize_int (off)));
6262 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6263 val = make_ssa_name (data_eltype);
6264 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6265 build3 (BIT_FIELD_REF,
6266 data_eltype,
6267 reduc_inputs[0],
6268 bitsize_int (el_size),
6269 bitsize_int (off)));
6270 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6271 if (off != 0)
6273 tree new_idx_val = idx_val;
6274 if (off != v_size - el_size)
6276 new_idx_val = make_ssa_name (idx_eltype);
6277 epilog_stmt = gimple_build_assign (new_idx_val,
6278 MAX_EXPR, idx_val,
6279 old_idx_val);
6280 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6282 tree cond = make_ssa_name (boolean_type_node);
6283 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6284 idx_val, old_idx_val);
6285 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6286 tree new_val = make_ssa_name (data_eltype);
6287 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6288 cond, val, old_val);
6289 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6290 idx_val = new_idx_val;
6291 val = new_val;
6294 /* Convert the reduced value back to the result type and set as the
6295 result. */
6296 gimple_seq stmts = NULL;
6297 val = gimple_convert (&stmts, scalar_type, val);
6298 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6299 scalar_results.safe_push (val);
6302 /* 2.3 Create the reduction code, using one of the three schemes described
6303 above. In SLP we simply need to extract all the elements from the
6304 vector (without reducing them), so we use scalar shifts. */
6305 else if (reduc_fn != IFN_LAST && !slp_reduc)
6307 tree tmp;
6308 tree vec_elem_type;
6310 /* Case 1: Create:
6311 v_out2 = reduc_expr <v_out1> */
6313 if (dump_enabled_p ())
6314 dump_printf_loc (MSG_NOTE, vect_location,
6315 "Reduce using direct vector reduction.\n");
6317 gimple_seq stmts = NULL;
6318 vec_elem_type = TREE_TYPE (vectype);
6319 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6320 vec_elem_type, reduc_inputs[0]);
6321 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6322 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6324 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6325 && induc_val)
6327 /* Earlier we set the initial value to be a vector if induc_val
6328 values. Check the result and if it is induc_val then replace
6329 with the original initial value, unless induc_val is
6330 the same as initial_def already. */
6331 tree zcompare = make_ssa_name (boolean_type_node);
6332 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6333 new_temp, induc_val);
6334 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6335 tree initial_def = reduc_info->reduc_initial_values[0];
6336 tmp = make_ssa_name (new_scalar_dest);
6337 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6338 initial_def, new_temp);
6339 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6340 new_temp = tmp;
6343 scalar_results.safe_push (new_temp);
6345 else if (direct_slp_reduc)
6347 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6348 with the elements for other SLP statements replaced with the
6349 neutral value. We can then do a normal reduction on each vector. */
6351 /* Enforced by vectorizable_reduction. */
6352 gcc_assert (reduc_inputs.length () == 1);
6353 gcc_assert (pow2p_hwi (group_size));
6355 gimple_seq seq = NULL;
6357 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6358 and the same element size as VECTYPE. */
6359 tree index = build_index_vector (vectype, 0, 1);
6360 tree index_type = TREE_TYPE (index);
6361 tree index_elt_type = TREE_TYPE (index_type);
6362 tree mask_type = truth_type_for (index_type);
6364 /* Create a vector that, for each element, identifies which of
6365 the REDUC_GROUP_SIZE results should use it. */
6366 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6367 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6368 build_vector_from_val (index_type, index_mask));
6370 /* Get a neutral vector value. This is simply a splat of the neutral
6371 scalar value if we have one, otherwise the initial scalar value
6372 is itself a neutral value. */
6373 tree vector_identity = NULL_TREE;
6374 tree neutral_op = NULL_TREE;
6375 if (slp_node)
6377 tree initial_value = NULL_TREE;
6378 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6379 initial_value = reduc_info->reduc_initial_values[0];
6380 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6381 initial_value);
6383 if (neutral_op)
6384 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6385 neutral_op);
6386 for (unsigned int i = 0; i < group_size; ++i)
6388 /* If there's no univeral neutral value, we can use the
6389 initial scalar value from the original PHI. This is used
6390 for MIN and MAX reduction, for example. */
6391 if (!neutral_op)
6393 tree scalar_value = reduc_info->reduc_initial_values[i];
6394 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6395 scalar_value);
6396 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6397 scalar_value);
6400 /* Calculate the equivalent of:
6402 sel[j] = (index[j] == i);
6404 which selects the elements of REDUC_INPUTS[0] that should
6405 be included in the result. */
6406 tree compare_val = build_int_cst (index_elt_type, i);
6407 compare_val = build_vector_from_val (index_type, compare_val);
6408 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6409 index, compare_val);
6411 /* Calculate the equivalent of:
6413 vec = seq ? reduc_inputs[0] : vector_identity;
6415 VEC is now suitable for a full vector reduction. */
6416 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6417 sel, reduc_inputs[0], vector_identity);
6419 /* Do the reduction and convert it to the appropriate type. */
6420 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6421 TREE_TYPE (vectype), vec);
6422 scalar = gimple_convert (&seq, scalar_type, scalar);
6423 scalar_results.safe_push (scalar);
6425 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6427 else
6429 bool reduce_with_shift;
6430 tree vec_temp;
6432 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6434 /* See if the target wants to do the final (shift) reduction
6435 in a vector mode of smaller size and first reduce upper/lower
6436 halves against each other. */
6437 enum machine_mode mode1 = mode;
6438 tree stype = TREE_TYPE (vectype);
6439 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6440 unsigned nunits1 = nunits;
6441 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6442 && reduc_inputs.length () == 1)
6444 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6445 /* For SLP reductions we have to make sure lanes match up, but
6446 since we're doing individual element final reduction reducing
6447 vector width here is even more important.
6448 ??? We can also separate lanes with permutes, for the common
6449 case of power-of-two group-size odd/even extracts would work. */
6450 if (slp_reduc && nunits != nunits1)
6452 nunits1 = least_common_multiple (nunits1, group_size);
6453 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6456 if (!slp_reduc
6457 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6458 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6460 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6461 stype, nunits1);
6462 reduce_with_shift = have_whole_vector_shift (mode1);
6463 if (!VECTOR_MODE_P (mode1)
6464 || !directly_supported_p (code, vectype1))
6465 reduce_with_shift = false;
6467 /* First reduce the vector to the desired vector size we should
6468 do shift reduction on by combining upper and lower halves. */
6469 gimple_seq stmts = NULL;
6470 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6471 code, &stmts);
6472 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6473 reduc_inputs[0] = new_temp;
6475 if (reduce_with_shift && !slp_reduc)
6477 int element_bitsize = tree_to_uhwi (bitsize);
6478 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6479 for variable-length vectors and also requires direct target support
6480 for loop reductions. */
6481 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6482 int nelements = vec_size_in_bits / element_bitsize;
6483 vec_perm_builder sel;
6484 vec_perm_indices indices;
6486 int elt_offset;
6488 tree zero_vec = build_zero_cst (vectype1);
6489 /* Case 2: Create:
6490 for (offset = nelements/2; offset >= 1; offset/=2)
6492 Create: va' = vec_shift <va, offset>
6493 Create: va = vop <va, va'>
6494 } */
6496 tree rhs;
6498 if (dump_enabled_p ())
6499 dump_printf_loc (MSG_NOTE, vect_location,
6500 "Reduce using vector shifts\n");
6502 gimple_seq stmts = NULL;
6503 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6504 for (elt_offset = nelements / 2;
6505 elt_offset >= 1;
6506 elt_offset /= 2)
6508 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6509 indices.new_vector (sel, 2, nelements);
6510 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6511 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6512 new_temp, zero_vec, mask);
6513 new_temp = gimple_build (&stmts, code,
6514 vectype1, new_name, new_temp);
6516 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6518 /* 2.4 Extract the final scalar result. Create:
6519 s_out3 = extract_field <v_out2, bitpos> */
6521 if (dump_enabled_p ())
6522 dump_printf_loc (MSG_NOTE, vect_location,
6523 "extract scalar result\n");
6525 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6526 bitsize, bitsize_zero_node);
6527 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6528 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6529 gimple_assign_set_lhs (epilog_stmt, new_temp);
6530 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6531 scalar_results.safe_push (new_temp);
6533 else
6535 /* Case 3: Create:
6536 s = extract_field <v_out2, 0>
6537 for (offset = element_size;
6538 offset < vector_size;
6539 offset += element_size;)
6541 Create: s' = extract_field <v_out2, offset>
6542 Create: s = op <s, s'> // For non SLP cases
6543 } */
6545 if (dump_enabled_p ())
6546 dump_printf_loc (MSG_NOTE, vect_location,
6547 "Reduce using scalar code.\n");
6549 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6550 int element_bitsize = tree_to_uhwi (bitsize);
6551 tree compute_type = TREE_TYPE (vectype);
6552 gimple_seq stmts = NULL;
6553 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6555 int bit_offset;
6556 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6557 vec_temp, bitsize, bitsize_zero_node);
6559 /* In SLP we don't need to apply reduction operation, so we just
6560 collect s' values in SCALAR_RESULTS. */
6561 if (slp_reduc)
6562 scalar_results.safe_push (new_temp);
6564 for (bit_offset = element_bitsize;
6565 bit_offset < vec_size_in_bits;
6566 bit_offset += element_bitsize)
6568 tree bitpos = bitsize_int (bit_offset);
6569 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6570 compute_type, vec_temp,
6571 bitsize, bitpos);
6572 if (slp_reduc)
6574 /* In SLP we don't need to apply reduction operation, so
6575 we just collect s' values in SCALAR_RESULTS. */
6576 new_temp = new_name;
6577 scalar_results.safe_push (new_name);
6579 else
6580 new_temp = gimple_build (&stmts, code, compute_type,
6581 new_name, new_temp);
6585 /* The only case where we need to reduce scalar results in SLP, is
6586 unrolling. If the size of SCALAR_RESULTS is greater than
6587 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6588 REDUC_GROUP_SIZE. */
6589 if (slp_reduc)
6591 tree res, first_res, new_res;
6593 /* Reduce multiple scalar results in case of SLP unrolling. */
6594 for (j = group_size; scalar_results.iterate (j, &res);
6595 j++)
6597 first_res = scalar_results[j % group_size];
6598 new_res = gimple_build (&stmts, code, compute_type,
6599 first_res, res);
6600 scalar_results[j % group_size] = new_res;
6602 scalar_results.truncate (group_size);
6603 for (k = 0; k < group_size; k++)
6604 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6605 scalar_results[k]);
6607 else
6609 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6610 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6611 scalar_results.safe_push (new_temp);
6614 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6617 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6618 && induc_val)
6620 /* Earlier we set the initial value to be a vector if induc_val
6621 values. Check the result and if it is induc_val then replace
6622 with the original initial value, unless induc_val is
6623 the same as initial_def already. */
6624 tree zcompare = make_ssa_name (boolean_type_node);
6625 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6626 induc_val);
6627 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6628 tree initial_def = reduc_info->reduc_initial_values[0];
6629 tree tmp = make_ssa_name (new_scalar_dest);
6630 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6631 initial_def, new_temp);
6632 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6633 scalar_results[0] = tmp;
6637 /* 2.5 Adjust the final result by the initial value of the reduction
6638 variable. (When such adjustment is not needed, then
6639 'adjustment_def' is zero). For example, if code is PLUS we create:
6640 new_temp = loop_exit_def + adjustment_def */
6642 if (adjustment_def)
6644 gcc_assert (!slp_reduc);
6645 gimple_seq stmts = NULL;
6646 if (double_reduc)
6648 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6649 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6650 new_temp = gimple_build (&stmts, code, vectype,
6651 reduc_inputs[0], adjustment_def);
6653 else
6655 new_temp = scalar_results[0];
6656 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6657 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6658 adjustment_def);
6659 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6660 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6661 new_temp, adjustment_def);
6662 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6665 epilog_stmt = gimple_seq_last_stmt (stmts);
6666 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6667 scalar_results[0] = new_temp;
6670 /* Record this operation if it could be reused by the epilogue loop. */
6671 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6672 && reduc_inputs.length () == 1)
6673 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6674 { orig_reduc_input, reduc_info });
6676 if (double_reduc)
6677 loop = outer_loop;
6679 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6680 phis with new adjusted scalar results, i.e., replace use <s_out0>
6681 with use <s_out4>.
6683 Transform:
6684 loop_exit:
6685 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6686 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6687 v_out2 = reduce <v_out1>
6688 s_out3 = extract_field <v_out2, 0>
6689 s_out4 = adjust_result <s_out3>
6690 use <s_out0>
6691 use <s_out0>
6693 into:
6695 loop_exit:
6696 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6697 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6698 v_out2 = reduce <v_out1>
6699 s_out3 = extract_field <v_out2, 0>
6700 s_out4 = adjust_result <s_out3>
6701 use <s_out4>
6702 use <s_out4> */
6704 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6705 for (k = 0; k < live_out_stmts.size (); k++)
6707 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6708 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6710 phis.create (3);
6711 /* Find the loop-closed-use at the loop exit of the original scalar
6712 result. (The reduction result is expected to have two immediate uses,
6713 one at the latch block, and one at the loop exit). For double
6714 reductions we are looking for exit phis of the outer loop. */
6715 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6717 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6719 if (!is_gimple_debug (USE_STMT (use_p)))
6720 phis.safe_push (USE_STMT (use_p));
6722 else
6724 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6726 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6728 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6730 if (!flow_bb_inside_loop_p (loop,
6731 gimple_bb (USE_STMT (phi_use_p)))
6732 && !is_gimple_debug (USE_STMT (phi_use_p)))
6733 phis.safe_push (USE_STMT (phi_use_p));
6739 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6741 /* Replace the uses: */
6742 orig_name = PHI_RESULT (exit_phi);
6744 /* Look for a single use at the target of the skip edge. */
6745 if (unify_with_main_loop_p)
6747 use_operand_p use_p;
6748 gimple *user;
6749 if (!single_imm_use (orig_name, &use_p, &user))
6750 gcc_unreachable ();
6751 orig_name = gimple_get_lhs (user);
6754 scalar_result = scalar_results[k];
6755 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6757 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6758 SET_USE (use_p, scalar_result);
6759 update_stmt (use_stmt);
6763 phis.release ();
6767 /* Return a vector of type VECTYPE that is equal to the vector select
6768 operation "MASK ? VEC : IDENTITY". Insert the select statements
6769 before GSI. */
6771 static tree
6772 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6773 tree vec, tree identity)
6775 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6776 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6777 mask, vec, identity);
6778 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6779 return cond;
6782 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6783 order, starting with LHS. Insert the extraction statements before GSI and
6784 associate the new scalar SSA names with variable SCALAR_DEST.
6785 Return the SSA name for the result. */
6787 static tree
6788 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6789 tree_code code, tree lhs, tree vector_rhs)
6791 tree vectype = TREE_TYPE (vector_rhs);
6792 tree scalar_type = TREE_TYPE (vectype);
6793 tree bitsize = TYPE_SIZE (scalar_type);
6794 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6795 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6797 for (unsigned HOST_WIDE_INT bit_offset = 0;
6798 bit_offset < vec_size_in_bits;
6799 bit_offset += element_bitsize)
6801 tree bitpos = bitsize_int (bit_offset);
6802 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6803 bitsize, bitpos);
6805 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6806 rhs = make_ssa_name (scalar_dest, stmt);
6807 gimple_assign_set_lhs (stmt, rhs);
6808 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6810 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6811 tree new_name = make_ssa_name (scalar_dest, stmt);
6812 gimple_assign_set_lhs (stmt, new_name);
6813 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6814 lhs = new_name;
6816 return lhs;
6819 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6820 type of the vector input. */
6822 static internal_fn
6823 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6825 internal_fn mask_reduc_fn;
6826 internal_fn mask_len_reduc_fn;
6828 switch (reduc_fn)
6830 case IFN_FOLD_LEFT_PLUS:
6831 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6832 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6833 break;
6835 default:
6836 return IFN_LAST;
6839 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6840 OPTIMIZE_FOR_SPEED))
6841 return mask_reduc_fn;
6842 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6843 OPTIMIZE_FOR_SPEED))
6844 return mask_len_reduc_fn;
6845 return IFN_LAST;
6848 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6849 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6850 statement. CODE is the operation performed by STMT_INFO and OPS are
6851 its scalar operands. REDUC_INDEX is the index of the operand in
6852 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6853 implements in-order reduction, or IFN_LAST if we should open-code it.
6854 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6855 that should be used to control the operation in a fully-masked loop. */
6857 static bool
6858 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6859 stmt_vec_info stmt_info,
6860 gimple_stmt_iterator *gsi,
6861 gimple **vec_stmt, slp_tree slp_node,
6862 gimple *reduc_def_stmt,
6863 tree_code code, internal_fn reduc_fn,
6864 tree ops[3], tree vectype_in,
6865 int reduc_index, vec_loop_masks *masks,
6866 vec_loop_lens *lens)
6868 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6869 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6870 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6872 int ncopies;
6873 if (slp_node)
6874 ncopies = 1;
6875 else
6876 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6878 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6879 gcc_assert (ncopies == 1);
6880 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6882 if (slp_node)
6883 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6884 TYPE_VECTOR_SUBPARTS (vectype_in)));
6886 tree op0 = ops[1 - reduc_index];
6888 int group_size = 1;
6889 stmt_vec_info scalar_dest_def_info;
6890 auto_vec<tree> vec_oprnds0;
6891 if (slp_node)
6893 auto_vec<vec<tree> > vec_defs (2);
6894 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6895 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6896 vec_defs[0].release ();
6897 vec_defs[1].release ();
6898 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6899 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6901 else
6903 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6904 op0, &vec_oprnds0);
6905 scalar_dest_def_info = stmt_info;
6908 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6909 tree scalar_type = TREE_TYPE (scalar_dest);
6910 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6912 int vec_num = vec_oprnds0.length ();
6913 gcc_assert (vec_num == 1 || slp_node);
6914 tree vec_elem_type = TREE_TYPE (vectype_out);
6915 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6917 tree vector_identity = NULL_TREE;
6918 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6920 vector_identity = build_zero_cst (vectype_out);
6921 if (!HONOR_SIGNED_ZEROS (vectype_out))
6923 else
6925 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
6926 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
6927 vector_identity);
6931 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6932 int i;
6933 tree def0;
6934 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6936 gimple *new_stmt;
6937 tree mask = NULL_TREE;
6938 tree len = NULL_TREE;
6939 tree bias = NULL_TREE;
6940 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6941 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
6942 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6944 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6945 i, 1);
6946 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6947 bias = build_int_cst (intQI_type_node, biasval);
6948 mask = build_minus_one_cst (truth_type_for (vectype_in));
6951 /* Handle MINUS by adding the negative. */
6952 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6954 tree negated = make_ssa_name (vectype_out);
6955 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6956 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6957 def0 = negated;
6960 if (mask && mask_reduc_fn == IFN_LAST)
6961 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6962 vector_identity);
6964 /* On the first iteration the input is simply the scalar phi
6965 result, and for subsequent iterations it is the output of
6966 the preceding operation. */
6967 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6969 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6970 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6971 def0, mask, len, bias);
6972 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6973 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6974 def0, mask);
6975 else
6976 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6977 def0);
6978 /* For chained SLP reductions the output of the previous reduction
6979 operation serves as the input of the next. For the final statement
6980 the output cannot be a temporary - we reuse the original
6981 scalar destination of the last statement. */
6982 if (i != vec_num - 1)
6984 gimple_set_lhs (new_stmt, scalar_dest_var);
6985 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6986 gimple_set_lhs (new_stmt, reduc_var);
6989 else
6991 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6992 reduc_var, def0);
6993 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6994 /* Remove the statement, so that we can use the same code paths
6995 as for statements that we've just created. */
6996 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6997 gsi_remove (&tmp_gsi, true);
7000 if (i == vec_num - 1)
7002 gimple_set_lhs (new_stmt, scalar_dest);
7003 vect_finish_replace_stmt (loop_vinfo,
7004 scalar_dest_def_info,
7005 new_stmt);
7007 else
7008 vect_finish_stmt_generation (loop_vinfo,
7009 scalar_dest_def_info,
7010 new_stmt, gsi);
7012 if (slp_node)
7013 slp_node->push_vec_def (new_stmt);
7014 else
7016 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7017 *vec_stmt = new_stmt;
7021 return true;
7024 /* Function is_nonwrapping_integer_induction.
7026 Check if STMT_VINO (which is part of loop LOOP) both increments and
7027 does not cause overflow. */
7029 static bool
7030 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7032 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7033 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7034 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7035 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7036 widest_int ni, max_loop_value, lhs_max;
7037 wi::overflow_type overflow = wi::OVF_NONE;
7039 /* Make sure the loop is integer based. */
7040 if (TREE_CODE (base) != INTEGER_CST
7041 || TREE_CODE (step) != INTEGER_CST)
7042 return false;
7044 /* Check that the max size of the loop will not wrap. */
7046 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7047 return true;
7049 if (! max_stmt_executions (loop, &ni))
7050 return false;
7052 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7053 &overflow);
7054 if (overflow)
7055 return false;
7057 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7058 TYPE_SIGN (lhs_type), &overflow);
7059 if (overflow)
7060 return false;
7062 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7063 <= TYPE_PRECISION (lhs_type));
7066 /* Check if masking can be supported by inserting a conditional expression.
7067 CODE is the code for the operation. COND_FN is the conditional internal
7068 function, if it exists. VECTYPE_IN is the type of the vector input. */
7069 static bool
7070 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7071 tree vectype_in)
7073 if (cond_fn != IFN_LAST
7074 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7075 OPTIMIZE_FOR_SPEED))
7076 return false;
7078 if (code.is_tree_code ())
7079 switch (tree_code (code))
7081 case DOT_PROD_EXPR:
7082 case SAD_EXPR:
7083 return true;
7085 default:
7086 break;
7088 return false;
7091 /* Insert a conditional expression to enable masked vectorization. CODE is the
7092 code for the operation. VOP is the array of operands. MASK is the loop
7093 mask. GSI is a statement iterator used to place the new conditional
7094 expression. */
7095 static void
7096 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7097 gimple_stmt_iterator *gsi)
7099 switch (tree_code (code))
7101 case DOT_PROD_EXPR:
7103 tree vectype = TREE_TYPE (vop[1]);
7104 tree zero = build_zero_cst (vectype);
7105 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7106 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7107 mask, vop[1], zero);
7108 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7109 vop[1] = masked_op1;
7110 break;
7113 case SAD_EXPR:
7115 tree vectype = TREE_TYPE (vop[1]);
7116 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7117 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7118 mask, vop[1], vop[0]);
7119 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7120 vop[1] = masked_op1;
7121 break;
7124 default:
7125 gcc_unreachable ();
7129 /* Function vectorizable_reduction.
7131 Check if STMT_INFO performs a reduction operation that can be vectorized.
7132 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7133 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7134 Return true if STMT_INFO is vectorizable in this way.
7136 This function also handles reduction idioms (patterns) that have been
7137 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7138 may be of this form:
7139 X = pattern_expr (arg0, arg1, ..., X)
7140 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7141 sequence that had been detected and replaced by the pattern-stmt
7142 (STMT_INFO).
7144 This function also handles reduction of condition expressions, for example:
7145 for (int i = 0; i < N; i++)
7146 if (a[i] < value)
7147 last = a[i];
7148 This is handled by vectorising the loop and creating an additional vector
7149 containing the loop indexes for which "a[i] < value" was true. In the
7150 function epilogue this is reduced to a single max value and then used to
7151 index into the vector of results.
7153 In some cases of reduction patterns, the type of the reduction variable X is
7154 different than the type of the other arguments of STMT_INFO.
7155 In such cases, the vectype that is used when transforming STMT_INFO into
7156 a vector stmt is different than the vectype that is used to determine the
7157 vectorization factor, because it consists of a different number of elements
7158 than the actual number of elements that are being operated upon in parallel.
7160 For example, consider an accumulation of shorts into an int accumulator.
7161 On some targets it's possible to vectorize this pattern operating on 8
7162 shorts at a time (hence, the vectype for purposes of determining the
7163 vectorization factor should be V8HI); on the other hand, the vectype that
7164 is used to create the vector form is actually V4SI (the type of the result).
7166 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7167 indicates what is the actual level of parallelism (V8HI in the example), so
7168 that the right vectorization factor would be derived. This vectype
7169 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7170 be used to create the vectorized stmt. The right vectype for the vectorized
7171 stmt is obtained from the type of the result X:
7172 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7174 This means that, contrary to "regular" reductions (or "regular" stmts in
7175 general), the following equation:
7176 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7177 does *NOT* necessarily hold for reduction patterns. */
7179 bool
7180 vectorizable_reduction (loop_vec_info loop_vinfo,
7181 stmt_vec_info stmt_info, slp_tree slp_node,
7182 slp_instance slp_node_instance,
7183 stmt_vector_for_cost *cost_vec)
7185 tree vectype_in = NULL_TREE;
7186 tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7187 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7188 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7189 stmt_vec_info cond_stmt_vinfo = NULL;
7190 int i;
7191 int ncopies;
7192 bool single_defuse_cycle = false;
7193 bool nested_cycle = false;
7194 bool double_reduc = false;
7195 int vec_num;
7196 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7197 tree cond_reduc_val = NULL_TREE;
7199 /* Make sure it was already recognized as a reduction computation. */
7200 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7201 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7202 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7203 return false;
7205 /* The stmt we store reduction analysis meta on. */
7206 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7207 reduc_info->is_reduc_info = true;
7209 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7211 if (is_a <gphi *> (stmt_info->stmt))
7213 if (slp_node)
7215 /* We eventually need to set a vector type on invariant
7216 arguments. */
7217 unsigned j;
7218 slp_tree child;
7219 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7220 if (!vect_maybe_update_slp_op_vectype
7221 (child, SLP_TREE_VECTYPE (slp_node)))
7223 if (dump_enabled_p ())
7224 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7225 "incompatible vector types for "
7226 "invariants\n");
7227 return false;
7230 /* Analysis for double-reduction is done on the outer
7231 loop PHI, nested cycles have no further restrictions. */
7232 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7234 else
7235 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7236 return true;
7239 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7240 stmt_vec_info phi_info = stmt_info;
7241 if (!is_a <gphi *> (stmt_info->stmt))
7243 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7244 return true;
7246 if (slp_node)
7248 slp_node_instance->reduc_phis = slp_node;
7249 /* ??? We're leaving slp_node to point to the PHIs, we only
7250 need it to get at the number of vector stmts which wasn't
7251 yet initialized for the instance root. */
7253 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7255 use_operand_p use_p;
7256 gimple *use_stmt;
7257 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7258 &use_p, &use_stmt);
7259 gcc_assert (res);
7260 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7263 /* PHIs should not participate in patterns. */
7264 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7265 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7267 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7268 and compute the reduction chain length. Discover the real
7269 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7270 tree reduc_def
7271 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7272 loop_latch_edge
7273 (gimple_bb (reduc_def_phi)->loop_father));
7274 unsigned reduc_chain_length = 0;
7275 bool only_slp_reduc_chain = true;
7276 stmt_info = NULL;
7277 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7278 while (reduc_def != PHI_RESULT (reduc_def_phi))
7280 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7281 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7282 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7284 if (dump_enabled_p ())
7285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7286 "reduction chain broken by patterns.\n");
7287 return false;
7289 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7290 only_slp_reduc_chain = false;
7291 /* For epilogue generation live members of the chain need
7292 to point back to the PHI via their original stmt for
7293 info_for_reduction to work. For SLP we need to look at
7294 all lanes here - even though we only will vectorize from
7295 the SLP node with live lane zero the other live lanes also
7296 need to be identified as part of a reduction to be able
7297 to skip code generation for them. */
7298 if (slp_for_stmt_info)
7300 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7301 if (STMT_VINFO_LIVE_P (s))
7302 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7304 else if (STMT_VINFO_LIVE_P (vdef))
7305 STMT_VINFO_REDUC_DEF (def) = phi_info;
7306 gimple_match_op op;
7307 if (!gimple_extract_op (vdef->stmt, &op))
7309 if (dump_enabled_p ())
7310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7311 "reduction chain includes unsupported"
7312 " statement type.\n");
7313 return false;
7315 if (CONVERT_EXPR_CODE_P (op.code))
7317 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7319 if (dump_enabled_p ())
7320 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7321 "conversion in the reduction chain.\n");
7322 return false;
7325 else if (!stmt_info)
7326 /* First non-conversion stmt. */
7327 stmt_info = vdef;
7328 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7329 reduc_chain_length++;
7330 if (!stmt_info && slp_node)
7331 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7333 /* PHIs should not participate in patterns. */
7334 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7336 if (nested_in_vect_loop_p (loop, stmt_info))
7338 loop = loop->inner;
7339 nested_cycle = true;
7342 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7343 element. */
7344 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7346 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7347 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7349 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7350 gcc_assert (slp_node
7351 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7353 /* 1. Is vectorizable reduction? */
7354 /* Not supportable if the reduction variable is used in the loop, unless
7355 it's a reduction chain. */
7356 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7357 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7358 return false;
7360 /* Reductions that are not used even in an enclosing outer-loop,
7361 are expected to be "live" (used out of the loop). */
7362 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7363 && !STMT_VINFO_LIVE_P (stmt_info))
7364 return false;
7366 /* 2. Has this been recognized as a reduction pattern?
7368 Check if STMT represents a pattern that has been recognized
7369 in earlier analysis stages. For stmts that represent a pattern,
7370 the STMT_VINFO_RELATED_STMT field records the last stmt in
7371 the original sequence that constitutes the pattern. */
7373 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7374 if (orig_stmt_info)
7376 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7377 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7380 /* 3. Check the operands of the operation. The first operands are defined
7381 inside the loop body. The last operand is the reduction variable,
7382 which is defined by the loop-header-phi. */
7384 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7385 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7386 gimple_match_op op;
7387 if (!gimple_extract_op (stmt_info->stmt, &op))
7388 gcc_unreachable ();
7389 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7390 || op.code == WIDEN_SUM_EXPR
7391 || op.code == SAD_EXPR);
7393 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7394 && !SCALAR_FLOAT_TYPE_P (op.type))
7395 return false;
7397 /* Do not try to vectorize bit-precision reductions. */
7398 if (!type_has_mode_precision_p (op.type))
7399 return false;
7401 /* For lane-reducing ops we're reducing the number of reduction PHIs
7402 which means the only use of that may be in the lane-reducing operation. */
7403 if (lane_reduc_code_p
7404 && reduc_chain_length != 1
7405 && !only_slp_reduc_chain)
7407 if (dump_enabled_p ())
7408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7409 "lane-reducing reduction with extra stmts.\n");
7410 return false;
7413 /* All uses but the last are expected to be defined in the loop.
7414 The last use is the reduction variable. In case of nested cycle this
7415 assumption is not true: we use reduc_index to record the index of the
7416 reduction variable. */
7417 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7418 /* We need to skip an extra operand for COND_EXPRs with embedded
7419 comparison. */
7420 unsigned opno_adjust = 0;
7421 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7422 opno_adjust = 1;
7423 for (i = 0; i < (int) op.num_ops; i++)
7425 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7426 if (i == 0 && op.code == COND_EXPR)
7427 continue;
7429 stmt_vec_info def_stmt_info;
7430 enum vect_def_type dt;
7431 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7432 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7433 &vectype_op[i], &def_stmt_info))
7435 if (dump_enabled_p ())
7436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7437 "use not simple.\n");
7438 return false;
7440 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7441 continue;
7443 /* There should be only one cycle def in the stmt, the one
7444 leading to reduc_def. */
7445 if (VECTORIZABLE_CYCLE_DEF (dt))
7446 return false;
7448 if (!vectype_op[i])
7449 vectype_op[i]
7450 = get_vectype_for_scalar_type (loop_vinfo,
7451 TREE_TYPE (op.ops[i]), slp_op[i]);
7453 /* To properly compute ncopies we are interested in the widest
7454 non-reduction input type in case we're looking at a widening
7455 accumulation that we later handle in vect_transform_reduction. */
7456 if (lane_reduc_code_p
7457 && vectype_op[i]
7458 && (!vectype_in
7459 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7460 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7461 vectype_in = vectype_op[i];
7463 if (op.code == COND_EXPR)
7465 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7466 if (dt == vect_constant_def)
7468 cond_reduc_dt = dt;
7469 cond_reduc_val = op.ops[i];
7471 if (dt == vect_induction_def
7472 && def_stmt_info
7473 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7475 cond_reduc_dt = dt;
7476 cond_stmt_vinfo = def_stmt_info;
7480 if (!vectype_in)
7481 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7482 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7484 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7485 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7486 /* If we have a condition reduction, see if we can simplify it further. */
7487 if (v_reduc_type == COND_REDUCTION)
7489 if (slp_node)
7490 return false;
7492 /* When the condition uses the reduction value in the condition, fail. */
7493 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7495 if (dump_enabled_p ())
7496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7497 "condition depends on previous iteration\n");
7498 return false;
7501 if (reduc_chain_length == 1
7502 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7503 OPTIMIZE_FOR_SPEED)
7504 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7505 vectype_in,
7506 OPTIMIZE_FOR_SPEED)))
7508 if (dump_enabled_p ())
7509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7510 "optimizing condition reduction with"
7511 " FOLD_EXTRACT_LAST.\n");
7512 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7514 else if (cond_reduc_dt == vect_induction_def)
7516 tree base
7517 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7518 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7520 gcc_assert (TREE_CODE (base) == INTEGER_CST
7521 && TREE_CODE (step) == INTEGER_CST);
7522 cond_reduc_val = NULL_TREE;
7523 enum tree_code cond_reduc_op_code = ERROR_MARK;
7524 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7525 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7527 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7528 above base; punt if base is the minimum value of the type for
7529 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7530 else if (tree_int_cst_sgn (step) == -1)
7532 cond_reduc_op_code = MIN_EXPR;
7533 if (tree_int_cst_sgn (base) == -1)
7534 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7535 else if (tree_int_cst_lt (base,
7536 TYPE_MAX_VALUE (TREE_TYPE (base))))
7537 cond_reduc_val
7538 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7540 else
7542 cond_reduc_op_code = MAX_EXPR;
7543 if (tree_int_cst_sgn (base) == 1)
7544 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7545 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7546 base))
7547 cond_reduc_val
7548 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7550 if (cond_reduc_val)
7552 if (dump_enabled_p ())
7553 dump_printf_loc (MSG_NOTE, vect_location,
7554 "condition expression based on "
7555 "integer induction.\n");
7556 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7557 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7558 = cond_reduc_val;
7559 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7562 else if (cond_reduc_dt == vect_constant_def)
7564 enum vect_def_type cond_initial_dt;
7565 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7566 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7567 if (cond_initial_dt == vect_constant_def
7568 && types_compatible_p (TREE_TYPE (cond_initial_val),
7569 TREE_TYPE (cond_reduc_val)))
7571 tree e = fold_binary (LE_EXPR, boolean_type_node,
7572 cond_initial_val, cond_reduc_val);
7573 if (e && (integer_onep (e) || integer_zerop (e)))
7575 if (dump_enabled_p ())
7576 dump_printf_loc (MSG_NOTE, vect_location,
7577 "condition expression based on "
7578 "compile time constant.\n");
7579 /* Record reduction code at analysis stage. */
7580 STMT_VINFO_REDUC_CODE (reduc_info)
7581 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7582 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7588 if (STMT_VINFO_LIVE_P (phi_info))
7589 return false;
7591 if (slp_node)
7592 ncopies = 1;
7593 else
7594 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7596 gcc_assert (ncopies >= 1);
7598 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7600 if (nested_cycle)
7602 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7603 == vect_double_reduction_def);
7604 double_reduc = true;
7607 /* 4.2. Check support for the epilog operation.
7609 If STMT represents a reduction pattern, then the type of the
7610 reduction variable may be different than the type of the rest
7611 of the arguments. For example, consider the case of accumulation
7612 of shorts into an int accumulator; The original code:
7613 S1: int_a = (int) short_a;
7614 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7616 was replaced with:
7617 STMT: int_acc = widen_sum <short_a, int_acc>
7619 This means that:
7620 1. The tree-code that is used to create the vector operation in the
7621 epilog code (that reduces the partial results) is not the
7622 tree-code of STMT, but is rather the tree-code of the original
7623 stmt from the pattern that STMT is replacing. I.e, in the example
7624 above we want to use 'widen_sum' in the loop, but 'plus' in the
7625 epilog.
7626 2. The type (mode) we use to check available target support
7627 for the vector operation to be created in the *epilog*, is
7628 determined by the type of the reduction variable (in the example
7629 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7630 However the type (mode) we use to check available target support
7631 for the vector operation to be created *inside the loop*, is
7632 determined by the type of the other arguments to STMT (in the
7633 example we'd check this: optab_handler (widen_sum_optab,
7634 vect_short_mode)).
7636 This is contrary to "regular" reductions, in which the types of all
7637 the arguments are the same as the type of the reduction variable.
7638 For "regular" reductions we can therefore use the same vector type
7639 (and also the same tree-code) when generating the epilog code and
7640 when generating the code inside the loop. */
7642 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7643 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7645 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7646 if (reduction_type == TREE_CODE_REDUCTION)
7648 /* Check whether it's ok to change the order of the computation.
7649 Generally, when vectorizing a reduction we change the order of the
7650 computation. This may change the behavior of the program in some
7651 cases, so we need to check that this is ok. One exception is when
7652 vectorizing an outer-loop: the inner-loop is executed sequentially,
7653 and therefore vectorizing reductions in the inner-loop during
7654 outer-loop vectorization is safe. Likewise when we are vectorizing
7655 a series of reductions using SLP and the VF is one the reductions
7656 are performed in scalar order. */
7657 if (slp_node
7658 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7659 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7661 else if (needs_fold_left_reduction_p (op.type, orig_code))
7663 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7664 is not directy used in stmt. */
7665 if (!only_slp_reduc_chain
7666 && reduc_chain_length != 1)
7668 if (dump_enabled_p ())
7669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7670 "in-order reduction chain without SLP.\n");
7671 return false;
7673 STMT_VINFO_REDUC_TYPE (reduc_info)
7674 = reduction_type = FOLD_LEFT_REDUCTION;
7676 else if (!commutative_binary_op_p (orig_code, op.type)
7677 || !associative_binary_op_p (orig_code, op.type))
7679 if (dump_enabled_p ())
7680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7681 "reduction: not commutative/associative");
7682 return false;
7686 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7687 && ncopies > 1)
7689 if (dump_enabled_p ())
7690 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7691 "multiple types in double reduction or condition "
7692 "reduction or fold-left reduction.\n");
7693 return false;
7696 internal_fn reduc_fn = IFN_LAST;
7697 if (reduction_type == TREE_CODE_REDUCTION
7698 || reduction_type == FOLD_LEFT_REDUCTION
7699 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7700 || reduction_type == CONST_COND_REDUCTION)
7702 if (reduction_type == FOLD_LEFT_REDUCTION
7703 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7704 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7706 if (reduc_fn != IFN_LAST
7707 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7708 OPTIMIZE_FOR_SPEED))
7710 if (dump_enabled_p ())
7711 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7712 "reduc op not supported by target.\n");
7714 reduc_fn = IFN_LAST;
7717 else
7719 if (!nested_cycle || double_reduc)
7721 if (dump_enabled_p ())
7722 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7723 "no reduc code for scalar code.\n");
7725 return false;
7729 else if (reduction_type == COND_REDUCTION)
7731 int scalar_precision
7732 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7733 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7734 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7735 vectype_out);
7737 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7738 OPTIMIZE_FOR_SPEED))
7739 reduc_fn = IFN_REDUC_MAX;
7741 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7743 if (reduction_type != EXTRACT_LAST_REDUCTION
7744 && (!nested_cycle || double_reduc)
7745 && reduc_fn == IFN_LAST
7746 && !nunits_out.is_constant ())
7748 if (dump_enabled_p ())
7749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7750 "missing target support for reduction on"
7751 " variable-length vectors.\n");
7752 return false;
7755 /* For SLP reductions, see if there is a neutral value we can use. */
7756 tree neutral_op = NULL_TREE;
7757 if (slp_node)
7759 tree initial_value = NULL_TREE;
7760 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7761 initial_value = vect_phi_initial_value (reduc_def_phi);
7762 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7763 orig_code, initial_value);
7766 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7768 /* We can't support in-order reductions of code such as this:
7770 for (int i = 0; i < n1; ++i)
7771 for (int j = 0; j < n2; ++j)
7772 l += a[j];
7774 since GCC effectively transforms the loop when vectorizing:
7776 for (int i = 0; i < n1 / VF; ++i)
7777 for (int j = 0; j < n2; ++j)
7778 for (int k = 0; k < VF; ++k)
7779 l += a[j];
7781 which is a reassociation of the original operation. */
7782 if (dump_enabled_p ())
7783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7784 "in-order double reduction not supported.\n");
7786 return false;
7789 if (reduction_type == FOLD_LEFT_REDUCTION
7790 && slp_node
7791 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7793 /* We cannot use in-order reductions in this case because there is
7794 an implicit reassociation of the operations involved. */
7795 if (dump_enabled_p ())
7796 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7797 "in-order unchained SLP reductions not supported.\n");
7798 return false;
7801 /* For double reductions, and for SLP reductions with a neutral value,
7802 we construct a variable-length initial vector by loading a vector
7803 full of the neutral value and then shift-and-inserting the start
7804 values into the low-numbered elements. */
7805 if ((double_reduc || neutral_op)
7806 && !nunits_out.is_constant ()
7807 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7808 vectype_out, OPTIMIZE_FOR_SPEED))
7810 if (dump_enabled_p ())
7811 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7812 "reduction on variable-length vectors requires"
7813 " target support for a vector-shift-and-insert"
7814 " operation.\n");
7815 return false;
7818 /* Check extra constraints for variable-length unchained SLP reductions. */
7819 if (slp_node
7820 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7821 && !nunits_out.is_constant ())
7823 /* We checked above that we could build the initial vector when
7824 there's a neutral element value. Check here for the case in
7825 which each SLP statement has its own initial value and in which
7826 that value needs to be repeated for every instance of the
7827 statement within the initial vector. */
7828 unsigned int group_size = SLP_TREE_LANES (slp_node);
7829 if (!neutral_op
7830 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7831 TREE_TYPE (vectype_out)))
7833 if (dump_enabled_p ())
7834 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7835 "unsupported form of SLP reduction for"
7836 " variable-length vectors: cannot build"
7837 " initial vector.\n");
7838 return false;
7840 /* The epilogue code relies on the number of elements being a multiple
7841 of the group size. The duplicate-and-interleave approach to setting
7842 up the initial vector does too. */
7843 if (!multiple_p (nunits_out, group_size))
7845 if (dump_enabled_p ())
7846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7847 "unsupported form of SLP reduction for"
7848 " variable-length vectors: the vector size"
7849 " is not a multiple of the number of results.\n");
7850 return false;
7854 if (reduction_type == COND_REDUCTION)
7856 widest_int ni;
7858 if (! max_loop_iterations (loop, &ni))
7860 if (dump_enabled_p ())
7861 dump_printf_loc (MSG_NOTE, vect_location,
7862 "loop count not known, cannot create cond "
7863 "reduction.\n");
7864 return false;
7866 /* Convert backedges to iterations. */
7867 ni += 1;
7869 /* The additional index will be the same type as the condition. Check
7870 that the loop can fit into this less one (because we'll use up the
7871 zero slot for when there are no matches). */
7872 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7873 if (wi::geu_p (ni, wi::to_widest (max_index)))
7875 if (dump_enabled_p ())
7876 dump_printf_loc (MSG_NOTE, vect_location,
7877 "loop size is greater than data size.\n");
7878 return false;
7882 /* In case the vectorization factor (VF) is bigger than the number
7883 of elements that we can fit in a vectype (nunits), we have to generate
7884 more than one vector stmt - i.e - we need to "unroll" the
7885 vector stmt by a factor VF/nunits. For more details see documentation
7886 in vectorizable_operation. */
7888 /* If the reduction is used in an outer loop we need to generate
7889 VF intermediate results, like so (e.g. for ncopies=2):
7890 r0 = phi (init, r0)
7891 r1 = phi (init, r1)
7892 r0 = x0 + r0;
7893 r1 = x1 + r1;
7894 (i.e. we generate VF results in 2 registers).
7895 In this case we have a separate def-use cycle for each copy, and therefore
7896 for each copy we get the vector def for the reduction variable from the
7897 respective phi node created for this copy.
7899 Otherwise (the reduction is unused in the loop nest), we can combine
7900 together intermediate results, like so (e.g. for ncopies=2):
7901 r = phi (init, r)
7902 r = x0 + r;
7903 r = x1 + r;
7904 (i.e. we generate VF/2 results in a single register).
7905 In this case for each copy we get the vector def for the reduction variable
7906 from the vectorized reduction operation generated in the previous iteration.
7908 This only works when we see both the reduction PHI and its only consumer
7909 in vectorizable_reduction and there are no intermediate stmts
7910 participating. When unrolling we want each unrolled iteration to have its
7911 own reduction accumulator since one of the main goals of unrolling a
7912 reduction is to reduce the aggregate loop-carried latency. */
7913 if (ncopies > 1
7914 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7915 && reduc_chain_length == 1
7916 && loop_vinfo->suggested_unroll_factor == 1)
7917 single_defuse_cycle = true;
7919 if (single_defuse_cycle || lane_reduc_code_p)
7921 gcc_assert (op.code != COND_EXPR);
7923 /* 4. Supportable by target? */
7924 bool ok = true;
7926 /* 4.1. check support for the operation in the loop
7928 This isn't necessary for the lane reduction codes, since they
7929 can only be produced by pattern matching, and it's up to the
7930 pattern matcher to test for support. The main reason for
7931 specifically skipping this step is to avoid rechecking whether
7932 mixed-sign dot-products can be implemented using signed
7933 dot-products. */
7934 machine_mode vec_mode = TYPE_MODE (vectype_in);
7935 if (!lane_reduc_code_p
7936 && !directly_supported_p (op.code, vectype_in, optab_vector))
7938 if (dump_enabled_p ())
7939 dump_printf (MSG_NOTE, "op not supported by target.\n");
7940 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7941 || !vect_can_vectorize_without_simd_p (op.code))
7942 ok = false;
7943 else
7944 if (dump_enabled_p ())
7945 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7948 if (vect_emulated_vector_p (vectype_in)
7949 && !vect_can_vectorize_without_simd_p (op.code))
7951 if (dump_enabled_p ())
7952 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7953 return false;
7956 /* lane-reducing operations have to go through vect_transform_reduction.
7957 For the other cases try without the single cycle optimization. */
7958 if (!ok)
7960 if (lane_reduc_code_p)
7961 return false;
7962 else
7963 single_defuse_cycle = false;
7966 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7968 /* If the reduction stmt is one of the patterns that have lane
7969 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7970 if ((ncopies > 1 && ! single_defuse_cycle)
7971 && lane_reduc_code_p)
7973 if (dump_enabled_p ())
7974 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7975 "multi def-use cycle not possible for lane-reducing "
7976 "reduction operation\n");
7977 return false;
7980 if (slp_node
7981 && !(!single_defuse_cycle
7982 && !lane_reduc_code_p
7983 && reduction_type != FOLD_LEFT_REDUCTION))
7984 for (i = 0; i < (int) op.num_ops; i++)
7985 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7987 if (dump_enabled_p ())
7988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7989 "incompatible vector types for invariants\n");
7990 return false;
7993 if (slp_node)
7994 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7995 else
7996 vec_num = 1;
7998 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7999 reduction_type, ncopies, cost_vec);
8000 /* Cost the reduction op inside the loop if transformed via
8001 vect_transform_reduction. Otherwise this is costed by the
8002 separate vectorizable_* routines. */
8003 if (single_defuse_cycle || lane_reduc_code_p)
8005 int factor = 1;
8006 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8007 /* Three dot-products and a subtraction. */
8008 factor = 4;
8009 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8010 stmt_info, 0, vect_body);
8013 if (dump_enabled_p ()
8014 && reduction_type == FOLD_LEFT_REDUCTION)
8015 dump_printf_loc (MSG_NOTE, vect_location,
8016 "using an in-order (fold-left) reduction.\n");
8017 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8018 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8019 reductions go through their own vectorizable_* routines. */
8020 if (!single_defuse_cycle
8021 && !lane_reduc_code_p
8022 && reduction_type != FOLD_LEFT_REDUCTION)
8024 stmt_vec_info tem
8025 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8026 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8028 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8029 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8031 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8032 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8034 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8036 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8037 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8038 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8040 if (reduction_type != FOLD_LEFT_REDUCTION
8041 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8042 && (cond_fn == IFN_LAST
8043 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8044 OPTIMIZE_FOR_SPEED)))
8046 if (dump_enabled_p ())
8047 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8048 "can't operate on partial vectors because"
8049 " no conditional operation is available.\n");
8050 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8052 else if (reduction_type == FOLD_LEFT_REDUCTION
8053 && reduc_fn == IFN_LAST
8054 && !expand_vec_cond_expr_p (vectype_in,
8055 truth_type_for (vectype_in),
8056 SSA_NAME))
8058 if (dump_enabled_p ())
8059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8060 "can't operate on partial vectors because"
8061 " no conditional operation is available.\n");
8062 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8064 else if (reduction_type == FOLD_LEFT_REDUCTION
8065 && reduc_fn == IFN_LAST
8066 && FLOAT_TYPE_P (vectype_in)
8067 && HONOR_SIGNED_ZEROS (vectype_in)
8068 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8070 if (dump_enabled_p ())
8071 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8072 "can't operate on partial vectors because"
8073 " signed zeros cannot be preserved.\n");
8074 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8076 else
8078 internal_fn mask_reduc_fn
8079 = get_masked_reduction_fn (reduc_fn, vectype_in);
8081 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8082 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8083 vectype_in, 1);
8084 else
8085 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8086 vectype_in, NULL);
8089 return true;
8092 /* STMT_INFO is a dot-product reduction whose multiplication operands
8093 have different signs. Emit a sequence to emulate the operation
8094 using a series of signed DOT_PROD_EXPRs and return the last
8095 statement generated. VEC_DEST is the result of the vector operation
8096 and VOP lists its inputs. */
8098 static gassign *
8099 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8100 gimple_stmt_iterator *gsi, tree vec_dest,
8101 tree vop[3])
8103 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8104 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8105 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8106 gimple *new_stmt;
8108 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8109 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8110 std::swap (vop[0], vop[1]);
8112 /* Convert all inputs to signed types. */
8113 for (int i = 0; i < 3; ++i)
8114 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8116 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8117 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8118 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8119 vop[i] = tmp;
8122 /* In the comments below we assume 8-bit inputs for simplicity,
8123 but the approach works for any full integer type. */
8125 /* Create a vector of -128. */
8126 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8127 tree min_narrow = build_vector_from_val (narrow_vectype,
8128 min_narrow_elttype);
8130 /* Create a vector of 64. */
8131 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8132 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8133 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8135 /* Emit: SUB_RES = VOP[0] - 128. */
8136 tree sub_res = make_ssa_name (narrow_vectype);
8137 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8138 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8140 /* Emit:
8142 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8143 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8144 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8146 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8147 Doing the two 64 * y steps first allows more time to compute x. */
8148 tree stage1 = make_ssa_name (wide_vectype);
8149 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8150 vop[1], half_narrow, vop[2]);
8151 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8153 tree stage2 = make_ssa_name (wide_vectype);
8154 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8155 vop[1], half_narrow, stage1);
8156 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8158 tree stage3 = make_ssa_name (wide_vectype);
8159 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8160 sub_res, vop[1], stage2);
8161 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8163 /* Convert STAGE3 to the reduction type. */
8164 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8167 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8168 value. */
8170 bool
8171 vect_transform_reduction (loop_vec_info loop_vinfo,
8172 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8173 gimple **vec_stmt, slp_tree slp_node)
8175 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8176 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8177 int i;
8178 int ncopies;
8179 int vec_num;
8181 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8182 gcc_assert (reduc_info->is_reduc_info);
8184 if (nested_in_vect_loop_p (loop, stmt_info))
8186 loop = loop->inner;
8187 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8190 gimple_match_op op;
8191 if (!gimple_extract_op (stmt_info->stmt, &op))
8192 gcc_unreachable ();
8194 /* All uses but the last are expected to be defined in the loop.
8195 The last use is the reduction variable. In case of nested cycle this
8196 assumption is not true: we use reduc_index to record the index of the
8197 reduction variable. */
8198 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8199 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8200 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8201 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8203 if (slp_node)
8205 ncopies = 1;
8206 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8208 else
8210 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8211 vec_num = 1;
8214 code_helper code = canonicalize_code (op.code, op.type);
8215 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8216 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8217 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8218 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8220 /* Transform. */
8221 tree new_temp = NULL_TREE;
8222 auto_vec<tree> vec_oprnds0;
8223 auto_vec<tree> vec_oprnds1;
8224 auto_vec<tree> vec_oprnds2;
8225 tree def0;
8227 if (dump_enabled_p ())
8228 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8230 /* FORNOW: Multiple types are not supported for condition. */
8231 if (code == COND_EXPR)
8232 gcc_assert (ncopies == 1);
8234 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8236 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8237 if (reduction_type == FOLD_LEFT_REDUCTION)
8239 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8240 gcc_assert (code.is_tree_code ());
8241 return vectorize_fold_left_reduction
8242 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8243 tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
8244 lens);
8247 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8248 gcc_assert (single_defuse_cycle
8249 || code == DOT_PROD_EXPR
8250 || code == WIDEN_SUM_EXPR
8251 || code == SAD_EXPR);
8253 /* Create the destination vector */
8254 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8255 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8257 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8258 single_defuse_cycle && reduc_index == 0
8259 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8260 single_defuse_cycle && reduc_index == 1
8261 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8262 op.num_ops == 3
8263 && !(single_defuse_cycle && reduc_index == 2)
8264 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8265 if (single_defuse_cycle)
8267 gcc_assert (!slp_node);
8268 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8269 op.ops[reduc_index],
8270 reduc_index == 0 ? &vec_oprnds0
8271 : (reduc_index == 1 ? &vec_oprnds1
8272 : &vec_oprnds2));
8275 bool emulated_mixed_dot_prod
8276 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8277 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8279 gimple *new_stmt;
8280 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8281 if (masked_loop_p && !mask_by_cond_expr)
8283 /* No conditional ifns have been defined for dot-product yet. */
8284 gcc_assert (code != DOT_PROD_EXPR);
8286 /* Make sure that the reduction accumulator is vop[0]. */
8287 if (reduc_index == 1)
8289 gcc_assert (commutative_binary_op_p (code, op.type));
8290 std::swap (vop[0], vop[1]);
8292 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8293 vec_num * ncopies, vectype_in, i);
8294 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8295 vop[0], vop[1], vop[0]);
8296 new_temp = make_ssa_name (vec_dest, call);
8297 gimple_call_set_lhs (call, new_temp);
8298 gimple_call_set_nothrow (call, true);
8299 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8300 new_stmt = call;
8302 else
8304 if (op.num_ops == 3)
8305 vop[2] = vec_oprnds2[i];
8307 if (masked_loop_p && mask_by_cond_expr)
8309 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8310 vec_num * ncopies, vectype_in, i);
8311 build_vect_cond_expr (code, vop, mask, gsi);
8314 if (emulated_mixed_dot_prod)
8315 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8316 vec_dest, vop);
8317 else if (code.is_internal_fn ())
8318 new_stmt = gimple_build_call_internal (internal_fn (code),
8319 op.num_ops,
8320 vop[0], vop[1], vop[2]);
8321 else
8322 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8323 vop[0], vop[1], vop[2]);
8324 new_temp = make_ssa_name (vec_dest, new_stmt);
8325 gimple_set_lhs (new_stmt, new_temp);
8326 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8329 if (slp_node)
8330 slp_node->push_vec_def (new_stmt);
8331 else if (single_defuse_cycle
8332 && i < ncopies - 1)
8334 if (reduc_index == 0)
8335 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8336 else if (reduc_index == 1)
8337 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8338 else if (reduc_index == 2)
8339 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8341 else
8342 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8345 if (!slp_node)
8346 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8348 return true;
8351 /* Transform phase of a cycle PHI. */
8353 bool
8354 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8355 stmt_vec_info stmt_info, gimple **vec_stmt,
8356 slp_tree slp_node, slp_instance slp_node_instance)
8358 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8359 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8360 int i;
8361 int ncopies;
8362 int j;
8363 bool nested_cycle = false;
8364 int vec_num;
8366 if (nested_in_vect_loop_p (loop, stmt_info))
8368 loop = loop->inner;
8369 nested_cycle = true;
8372 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8373 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8374 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8375 gcc_assert (reduc_info->is_reduc_info);
8377 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8378 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8379 /* Leave the scalar phi in place. */
8380 return true;
8382 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8383 /* For a nested cycle we do not fill the above. */
8384 if (!vectype_in)
8385 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8386 gcc_assert (vectype_in);
8388 if (slp_node)
8390 /* The size vect_schedule_slp_instance computes is off for us. */
8391 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8392 * SLP_TREE_LANES (slp_node), vectype_in);
8393 ncopies = 1;
8395 else
8397 vec_num = 1;
8398 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8401 /* Check whether we should use a single PHI node and accumulate
8402 vectors to one before the backedge. */
8403 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8404 ncopies = 1;
8406 /* Create the destination vector */
8407 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8408 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8409 vectype_out);
8411 /* Get the loop-entry arguments. */
8412 tree vec_initial_def = NULL_TREE;
8413 auto_vec<tree> vec_initial_defs;
8414 if (slp_node)
8416 vec_initial_defs.reserve (vec_num);
8417 if (nested_cycle)
8419 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8420 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8421 &vec_initial_defs);
8423 else
8425 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8426 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8427 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8429 unsigned int num_phis = stmts.length ();
8430 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8431 num_phis = 1;
8432 initial_values.reserve (num_phis);
8433 for (unsigned int i = 0; i < num_phis; ++i)
8435 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8436 initial_values.quick_push (vect_phi_initial_value (this_phi));
8438 if (vec_num == 1)
8439 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8440 if (!initial_values.is_empty ())
8442 tree initial_value
8443 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8444 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8445 tree neutral_op
8446 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8447 code, initial_value);
8448 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8449 &vec_initial_defs, vec_num,
8450 stmts.length (), neutral_op);
8454 else
8456 /* Get at the scalar def before the loop, that defines the initial
8457 value of the reduction variable. */
8458 tree initial_def = vect_phi_initial_value (phi);
8459 reduc_info->reduc_initial_values.safe_push (initial_def);
8460 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8461 and we can't use zero for induc_val, use initial_def. Similarly
8462 for REDUC_MIN and initial_def larger than the base. */
8463 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8465 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8466 if (TREE_CODE (initial_def) == INTEGER_CST
8467 && !integer_zerop (induc_val)
8468 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8469 && tree_int_cst_lt (initial_def, induc_val))
8470 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8471 && tree_int_cst_lt (induc_val, initial_def))))
8473 induc_val = initial_def;
8474 /* Communicate we used the initial_def to epilouge
8475 generation. */
8476 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8478 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8480 else if (nested_cycle)
8482 /* Do not use an adjustment def as that case is not supported
8483 correctly if ncopies is not one. */
8484 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8485 ncopies, initial_def,
8486 &vec_initial_defs);
8488 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8489 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8490 /* Fill the initial vector with the initial scalar value. */
8491 vec_initial_def
8492 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8493 initial_def, initial_def);
8494 else
8496 if (ncopies == 1)
8497 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8498 if (!reduc_info->reduc_initial_values.is_empty ())
8500 initial_def = reduc_info->reduc_initial_values[0];
8501 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8502 tree neutral_op
8503 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8504 code, initial_def);
8505 gcc_assert (neutral_op);
8506 /* Try to simplify the vector initialization by applying an
8507 adjustment after the reduction has been performed. */
8508 if (!reduc_info->reused_accumulator
8509 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8510 && !operand_equal_p (neutral_op, initial_def))
8512 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8513 = initial_def;
8514 initial_def = neutral_op;
8516 vec_initial_def
8517 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8518 initial_def, neutral_op);
8523 if (vec_initial_def)
8525 vec_initial_defs.create (ncopies);
8526 for (i = 0; i < ncopies; ++i)
8527 vec_initial_defs.quick_push (vec_initial_def);
8530 if (auto *accumulator = reduc_info->reused_accumulator)
8532 tree def = accumulator->reduc_input;
8533 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8535 unsigned int nreduc;
8536 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8537 (TREE_TYPE (def)),
8538 TYPE_VECTOR_SUBPARTS (vectype_out),
8539 &nreduc);
8540 gcc_assert (res);
8541 gimple_seq stmts = NULL;
8542 /* Reduce the single vector to a smaller one. */
8543 if (nreduc != 1)
8545 /* Perform the reduction in the appropriate type. */
8546 tree rvectype = vectype_out;
8547 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8548 TREE_TYPE (TREE_TYPE (def))))
8549 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8550 TYPE_VECTOR_SUBPARTS
8551 (vectype_out));
8552 def = vect_create_partial_epilog (def, rvectype,
8553 STMT_VINFO_REDUC_CODE
8554 (reduc_info),
8555 &stmts);
8557 /* The epilogue loop might use a different vector mode, like
8558 VNx2DI vs. V2DI. */
8559 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8561 tree reduc_type = build_vector_type_for_mode
8562 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8563 def = gimple_convert (&stmts, reduc_type, def);
8565 /* Adjust the input so we pick up the partially reduced value
8566 for the skip edge in vect_create_epilog_for_reduction. */
8567 accumulator->reduc_input = def;
8568 /* And the reduction could be carried out using a different sign. */
8569 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8570 def = gimple_convert (&stmts, vectype_out, def);
8571 if (loop_vinfo->main_loop_edge)
8573 /* While we'd like to insert on the edge this will split
8574 blocks and disturb bookkeeping, we also will eventually
8575 need this on the skip edge. Rely on sinking to
8576 fixup optimal placement and insert in the pred. */
8577 gimple_stmt_iterator gsi
8578 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8579 /* Insert before a cond that eventually skips the
8580 epilogue. */
8581 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8582 gsi_prev (&gsi);
8583 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8585 else
8586 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8587 stmts);
8589 if (loop_vinfo->main_loop_edge)
8590 vec_initial_defs[0]
8591 = vect_get_main_loop_result (loop_vinfo, def,
8592 vec_initial_defs[0]);
8593 else
8594 vec_initial_defs.safe_push (def);
8597 /* Generate the reduction PHIs upfront. */
8598 for (i = 0; i < vec_num; i++)
8600 tree vec_init_def = vec_initial_defs[i];
8601 for (j = 0; j < ncopies; j++)
8603 /* Create the reduction-phi that defines the reduction
8604 operand. */
8605 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8607 /* Set the loop-entry arg of the reduction-phi. */
8608 if (j != 0 && nested_cycle)
8609 vec_init_def = vec_initial_defs[j];
8610 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8611 UNKNOWN_LOCATION);
8613 /* The loop-latch arg is set in epilogue processing. */
8615 if (slp_node)
8616 slp_node->push_vec_def (new_phi);
8617 else
8619 if (j == 0)
8620 *vec_stmt = new_phi;
8621 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8626 return true;
8629 /* Vectorizes LC PHIs. */
8631 bool
8632 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8633 stmt_vec_info stmt_info, gimple **vec_stmt,
8634 slp_tree slp_node)
8636 if (!loop_vinfo
8637 || !is_a <gphi *> (stmt_info->stmt)
8638 || gimple_phi_num_args (stmt_info->stmt) != 1)
8639 return false;
8641 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8642 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8643 return false;
8645 if (!vec_stmt) /* transformation not required. */
8647 /* Deal with copies from externs or constants that disguise as
8648 loop-closed PHI nodes (PR97886). */
8649 if (slp_node
8650 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8651 SLP_TREE_VECTYPE (slp_node)))
8653 if (dump_enabled_p ())
8654 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8655 "incompatible vector types for invariants\n");
8656 return false;
8658 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8659 return true;
8662 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8663 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8664 basic_block bb = gimple_bb (stmt_info->stmt);
8665 edge e = single_pred_edge (bb);
8666 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8667 auto_vec<tree> vec_oprnds;
8668 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8669 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8670 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8671 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8673 /* Create the vectorized LC PHI node. */
8674 gphi *new_phi = create_phi_node (vec_dest, bb);
8675 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8676 if (slp_node)
8677 slp_node->push_vec_def (new_phi);
8678 else
8679 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8681 if (!slp_node)
8682 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8684 return true;
8687 /* Vectorizes PHIs. */
8689 bool
8690 vectorizable_phi (vec_info *,
8691 stmt_vec_info stmt_info, gimple **vec_stmt,
8692 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8694 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8695 return false;
8697 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8698 return false;
8700 tree vectype = SLP_TREE_VECTYPE (slp_node);
8702 if (!vec_stmt) /* transformation not required. */
8704 slp_tree child;
8705 unsigned i;
8706 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8707 if (!child)
8709 if (dump_enabled_p ())
8710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8711 "PHI node with unvectorized backedge def\n");
8712 return false;
8714 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8716 if (dump_enabled_p ())
8717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8718 "incompatible vector types for invariants\n");
8719 return false;
8721 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8722 && !useless_type_conversion_p (vectype,
8723 SLP_TREE_VECTYPE (child)))
8725 /* With bools we can have mask and non-mask precision vectors
8726 or different non-mask precisions. while pattern recog is
8727 supposed to guarantee consistency here bugs in it can cause
8728 mismatches (PR103489 and PR103800 for example).
8729 Deal with them here instead of ICEing later. */
8730 if (dump_enabled_p ())
8731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8732 "incompatible vector type setup from "
8733 "bool pattern detection\n");
8734 return false;
8737 /* For single-argument PHIs assume coalescing which means zero cost
8738 for the scalar and the vector PHIs. This avoids artificially
8739 favoring the vector path (but may pessimize it in some cases). */
8740 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8741 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8742 vector_stmt, stmt_info, vectype, 0, vect_body);
8743 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8744 return true;
8747 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8748 basic_block bb = gimple_bb (stmt_info->stmt);
8749 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8750 auto_vec<gphi *> new_phis;
8751 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8753 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8755 /* Skip not yet vectorized defs. */
8756 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8757 && SLP_TREE_VEC_DEFS (child).is_empty ())
8758 continue;
8760 auto_vec<tree> vec_oprnds;
8761 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8762 if (!new_phis.exists ())
8764 new_phis.create (vec_oprnds.length ());
8765 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8767 /* Create the vectorized LC PHI node. */
8768 new_phis.quick_push (create_phi_node (vec_dest, bb));
8769 slp_node->push_vec_def (new_phis[j]);
8772 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8773 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8774 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8776 /* We should have at least one already vectorized child. */
8777 gcc_assert (new_phis.exists ());
8779 return true;
8782 /* Vectorizes first order recurrences. An overview of the transformation
8783 is described below. Suppose we have the following loop.
8785 int t = 0;
8786 for (int i = 0; i < n; ++i)
8788 b[i] = a[i] - t;
8789 t = a[i];
8792 There is a first-order recurrence on 'a'. For this loop, the scalar IR
8793 looks (simplified) like:
8795 scalar.preheader:
8796 init = 0;
8798 scalar.body:
8799 i = PHI <0(scalar.preheader), i+1(scalar.body)>
8800 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8801 _1 = a[i]
8802 b[i] = _1 - _2
8803 if (i < n) goto scalar.body
8805 In this example, _2 is a recurrence because it's value depends on the
8806 previous iteration. We vectorize this as (VF = 4)
8808 vector.preheader:
8809 vect_init = vect_cst(..., ..., ..., 0)
8811 vector.body
8812 i = PHI <0(vector.preheader), i+4(vector.body)>
8813 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8814 vect_2 = a[i, i+1, i+2, i+3];
8815 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8816 b[i, i+1, i+2, i+3] = vect_2 - vect_3
8817 if (..) goto vector.body
8819 In this function, vectorizable_recurr, we code generate both the
8820 vector PHI node and the permute since those together compute the
8821 vectorized value of the scalar PHI. We do not yet have the
8822 backedge value to fill in there nor into the vec_perm. Those
8823 are filled in maybe_set_vectorized_backedge_value and
8824 vect_schedule_scc.
8826 TODO: Since the scalar loop does not have a use of the recurrence
8827 outside of the loop the natural way to implement peeling via
8828 vectorizing the live value doesn't work. For now peeling of loops
8829 with a recurrence is not implemented. For SLP the supported cases
8830 are restricted to those requiring a single vector recurrence PHI. */
8832 bool
8833 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8834 gimple **vec_stmt, slp_tree slp_node,
8835 stmt_vector_for_cost *cost_vec)
8837 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8838 return false;
8840 gphi *phi = as_a<gphi *> (stmt_info->stmt);
8842 /* So far we only support first-order recurrence auto-vectorization. */
8843 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8844 return false;
8846 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8847 unsigned ncopies;
8848 if (slp_node)
8849 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8850 else
8851 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8852 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8853 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8854 /* We need to be able to make progress with a single vector. */
8855 if (maybe_gt (dist * 2, nunits))
8857 if (dump_enabled_p ())
8858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8859 "first order recurrence exceeds half of "
8860 "a vector\n");
8861 return false;
8864 /* First-order recurrence autovectorization needs to handle permutation
8865 with indices = [nunits-1, nunits, nunits+1, ...]. */
8866 vec_perm_builder sel (nunits, 1, 3);
8867 for (int i = 0; i < 3; ++i)
8868 sel.quick_push (nunits - dist + i);
8869 vec_perm_indices indices (sel, 2, nunits);
8871 if (!vec_stmt) /* transformation not required. */
8873 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8874 indices))
8875 return false;
8877 if (slp_node)
8879 /* We eventually need to set a vector type on invariant
8880 arguments. */
8881 unsigned j;
8882 slp_tree child;
8883 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8884 if (!vect_maybe_update_slp_op_vectype
8885 (child, SLP_TREE_VECTYPE (slp_node)))
8887 if (dump_enabled_p ())
8888 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8889 "incompatible vector types for "
8890 "invariants\n");
8891 return false;
8894 /* The recurrence costs the initialization vector and one permute
8895 for each copy. */
8896 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8897 stmt_info, 0, vect_prologue);
8898 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8899 stmt_info, 0, vect_body);
8900 if (dump_enabled_p ())
8901 dump_printf_loc (MSG_NOTE, vect_location,
8902 "vectorizable_recurr: inside_cost = %d, "
8903 "prologue_cost = %d .\n", inside_cost,
8904 prologue_cost);
8906 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8907 return true;
8910 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8911 basic_block bb = gimple_bb (phi);
8912 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8913 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8915 gimple_seq stmts = NULL;
8916 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8917 gsi_insert_seq_on_edge_immediate (pe, stmts);
8919 tree vec_init = build_vector_from_val (vectype, preheader);
8920 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8922 /* Create the vectorized first-order PHI node. */
8923 tree vec_dest = vect_get_new_vect_var (vectype,
8924 vect_simple_var, "vec_recur_");
8925 gphi *new_phi = create_phi_node (vec_dest, bb);
8926 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8928 /* Insert shuffles the first-order recurrence autovectorization.
8929 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8930 tree perm = vect_gen_perm_mask_checked (vectype, indices);
8932 /* Insert the required permute after the latch definition. The
8933 second and later operands are tentative and will be updated when we have
8934 vectorized the latch definition. */
8935 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8936 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8937 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8938 gsi_next (&gsi2);
8940 for (unsigned i = 0; i < ncopies; ++i)
8942 vec_dest = make_ssa_name (vectype);
8943 gassign *vperm
8944 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8945 i == 0 ? gimple_phi_result (new_phi) : NULL,
8946 NULL, perm);
8947 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8949 if (slp_node)
8950 slp_node->push_vec_def (vperm);
8951 else
8952 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8955 if (!slp_node)
8956 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8957 return true;
8960 /* Return true if VECTYPE represents a vector that requires lowering
8961 by the vector lowering pass. */
8963 bool
8964 vect_emulated_vector_p (tree vectype)
8966 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8967 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8968 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8971 /* Return true if we can emulate CODE on an integer mode representation
8972 of a vector. */
8974 bool
8975 vect_can_vectorize_without_simd_p (tree_code code)
8977 switch (code)
8979 case PLUS_EXPR:
8980 case MINUS_EXPR:
8981 case NEGATE_EXPR:
8982 case BIT_AND_EXPR:
8983 case BIT_IOR_EXPR:
8984 case BIT_XOR_EXPR:
8985 case BIT_NOT_EXPR:
8986 return true;
8988 default:
8989 return false;
8993 /* Likewise, but taking a code_helper. */
8995 bool
8996 vect_can_vectorize_without_simd_p (code_helper code)
8998 return (code.is_tree_code ()
8999 && vect_can_vectorize_without_simd_p (tree_code (code)));
9002 /* Create vector init for vectorized iv. */
9003 static tree
9004 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9005 tree step_expr, poly_uint64 nunits,
9006 tree vectype,
9007 enum vect_induction_op_type induction_type)
9009 unsigned HOST_WIDE_INT const_nunits;
9010 tree vec_shift, vec_init, new_name;
9011 unsigned i;
9012 tree itype = TREE_TYPE (vectype);
9014 /* iv_loop is the loop to be vectorized. Create:
9015 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9016 new_name = gimple_convert (stmts, itype, init_expr);
9017 switch (induction_type)
9019 case vect_step_op_shr:
9020 case vect_step_op_shl:
9021 /* Build the Initial value from shift_expr. */
9022 vec_init = gimple_build_vector_from_val (stmts,
9023 vectype,
9024 new_name);
9025 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9026 build_zero_cst (itype), step_expr);
9027 vec_init = gimple_build (stmts,
9028 (induction_type == vect_step_op_shr
9029 ? RSHIFT_EXPR : LSHIFT_EXPR),
9030 vectype, vec_init, vec_shift);
9031 break;
9033 case vect_step_op_neg:
9035 vec_init = gimple_build_vector_from_val (stmts,
9036 vectype,
9037 new_name);
9038 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9039 vectype, vec_init);
9040 /* The encoding has 2 interleaved stepped patterns. */
9041 vec_perm_builder sel (nunits, 2, 3);
9042 sel.quick_grow (6);
9043 for (i = 0; i < 3; i++)
9045 sel[2 * i] = i;
9046 sel[2 * i + 1] = i + nunits;
9048 vec_perm_indices indices (sel, 2, nunits);
9049 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9050 fail when vec_init is const vector. In that situation vec_perm is not
9051 really needed. */
9052 tree perm_mask_even
9053 = vect_gen_perm_mask_any (vectype, indices);
9054 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9055 vectype,
9056 vec_init, vec_neg,
9057 perm_mask_even);
9059 break;
9061 case vect_step_op_mul:
9063 /* Use unsigned mult to avoid UD integer overflow. */
9064 gcc_assert (nunits.is_constant (&const_nunits));
9065 tree utype = unsigned_type_for (itype);
9066 tree uvectype = build_vector_type (utype,
9067 TYPE_VECTOR_SUBPARTS (vectype));
9068 new_name = gimple_convert (stmts, utype, new_name);
9069 vec_init = gimple_build_vector_from_val (stmts,
9070 uvectype,
9071 new_name);
9072 tree_vector_builder elts (uvectype, const_nunits, 1);
9073 tree elt_step = build_one_cst (utype);
9075 elts.quick_push (elt_step);
9076 for (i = 1; i < const_nunits; i++)
9078 /* Create: new_name_i = new_name + step_expr. */
9079 elt_step = gimple_build (stmts, MULT_EXPR,
9080 utype, elt_step, step_expr);
9081 elts.quick_push (elt_step);
9083 /* Create a vector from [new_name_0, new_name_1, ...,
9084 new_name_nunits-1]. */
9085 tree vec_mul = gimple_build_vector (stmts, &elts);
9086 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9087 vec_init, vec_mul);
9088 vec_init = gimple_convert (stmts, vectype, vec_init);
9090 break;
9092 default:
9093 gcc_unreachable ();
9096 return vec_init;
9099 /* Peel init_expr by skip_niter for induction_type. */
9100 tree
9101 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9102 tree skip_niters, tree step_expr,
9103 enum vect_induction_op_type induction_type)
9105 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9106 tree type = TREE_TYPE (init_expr);
9107 unsigned prec = TYPE_PRECISION (type);
9108 switch (induction_type)
9110 case vect_step_op_neg:
9111 if (TREE_INT_CST_LOW (skip_niters) % 2)
9112 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9113 /* else no change. */
9114 break;
9116 case vect_step_op_shr:
9117 case vect_step_op_shl:
9118 skip_niters = gimple_convert (stmts, type, skip_niters);
9119 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9120 /* When shift mount >= precision, need to avoid UD.
9121 In the original loop, there's no UD, and according to semantic,
9122 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9123 if (!tree_fits_uhwi_p (step_expr)
9124 || tree_to_uhwi (step_expr) >= prec)
9126 if (induction_type == vect_step_op_shl
9127 || TYPE_UNSIGNED (type))
9128 init_expr = build_zero_cst (type);
9129 else
9130 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9131 init_expr,
9132 wide_int_to_tree (type, prec - 1));
9134 else
9135 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9136 ? RSHIFT_EXPR : LSHIFT_EXPR),
9137 type, init_expr, step_expr);
9138 break;
9140 case vect_step_op_mul:
9142 tree utype = unsigned_type_for (type);
9143 init_expr = gimple_convert (stmts, utype, init_expr);
9144 unsigned skipn = TREE_INT_CST_LOW (skip_niters);
9145 wide_int begin = wi::to_wide (step_expr);
9146 for (unsigned i = 0; i != skipn - 1; i++)
9147 begin = wi::mul (begin, wi::to_wide (step_expr));
9148 tree mult_expr = wide_int_to_tree (utype, begin);
9149 init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
9150 init_expr = gimple_convert (stmts, type, init_expr);
9152 break;
9154 default:
9155 gcc_unreachable ();
9158 return init_expr;
9161 /* Create vector step for vectorized iv. */
9162 static tree
9163 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9164 poly_uint64 vf,
9165 enum vect_induction_op_type induction_type)
9167 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9168 tree new_name = NULL;
9169 /* Step should be pow (step, vf) for mult induction. */
9170 if (induction_type == vect_step_op_mul)
9172 gcc_assert (vf.is_constant ());
9173 wide_int begin = wi::to_wide (step_expr);
9175 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9176 begin = wi::mul (begin, wi::to_wide (step_expr));
9178 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9180 else if (induction_type == vect_step_op_neg)
9181 /* Do nothing. */
9183 else
9184 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9185 expr, step_expr);
9186 return new_name;
9189 static tree
9190 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9191 stmt_vec_info stmt_info,
9192 tree new_name, tree vectype,
9193 enum vect_induction_op_type induction_type)
9195 /* No step is needed for neg induction. */
9196 if (induction_type == vect_step_op_neg)
9197 return NULL;
9199 tree t = unshare_expr (new_name);
9200 gcc_assert (CONSTANT_CLASS_P (new_name)
9201 || TREE_CODE (new_name) == SSA_NAME);
9202 tree new_vec = build_vector_from_val (vectype, t);
9203 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9204 new_vec, vectype, NULL);
9205 return vec_step;
9208 /* Update vectorized iv with vect_step, induc_def is init. */
9209 static tree
9210 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9211 tree induc_def, tree vec_step,
9212 enum vect_induction_op_type induction_type)
9214 tree vec_def = induc_def;
9215 switch (induction_type)
9217 case vect_step_op_mul:
9219 /* Use unsigned mult to avoid UD integer overflow. */
9220 tree uvectype
9221 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9222 TYPE_VECTOR_SUBPARTS (vectype));
9223 vec_def = gimple_convert (stmts, uvectype, vec_def);
9224 vec_step = gimple_convert (stmts, uvectype, vec_step);
9225 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9226 vec_def, vec_step);
9227 vec_def = gimple_convert (stmts, vectype, vec_def);
9229 break;
9231 case vect_step_op_shr:
9232 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9233 vec_def, vec_step);
9234 break;
9236 case vect_step_op_shl:
9237 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9238 vec_def, vec_step);
9239 break;
9240 case vect_step_op_neg:
9241 vec_def = induc_def;
9242 /* Do nothing. */
9243 break;
9244 default:
9245 gcc_unreachable ();
9248 return vec_def;
9252 /* Function vectorizable_induction
9254 Check if STMT_INFO performs an nonlinear induction computation that can be
9255 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9256 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9257 basic block.
9258 Return true if STMT_INFO is vectorizable in this way. */
9260 static bool
9261 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9262 stmt_vec_info stmt_info,
9263 gimple **vec_stmt, slp_tree slp_node,
9264 stmt_vector_for_cost *cost_vec)
9266 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9267 unsigned ncopies;
9268 bool nested_in_vect_loop = false;
9269 class loop *iv_loop;
9270 tree vec_def;
9271 edge pe = loop_preheader_edge (loop);
9272 basic_block new_bb;
9273 tree vec_init, vec_step;
9274 tree new_name;
9275 gimple *new_stmt;
9276 gphi *induction_phi;
9277 tree induc_def, vec_dest;
9278 tree init_expr, step_expr;
9279 tree niters_skip;
9280 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9281 unsigned i;
9282 gimple_stmt_iterator si;
9284 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9286 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9287 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9288 enum vect_induction_op_type induction_type
9289 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9291 gcc_assert (induction_type > vect_step_op_add);
9293 if (slp_node)
9294 ncopies = 1;
9295 else
9296 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9297 gcc_assert (ncopies >= 1);
9299 /* FORNOW. Only handle nonlinear induction in the same loop. */
9300 if (nested_in_vect_loop_p (loop, stmt_info))
9302 if (dump_enabled_p ())
9303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9304 "nonlinear induction in nested loop.\n");
9305 return false;
9308 iv_loop = loop;
9309 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9311 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9312 update for each iv and a permutation to generate wanted vector iv. */
9313 if (slp_node)
9315 if (dump_enabled_p ())
9316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9317 "SLP induction not supported for nonlinear"
9318 " induction.\n");
9319 return false;
9322 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9324 if (dump_enabled_p ())
9325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9326 "floating point nonlinear induction vectorization"
9327 " not supported.\n");
9328 return false;
9331 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9332 init_expr = vect_phi_initial_value (phi);
9333 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9334 && TREE_CODE (step_expr) == INTEGER_CST);
9335 /* step_expr should be aligned with init_expr,
9336 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9337 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9339 if (TREE_CODE (init_expr) == INTEGER_CST)
9340 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9341 else
9342 gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9343 TREE_TYPE (init_expr)));
9345 switch (induction_type)
9347 case vect_step_op_neg:
9348 if (TREE_CODE (init_expr) != INTEGER_CST
9349 && TREE_CODE (init_expr) != REAL_CST)
9351 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9352 if (!directly_supported_p (NEGATE_EXPR, vectype))
9353 return false;
9355 /* The encoding has 2 interleaved stepped patterns. */
9356 vec_perm_builder sel (nunits, 2, 3);
9357 machine_mode mode = TYPE_MODE (vectype);
9358 sel.quick_grow (6);
9359 for (i = 0; i < 3; i++)
9361 sel[i * 2] = i;
9362 sel[i * 2 + 1] = i + nunits;
9364 vec_perm_indices indices (sel, 2, nunits);
9365 if (!can_vec_perm_const_p (mode, mode, indices))
9366 return false;
9368 break;
9370 case vect_step_op_mul:
9372 /* Check for backend support of MULT_EXPR. */
9373 if (!directly_supported_p (MULT_EXPR, vectype))
9374 return false;
9376 /* ?? How to construct vector step for variable number vector.
9377 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9378 if (!vf.is_constant ())
9379 return false;
9381 break;
9383 case vect_step_op_shr:
9384 /* Check for backend support of RSHIFT_EXPR. */
9385 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9386 return false;
9388 /* Don't shift more than type precision to avoid UD. */
9389 if (!tree_fits_uhwi_p (step_expr)
9390 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9391 TYPE_PRECISION (TREE_TYPE (init_expr))))
9392 return false;
9393 break;
9395 case vect_step_op_shl:
9396 /* Check for backend support of RSHIFT_EXPR. */
9397 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9398 return false;
9400 /* Don't shift more than type precision to avoid UD. */
9401 if (!tree_fits_uhwi_p (step_expr)
9402 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9403 TYPE_PRECISION (TREE_TYPE (init_expr))))
9404 return false;
9406 break;
9408 default:
9409 gcc_unreachable ();
9412 if (!vec_stmt) /* transformation not required. */
9414 unsigned inside_cost = 0, prologue_cost = 0;
9415 /* loop cost for vec_loop. Neg induction doesn't have any
9416 inside_cost. */
9417 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9418 stmt_info, 0, vect_body);
9420 /* loop cost for vec_loop. Neg induction doesn't have any
9421 inside_cost. */
9422 if (induction_type == vect_step_op_neg)
9423 inside_cost = 0;
9425 /* prologue cost for vec_init and vec_step. */
9426 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9427 stmt_info, 0, vect_prologue);
9429 if (dump_enabled_p ())
9430 dump_printf_loc (MSG_NOTE, vect_location,
9431 "vect_model_induction_cost: inside_cost = %d, "
9432 "prologue_cost = %d. \n", inside_cost,
9433 prologue_cost);
9435 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9436 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9437 return true;
9440 /* Transform. */
9442 /* Compute a vector variable, initialized with the first VF values of
9443 the induction variable. E.g., for an iv with IV_PHI='X' and
9444 evolution S, for a vector of 4 units, we want to compute:
9445 [X, X + S, X + 2*S, X + 3*S]. */
9447 if (dump_enabled_p ())
9448 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9450 pe = loop_preheader_edge (iv_loop);
9451 /* Find the first insertion point in the BB. */
9452 basic_block bb = gimple_bb (phi);
9453 si = gsi_after_labels (bb);
9455 gimple_seq stmts = NULL;
9457 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9458 /* If we are using the loop mask to "peel" for alignment then we need
9459 to adjust the start value here. */
9460 if (niters_skip != NULL_TREE)
9461 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9462 step_expr, induction_type);
9464 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9465 step_expr, nunits, vectype,
9466 induction_type);
9467 if (stmts)
9469 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9470 gcc_assert (!new_bb);
9473 stmts = NULL;
9474 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9475 vf, induction_type);
9476 if (stmts)
9478 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9479 gcc_assert (!new_bb);
9482 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9483 new_name, vectype,
9484 induction_type);
9485 /* Create the following def-use cycle:
9486 loop prolog:
9487 vec_init = ...
9488 vec_step = ...
9489 loop:
9490 vec_iv = PHI <vec_init, vec_loop>
9492 STMT
9494 vec_loop = vec_iv + vec_step; */
9496 /* Create the induction-phi that defines the induction-operand. */
9497 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9498 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9499 induc_def = PHI_RESULT (induction_phi);
9501 /* Create the iv update inside the loop. */
9502 stmts = NULL;
9503 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9504 induc_def, vec_step,
9505 induction_type);
9507 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9508 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9510 /* Set the arguments of the phi node: */
9511 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9512 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9513 UNKNOWN_LOCATION);
9515 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9516 *vec_stmt = induction_phi;
9518 /* In case that vectorization factor (VF) is bigger than the number
9519 of elements that we can fit in a vectype (nunits), we have to generate
9520 more than one vector stmt - i.e - we need to "unroll" the
9521 vector stmt by a factor VF/nunits. For more details see documentation
9522 in vectorizable_operation. */
9524 if (ncopies > 1)
9526 stmts = NULL;
9527 /* FORNOW. This restriction should be relaxed. */
9528 gcc_assert (!nested_in_vect_loop);
9530 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9531 nunits, induction_type);
9533 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9534 new_name, vectype,
9535 induction_type);
9536 vec_def = induc_def;
9537 for (i = 1; i < ncopies; i++)
9539 /* vec_i = vec_prev + vec_step. */
9540 stmts = NULL;
9541 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9542 vec_def, vec_step,
9543 induction_type);
9544 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9545 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9546 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9550 if (dump_enabled_p ())
9551 dump_printf_loc (MSG_NOTE, vect_location,
9552 "transform induction: created def-use cycle: %G%G",
9553 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9555 return true;
9558 /* Function vectorizable_induction
9560 Check if STMT_INFO performs an induction computation that can be vectorized.
9561 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9562 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9563 Return true if STMT_INFO is vectorizable in this way. */
9565 bool
9566 vectorizable_induction (loop_vec_info loop_vinfo,
9567 stmt_vec_info stmt_info,
9568 gimple **vec_stmt, slp_tree slp_node,
9569 stmt_vector_for_cost *cost_vec)
9571 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9572 unsigned ncopies;
9573 bool nested_in_vect_loop = false;
9574 class loop *iv_loop;
9575 tree vec_def;
9576 edge pe = loop_preheader_edge (loop);
9577 basic_block new_bb;
9578 tree new_vec, vec_init, vec_step, t;
9579 tree new_name;
9580 gimple *new_stmt;
9581 gphi *induction_phi;
9582 tree induc_def, vec_dest;
9583 tree init_expr, step_expr;
9584 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9585 unsigned i;
9586 tree expr;
9587 gimple_stmt_iterator si;
9588 enum vect_induction_op_type induction_type
9589 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9591 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9592 if (!phi)
9593 return false;
9595 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9596 return false;
9598 /* Make sure it was recognized as induction computation. */
9599 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9600 return false;
9602 /* Handle nonlinear induction in a separate place. */
9603 if (induction_type != vect_step_op_add)
9604 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9605 vec_stmt, slp_node, cost_vec);
9607 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9608 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9610 if (slp_node)
9611 ncopies = 1;
9612 else
9613 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9614 gcc_assert (ncopies >= 1);
9616 /* FORNOW. These restrictions should be relaxed. */
9617 if (nested_in_vect_loop_p (loop, stmt_info))
9619 imm_use_iterator imm_iter;
9620 use_operand_p use_p;
9621 gimple *exit_phi;
9622 edge latch_e;
9623 tree loop_arg;
9625 if (ncopies > 1)
9627 if (dump_enabled_p ())
9628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9629 "multiple types in nested loop.\n");
9630 return false;
9633 exit_phi = NULL;
9634 latch_e = loop_latch_edge (loop->inner);
9635 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9636 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9638 gimple *use_stmt = USE_STMT (use_p);
9639 if (is_gimple_debug (use_stmt))
9640 continue;
9642 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9644 exit_phi = use_stmt;
9645 break;
9648 if (exit_phi)
9650 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9651 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9652 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9654 if (dump_enabled_p ())
9655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9656 "inner-loop induction only used outside "
9657 "of the outer vectorized loop.\n");
9658 return false;
9662 nested_in_vect_loop = true;
9663 iv_loop = loop->inner;
9665 else
9666 iv_loop = loop;
9667 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9669 if (slp_node && !nunits.is_constant ())
9671 /* The current SLP code creates the step value element-by-element. */
9672 if (dump_enabled_p ())
9673 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9674 "SLP induction not supported for variable-length"
9675 " vectors.\n");
9676 return false;
9679 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9681 if (dump_enabled_p ())
9682 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9683 "floating point induction vectorization disabled\n");
9684 return false;
9687 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9688 gcc_assert (step_expr != NULL_TREE);
9689 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9691 /* Check for backend support of PLUS/MINUS_EXPR. */
9692 if (!directly_supported_p (PLUS_EXPR, step_vectype)
9693 || !directly_supported_p (MINUS_EXPR, step_vectype))
9694 return false;
9696 if (!vec_stmt) /* transformation not required. */
9698 unsigned inside_cost = 0, prologue_cost = 0;
9699 if (slp_node)
9701 /* We eventually need to set a vector type on invariant
9702 arguments. */
9703 unsigned j;
9704 slp_tree child;
9705 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9706 if (!vect_maybe_update_slp_op_vectype
9707 (child, SLP_TREE_VECTYPE (slp_node)))
9709 if (dump_enabled_p ())
9710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9711 "incompatible vector types for "
9712 "invariants\n");
9713 return false;
9715 /* loop cost for vec_loop. */
9716 inside_cost
9717 = record_stmt_cost (cost_vec,
9718 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9719 vector_stmt, stmt_info, 0, vect_body);
9720 /* prologue cost for vec_init (if not nested) and step. */
9721 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9722 scalar_to_vec,
9723 stmt_info, 0, vect_prologue);
9725 else /* if (!slp_node) */
9727 /* loop cost for vec_loop. */
9728 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9729 stmt_info, 0, vect_body);
9730 /* prologue cost for vec_init and vec_step. */
9731 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9732 stmt_info, 0, vect_prologue);
9734 if (dump_enabled_p ())
9735 dump_printf_loc (MSG_NOTE, vect_location,
9736 "vect_model_induction_cost: inside_cost = %d, "
9737 "prologue_cost = %d .\n", inside_cost,
9738 prologue_cost);
9740 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9741 DUMP_VECT_SCOPE ("vectorizable_induction");
9742 return true;
9745 /* Transform. */
9747 /* Compute a vector variable, initialized with the first VF values of
9748 the induction variable. E.g., for an iv with IV_PHI='X' and
9749 evolution S, for a vector of 4 units, we want to compute:
9750 [X, X + S, X + 2*S, X + 3*S]. */
9752 if (dump_enabled_p ())
9753 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9755 pe = loop_preheader_edge (iv_loop);
9756 /* Find the first insertion point in the BB. */
9757 basic_block bb = gimple_bb (phi);
9758 si = gsi_after_labels (bb);
9760 /* For SLP induction we have to generate several IVs as for example
9761 with group size 3 we need
9762 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9763 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9764 if (slp_node)
9766 /* Enforced above. */
9767 unsigned int const_nunits = nunits.to_constant ();
9769 /* The initial values are vectorized, but any lanes > group_size
9770 need adjustment. */
9771 slp_tree init_node
9772 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9774 /* Gather steps. Since we do not vectorize inductions as
9775 cycles we have to reconstruct the step from SCEV data. */
9776 unsigned group_size = SLP_TREE_LANES (slp_node);
9777 tree *steps = XALLOCAVEC (tree, group_size);
9778 tree *inits = XALLOCAVEC (tree, group_size);
9779 stmt_vec_info phi_info;
9780 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9782 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9783 if (!init_node)
9784 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9785 pe->dest_idx);
9788 /* Now generate the IVs. */
9789 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9790 gcc_assert ((const_nunits * nvects) % group_size == 0);
9791 unsigned nivs;
9792 if (nested_in_vect_loop)
9793 nivs = nvects;
9794 else
9796 /* Compute the number of distinct IVs we need. First reduce
9797 group_size if it is a multiple of const_nunits so we get
9798 one IV for a group_size of 4 but const_nunits 2. */
9799 unsigned group_sizep = group_size;
9800 if (group_sizep % const_nunits == 0)
9801 group_sizep = group_sizep / const_nunits;
9802 nivs = least_common_multiple (group_sizep,
9803 const_nunits) / const_nunits;
9805 tree stept = TREE_TYPE (step_vectype);
9806 tree lupdate_mul = NULL_TREE;
9807 if (!nested_in_vect_loop)
9809 /* The number of iterations covered in one vector iteration. */
9810 unsigned lup_mul = (nvects * const_nunits) / group_size;
9811 lupdate_mul
9812 = build_vector_from_val (step_vectype,
9813 SCALAR_FLOAT_TYPE_P (stept)
9814 ? build_real_from_wide (stept, lup_mul,
9815 UNSIGNED)
9816 : build_int_cstu (stept, lup_mul));
9818 tree peel_mul = NULL_TREE;
9819 gimple_seq init_stmts = NULL;
9820 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9822 if (SCALAR_FLOAT_TYPE_P (stept))
9823 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9824 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9825 else
9826 peel_mul = gimple_convert (&init_stmts, stept,
9827 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9828 peel_mul = gimple_build_vector_from_val (&init_stmts,
9829 step_vectype, peel_mul);
9831 unsigned ivn;
9832 auto_vec<tree> vec_steps;
9833 for (ivn = 0; ivn < nivs; ++ivn)
9835 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9836 tree_vector_builder init_elts (vectype, const_nunits, 1);
9837 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9838 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9840 /* The scalar steps of the IVs. */
9841 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9842 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9843 step_elts.quick_push (elt);
9844 if (!init_node)
9846 /* The scalar inits of the IVs if not vectorized. */
9847 elt = inits[(ivn*const_nunits + eltn) % group_size];
9848 if (!useless_type_conversion_p (TREE_TYPE (vectype),
9849 TREE_TYPE (elt)))
9850 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9851 TREE_TYPE (vectype), elt);
9852 init_elts.quick_push (elt);
9854 /* The number of steps to add to the initial values. */
9855 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9856 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9857 ? build_real_from_wide (stept,
9858 mul_elt, UNSIGNED)
9859 : build_int_cstu (stept, mul_elt));
9861 vec_step = gimple_build_vector (&init_stmts, &step_elts);
9862 vec_steps.safe_push (vec_step);
9863 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9864 if (peel_mul)
9865 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9866 step_mul, peel_mul);
9867 if (!init_node)
9868 vec_init = gimple_build_vector (&init_stmts, &init_elts);
9870 /* Create the induction-phi that defines the induction-operand. */
9871 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9872 "vec_iv_");
9873 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9874 induc_def = PHI_RESULT (induction_phi);
9876 /* Create the iv update inside the loop */
9877 tree up = vec_step;
9878 if (lupdate_mul)
9879 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9880 vec_step, lupdate_mul);
9881 gimple_seq stmts = NULL;
9882 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9883 vec_def = gimple_build (&stmts,
9884 PLUS_EXPR, step_vectype, vec_def, up);
9885 vec_def = gimple_convert (&stmts, vectype, vec_def);
9886 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9887 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9888 UNKNOWN_LOCATION);
9890 if (init_node)
9891 vec_init = vect_get_slp_vect_def (init_node, ivn);
9892 if (!nested_in_vect_loop
9893 && !integer_zerop (step_mul))
9895 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9896 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9897 vec_step, step_mul);
9898 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9899 vec_def, up);
9900 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9903 /* Set the arguments of the phi node: */
9904 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9906 slp_node->push_vec_def (induction_phi);
9908 if (!nested_in_vect_loop)
9910 /* Fill up to the number of vectors we need for the whole group. */
9911 nivs = least_common_multiple (group_size,
9912 const_nunits) / const_nunits;
9913 vec_steps.reserve (nivs-ivn);
9914 for (; ivn < nivs; ++ivn)
9916 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9917 vec_steps.quick_push (vec_steps[0]);
9921 /* Re-use IVs when we can. We are generating further vector
9922 stmts by adding VF' * stride to the IVs generated above. */
9923 if (ivn < nvects)
9925 unsigned vfp
9926 = least_common_multiple (group_size, const_nunits) / group_size;
9927 tree lupdate_mul
9928 = build_vector_from_val (step_vectype,
9929 SCALAR_FLOAT_TYPE_P (stept)
9930 ? build_real_from_wide (stept,
9931 vfp, UNSIGNED)
9932 : build_int_cstu (stept, vfp));
9933 for (; ivn < nvects; ++ivn)
9935 gimple *iv
9936 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
9937 tree def = gimple_get_lhs (iv);
9938 if (ivn < 2*nivs)
9939 vec_steps[ivn - nivs]
9940 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9941 vec_steps[ivn - nivs], lupdate_mul);
9942 gimple_seq stmts = NULL;
9943 def = gimple_convert (&stmts, step_vectype, def);
9944 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9945 def, vec_steps[ivn % nivs]);
9946 def = gimple_convert (&stmts, vectype, def);
9947 if (gimple_code (iv) == GIMPLE_PHI)
9948 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9949 else
9951 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9952 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9954 slp_node->push_vec_def (def);
9958 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9959 gcc_assert (!new_bb);
9961 return true;
9964 init_expr = vect_phi_initial_value (phi);
9966 gimple_seq stmts = NULL;
9967 if (!nested_in_vect_loop)
9969 /* Convert the initial value to the IV update type. */
9970 tree new_type = TREE_TYPE (step_expr);
9971 init_expr = gimple_convert (&stmts, new_type, init_expr);
9973 /* If we are using the loop mask to "peel" for alignment then we need
9974 to adjust the start value here. */
9975 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9976 if (skip_niters != NULL_TREE)
9978 if (FLOAT_TYPE_P (vectype))
9979 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9980 skip_niters);
9981 else
9982 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9983 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9984 skip_niters, step_expr);
9985 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9986 init_expr, skip_step);
9990 if (stmts)
9992 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9993 gcc_assert (!new_bb);
9996 /* Create the vector that holds the initial_value of the induction. */
9997 if (nested_in_vect_loop)
9999 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10000 been created during vectorization of previous stmts. We obtain it
10001 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10002 auto_vec<tree> vec_inits;
10003 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10004 init_expr, &vec_inits);
10005 vec_init = vec_inits[0];
10006 /* If the initial value is not of proper type, convert it. */
10007 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10009 new_stmt
10010 = gimple_build_assign (vect_get_new_ssa_name (vectype,
10011 vect_simple_var,
10012 "vec_iv_"),
10013 VIEW_CONVERT_EXPR,
10014 build1 (VIEW_CONVERT_EXPR, vectype,
10015 vec_init));
10016 vec_init = gimple_assign_lhs (new_stmt);
10017 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10018 new_stmt);
10019 gcc_assert (!new_bb);
10022 else
10024 /* iv_loop is the loop to be vectorized. Create:
10025 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10026 stmts = NULL;
10027 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10029 unsigned HOST_WIDE_INT const_nunits;
10030 if (nunits.is_constant (&const_nunits))
10032 tree_vector_builder elts (step_vectype, const_nunits, 1);
10033 elts.quick_push (new_name);
10034 for (i = 1; i < const_nunits; i++)
10036 /* Create: new_name_i = new_name + step_expr */
10037 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10038 new_name, step_expr);
10039 elts.quick_push (new_name);
10041 /* Create a vector from [new_name_0, new_name_1, ...,
10042 new_name_nunits-1] */
10043 vec_init = gimple_build_vector (&stmts, &elts);
10045 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10046 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10047 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10048 new_name, step_expr);
10049 else
10051 /* Build:
10052 [base, base, base, ...]
10053 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10054 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10055 gcc_assert (flag_associative_math);
10056 tree index = build_index_vector (step_vectype, 0, 1);
10057 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10058 new_name);
10059 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10060 step_expr);
10061 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10062 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10063 vec_init, step_vec);
10064 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10065 vec_init, base_vec);
10067 vec_init = gimple_convert (&stmts, vectype, vec_init);
10069 if (stmts)
10071 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10072 gcc_assert (!new_bb);
10077 /* Create the vector that holds the step of the induction. */
10078 if (nested_in_vect_loop)
10079 /* iv_loop is nested in the loop to be vectorized. Generate:
10080 vec_step = [S, S, S, S] */
10081 new_name = step_expr;
10082 else
10084 /* iv_loop is the loop to be vectorized. Generate:
10085 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10086 gimple_seq seq = NULL;
10087 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10089 expr = build_int_cst (integer_type_node, vf);
10090 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10092 else
10093 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10094 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10095 expr, step_expr);
10096 if (seq)
10098 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10099 gcc_assert (!new_bb);
10103 t = unshare_expr (new_name);
10104 gcc_assert (CONSTANT_CLASS_P (new_name)
10105 || TREE_CODE (new_name) == SSA_NAME);
10106 new_vec = build_vector_from_val (step_vectype, t);
10107 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10108 new_vec, step_vectype, NULL);
10111 /* Create the following def-use cycle:
10112 loop prolog:
10113 vec_init = ...
10114 vec_step = ...
10115 loop:
10116 vec_iv = PHI <vec_init, vec_loop>
10118 STMT
10120 vec_loop = vec_iv + vec_step; */
10122 /* Create the induction-phi that defines the induction-operand. */
10123 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10124 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10125 induc_def = PHI_RESULT (induction_phi);
10127 /* Create the iv update inside the loop */
10128 stmts = NULL;
10129 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10130 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10131 vec_def = gimple_convert (&stmts, vectype, vec_def);
10132 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10133 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10135 /* Set the arguments of the phi node: */
10136 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10137 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10138 UNKNOWN_LOCATION);
10140 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10141 *vec_stmt = induction_phi;
10143 /* In case that vectorization factor (VF) is bigger than the number
10144 of elements that we can fit in a vectype (nunits), we have to generate
10145 more than one vector stmt - i.e - we need to "unroll" the
10146 vector stmt by a factor VF/nunits. For more details see documentation
10147 in vectorizable_operation. */
10149 if (ncopies > 1)
10151 gimple_seq seq = NULL;
10152 /* FORNOW. This restriction should be relaxed. */
10153 gcc_assert (!nested_in_vect_loop);
10155 /* Create the vector that holds the step of the induction. */
10156 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10158 expr = build_int_cst (integer_type_node, nunits);
10159 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10161 else
10162 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10163 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10164 expr, step_expr);
10165 if (seq)
10167 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10168 gcc_assert (!new_bb);
10171 t = unshare_expr (new_name);
10172 gcc_assert (CONSTANT_CLASS_P (new_name)
10173 || TREE_CODE (new_name) == SSA_NAME);
10174 new_vec = build_vector_from_val (step_vectype, t);
10175 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10176 new_vec, step_vectype, NULL);
10178 vec_def = induc_def;
10179 for (i = 1; i < ncopies + 1; i++)
10181 /* vec_i = vec_prev + vec_step */
10182 gimple_seq stmts = NULL;
10183 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10184 vec_def = gimple_build (&stmts,
10185 PLUS_EXPR, step_vectype, vec_def, vec_step);
10186 vec_def = gimple_convert (&stmts, vectype, vec_def);
10188 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10189 if (i < ncopies)
10191 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10192 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10194 else
10196 /* vec_1 = vec_iv + (VF/n * S)
10197 vec_2 = vec_1 + (VF/n * S)
10199 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10201 vec_n is used as vec_loop to save the large step register and
10202 related operations. */
10203 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10204 UNKNOWN_LOCATION);
10209 if (dump_enabled_p ())
10210 dump_printf_loc (MSG_NOTE, vect_location,
10211 "transform induction: created def-use cycle: %G%G",
10212 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10214 return true;
10217 /* Function vectorizable_live_operation.
10219 STMT_INFO computes a value that is used outside the loop. Check if
10220 it can be supported. */
10222 bool
10223 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10224 slp_tree slp_node, slp_instance slp_node_instance,
10225 int slp_index, bool vec_stmt_p,
10226 stmt_vector_for_cost *cost_vec)
10228 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10229 imm_use_iterator imm_iter;
10230 tree lhs, lhs_type, bitsize;
10231 tree vectype = (slp_node
10232 ? SLP_TREE_VECTYPE (slp_node)
10233 : STMT_VINFO_VECTYPE (stmt_info));
10234 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10235 int ncopies;
10236 gimple *use_stmt;
10237 auto_vec<tree> vec_oprnds;
10238 int vec_entry = 0;
10239 poly_uint64 vec_index = 0;
10241 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10243 /* If a stmt of a reduction is live, vectorize it via
10244 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10245 validity so just trigger the transform here. */
10246 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10248 if (!vec_stmt_p)
10249 return true;
10250 if (slp_node)
10252 /* For reduction chains the meta-info is attached to
10253 the group leader. */
10254 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10255 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10256 /* For SLP reductions we vectorize the epilogue for
10257 all involved stmts together. */
10258 else if (slp_index != 0)
10259 return true;
10261 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10262 gcc_assert (reduc_info->is_reduc_info);
10263 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10264 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10265 return true;
10266 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10267 slp_node_instance);
10268 return true;
10271 /* If STMT is not relevant and it is a simple assignment and its inputs are
10272 invariant then it can remain in place, unvectorized. The original last
10273 scalar value that it computes will be used. */
10274 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10276 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10277 if (dump_enabled_p ())
10278 dump_printf_loc (MSG_NOTE, vect_location,
10279 "statement is simple and uses invariant. Leaving in "
10280 "place.\n");
10281 return true;
10284 if (slp_node)
10285 ncopies = 1;
10286 else
10287 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10289 if (slp_node)
10291 gcc_assert (slp_index >= 0);
10293 /* Get the last occurrence of the scalar index from the concatenation of
10294 all the slp vectors. Calculate which slp vector it is and the index
10295 within. */
10296 int num_scalar = SLP_TREE_LANES (slp_node);
10297 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10298 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10300 /* Calculate which vector contains the result, and which lane of
10301 that vector we need. */
10302 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10304 if (dump_enabled_p ())
10305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10306 "Cannot determine which vector holds the"
10307 " final result.\n");
10308 return false;
10312 if (!vec_stmt_p)
10314 /* No transformation required. */
10315 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10317 if (slp_node)
10319 if (dump_enabled_p ())
10320 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10321 "can't operate on partial vectors "
10322 "because an SLP statement is live after "
10323 "the loop.\n");
10324 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10326 else if (ncopies > 1)
10328 if (dump_enabled_p ())
10329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10330 "can't operate on partial vectors "
10331 "because ncopies is greater than 1.\n");
10332 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10334 else
10336 gcc_assert (ncopies == 1 && !slp_node);
10337 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10338 OPTIMIZE_FOR_SPEED))
10339 vect_record_loop_mask (loop_vinfo,
10340 &LOOP_VINFO_MASKS (loop_vinfo),
10341 1, vectype, NULL);
10342 else if (can_vec_extract_var_idx_p (
10343 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10344 vect_record_loop_len (loop_vinfo,
10345 &LOOP_VINFO_LENS (loop_vinfo),
10346 1, vectype, 1);
10347 else
10349 if (dump_enabled_p ())
10350 dump_printf_loc (
10351 MSG_MISSED_OPTIMIZATION, vect_location,
10352 "can't operate on partial vectors "
10353 "because the target doesn't support extract "
10354 "last reduction.\n");
10355 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10359 /* ??? Enable for loop costing as well. */
10360 if (!loop_vinfo)
10361 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10362 0, vect_epilogue);
10363 return true;
10366 /* Use the lhs of the original scalar statement. */
10367 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10368 if (dump_enabled_p ())
10369 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10370 "stmt %G", stmt);
10372 lhs = gimple_get_lhs (stmt);
10373 lhs_type = TREE_TYPE (lhs);
10375 bitsize = vector_element_bits_tree (vectype);
10377 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10378 tree vec_lhs, bitstart;
10379 gimple *vec_stmt;
10380 if (slp_node)
10382 gcc_assert (!loop_vinfo
10383 || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10384 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10386 /* Get the correct slp vectorized stmt. */
10387 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10388 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10390 /* Get entry to use. */
10391 bitstart = bitsize_int (vec_index);
10392 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10394 else
10396 /* For multiple copies, get the last copy. */
10397 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10398 vec_lhs = gimple_get_lhs (vec_stmt);
10400 /* Get the last lane in the vector. */
10401 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10404 if (loop_vinfo)
10406 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10407 requirement, insert one phi node for it. It looks like:
10408 loop;
10410 # lhs' = PHI <lhs>
10412 loop;
10414 # vec_lhs' = PHI <vec_lhs>
10415 new_tree = lane_extract <vec_lhs', ...>;
10416 lhs' = new_tree; */
10418 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10419 basic_block exit_bb = single_exit (loop)->dest;
10420 gcc_assert (single_pred_p (exit_bb));
10422 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10423 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10424 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
10426 gimple_seq stmts = NULL;
10427 tree new_tree;
10428 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10430 /* Emit:
10432 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10434 where VEC_LHS is the vectorized live-out result and MASK is
10435 the loop mask for the final iteration. */
10436 gcc_assert (ncopies == 1 && !slp_node);
10437 gimple_seq tem = NULL;
10438 gimple_stmt_iterator gsi = gsi_last (tem);
10439 tree len
10440 = vect_get_loop_len (loop_vinfo, &gsi,
10441 &LOOP_VINFO_LENS (loop_vinfo),
10442 1, vectype, 0, 0);
10444 /* BIAS - 1. */
10445 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10446 tree bias_minus_one
10447 = int_const_binop (MINUS_EXPR,
10448 build_int_cst (TREE_TYPE (len), biasval),
10449 build_one_cst (TREE_TYPE (len)));
10451 /* LAST_INDEX = LEN + (BIAS - 1). */
10452 tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10453 len, bias_minus_one);
10455 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10456 tree scalar_res
10457 = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10458 vec_lhs_phi, last_index);
10460 /* Convert the extracted vector element to the scalar type. */
10461 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10463 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10465 /* Emit:
10467 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10469 where VEC_LHS is the vectorized live-out result and MASK is
10470 the loop mask for the final iteration. */
10471 gcc_assert (ncopies == 1 && !slp_node);
10472 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10473 gimple_seq tem = NULL;
10474 gimple_stmt_iterator gsi = gsi_last (tem);
10475 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10476 &LOOP_VINFO_MASKS (loop_vinfo),
10477 1, vectype, 0);
10478 gimple_seq_add_seq (&stmts, tem);
10479 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10480 mask, vec_lhs_phi);
10482 /* Convert the extracted vector element to the scalar type. */
10483 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10485 else
10487 tree bftype = TREE_TYPE (vectype);
10488 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10489 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10490 new_tree = build3 (BIT_FIELD_REF, bftype,
10491 vec_lhs_phi, bitsize, bitstart);
10492 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10493 &stmts, true, NULL_TREE);
10496 if (stmts)
10498 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10499 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10501 /* Remove existing phi from lhs and create one copy from new_tree. */
10502 tree lhs_phi = NULL_TREE;
10503 gimple_stmt_iterator gsi;
10504 for (gsi = gsi_start_phis (exit_bb);
10505 !gsi_end_p (gsi); gsi_next (&gsi))
10507 gimple *phi = gsi_stmt (gsi);
10508 if ((gimple_phi_arg_def (phi, 0) == lhs))
10510 remove_phi_node (&gsi, false);
10511 lhs_phi = gimple_phi_result (phi);
10512 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10513 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10514 break;
10519 /* Replace use of lhs with newly computed result. If the use stmt is a
10520 single arg PHI, just replace all uses of PHI result. It's necessary
10521 because lcssa PHI defining lhs may be before newly inserted stmt. */
10522 use_operand_p use_p;
10523 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10524 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10525 && !is_gimple_debug (use_stmt))
10527 if (gimple_code (use_stmt) == GIMPLE_PHI
10528 && gimple_phi_num_args (use_stmt) == 1)
10530 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10532 else
10534 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10535 SET_USE (use_p, new_tree);
10537 update_stmt (use_stmt);
10540 else
10542 /* For basic-block vectorization simply insert the lane-extraction. */
10543 tree bftype = TREE_TYPE (vectype);
10544 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10545 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10546 tree new_tree = build3 (BIT_FIELD_REF, bftype,
10547 vec_lhs, bitsize, bitstart);
10548 gimple_seq stmts = NULL;
10549 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10550 &stmts, true, NULL_TREE);
10551 if (TREE_CODE (new_tree) == SSA_NAME
10552 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10553 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10554 if (is_a <gphi *> (vec_stmt))
10556 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10557 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10559 else
10561 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10562 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10565 /* Replace use of lhs with newly computed result. If the use stmt is a
10566 single arg PHI, just replace all uses of PHI result. It's necessary
10567 because lcssa PHI defining lhs may be before newly inserted stmt. */
10568 use_operand_p use_p;
10569 stmt_vec_info use_stmt_info;
10570 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10571 if (!is_gimple_debug (use_stmt)
10572 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10573 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10575 /* ??? This can happen when the live lane ends up being
10576 used in a vector construction code-generated by an
10577 external SLP node (and code-generation for that already
10578 happened). See gcc.dg/vect/bb-slp-47.c.
10579 Doing this is what would happen if that vector CTOR
10580 were not code-generated yet so it is not too bad.
10581 ??? In fact we'd likely want to avoid this situation
10582 in the first place. */
10583 if (TREE_CODE (new_tree) == SSA_NAME
10584 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10585 && gimple_code (use_stmt) != GIMPLE_PHI
10586 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10587 use_stmt))
10589 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10590 gcc_checking_assert (code == SSA_NAME
10591 || code == CONSTRUCTOR
10592 || code == VIEW_CONVERT_EXPR
10593 || CONVERT_EXPR_CODE_P (code));
10594 if (dump_enabled_p ())
10595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10596 "Using original scalar computation for "
10597 "live lane because use preceeds vector "
10598 "def\n");
10599 continue;
10601 /* ??? It can also happen that we end up pulling a def into
10602 a loop where replacing out-of-loop uses would require
10603 a new LC SSA PHI node. Retain the original scalar in
10604 those cases as well. PR98064. */
10605 if (TREE_CODE (new_tree) == SSA_NAME
10606 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10607 && (gimple_bb (use_stmt)->loop_father
10608 != gimple_bb (vec_stmt)->loop_father)
10609 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10610 gimple_bb (use_stmt)->loop_father))
10612 if (dump_enabled_p ())
10613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10614 "Using original scalar computation for "
10615 "live lane because there is an out-of-loop "
10616 "definition for it\n");
10617 continue;
10619 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10620 SET_USE (use_p, new_tree);
10621 update_stmt (use_stmt);
10625 return true;
10628 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
10630 static void
10631 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10633 ssa_op_iter op_iter;
10634 imm_use_iterator imm_iter;
10635 def_operand_p def_p;
10636 gimple *ustmt;
10638 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10640 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10642 basic_block bb;
10644 if (!is_gimple_debug (ustmt))
10645 continue;
10647 bb = gimple_bb (ustmt);
10649 if (!flow_bb_inside_loop_p (loop, bb))
10651 if (gimple_debug_bind_p (ustmt))
10653 if (dump_enabled_p ())
10654 dump_printf_loc (MSG_NOTE, vect_location,
10655 "killing debug use\n");
10657 gimple_debug_bind_reset_value (ustmt);
10658 update_stmt (ustmt);
10660 else
10661 gcc_unreachable ();
10667 /* Given loop represented by LOOP_VINFO, return true if computation of
10668 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10669 otherwise. */
10671 static bool
10672 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10674 /* Constant case. */
10675 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10677 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10678 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10680 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10681 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10682 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10683 return true;
10686 widest_int max;
10687 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10688 /* Check the upper bound of loop niters. */
10689 if (get_max_loop_iterations (loop, &max))
10691 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10692 signop sgn = TYPE_SIGN (type);
10693 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10694 if (max < type_max)
10695 return true;
10697 return false;
10700 /* Return a mask type with half the number of elements as OLD_TYPE,
10701 given that it should have mode NEW_MODE. */
10703 tree
10704 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10706 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10707 return build_truth_vector_type_for_mode (nunits, new_mode);
10710 /* Return a mask type with twice as many elements as OLD_TYPE,
10711 given that it should have mode NEW_MODE. */
10713 tree
10714 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10716 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10717 return build_truth_vector_type_for_mode (nunits, new_mode);
10720 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10721 contain a sequence of NVECTORS masks that each control a vector of type
10722 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10723 these vector masks with the vector version of SCALAR_MASK. */
10725 void
10726 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10727 unsigned int nvectors, tree vectype, tree scalar_mask)
10729 gcc_assert (nvectors != 0);
10731 if (scalar_mask)
10733 scalar_cond_masked_key cond (scalar_mask, nvectors);
10734 loop_vinfo->scalar_cond_masked_set.add (cond);
10737 masks->mask_set.add (std::make_pair (vectype, nvectors));
10740 /* Given a complete set of masks MASKS, extract mask number INDEX
10741 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10742 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10744 See the comment above vec_loop_masks for more details about the mask
10745 arrangement. */
10747 tree
10748 vect_get_loop_mask (loop_vec_info loop_vinfo,
10749 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10750 unsigned int nvectors, tree vectype, unsigned int index)
10752 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10753 == vect_partial_vectors_while_ult)
10755 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10756 tree mask_type = rgm->type;
10758 /* Populate the rgroup's mask array, if this is the first time we've
10759 used it. */
10760 if (rgm->controls.is_empty ())
10762 rgm->controls.safe_grow_cleared (nvectors, true);
10763 for (unsigned int i = 0; i < nvectors; ++i)
10765 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10766 /* Provide a dummy definition until the real one is available. */
10767 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10768 rgm->controls[i] = mask;
10772 tree mask = rgm->controls[index];
10773 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10774 TYPE_VECTOR_SUBPARTS (vectype)))
10776 /* A loop mask for data type X can be reused for data type Y
10777 if X has N times more elements than Y and if Y's elements
10778 are N times bigger than X's. In this case each sequence
10779 of N elements in the loop mask will be all-zero or all-one.
10780 We can then view-convert the mask so that each sequence of
10781 N elements is replaced by a single element. */
10782 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10783 TYPE_VECTOR_SUBPARTS (vectype)));
10784 gimple_seq seq = NULL;
10785 mask_type = truth_type_for (vectype);
10786 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10787 if (seq)
10788 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10790 return mask;
10792 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10793 == vect_partial_vectors_avx512)
10795 /* The number of scalars per iteration and the number of vectors are
10796 both compile-time constants. */
10797 unsigned int nscalars_per_iter
10798 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10799 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10801 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10803 /* The stored nV is dependent on the mask type produced. */
10804 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10805 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10806 == rgm->factor);
10807 nvectors = rgm->factor;
10809 /* Populate the rgroup's mask array, if this is the first time we've
10810 used it. */
10811 if (rgm->controls.is_empty ())
10813 rgm->controls.safe_grow_cleared (nvectors, true);
10814 for (unsigned int i = 0; i < nvectors; ++i)
10816 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10817 /* Provide a dummy definition until the real one is available. */
10818 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10819 rgm->controls[i] = mask;
10822 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10823 TYPE_VECTOR_SUBPARTS (vectype)))
10824 return rgm->controls[index];
10826 /* Split the vector if needed. Since we are dealing with integer mode
10827 masks with AVX512 we can operate on the integer representation
10828 performing the whole vector shifting. */
10829 unsigned HOST_WIDE_INT factor;
10830 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10831 TYPE_VECTOR_SUBPARTS (vectype), &factor);
10832 gcc_assert (ok);
10833 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10834 tree mask_type = truth_type_for (vectype);
10835 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10836 unsigned vi = index / factor;
10837 unsigned vpart = index % factor;
10838 tree vec = rgm->controls[vi];
10839 gimple_seq seq = NULL;
10840 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10841 lang_hooks.types.type_for_mode
10842 (TYPE_MODE (rgm->type), 1), vec);
10843 /* For integer mode masks simply shift the right bits into position. */
10844 if (vpart != 0)
10845 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10846 build_int_cst (integer_type_node,
10847 (TYPE_VECTOR_SUBPARTS (vectype)
10848 * vpart)));
10849 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10850 (TYPE_MODE (mask_type), 1), vec);
10851 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10852 if (seq)
10853 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10854 return vec;
10856 else
10857 gcc_unreachable ();
10860 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10861 lengths for controlling an operation on VECTYPE. The operation splits
10862 each element of VECTYPE into FACTOR separate subelements, measuring the
10863 length as a number of these subelements. */
10865 void
10866 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10867 unsigned int nvectors, tree vectype, unsigned int factor)
10869 gcc_assert (nvectors != 0);
10870 if (lens->length () < nvectors)
10871 lens->safe_grow_cleared (nvectors, true);
10872 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10874 /* The number of scalars per iteration, scalar occupied bytes and
10875 the number of vectors are both compile-time constants. */
10876 unsigned int nscalars_per_iter
10877 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10878 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10880 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10882 /* For now, we only support cases in which all loads and stores fall back
10883 to VnQI or none do. */
10884 gcc_assert (!rgl->max_nscalars_per_iter
10885 || (rgl->factor == 1 && factor == 1)
10886 || (rgl->max_nscalars_per_iter * rgl->factor
10887 == nscalars_per_iter * factor));
10888 rgl->max_nscalars_per_iter = nscalars_per_iter;
10889 rgl->type = vectype;
10890 rgl->factor = factor;
10894 /* Given a complete set of lengths LENS, extract length number INDEX
10895 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10896 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10897 multipled by the number of elements that should be processed.
10898 Insert any set-up statements before GSI. */
10900 tree
10901 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10902 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10903 unsigned int index, unsigned int factor)
10905 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10906 bool use_bias_adjusted_len =
10907 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10909 /* Populate the rgroup's len array, if this is the first time we've
10910 used it. */
10911 if (rgl->controls.is_empty ())
10913 rgl->controls.safe_grow_cleared (nvectors, true);
10914 for (unsigned int i = 0; i < nvectors; ++i)
10916 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10917 gcc_assert (len_type != NULL_TREE);
10919 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10921 /* Provide a dummy definition until the real one is available. */
10922 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10923 rgl->controls[i] = len;
10925 if (use_bias_adjusted_len)
10927 gcc_assert (i == 0);
10928 tree adjusted_len =
10929 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10930 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10931 rgl->bias_adjusted_ctrl = adjusted_len;
10936 if (use_bias_adjusted_len)
10937 return rgl->bias_adjusted_ctrl;
10939 tree loop_len = rgl->controls[index];
10940 if (rgl->factor == 1 && factor == 1)
10942 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10943 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10944 if (maybe_ne (nunits1, nunits2))
10946 /* A loop len for data type X can be reused for data type Y
10947 if X has N times more elements than Y and if Y's elements
10948 are N times bigger than X's. */
10949 gcc_assert (multiple_p (nunits1, nunits2));
10950 factor = exact_div (nunits1, nunits2).to_constant ();
10951 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10952 gimple_seq seq = NULL;
10953 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
10954 build_int_cst (iv_type, factor));
10955 if (seq)
10956 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10959 return loop_len;
10962 /* Scale profiling counters by estimation for LOOP which is vectorized
10963 by factor VF.
10964 If FLAT is true, the loop we started with had unrealistically flat
10965 profile. */
10967 static void
10968 scale_profile_for_vect_loop (class loop *loop, unsigned vf, bool flat)
10970 /* For flat profiles do not scale down proportionally by VF and only
10971 cap by known iteration count bounds. */
10972 if (flat)
10974 if (dump_file && (dump_flags & TDF_DETAILS))
10975 fprintf (dump_file,
10976 "Vectorized loop profile seems flat; not scaling iteration "
10977 "count down by the vectorization factor %i\n", vf);
10978 scale_loop_profile (loop, profile_probability::always (),
10979 get_likely_max_loop_iterations_int (loop));
10980 return;
10982 /* Loop body executes VF fewer times and exit increases VF times. */
10983 edge exit_e = single_exit (loop);
10984 profile_count entry_count = loop_preheader_edge (loop)->count ();
10986 /* If we have unreliable loop profile avoid dropping entry
10987 count bellow header count. This can happen since loops
10988 has unrealistically low trip counts. */
10989 while (vf > 1
10990 && loop->header->count > entry_count
10991 && loop->header->count < entry_count * vf)
10993 if (dump_file && (dump_flags & TDF_DETAILS))
10994 fprintf (dump_file,
10995 "Vectorization factor %i seems too large for profile "
10996 "prevoiusly believed to be consistent; reducing.\n", vf);
10997 vf /= 2;
11000 if (entry_count.nonzero_p ())
11001 set_edge_probability_and_rescale_others
11002 (exit_e,
11003 entry_count.probability_in (loop->header->count / vf));
11004 /* Avoid producing very large exit probability when we do not have
11005 sensible profile. */
11006 else if (exit_e->probability < profile_probability::always () / (vf * 2))
11007 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11008 loop->latch->count = single_pred_edge (loop->latch)->count ();
11010 scale_loop_profile (loop, profile_probability::always () / vf,
11011 get_likely_max_loop_iterations_int (loop));
11014 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11015 latch edge values originally defined by it. */
11017 static void
11018 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11019 stmt_vec_info def_stmt_info)
11021 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11022 if (!def || TREE_CODE (def) != SSA_NAME)
11023 return;
11024 stmt_vec_info phi_info;
11025 imm_use_iterator iter;
11026 use_operand_p use_p;
11027 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11029 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11030 if (!phi)
11031 continue;
11032 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11033 && (phi_info = loop_vinfo->lookup_stmt (phi))
11034 && STMT_VINFO_RELEVANT_P (phi_info)))
11035 continue;
11036 loop_p loop = gimple_bb (phi)->loop_father;
11037 edge e = loop_latch_edge (loop);
11038 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11039 continue;
11041 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11042 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11043 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11045 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11046 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11047 gcc_assert (phi_defs.length () == latch_defs.length ());
11048 for (unsigned i = 0; i < phi_defs.length (); ++i)
11049 add_phi_arg (as_a <gphi *> (phi_defs[i]),
11050 gimple_get_lhs (latch_defs[i]), e,
11051 gimple_phi_arg_location (phi, e->dest_idx));
11053 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11055 /* For first order recurrences we have to update both uses of
11056 the latch definition, the one in the PHI node and the one
11057 in the generated VEC_PERM_EXPR. */
11058 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11059 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11060 gcc_assert (phi_defs.length () == latch_defs.length ());
11061 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11062 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11063 for (unsigned i = 0; i < phi_defs.length (); ++i)
11065 gassign *perm = as_a <gassign *> (phi_defs[i]);
11066 if (i > 0)
11067 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11068 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11069 update_stmt (perm);
11071 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11072 gimple_phi_arg_location (phi, e->dest_idx));
11077 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11078 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11079 stmt_vec_info. */
11081 static bool
11082 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11083 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11085 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11086 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11088 if (dump_enabled_p ())
11089 dump_printf_loc (MSG_NOTE, vect_location,
11090 "------>vectorizing statement: %G", stmt_info->stmt);
11092 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11093 vect_loop_kill_debug_uses (loop, stmt_info);
11095 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11096 && !STMT_VINFO_LIVE_P (stmt_info))
11097 return false;
11099 if (STMT_VINFO_VECTYPE (stmt_info))
11101 poly_uint64 nunits
11102 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11103 if (!STMT_SLP_TYPE (stmt_info)
11104 && maybe_ne (nunits, vf)
11105 && dump_enabled_p ())
11106 /* For SLP VF is set according to unrolling factor, and not
11107 to vector size, hence for SLP this print is not valid. */
11108 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11111 /* Pure SLP statements have already been vectorized. We still need
11112 to apply loop vectorization to hybrid SLP statements. */
11113 if (PURE_SLP_STMT (stmt_info))
11114 return false;
11116 if (dump_enabled_p ())
11117 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11119 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11120 *seen_store = stmt_info;
11122 return true;
11125 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11126 in the hash_map with its corresponding values. */
11128 static tree
11129 find_in_mapping (tree t, void *context)
11131 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11133 tree *value = mapping->get (t);
11134 return value ? *value : t;
11137 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11138 original loop that has now been vectorized.
11140 The inits of the data_references need to be advanced with the number of
11141 iterations of the main loop. This has been computed in vect_do_peeling and
11142 is stored in parameter ADVANCE. We first restore the data_references
11143 initial offset with the values recored in ORIG_DRS_INIT.
11145 Since the loop_vec_info of this EPILOGUE was constructed for the original
11146 loop, its stmt_vec_infos all point to the original statements. These need
11147 to be updated to point to their corresponding copies as well as the SSA_NAMES
11148 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11150 The data_reference's connections also need to be updated. Their
11151 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11152 stmt_vec_infos, their statements need to point to their corresponding copy,
11153 if they are gather loads or scatter stores then their reference needs to be
11154 updated to point to its corresponding copy and finally we set
11155 'base_misaligned' to false as we have already peeled for alignment in the
11156 prologue of the main loop. */
11158 static void
11159 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11161 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11162 auto_vec<gimple *> stmt_worklist;
11163 hash_map<tree,tree> mapping;
11164 gimple *orig_stmt, *new_stmt;
11165 gimple_stmt_iterator epilogue_gsi;
11166 gphi_iterator epilogue_phi_gsi;
11167 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11168 basic_block *epilogue_bbs = get_loop_body (epilogue);
11169 unsigned i;
11171 free (LOOP_VINFO_BBS (epilogue_vinfo));
11172 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11174 /* Advance data_reference's with the number of iterations of the previous
11175 loop and its prologue. */
11176 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11179 /* The EPILOGUE loop is a copy of the original loop so they share the same
11180 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11181 point to the copied statements. We also create a mapping of all LHS' in
11182 the original loop and all the LHS' in the EPILOGUE and create worklists to
11183 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11184 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11186 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11187 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11189 new_stmt = epilogue_phi_gsi.phi ();
11191 gcc_assert (gimple_uid (new_stmt) > 0);
11192 stmt_vinfo
11193 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11195 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11196 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11198 mapping.put (gimple_phi_result (orig_stmt),
11199 gimple_phi_result (new_stmt));
11200 /* PHI nodes can not have patterns or related statements. */
11201 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11202 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11205 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11206 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11208 new_stmt = gsi_stmt (epilogue_gsi);
11209 if (is_gimple_debug (new_stmt))
11210 continue;
11212 gcc_assert (gimple_uid (new_stmt) > 0);
11213 stmt_vinfo
11214 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11216 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11217 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11219 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11220 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11222 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11224 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11225 for (gimple_stmt_iterator gsi = gsi_start (seq);
11226 !gsi_end_p (gsi); gsi_next (&gsi))
11227 stmt_worklist.safe_push (gsi_stmt (gsi));
11230 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11231 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11233 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11234 stmt_worklist.safe_push (stmt);
11235 /* Set BB such that the assert in
11236 'get_initial_def_for_reduction' is able to determine that
11237 the BB of the related stmt is inside this loop. */
11238 gimple_set_bb (stmt,
11239 gimple_bb (new_stmt));
11240 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11241 gcc_assert (related_vinfo == NULL
11242 || related_vinfo == stmt_vinfo);
11247 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11248 using the original main loop and thus need to be updated to refer to the
11249 cloned variables used in the epilogue. */
11250 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11252 gimple *stmt = stmt_worklist[i];
11253 tree *new_op;
11255 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11257 tree op = gimple_op (stmt, j);
11258 if ((new_op = mapping.get(op)))
11259 gimple_set_op (stmt, j, *new_op);
11260 else
11262 /* PR92429: The last argument of simplify_replace_tree disables
11263 folding when replacing arguments. This is required as
11264 otherwise you might end up with different statements than the
11265 ones analyzed in vect_loop_analyze, leading to different
11266 vectorization. */
11267 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11268 &find_in_mapping, &mapping, false);
11269 gimple_set_op (stmt, j, op);
11274 struct data_reference *dr;
11275 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11276 FOR_EACH_VEC_ELT (datarefs, i, dr)
11278 orig_stmt = DR_STMT (dr);
11279 gcc_assert (gimple_uid (orig_stmt) > 0);
11280 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11281 /* Data references for gather loads and scatter stores do not use the
11282 updated offset we set using ADVANCE. Instead we have to make sure the
11283 reference in the data references point to the corresponding copy of
11284 the original in the epilogue. */
11285 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
11286 == VMAT_GATHER_SCATTER)
11288 DR_REF (dr)
11289 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11290 &find_in_mapping, &mapping);
11291 DR_BASE_ADDRESS (dr)
11292 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11293 &find_in_mapping, &mapping);
11295 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11296 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11297 /* The vector size of the epilogue is smaller than that of the main loop
11298 so the alignment is either the same or lower. This means the dr will
11299 thus by definition be aligned. */
11300 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11303 epilogue_vinfo->shared->datarefs_copy.release ();
11304 epilogue_vinfo->shared->save_datarefs ();
11307 /* Function vect_transform_loop.
11309 The analysis phase has determined that the loop is vectorizable.
11310 Vectorize the loop - created vectorized stmts to replace the scalar
11311 stmts in the loop, and update the loop exit condition.
11312 Returns scalar epilogue loop if any. */
11314 class loop *
11315 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11317 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11318 class loop *epilogue = NULL;
11319 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11320 int nbbs = loop->num_nodes;
11321 int i;
11322 tree niters_vector = NULL_TREE;
11323 tree step_vector = NULL_TREE;
11324 tree niters_vector_mult_vf = NULL_TREE;
11325 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11326 unsigned int lowest_vf = constant_lower_bound (vf);
11327 gimple *stmt;
11328 bool check_profitability = false;
11329 unsigned int th;
11330 bool flat = maybe_flat_loop_profile (loop);
11332 DUMP_VECT_SCOPE ("vec_transform_loop");
11334 loop_vinfo->shared->check_datarefs ();
11336 /* Use the more conservative vectorization threshold. If the number
11337 of iterations is constant assume the cost check has been performed
11338 by our caller. If the threshold makes all loops profitable that
11339 run at least the (estimated) vectorization factor number of times
11340 checking is pointless, too. */
11341 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11342 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11344 if (dump_enabled_p ())
11345 dump_printf_loc (MSG_NOTE, vect_location,
11346 "Profitability threshold is %d loop iterations.\n",
11347 th);
11348 check_profitability = true;
11351 /* Make sure there exists a single-predecessor exit bb. Do this before
11352 versioning. */
11353 edge e = single_exit (loop);
11354 if (! single_pred_p (e->dest))
11356 split_loop_exit_edge (e, true);
11357 if (dump_enabled_p ())
11358 dump_printf (MSG_NOTE, "split exit edge\n");
11361 /* Version the loop first, if required, so the profitability check
11362 comes first. */
11364 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11366 class loop *sloop
11367 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11368 sloop->force_vectorize = false;
11369 check_profitability = false;
11372 /* Make sure there exists a single-predecessor exit bb also on the
11373 scalar loop copy. Do this after versioning but before peeling
11374 so CFG structure is fine for both scalar and if-converted loop
11375 to make slpeel_duplicate_current_defs_from_edges face matched
11376 loop closed PHI nodes on the exit. */
11377 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11379 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
11380 if (! single_pred_p (e->dest))
11382 split_loop_exit_edge (e, true);
11383 if (dump_enabled_p ())
11384 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11388 tree niters = vect_build_loop_niters (loop_vinfo);
11389 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11390 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11391 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11392 tree advance;
11393 drs_init_vec orig_drs_init;
11395 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11396 &step_vector, &niters_vector_mult_vf, th,
11397 check_profitability, niters_no_overflow,
11398 &advance);
11399 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11400 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11402 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11403 block after loop exit. We need to scale all that. */
11404 basic_block preheader
11405 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11406 preheader->count
11407 = preheader->count.apply_probability
11408 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11409 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11410 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11411 single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11412 = preheader->count;
11415 if (niters_vector == NULL_TREE)
11417 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11418 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11419 && known_eq (lowest_vf, vf))
11421 niters_vector
11422 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11423 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11424 step_vector = build_one_cst (TREE_TYPE (niters));
11426 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11427 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11428 &step_vector, niters_no_overflow);
11429 else
11430 /* vect_do_peeling subtracted the number of peeled prologue
11431 iterations from LOOP_VINFO_NITERS. */
11432 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11433 &niters_vector, &step_vector,
11434 niters_no_overflow);
11437 /* 1) Make sure the loop header has exactly two entries
11438 2) Make sure we have a preheader basic block. */
11440 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11442 split_edge (loop_preheader_edge (loop));
11444 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11445 /* This will deal with any possible peeling. */
11446 vect_prepare_for_masked_peels (loop_vinfo);
11448 /* Schedule the SLP instances first, then handle loop vectorization
11449 below. */
11450 if (!loop_vinfo->slp_instances.is_empty ())
11452 DUMP_VECT_SCOPE ("scheduling SLP instances");
11453 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11456 /* FORNOW: the vectorizer supports only loops which body consist
11457 of one basic block (header + empty latch). When the vectorizer will
11458 support more involved loop forms, the order by which the BBs are
11459 traversed need to be reconsidered. */
11461 for (i = 0; i < nbbs; i++)
11463 basic_block bb = bbs[i];
11464 stmt_vec_info stmt_info;
11466 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11467 gsi_next (&si))
11469 gphi *phi = si.phi ();
11470 if (dump_enabled_p ())
11471 dump_printf_loc (MSG_NOTE, vect_location,
11472 "------>vectorizing phi: %G", (gimple *) phi);
11473 stmt_info = loop_vinfo->lookup_stmt (phi);
11474 if (!stmt_info)
11475 continue;
11477 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11478 vect_loop_kill_debug_uses (loop, stmt_info);
11480 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11481 && !STMT_VINFO_LIVE_P (stmt_info))
11482 continue;
11484 if (STMT_VINFO_VECTYPE (stmt_info)
11485 && (maybe_ne
11486 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11487 && dump_enabled_p ())
11488 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11490 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11491 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11492 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11493 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11494 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11495 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11496 && ! PURE_SLP_STMT (stmt_info))
11498 if (dump_enabled_p ())
11499 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11500 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11504 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11505 gsi_next (&si))
11507 gphi *phi = si.phi ();
11508 stmt_info = loop_vinfo->lookup_stmt (phi);
11509 if (!stmt_info)
11510 continue;
11512 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11513 && !STMT_VINFO_LIVE_P (stmt_info))
11514 continue;
11516 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11517 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11518 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11519 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11520 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11521 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11522 && ! PURE_SLP_STMT (stmt_info))
11523 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11526 for (gimple_stmt_iterator si = gsi_start_bb (bb);
11527 !gsi_end_p (si);)
11529 stmt = gsi_stmt (si);
11530 /* During vectorization remove existing clobber stmts. */
11531 if (gimple_clobber_p (stmt))
11533 unlink_stmt_vdef (stmt);
11534 gsi_remove (&si, true);
11535 release_defs (stmt);
11537 else
11539 /* Ignore vector stmts created in the outer loop. */
11540 stmt_info = loop_vinfo->lookup_stmt (stmt);
11542 /* vector stmts created in the outer-loop during vectorization of
11543 stmts in an inner-loop may not have a stmt_info, and do not
11544 need to be vectorized. */
11545 stmt_vec_info seen_store = NULL;
11546 if (stmt_info)
11548 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11550 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11551 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11552 !gsi_end_p (subsi); gsi_next (&subsi))
11554 stmt_vec_info pat_stmt_info
11555 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11556 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11557 &si, &seen_store);
11559 stmt_vec_info pat_stmt_info
11560 = STMT_VINFO_RELATED_STMT (stmt_info);
11561 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11562 &si, &seen_store))
11563 maybe_set_vectorized_backedge_value (loop_vinfo,
11564 pat_stmt_info);
11566 else
11568 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11569 &seen_store))
11570 maybe_set_vectorized_backedge_value (loop_vinfo,
11571 stmt_info);
11574 gsi_next (&si);
11575 if (seen_store)
11577 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11578 /* Interleaving. If IS_STORE is TRUE, the
11579 vectorization of the interleaving chain was
11580 completed - free all the stores in the chain. */
11581 vect_remove_stores (loop_vinfo,
11582 DR_GROUP_FIRST_ELEMENT (seen_store));
11583 else
11584 /* Free the attached stmt_vec_info and remove the stmt. */
11585 loop_vinfo->remove_stmt (stmt_info);
11590 /* Stub out scalar statements that must not survive vectorization.
11591 Doing this here helps with grouped statements, or statements that
11592 are involved in patterns. */
11593 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11594 !gsi_end_p (gsi); gsi_next (&gsi))
11596 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11597 if (!call || !gimple_call_internal_p (call))
11598 continue;
11599 internal_fn ifn = gimple_call_internal_fn (call);
11600 if (ifn == IFN_MASK_LOAD)
11602 tree lhs = gimple_get_lhs (call);
11603 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11605 tree zero = build_zero_cst (TREE_TYPE (lhs));
11606 gimple *new_stmt = gimple_build_assign (lhs, zero);
11607 gsi_replace (&gsi, new_stmt, true);
11610 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11612 tree lhs = gimple_get_lhs (call);
11613 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11615 tree else_arg
11616 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11617 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11618 gsi_replace (&gsi, new_stmt, true);
11622 } /* BBs in loop */
11624 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11625 a zero NITERS becomes a nonzero NITERS_VECTOR. */
11626 if (integer_onep (step_vector))
11627 niters_no_overflow = true;
11628 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11629 niters_vector_mult_vf, !niters_no_overflow);
11631 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11633 /* True if the final iteration might not handle a full vector's
11634 worth of scalar iterations. */
11635 bool final_iter_may_be_partial
11636 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11637 /* The minimum number of iterations performed by the epilogue. This
11638 is 1 when peeling for gaps because we always need a final scalar
11639 iteration. */
11640 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11641 /* +1 to convert latch counts to loop iteration counts,
11642 -min_epilogue_iters to remove iterations that cannot be performed
11643 by the vector code. */
11644 int bias_for_lowest = 1 - min_epilogue_iters;
11645 int bias_for_assumed = bias_for_lowest;
11646 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11647 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11649 /* When the amount of peeling is known at compile time, the first
11650 iteration will have exactly alignment_npeels active elements.
11651 In the worst case it will have at least one. */
11652 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11653 bias_for_lowest += lowest_vf - min_first_active;
11654 bias_for_assumed += assumed_vf - min_first_active;
11656 /* In these calculations the "- 1" converts loop iteration counts
11657 back to latch counts. */
11658 if (loop->any_upper_bound)
11660 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11661 loop->nb_iterations_upper_bound
11662 = (final_iter_may_be_partial
11663 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11664 lowest_vf) - 1
11665 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11666 lowest_vf) - 1);
11667 if (main_vinfo
11668 /* Both peeling for alignment and peeling for gaps can end up
11669 with the scalar epilogue running for more than VF-1 iterations. */
11670 && !main_vinfo->peeling_for_alignment
11671 && !main_vinfo->peeling_for_gaps)
11673 unsigned int bound;
11674 poly_uint64 main_iters
11675 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11676 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11677 main_iters
11678 = upper_bound (main_iters,
11679 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11680 if (can_div_away_from_zero_p (main_iters,
11681 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11682 &bound))
11683 loop->nb_iterations_upper_bound
11684 = wi::umin ((widest_int) (bound - 1),
11685 loop->nb_iterations_upper_bound);
11688 if (loop->any_likely_upper_bound)
11689 loop->nb_iterations_likely_upper_bound
11690 = (final_iter_may_be_partial
11691 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11692 + bias_for_lowest, lowest_vf) - 1
11693 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11694 + bias_for_lowest, lowest_vf) - 1);
11695 if (loop->any_estimate)
11696 loop->nb_iterations_estimate
11697 = (final_iter_may_be_partial
11698 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11699 assumed_vf) - 1
11700 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11701 assumed_vf) - 1);
11702 scale_profile_for_vect_loop (loop, assumed_vf, flat);
11704 if (dump_enabled_p ())
11706 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11708 dump_printf_loc (MSG_NOTE, vect_location,
11709 "LOOP VECTORIZED\n");
11710 if (loop->inner)
11711 dump_printf_loc (MSG_NOTE, vect_location,
11712 "OUTER LOOP VECTORIZED\n");
11713 dump_printf (MSG_NOTE, "\n");
11715 else
11716 dump_printf_loc (MSG_NOTE, vect_location,
11717 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11718 GET_MODE_NAME (loop_vinfo->vector_mode));
11721 /* Loops vectorized with a variable factor won't benefit from
11722 unrolling/peeling. */
11723 if (!vf.is_constant ())
11725 loop->unroll = 1;
11726 if (dump_enabled_p ())
11727 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11728 " variable-length vectorization factor\n");
11730 /* Free SLP instances here because otherwise stmt reference counting
11731 won't work. */
11732 slp_instance instance;
11733 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11734 vect_free_slp_instance (instance);
11735 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11736 /* Clear-up safelen field since its value is invalid after vectorization
11737 since vectorized loop can have loop-carried dependencies. */
11738 loop->safelen = 0;
11740 if (epilogue)
11742 update_epilogue_loop_vinfo (epilogue, advance);
11744 epilogue->simduid = loop->simduid;
11745 epilogue->force_vectorize = loop->force_vectorize;
11746 epilogue->dont_vectorize = false;
11749 return epilogue;
11752 /* The code below is trying to perform simple optimization - revert
11753 if-conversion for masked stores, i.e. if the mask of a store is zero
11754 do not perform it and all stored value producers also if possible.
11755 For example,
11756 for (i=0; i<n; i++)
11757 if (c[i])
11759 p1[i] += 1;
11760 p2[i] = p3[i] +2;
11762 this transformation will produce the following semi-hammock:
11764 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11766 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11767 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11768 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11769 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11770 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11771 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11775 void
11776 optimize_mask_stores (class loop *loop)
11778 basic_block *bbs = get_loop_body (loop);
11779 unsigned nbbs = loop->num_nodes;
11780 unsigned i;
11781 basic_block bb;
11782 class loop *bb_loop;
11783 gimple_stmt_iterator gsi;
11784 gimple *stmt;
11785 auto_vec<gimple *> worklist;
11786 auto_purge_vect_location sentinel;
11788 vect_location = find_loop_location (loop);
11789 /* Pick up all masked stores in loop if any. */
11790 for (i = 0; i < nbbs; i++)
11792 bb = bbs[i];
11793 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11794 gsi_next (&gsi))
11796 stmt = gsi_stmt (gsi);
11797 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11798 worklist.safe_push (stmt);
11802 free (bbs);
11803 if (worklist.is_empty ())
11804 return;
11806 /* Loop has masked stores. */
11807 while (!worklist.is_empty ())
11809 gimple *last, *last_store;
11810 edge e, efalse;
11811 tree mask;
11812 basic_block store_bb, join_bb;
11813 gimple_stmt_iterator gsi_to;
11814 tree vdef, new_vdef;
11815 gphi *phi;
11816 tree vectype;
11817 tree zero;
11819 last = worklist.pop ();
11820 mask = gimple_call_arg (last, 2);
11821 bb = gimple_bb (last);
11822 /* Create then_bb and if-then structure in CFG, then_bb belongs to
11823 the same loop as if_bb. It could be different to LOOP when two
11824 level loop-nest is vectorized and mask_store belongs to the inner
11825 one. */
11826 e = split_block (bb, last);
11827 bb_loop = bb->loop_father;
11828 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11829 join_bb = e->dest;
11830 store_bb = create_empty_bb (bb);
11831 add_bb_to_loop (store_bb, bb_loop);
11832 e->flags = EDGE_TRUE_VALUE;
11833 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11834 /* Put STORE_BB to likely part. */
11835 efalse->probability = profile_probability::likely ();
11836 e->probability = efalse->probability.invert ();
11837 store_bb->count = efalse->count ();
11838 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11839 if (dom_info_available_p (CDI_DOMINATORS))
11840 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11841 if (dump_enabled_p ())
11842 dump_printf_loc (MSG_NOTE, vect_location,
11843 "Create new block %d to sink mask stores.",
11844 store_bb->index);
11845 /* Create vector comparison with boolean result. */
11846 vectype = TREE_TYPE (mask);
11847 zero = build_zero_cst (vectype);
11848 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11849 gsi = gsi_last_bb (bb);
11850 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11851 /* Create new PHI node for vdef of the last masked store:
11852 .MEM_2 = VDEF <.MEM_1>
11853 will be converted to
11854 .MEM.3 = VDEF <.MEM_1>
11855 and new PHI node will be created in join bb
11856 .MEM_2 = PHI <.MEM_1, .MEM_3>
11858 vdef = gimple_vdef (last);
11859 new_vdef = make_ssa_name (gimple_vop (cfun), last);
11860 gimple_set_vdef (last, new_vdef);
11861 phi = create_phi_node (vdef, join_bb);
11862 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11864 /* Put all masked stores with the same mask to STORE_BB if possible. */
11865 while (true)
11867 gimple_stmt_iterator gsi_from;
11868 gimple *stmt1 = NULL;
11870 /* Move masked store to STORE_BB. */
11871 last_store = last;
11872 gsi = gsi_for_stmt (last);
11873 gsi_from = gsi;
11874 /* Shift GSI to the previous stmt for further traversal. */
11875 gsi_prev (&gsi);
11876 gsi_to = gsi_start_bb (store_bb);
11877 gsi_move_before (&gsi_from, &gsi_to);
11878 /* Setup GSI_TO to the non-empty block start. */
11879 gsi_to = gsi_start_bb (store_bb);
11880 if (dump_enabled_p ())
11881 dump_printf_loc (MSG_NOTE, vect_location,
11882 "Move stmt to created bb\n%G", last);
11883 /* Move all stored value producers if possible. */
11884 while (!gsi_end_p (gsi))
11886 tree lhs;
11887 imm_use_iterator imm_iter;
11888 use_operand_p use_p;
11889 bool res;
11891 /* Skip debug statements. */
11892 if (is_gimple_debug (gsi_stmt (gsi)))
11894 gsi_prev (&gsi);
11895 continue;
11897 stmt1 = gsi_stmt (gsi);
11898 /* Do not consider statements writing to memory or having
11899 volatile operand. */
11900 if (gimple_vdef (stmt1)
11901 || gimple_has_volatile_ops (stmt1))
11902 break;
11903 gsi_from = gsi;
11904 gsi_prev (&gsi);
11905 lhs = gimple_get_lhs (stmt1);
11906 if (!lhs)
11907 break;
11909 /* LHS of vectorized stmt must be SSA_NAME. */
11910 if (TREE_CODE (lhs) != SSA_NAME)
11911 break;
11913 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11915 /* Remove dead scalar statement. */
11916 if (has_zero_uses (lhs))
11918 gsi_remove (&gsi_from, true);
11919 continue;
11923 /* Check that LHS does not have uses outside of STORE_BB. */
11924 res = true;
11925 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11927 gimple *use_stmt;
11928 use_stmt = USE_STMT (use_p);
11929 if (is_gimple_debug (use_stmt))
11930 continue;
11931 if (gimple_bb (use_stmt) != store_bb)
11933 res = false;
11934 break;
11937 if (!res)
11938 break;
11940 if (gimple_vuse (stmt1)
11941 && gimple_vuse (stmt1) != gimple_vuse (last_store))
11942 break;
11944 /* Can move STMT1 to STORE_BB. */
11945 if (dump_enabled_p ())
11946 dump_printf_loc (MSG_NOTE, vect_location,
11947 "Move stmt to created bb\n%G", stmt1);
11948 gsi_move_before (&gsi_from, &gsi_to);
11949 /* Shift GSI_TO for further insertion. */
11950 gsi_prev (&gsi_to);
11952 /* Put other masked stores with the same mask to STORE_BB. */
11953 if (worklist.is_empty ()
11954 || gimple_call_arg (worklist.last (), 2) != mask
11955 || worklist.last () != stmt1)
11956 break;
11957 last = worklist.pop ();
11959 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11963 /* Decide whether it is possible to use a zero-based induction variable
11964 when vectorizing LOOP_VINFO with partial vectors. If it is, return
11965 the value that the induction variable must be able to hold in order
11966 to ensure that the rgroups eventually have no active vector elements.
11967 Return -1 otherwise. */
11969 widest_int
11970 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11972 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11973 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11974 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11976 /* Calculate the value that the induction variable must be able
11977 to hit in order to ensure that we end the loop with an all-false mask.
11978 This involves adding the maximum number of inactive trailing scalar
11979 iterations. */
11980 widest_int iv_limit = -1;
11981 if (max_loop_iterations (loop, &iv_limit))
11983 if (niters_skip)
11985 /* Add the maximum number of skipped iterations to the
11986 maximum iteration count. */
11987 if (TREE_CODE (niters_skip) == INTEGER_CST)
11988 iv_limit += wi::to_widest (niters_skip);
11989 else
11990 iv_limit += max_vf - 1;
11992 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11993 /* Make a conservatively-correct assumption. */
11994 iv_limit += max_vf - 1;
11996 /* IV_LIMIT is the maximum number of latch iterations, which is also
11997 the maximum in-range IV value. Round this value down to the previous
11998 vector alignment boundary and then add an extra full iteration. */
11999 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12000 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12002 return iv_limit;
12005 /* For the given rgroup_controls RGC, check whether an induction variable
12006 would ever hit a value that produces a set of all-false masks or zero
12007 lengths before wrapping around. Return true if it's possible to wrap
12008 around before hitting the desirable value, otherwise return false. */
12010 bool
12011 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12013 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12015 if (iv_limit == -1)
12016 return true;
12018 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12019 unsigned int compare_precision = TYPE_PRECISION (compare_type);
12020 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12022 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12023 return true;
12025 return false;