Daily bump.
[official-gcc.git] / gcc / tree-vect-loop.cc
blob30b90d99925bea74caf14833d8ab1695607d0fe9
1 /* Loop Vectorization
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "memmodel.h"
36 #include "optabs.h"
37 #include "diagnostic-core.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "cfganal.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "gimple-fold.h"
52 #include "cgraph.h"
53 #include "tree-cfg.h"
54 #include "tree-if-conv.h"
55 #include "internal-fn.h"
56 #include "tree-vector-builder.h"
57 #include "vec-perm-indices.h"
58 #include "tree-eh.h"
59 #include "case-cfn-macros.h"
60 #include "langhooks.h"
62 /* Loop Vectorization Pass.
64 This pass tries to vectorize loops.
66 For example, the vectorizer transforms the following simple loop:
68 short a[N]; short b[N]; short c[N]; int i;
70 for (i=0; i<N; i++){
71 a[i] = b[i] + c[i];
74 as if it was manually vectorized by rewriting the source code into:
76 typedef int __attribute__((mode(V8HI))) v8hi;
77 short a[N]; short b[N]; short c[N]; int i;
78 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 v8hi va, vb, vc;
81 for (i=0; i<N/8; i++){
82 vb = pb[i];
83 vc = pc[i];
84 va = vb + vc;
85 pa[i] = va;
88 The main entry to this pass is vectorize_loops(), in which
89 the vectorizer applies a set of analyses on a given set of loops,
90 followed by the actual vectorization transformation for the loops that
91 had successfully passed the analysis phase.
92 Throughout this pass we make a distinction between two types of
93 data: scalars (which are represented by SSA_NAMES), and memory references
94 ("data-refs"). These two types of data require different handling both
95 during analysis and transformation. The types of data-refs that the
96 vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 accesses are required to have a simple (consecutive) access pattern.
100 Analysis phase:
101 ===============
102 The driver for the analysis phase is vect_analyze_loop().
103 It applies a set of analyses, some of which rely on the scalar evolution
104 analyzer (scev) developed by Sebastian Pop.
106 During the analysis phase the vectorizer records some information
107 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 loop, as well as general information about the loop as a whole, which is
109 recorded in a "loop_vec_info" struct attached to each loop.
111 Transformation phase:
112 =====================
113 The loop transformation phase scans all the stmts in the loop, and
114 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 the loop that needs to be vectorized. It inserts the vector code sequence
116 just before the scalar stmt S, and records a pointer to the vector code
117 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 attached to S). This pointer will be used for the vectorization of following
119 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 otherwise, we rely on dead code elimination for removing it.
122 For example, say stmt S1 was vectorized into stmt VS1:
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 S2: a = b;
128 To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 resulting sequence would be:
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 VS2: va = vb;
136 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 Operands that are not SSA_NAMEs, are data-refs that appear in
139 load/store operations (like 'x[i]' in S1), and are handled differently.
141 Target modeling:
142 =================
143 Currently the only target specific information that is used is the
144 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 Targets that can support different sizes of vectors, for now will need
146 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 flexibility will be added in the future.
149 Since we only vectorize operations which vector form can be
150 expressed using existing tree codes, to verify that an operation is
151 supported, the vectorizer checks the relevant optab at the relevant
152 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 the value found is CODE_FOR_nothing, then there's no target support, and
154 we can't vectorize the stmt.
156 For additional information on this project see:
157 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 unsigned *);
162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 bool *, bool *, bool);
165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 may already be set for general statements (not just data refs). */
169 static opt_result
170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 bool vectype_maybe_set_p,
172 poly_uint64 *vf)
174 gimple *stmt = stmt_info->stmt;
176 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 && !STMT_VINFO_LIVE_P (stmt_info))
178 || gimple_clobber_p (stmt))
180 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 return opt_result::success ();
185 tree stmt_vectype, nunits_vectype;
186 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 &stmt_vectype,
188 &nunits_vectype);
189 if (!res)
190 return res;
192 if (stmt_vectype)
194 if (STMT_VINFO_VECTYPE (stmt_info))
195 /* The only case when a vectype had been already set is for stmts
196 that contain a data ref, or for "pattern-stmts" (stmts generated
197 by the vectorizer to represent/replace a certain idiom). */
198 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 || vectype_maybe_set_p)
200 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 else
202 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
205 if (nunits_vectype)
206 vect_update_max_nunits (vf, nunits_vectype);
208 return opt_result::success ();
211 /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 types of STMT_INFO and all attached pattern statements and update
213 the vectorization factor VF accordingly. Return true on success
214 or false if something prevented vectorization. */
216 static opt_result
217 vect_determine_vf_for_stmt (vec_info *vinfo,
218 stmt_vec_info stmt_info, poly_uint64 *vf)
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 if (!res)
225 return res;
227 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 && STMT_VINFO_RELATED_STMT (stmt_info))
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
233 /* If a pattern statement has def stmts, analyze them too. */
234 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 !gsi_end_p (si); gsi_next (&si))
237 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "==> examining pattern def stmt: %G",
241 def_stmt_info->stmt);
242 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 if (!res)
244 return res;
247 if (dump_enabled_p ())
248 dump_printf_loc (MSG_NOTE, vect_location,
249 "==> examining pattern statement: %G",
250 stmt_info->stmt);
251 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 if (!res)
253 return res;
256 return opt_result::success ();
259 /* Function vect_determine_vectorization_factor
261 Determine the vectorization factor (VF). VF is the number of data elements
262 that are operated upon in parallel in a single iteration of the vectorized
263 loop. For example, when vectorizing a loop that operates on 4byte elements,
264 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 elements can fit in a single vector register.
267 We currently support vectorization of loops in which all types operated upon
268 are of the same size. Therefore this function currently sets VF according to
269 the size of the types operated upon, and fails if there are multiple sizes
270 in the loop.
272 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 original loop:
274 for (i=0; i<N; i++){
275 a[i] = b[i] + c[i];
278 vectorized loop:
279 for (i=0; i<N; i+=VF){
280 a[i:VF] = b[i:VF] + c[i:VF];
284 static opt_result
285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
287 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 unsigned nbbs = loop->num_nodes;
290 poly_uint64 vectorization_factor = 1;
291 tree scalar_type = NULL_TREE;
292 gphi *phi;
293 tree vectype;
294 stmt_vec_info stmt_info;
295 unsigned i;
297 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
299 for (i = 0; i < nbbs; i++)
301 basic_block bb = bbs[i];
303 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 gsi_next (&si))
306 phi = si.phi ();
307 stmt_info = loop_vinfo->lookup_stmt (phi);
308 if (dump_enabled_p ())
309 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 (gimple *) phi);
312 gcc_assert (stmt_info);
314 if (STMT_VINFO_RELEVANT_P (stmt_info)
315 || STMT_VINFO_LIVE_P (stmt_info))
317 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 scalar_type = TREE_TYPE (PHI_RESULT (phi));
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE, vect_location,
322 "get vectype for scalar type: %T\n",
323 scalar_type);
325 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 if (!vectype)
327 return opt_result::failure_at (phi,
328 "not vectorized: unsupported "
329 "data-type %T\n",
330 scalar_type);
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 dump_printf (MSG_NOTE, "\n");
344 vect_update_max_nunits (&vectorization_factor, vectype);
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 gsi_next (&si))
351 if (is_gimple_debug (gsi_stmt (si)))
352 continue;
353 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 opt_result res
355 = vect_determine_vf_for_stmt (loop_vinfo,
356 stmt_info, &vectorization_factor);
357 if (!res)
358 return res;
362 /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 dump_dec (MSG_NOTE, vectorization_factor);
367 dump_printf (MSG_NOTE, "\n");
370 if (known_le (vectorization_factor, 1U))
371 return opt_result::failure_at (vect_location,
372 "not vectorized: unsupported data-type\n");
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 return opt_result::success ();
378 /* Function vect_is_simple_iv_evolution.
380 FORNOW: A simple evolution of an induction variables in the loop is
381 considered a polynomial evolution. */
383 static bool
384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 tree * step)
387 tree init_expr;
388 tree step_expr;
389 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 basic_block bb;
392 /* When there is no evolution in this loop, the evolution function
393 is not "simple". */
394 if (evolution_part == NULL_TREE)
395 return false;
397 /* When the evolution is a polynomial of degree >= 2
398 the evolution function is not "simple". */
399 if (tree_is_chrec (evolution_part))
400 return false;
402 step_expr = evolution_part;
403 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 step_expr, init_expr);
409 *init = init_expr;
410 *step = step_expr;
412 if (TREE_CODE (step_expr) != INTEGER_CST
413 && (TREE_CODE (step_expr) != SSA_NAME
414 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 || !flag_associative_math)))
419 && (TREE_CODE (step_expr) != REAL_CST
420 || !flag_associative_math))
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "step unknown.\n");
425 return false;
428 return true;
431 /* Function vect_is_nonlinear_iv_evolution
433 Only support nonlinear induction for integer type
434 1. neg
435 2. mul by constant
436 3. lshift/rshift by constant.
438 For neg induction, return a fake step as integer -1. */
439 static bool
440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 gphi* loop_phi_node, tree *init, tree *step)
443 tree init_expr, ev_expr, result, op1, op2;
444 gimple* def;
446 if (gimple_phi_num_args (loop_phi_node) != 2)
447 return false;
449 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
452 /* Support nonlinear induction only for integer type. */
453 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 return false;
456 *init = init_expr;
457 result = PHI_RESULT (loop_phi_node);
459 if (TREE_CODE (ev_expr) != SSA_NAME
460 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 || !is_gimple_assign (def))
462 return false;
464 enum tree_code t_code = gimple_assign_rhs_code (def);
465 switch (t_code)
467 case NEGATE_EXPR:
468 if (gimple_assign_rhs1 (def) != result)
469 return false;
470 *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 break;
474 case RSHIFT_EXPR:
475 case LSHIFT_EXPR:
476 case MULT_EXPR:
477 op1 = gimple_assign_rhs1 (def);
478 op2 = gimple_assign_rhs2 (def);
479 if (TREE_CODE (op2) != INTEGER_CST
480 || op1 != result)
481 return false;
482 *step = op2;
483 if (t_code == LSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 else if (t_code == RSHIFT_EXPR)
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 else
489 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 break;
492 default:
493 return false;
496 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
499 return true;
502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 what we are assuming is a double reduction. For example, given
504 a structure like this:
506 outer1:
507 x_1 = PHI <x_4(outer2), ...>;
510 inner:
511 x_2 = PHI <x_1(outer1), ...>;
513 x_3 = ...;
516 outer2:
517 x_4 = PHI <x_3(inner)>;
520 outer loop analysis would treat x_1 as a double reduction phi and
521 this function would then return true for x_2. */
523 static bool
524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
526 use_operand_p use_p;
527 ssa_op_iter op_iter;
528 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 return true;
532 return false;
535 /* Returns true if Phi is a first-order recurrence. A first-order
536 recurrence is a non-reduction recurrence relation in which the value of
537 the recurrence in the current loop iteration equals a value defined in
538 the previous iteration. */
540 static bool
541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 gphi *phi)
544 /* A nested cycle isn't vectorizable as first order recurrence. */
545 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 return false;
548 /* Ensure the loop latch definition is from within the loop. */
549 edge latch = loop_latch_edge (loop);
550 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 if (TREE_CODE (ldef) != SSA_NAME
552 || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 return false;
557 tree def = gimple_phi_result (phi);
559 /* Ensure every use_stmt of the phi node is dominated by the latch
560 definition. */
561 imm_use_iterator imm_iter;
562 use_operand_p use_p;
563 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 if (!is_gimple_debug (USE_STMT (use_p))
565 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 USE_STMT (use_p))))
568 return false;
570 /* First-order recurrence autovectorization needs shuffle vector. */
571 tree scalar_type = TREE_TYPE (def);
572 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 if (!vectype)
574 return false;
576 return true;
579 /* Function vect_analyze_scalar_cycles_1.
581 Examine the cross iteration def-use cycles of scalar variables
582 in LOOP. LOOP_VINFO represents the loop that is now being
583 considered for vectorization (can be LOOP, or an outer-loop
584 enclosing LOOP). SLP indicates there will be some subsequent
585 slp analyses or not. */
587 static void
588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 bool slp)
591 basic_block bb = loop->header;
592 tree init, step;
593 auto_vec<stmt_vec_info, 64> worklist;
594 gphi_iterator gsi;
595 bool double_reduc, reduc_chain;
597 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
599 /* First - identify all inductions. Reduction detection assumes that all the
600 inductions have been identified, therefore, this order must not be
601 changed. */
602 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
604 gphi *phi = gsi.phi ();
605 tree access_fn = NULL;
606 tree def = PHI_RESULT (phi);
607 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 (gimple *) phi);
613 /* Skip virtual phi's. The data dependences that are associated with
614 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 if (virtual_operand_p (def))
616 continue;
618 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
620 /* Analyze the evolution function. */
621 access_fn = analyze_scalar_evolution (loop, def);
622 if (access_fn)
624 STRIP_NOPS (access_fn);
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "Access function of PHI: %T\n", access_fn);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 = initial_condition_in_loop_num (access_fn, loop->num);
630 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 = evolution_part_in_loop_num (access_fn, loop->num);
634 if ((!access_fn
635 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 &init, &step)
638 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 && TREE_CODE (step) != INTEGER_CST))
640 /* Only handle nonlinear iv for same loop. */
641 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 phi, &init, &step)))
645 worklist.safe_push (stmt_vinfo);
646 continue;
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 != NULL_TREE);
651 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
659 /* Second - identify all reductions and nested cycles. */
660 while (worklist.length () > 0)
662 stmt_vec_info stmt_vinfo = worklist.pop ();
663 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
664 tree def = PHI_RESULT (phi);
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 (gimple *) phi);
670 gcc_assert (!virtual_operand_p (def)
671 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
673 stmt_vec_info reduc_stmt_info
674 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 &reduc_chain, slp);
676 if (reduc_stmt_info)
678 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 if (double_reduc)
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location,
684 "Detected double reduction.\n");
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
689 else
691 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "Detected vectorizable nested cycle.\n");
697 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
699 else
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_NOTE, vect_location,
703 "Detected reduction.\n");
705 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
706 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
707 /* Store the reduction cycles for possible vectorization in
708 loop-aware SLP if it was not detected as reduction
709 chain. */
710 if (! reduc_chain)
711 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
712 (reduc_stmt_info);
716 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
718 else
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown def-use cycle pattern.\n");
726 /* Function vect_analyze_scalar_cycles.
728 Examine the cross iteration def-use cycles of scalar variables, by
729 analyzing the loop-header PHIs of scalar variables. Classify each
730 cycle as one of the following: invariant, induction, reduction, unknown.
731 We do that for the loop represented by LOOP_VINFO, and also to its
732 inner-loop, if exists.
733 Examples for scalar cycles:
735 Example1: reduction:
737 loop1:
738 for (i=0; i<N; i++)
739 sum += a[i];
741 Example2: induction:
743 loop2:
744 for (i=0; i<N; i++)
745 a[i] = i; */
747 static void
748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
752 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
754 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 Reductions in such inner-loop therefore have different properties than
756 the reductions in the nest that gets vectorized:
757 1. When vectorized, they are executed in the same order as in the original
758 scalar loop, so we can't change the order of computation when
759 vectorizing them.
760 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 current checks are too strict. */
763 if (loop->inner)
764 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
767 /* Transfer group and reduction information from STMT_INFO to its
768 pattern stmt. */
770 static void
771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
773 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
774 stmt_vec_info stmtp;
775 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
777 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
780 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
781 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
782 == STMT_VINFO_DEF_TYPE (stmt_info));
783 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
784 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
785 if (stmt_info)
786 REDUC_GROUP_NEXT_ELEMENT (stmtp)
787 = STMT_VINFO_RELATED_STMT (stmt_info);
789 while (stmt_info);
792 /* Fixup scalar cycles that now have their stmts detected as patterns. */
794 static void
795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
797 stmt_vec_info first;
798 unsigned i;
800 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
802 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
803 while (next)
805 if ((STMT_VINFO_IN_PATTERN_P (next)
806 != STMT_VINFO_IN_PATTERN_P (first))
807 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
808 break;
809 next = REDUC_GROUP_NEXT_ELEMENT (next);
811 /* If all reduction chain members are well-formed patterns adjust
812 the group to group the pattern stmts instead. */
813 if (! next
814 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
816 if (STMT_VINFO_IN_PATTERN_P (first))
818 vect_fixup_reduc_chain (first);
819 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
820 = STMT_VINFO_RELATED_STMT (first);
823 /* If not all stmt in the chain are patterns or if we failed
824 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 it as regular reduction instead. */
826 else
828 stmt_vec_info vinfo = first;
829 stmt_vec_info last = NULL;
830 while (vinfo)
832 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
833 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
834 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
835 last = vinfo;
836 vinfo = next;
838 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
839 = vect_internal_def;
840 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
841 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
842 --i;
847 /* Function vect_get_loop_niters.
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 niter information holds in ASSUMPTIONS.
854 Return the loop exit conditions. */
857 static vec<gcond *>
858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
859 tree *number_of_iterations, tree *number_of_iterationsm1)
861 auto_vec<edge> exits = get_loop_exit_edges (loop);
862 vec<gcond *> conds;
863 conds.create (exits.length ());
864 class tree_niter_desc niter_desc;
865 tree niter_assumptions, niter, may_be_zero;
867 *assumptions = boolean_true_node;
868 *number_of_iterationsm1 = chrec_dont_know;
869 *number_of_iterations = chrec_dont_know;
871 DUMP_VECT_SCOPE ("get_loop_niters");
873 if (exits.is_empty ())
874 return conds;
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
878 exits.length ());
880 edge exit;
881 unsigned int i;
882 FOR_EACH_VEC_ELT (exits, i, exit)
884 gcond *cond = get_loop_exit_condition (exit);
885 if (cond)
886 conds.safe_push (cond);
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
891 if (exit != main_exit)
892 continue;
894 may_be_zero = NULL_TREE;
895 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
896 || chrec_contains_undetermined (niter_desc.niter))
897 continue;
899 niter_assumptions = niter_desc.assumptions;
900 may_be_zero = niter_desc.may_be_zero;
901 niter = niter_desc.niter;
903 if (may_be_zero && integer_zerop (may_be_zero))
904 may_be_zero = NULL_TREE;
906 if (may_be_zero)
908 if (COMPARISON_CLASS_P (may_be_zero))
910 /* Try to combine may_be_zero with assumptions, this can simplify
911 computation of niter expression. */
912 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
913 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
914 niter_assumptions,
915 fold_build1 (TRUTH_NOT_EXPR,
916 boolean_type_node,
917 may_be_zero));
918 else
919 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
920 build_int_cst (TREE_TYPE (niter), 0),
921 rewrite_to_non_trapping_overflow (niter));
923 may_be_zero = NULL_TREE;
925 else if (integer_nonzerop (may_be_zero))
927 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
928 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
929 continue;
931 else
932 continue;
935 /* Loop assumptions are based off the normal exit. */
936 *assumptions = niter_assumptions;
937 *number_of_iterationsm1 = niter;
939 /* We want the number of loop header executions which is the number
940 of latch executions plus one.
941 ??? For UINT_MAX latch executions this number overflows to zero
942 for loops like do { n++; } while (n != 0); */
943 if (niter && !chrec_contains_undetermined (niter))
945 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
946 unshare_expr (niter),
947 build_int_cst (TREE_TYPE (niter), 1));
948 if (TREE_CODE (niter) == INTEGER_CST
949 && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
951 /* If we manage to fold niter + 1 into INTEGER_CST even when
952 niter is some complex expression, ensure back
953 *number_of_iterationsm1 is an INTEGER_CST as well. See
954 PR113210. */
955 *number_of_iterationsm1
956 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
957 build_minus_one_cst (TREE_TYPE (niter)));
960 *number_of_iterations = niter;
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
966 return conds;
969 /* Determine the main loop exit for the vectorizer. */
971 edge
972 vec_init_loop_exit_info (class loop *loop)
974 /* Before we begin we must first determine which exit is the main one and
975 which are auxilary exits. */
976 auto_vec<edge> exits = get_loop_exit_edges (loop);
977 if (exits.length () == 1)
978 return exits[0];
980 /* If we have multiple exits we only support counting IV at the moment.
981 Analyze all exits and return the last one we can analyze. */
982 class tree_niter_desc niter_desc;
983 edge candidate = NULL;
984 for (edge exit : exits)
986 if (!get_loop_exit_condition (exit))
987 continue;
989 if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
990 && !chrec_contains_undetermined (niter_desc.niter))
992 tree may_be_zero = niter_desc.may_be_zero;
993 if ((integer_zerop (may_be_zero)
994 /* As we are handling may_be_zero that's not false by
995 rewriting niter to may_be_zero ? 0 : niter we require
996 an empty latch. */
997 || (single_pred_p (loop->latch)
998 && exit->src == single_pred (loop->latch)
999 && (integer_nonzerop (may_be_zero)
1000 || COMPARISON_CLASS_P (may_be_zero))))
1001 && (!candidate
1002 || dominated_by_p (CDI_DOMINATORS, exit->src,
1003 candidate->src)))
1004 candidate = exit;
1008 return candidate;
1011 /* Function bb_in_loop_p
1013 Used as predicate for dfs order traversal of the loop bbs. */
1015 static bool
1016 bb_in_loop_p (const_basic_block bb, const void *data)
1018 const class loop *const loop = (const class loop *)data;
1019 if (flow_bb_inside_loop_p (loop, bb))
1020 return true;
1021 return false;
1025 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1026 stmt_vec_info structs for all the stmts in LOOP_IN. */
1028 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1029 : vec_info (vec_info::loop, shared),
1030 loop (loop_in),
1031 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1032 num_itersm1 (NULL_TREE),
1033 num_iters (NULL_TREE),
1034 num_iters_unchanged (NULL_TREE),
1035 num_iters_assumptions (NULL_TREE),
1036 vector_costs (nullptr),
1037 scalar_costs (nullptr),
1038 th (0),
1039 versioning_threshold (0),
1040 vectorization_factor (0),
1041 main_loop_edge (nullptr),
1042 skip_main_loop_edge (nullptr),
1043 skip_this_loop_edge (nullptr),
1044 reusable_accumulators (),
1045 suggested_unroll_factor (1),
1046 max_vectorization_factor (0),
1047 mask_skip_niters (NULL_TREE),
1048 rgroup_compare_type (NULL_TREE),
1049 simd_if_cond (NULL_TREE),
1050 partial_vector_style (vect_partial_vectors_none),
1051 unaligned_dr (NULL),
1052 peeling_for_alignment (0),
1053 ptr_mask (0),
1054 ivexpr_map (NULL),
1055 scan_map (NULL),
1056 slp_unrolling_factor (1),
1057 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1058 vectorizable (false),
1059 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1060 using_partial_vectors_p (false),
1061 using_decrementing_iv_p (false),
1062 using_select_vl_p (false),
1063 epil_using_partial_vectors_p (false),
1064 partial_load_store_bias (0),
1065 peeling_for_gaps (false),
1066 peeling_for_niter (false),
1067 early_breaks (false),
1068 no_data_dependencies (false),
1069 has_mask_store (false),
1070 scalar_loop_scaling (profile_probability::uninitialized ()),
1071 scalar_loop (NULL),
1072 orig_loop_info (NULL),
1073 vec_loop_iv_exit (NULL),
1074 vec_epilogue_loop_iv_exit (NULL),
1075 scalar_loop_iv_exit (NULL)
1077 /* CHECKME: We want to visit all BBs before their successors (except for
1078 latch blocks, for which this assertion wouldn't hold). In the simple
1079 case of the loop forms we allow, a dfs order of the BBs would the same
1080 as reversed postorder traversal, so we are safe. */
1082 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1083 bbs, loop->num_nodes, loop);
1084 gcc_assert (nbbs == loop->num_nodes);
1086 for (unsigned int i = 0; i < nbbs; i++)
1088 basic_block bb = bbs[i];
1089 gimple_stmt_iterator si;
1091 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1093 gimple *phi = gsi_stmt (si);
1094 gimple_set_uid (phi, 0);
1095 add_stmt (phi);
1098 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1100 gimple *stmt = gsi_stmt (si);
1101 gimple_set_uid (stmt, 0);
1102 if (is_gimple_debug (stmt))
1103 continue;
1104 add_stmt (stmt);
1105 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1106 third argument is the #pragma omp simd if (x) condition, when 0,
1107 loop shouldn't be vectorized, when non-zero constant, it should
1108 be vectorized normally, otherwise versioned with vectorized loop
1109 done if the condition is non-zero at runtime. */
1110 if (loop_in->simduid
1111 && is_gimple_call (stmt)
1112 && gimple_call_internal_p (stmt)
1113 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1114 && gimple_call_num_args (stmt) >= 3
1115 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1116 && (loop_in->simduid
1117 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1119 tree arg = gimple_call_arg (stmt, 2);
1120 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1121 simd_if_cond = arg;
1122 else
1123 gcc_assert (integer_nonzerop (arg));
1128 epilogue_vinfos.create (6);
1131 /* Free all levels of rgroup CONTROLS. */
1133 void
1134 release_vec_loop_controls (vec<rgroup_controls> *controls)
1136 rgroup_controls *rgc;
1137 unsigned int i;
1138 FOR_EACH_VEC_ELT (*controls, i, rgc)
1139 rgc->controls.release ();
1140 controls->release ();
1143 /* Free all memory used by the _loop_vec_info, as well as all the
1144 stmt_vec_info structs of all the stmts in the loop. */
1146 _loop_vec_info::~_loop_vec_info ()
1148 free (bbs);
1150 release_vec_loop_controls (&masks.rgc_vec);
1151 release_vec_loop_controls (&lens);
1152 delete ivexpr_map;
1153 delete scan_map;
1154 epilogue_vinfos.release ();
1155 delete scalar_costs;
1156 delete vector_costs;
1158 /* When we release an epiloge vinfo that we do not intend to use
1159 avoid clearing AUX of the main loop which should continue to
1160 point to the main loop vinfo since otherwise we'll leak that. */
1161 if (loop->aux == this)
1162 loop->aux = NULL;
1165 /* Return an invariant or register for EXPR and emit necessary
1166 computations in the LOOP_VINFO loop preheader. */
1168 tree
1169 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1171 if (is_gimple_reg (expr)
1172 || is_gimple_min_invariant (expr))
1173 return expr;
1175 if (! loop_vinfo->ivexpr_map)
1176 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1177 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1178 if (! cached)
1180 gimple_seq stmts = NULL;
1181 cached = force_gimple_operand (unshare_expr (expr),
1182 &stmts, true, NULL_TREE);
1183 if (stmts)
1185 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1186 gsi_insert_seq_on_edge_immediate (e, stmts);
1189 return cached;
1192 /* Return true if we can use CMP_TYPE as the comparison type to produce
1193 all masks required to mask LOOP_VINFO. */
1195 static bool
1196 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1198 rgroup_controls *rgm;
1199 unsigned int i;
1200 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1201 if (rgm->type != NULL_TREE
1202 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1203 cmp_type, rgm->type,
1204 OPTIMIZE_FOR_SPEED))
1205 return false;
1206 return true;
1209 /* Calculate the maximum number of scalars per iteration for every
1210 rgroup in LOOP_VINFO. */
1212 static unsigned int
1213 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1215 unsigned int res = 1;
1216 unsigned int i;
1217 rgroup_controls *rgm;
1218 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1219 res = MAX (res, rgm->max_nscalars_per_iter);
1220 return res;
1223 /* Calculate the minimum precision necessary to represent:
1225 MAX_NITERS * FACTOR
1227 as an unsigned integer, where MAX_NITERS is the maximum number of
1228 loop header iterations for the original scalar form of LOOP_VINFO. */
1230 static unsigned
1231 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1233 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1235 /* Get the maximum number of iterations that is representable
1236 in the counter type. */
1237 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1238 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1240 /* Get a more refined estimate for the number of iterations. */
1241 widest_int max_back_edges;
1242 if (max_loop_iterations (loop, &max_back_edges))
1243 max_ni = wi::smin (max_ni, max_back_edges + 1);
1245 /* Work out how many bits we need to represent the limit. */
1246 return wi::min_precision (max_ni * factor, UNSIGNED);
1249 /* True if the loop needs peeling or partial vectors when vectorized. */
1251 static bool
1252 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1254 unsigned HOST_WIDE_INT const_vf;
1255 HOST_WIDE_INT max_niter
1256 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1258 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1259 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1260 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1261 (loop_vinfo));
1263 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1264 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1266 /* Work out the (constant) number of iterations that need to be
1267 peeled for reasons other than niters. */
1268 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1269 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1270 peel_niter += 1;
1271 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1272 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1273 return true;
1275 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1276 /* ??? When peeling for gaps but not alignment, we could
1277 try to check whether the (variable) niters is known to be
1278 VF * N + 1. That's something of a niche case though. */
1279 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1280 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1281 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1282 < (unsigned) exact_log2 (const_vf))
1283 /* In case of versioning, check if the maximum number of
1284 iterations is greater than th. If they are identical,
1285 the epilogue is unnecessary. */
1286 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1287 || ((unsigned HOST_WIDE_INT) max_niter
1288 /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1289 but that's only computed later based on our result.
1290 The following is the most conservative approximation. */
1291 > (std::max ((unsigned HOST_WIDE_INT) th,
1292 const_vf) / const_vf) * const_vf))))
1293 return true;
1295 return false;
1298 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1299 whether we can actually generate the masks required. Return true if so,
1300 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1302 static bool
1303 vect_verify_full_masking (loop_vec_info loop_vinfo)
1305 unsigned int min_ni_width;
1307 /* Use a normal loop if there are no statements that need masking.
1308 This only happens in rare degenerate cases: it means that the loop
1309 has no loads, no stores, and no live-out values. */
1310 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1311 return false;
1313 /* Produce the rgroup controls. */
1314 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1316 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1317 tree vectype = mask.first;
1318 unsigned nvectors = mask.second;
1320 if (masks->rgc_vec.length () < nvectors)
1321 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1322 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1323 /* The number of scalars per iteration and the number of vectors are
1324 both compile-time constants. */
1325 unsigned int nscalars_per_iter
1326 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1327 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1329 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1331 rgm->max_nscalars_per_iter = nscalars_per_iter;
1332 rgm->type = truth_type_for (vectype);
1333 rgm->factor = 1;
1337 unsigned int max_nscalars_per_iter
1338 = vect_get_max_nscalars_per_iter (loop_vinfo);
1340 /* Work out how many bits we need to represent the limit. */
1341 min_ni_width
1342 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1344 /* Find a scalar mode for which WHILE_ULT is supported. */
1345 opt_scalar_int_mode cmp_mode_iter;
1346 tree cmp_type = NULL_TREE;
1347 tree iv_type = NULL_TREE;
1348 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1349 unsigned int iv_precision = UINT_MAX;
1351 if (iv_limit != -1)
1352 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1353 UNSIGNED);
1355 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1357 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1358 if (cmp_bits >= min_ni_width
1359 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1361 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1362 if (this_type
1363 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1365 /* Although we could stop as soon as we find a valid mode,
1366 there are at least two reasons why that's not always the
1367 best choice:
1369 - An IV that's Pmode or wider is more likely to be reusable
1370 in address calculations than an IV that's narrower than
1371 Pmode.
1373 - Doing the comparison in IV_PRECISION or wider allows
1374 a natural 0-based IV, whereas using a narrower comparison
1375 type requires mitigations against wrap-around.
1377 Conversely, if the IV limit is variable, doing the comparison
1378 in a wider type than the original type can introduce
1379 unnecessary extensions, so picking the widest valid mode
1380 is not always a good choice either.
1382 Here we prefer the first IV type that's Pmode or wider,
1383 and the first comparison type that's IV_PRECISION or wider.
1384 (The comparison type must be no wider than the IV type,
1385 to avoid extensions in the vector loop.)
1387 ??? We might want to try continuing beyond Pmode for ILP32
1388 targets if CMP_BITS < IV_PRECISION. */
1389 iv_type = this_type;
1390 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1391 cmp_type = this_type;
1392 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1393 break;
1398 if (!cmp_type)
1400 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1401 return false;
1404 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1405 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1406 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1407 return true;
1410 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1411 whether we can actually generate AVX512 style masks. Return true if so,
1412 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1414 static bool
1415 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1417 /* Produce differently organized rgc_vec and differently check
1418 we can produce masks. */
1420 /* Use a normal loop if there are no statements that need masking.
1421 This only happens in rare degenerate cases: it means that the loop
1422 has no loads, no stores, and no live-out values. */
1423 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1424 return false;
1426 /* For the decrementing IV we need to represent all values in
1427 [0, niter + niter_skip] where niter_skip is the elements we
1428 skip in the first iteration for prologue peeling. */
1429 tree iv_type = NULL_TREE;
1430 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1431 unsigned int iv_precision = UINT_MAX;
1432 if (iv_limit != -1)
1433 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1435 /* First compute the type for the IV we use to track the remaining
1436 scalar iterations. */
1437 opt_scalar_int_mode cmp_mode_iter;
1438 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1440 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1441 if (cmp_bits >= iv_precision
1442 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1444 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1445 if (iv_type)
1446 break;
1449 if (!iv_type)
1450 return false;
1452 /* Produce the rgroup controls. */
1453 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1455 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1456 tree vectype = mask.first;
1457 unsigned nvectors = mask.second;
1459 /* The number of scalars per iteration and the number of vectors are
1460 both compile-time constants. */
1461 unsigned int nscalars_per_iter
1462 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1463 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1465 /* We index the rgroup_controls vector with nscalars_per_iter
1466 which we keep constant and instead have a varying nvectors,
1467 remembering the vector mask with the fewest nV. */
1468 if (masks->rgc_vec.length () < nscalars_per_iter)
1469 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1470 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1472 if (!rgm->type || rgm->factor > nvectors)
1474 rgm->type = truth_type_for (vectype);
1475 rgm->compare_type = NULL_TREE;
1476 rgm->max_nscalars_per_iter = nscalars_per_iter;
1477 rgm->factor = nvectors;
1478 rgm->bias_adjusted_ctrl = NULL_TREE;
1482 /* There is no fixed compare type we are going to use but we have to
1483 be able to get at one for each mask group. */
1484 unsigned int min_ni_width
1485 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1487 bool ok = true;
1488 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1490 tree mask_type = rgc.type;
1491 if (!mask_type)
1492 continue;
1494 /* For now vect_get_loop_mask only supports integer mode masks
1495 when we need to split it. */
1496 if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1497 || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1499 ok = false;
1500 break;
1503 /* If iv_type is usable as compare type use that - we can elide the
1504 saturation in that case. */
1505 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1507 tree cmp_vectype
1508 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1509 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1510 rgc.compare_type = cmp_vectype;
1512 if (!rgc.compare_type)
1513 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1515 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1516 if (cmp_bits >= min_ni_width
1517 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1519 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1520 if (!cmp_type)
1521 continue;
1523 /* Check whether we can produce the mask with cmp_type. */
1524 tree cmp_vectype
1525 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1526 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1528 rgc.compare_type = cmp_vectype;
1529 break;
1533 if (!rgc.compare_type)
1535 ok = false;
1536 break;
1539 if (!ok)
1541 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1542 return false;
1545 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1546 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1547 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1548 return true;
1551 /* Check whether we can use vector access with length based on precison
1552 comparison. So far, to keep it simple, we only allow the case that the
1553 precision of the target supported length is larger than the precision
1554 required by loop niters. */
1556 static bool
1557 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1559 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1560 return false;
1562 machine_mode len_load_mode, len_store_mode;
1563 if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1564 .exists (&len_load_mode))
1565 return false;
1566 if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1567 .exists (&len_store_mode))
1568 return false;
1570 signed char partial_load_bias = internal_len_load_store_bias
1571 (IFN_LEN_LOAD, len_load_mode);
1573 signed char partial_store_bias = internal_len_load_store_bias
1574 (IFN_LEN_STORE, len_store_mode);
1576 gcc_assert (partial_load_bias == partial_store_bias);
1578 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1579 return false;
1581 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1582 len_loads with a length of zero. In order to avoid that we prohibit
1583 more than one loop length here. */
1584 if (partial_load_bias == -1
1585 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1586 return false;
1588 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1590 unsigned int max_nitems_per_iter = 1;
1591 unsigned int i;
1592 rgroup_controls *rgl;
1593 /* Find the maximum number of items per iteration for every rgroup. */
1594 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1596 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1597 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1600 /* Work out how many bits we need to represent the length limit. */
1601 unsigned int min_ni_prec
1602 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1604 /* Now use the maximum of below precisions for one suitable IV type:
1605 - the IV's natural precision
1606 - the precision needed to hold: the maximum number of scalar
1607 iterations multiplied by the scale factor (min_ni_prec above)
1608 - the Pmode precision
1610 If min_ni_prec is less than the precision of the current niters,
1611 we perfer to still use the niters type. Prefer to use Pmode and
1612 wider IV to avoid narrow conversions. */
1614 unsigned int ni_prec
1615 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1616 min_ni_prec = MAX (min_ni_prec, ni_prec);
1617 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1619 tree iv_type = NULL_TREE;
1620 opt_scalar_int_mode tmode_iter;
1621 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1623 scalar_mode tmode = tmode_iter.require ();
1624 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1626 /* ??? Do we really want to construct one IV whose precision exceeds
1627 BITS_PER_WORD? */
1628 if (tbits > BITS_PER_WORD)
1629 break;
1631 /* Find the first available standard integral type. */
1632 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1634 iv_type = build_nonstandard_integer_type (tbits, true);
1635 break;
1639 if (!iv_type)
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643 "can't vectorize with length-based partial vectors"
1644 " because there is no suitable iv type.\n");
1645 return false;
1648 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1649 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1650 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1652 return true;
1655 /* Calculate the cost of one scalar iteration of the loop. */
1656 static void
1657 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1659 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1660 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1661 int nbbs = loop->num_nodes, factor;
1662 int innerloop_iters, i;
1664 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1666 /* Gather costs for statements in the scalar loop. */
1668 /* FORNOW. */
1669 innerloop_iters = 1;
1670 if (loop->inner)
1671 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1673 for (i = 0; i < nbbs; i++)
1675 gimple_stmt_iterator si;
1676 basic_block bb = bbs[i];
1678 if (bb->loop_father == loop->inner)
1679 factor = innerloop_iters;
1680 else
1681 factor = 1;
1683 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1685 gimple *stmt = gsi_stmt (si);
1686 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1688 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1689 continue;
1691 /* Skip stmts that are not vectorized inside the loop. */
1692 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1693 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1694 && (!STMT_VINFO_LIVE_P (vstmt_info)
1695 || !VECTORIZABLE_CYCLE_DEF
1696 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1697 continue;
1699 vect_cost_for_stmt kind;
1700 if (STMT_VINFO_DATA_REF (stmt_info))
1702 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1703 kind = scalar_load;
1704 else
1705 kind = scalar_store;
1707 else if (vect_nop_conversion_p (stmt_info))
1708 continue;
1709 else
1710 kind = scalar_stmt;
1712 /* We are using vect_prologue here to avoid scaling twice
1713 by the inner loop factor. */
1714 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1715 factor, kind, stmt_info, 0, vect_prologue);
1719 /* Now accumulate cost. */
1720 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1721 add_stmt_costs (loop_vinfo->scalar_costs,
1722 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1723 loop_vinfo->scalar_costs->finish_cost (nullptr);
1726 /* Function vect_analyze_loop_form.
1728 Verify that certain CFG restrictions hold, including:
1729 - the loop has a pre-header
1730 - the loop has a single entry
1731 - nested loops can have only a single exit.
1732 - the loop exit condition is simple enough
1733 - the number of iterations can be analyzed, i.e, a countable loop. The
1734 niter could be analyzed under some assumptions. */
1736 opt_result
1737 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1739 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1741 edge exit_e = vec_init_loop_exit_info (loop);
1742 if (!exit_e)
1743 return opt_result::failure_at (vect_location,
1744 "not vectorized:"
1745 " could not determine main exit from"
1746 " loop with multiple exits.\n");
1747 info->loop_exit = exit_e;
1748 if (dump_enabled_p ())
1749 dump_printf_loc (MSG_NOTE, vect_location,
1750 "using as main loop exit: %d -> %d [AUX: %p]\n",
1751 exit_e->src->index, exit_e->dest->index, exit_e->aux);
1753 /* Check if we have any control flow that doesn't leave the loop. */
1754 class loop *v_loop = loop->inner ? loop->inner : loop;
1755 basic_block *bbs = get_loop_body (v_loop);
1756 for (unsigned i = 0; i < v_loop->num_nodes; i++)
1757 if (EDGE_COUNT (bbs[i]->succs) != 1
1758 && (EDGE_COUNT (bbs[i]->succs) != 2
1759 || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1761 free (bbs);
1762 return opt_result::failure_at (vect_location,
1763 "not vectorized:"
1764 " unsupported control flow in loop.\n");
1766 free (bbs);
1768 /* Different restrictions apply when we are considering an inner-most loop,
1769 vs. an outer (nested) loop.
1770 (FORNOW. May want to relax some of these restrictions in the future). */
1772 info->inner_loop_cond = NULL;
1773 if (!loop->inner)
1775 /* Inner-most loop. */
1777 if (empty_block_p (loop->header))
1778 return opt_result::failure_at (vect_location,
1779 "not vectorized: empty loop.\n");
1781 else
1783 class loop *innerloop = loop->inner;
1784 edge entryedge;
1786 /* Nested loop. We currently require that the loop is doubly-nested,
1787 contains a single inner loop with a single exit to the block
1788 with the single exit condition in the outer loop.
1789 Vectorizable outer-loops look like this:
1791 (pre-header)
1793 header <---+
1795 inner-loop |
1797 tail ------+
1799 (exit-bb)
1801 The inner-loop also has the properties expected of inner-most loops
1802 as described above. */
1804 if ((loop->inner)->inner || (loop->inner)->next)
1805 return opt_result::failure_at (vect_location,
1806 "not vectorized:"
1807 " multiple nested loops.\n");
1809 entryedge = loop_preheader_edge (innerloop);
1810 if (entryedge->src != loop->header
1811 || !single_exit (innerloop)
1812 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1813 return opt_result::failure_at (vect_location,
1814 "not vectorized:"
1815 " unsupported outerloop form.\n");
1817 /* Analyze the inner-loop. */
1818 vect_loop_form_info inner;
1819 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1820 if (!res)
1822 if (dump_enabled_p ())
1823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824 "not vectorized: Bad inner loop.\n");
1825 return res;
1828 /* Don't support analyzing niter under assumptions for inner
1829 loop. */
1830 if (!integer_onep (inner.assumptions))
1831 return opt_result::failure_at (vect_location,
1832 "not vectorized: Bad inner loop.\n");
1834 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1835 return opt_result::failure_at (vect_location,
1836 "not vectorized: inner-loop count not"
1837 " invariant.\n");
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_NOTE, vect_location,
1841 "Considering outer-loop vectorization.\n");
1842 info->inner_loop_cond = inner.conds[0];
1845 if (EDGE_COUNT (loop->header->preds) != 2)
1846 return opt_result::failure_at (vect_location,
1847 "not vectorized:"
1848 " too many incoming edges.\n");
1850 /* We assume that the latch is empty. */
1851 if (!empty_block_p (loop->latch)
1852 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1853 return opt_result::failure_at (vect_location,
1854 "not vectorized: latch block not empty.\n");
1856 /* Make sure there is no abnormal exit. */
1857 auto_vec<edge> exits = get_loop_exit_edges (loop);
1858 for (edge e : exits)
1860 if (e->flags & EDGE_ABNORMAL)
1861 return opt_result::failure_at (vect_location,
1862 "not vectorized:"
1863 " abnormal loop exit edge.\n");
1866 info->conds
1867 = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1868 &info->number_of_iterations,
1869 &info->number_of_iterationsm1);
1870 if (info->conds.is_empty ())
1871 return opt_result::failure_at
1872 (vect_location,
1873 "not vectorized: complicated exit condition.\n");
1875 /* Determine what the primary and alternate exit conds are. */
1876 for (unsigned i = 0; i < info->conds.length (); i++)
1878 gcond *cond = info->conds[i];
1879 if (exit_e->src == gimple_bb (cond))
1880 std::swap (info->conds[0], info->conds[i]);
1883 if (integer_zerop (info->assumptions)
1884 || !info->number_of_iterations
1885 || chrec_contains_undetermined (info->number_of_iterations))
1886 return opt_result::failure_at
1887 (info->conds[0],
1888 "not vectorized: number of iterations cannot be computed.\n");
1890 if (integer_zerop (info->number_of_iterations))
1891 return opt_result::failure_at
1892 (info->conds[0],
1893 "not vectorized: number of iterations = 0.\n");
1895 if (!(tree_fits_shwi_p (info->number_of_iterations)
1896 && tree_to_shwi (info->number_of_iterations) > 0))
1898 if (dump_enabled_p ())
1900 dump_printf_loc (MSG_NOTE, vect_location,
1901 "Symbolic number of iterations is ");
1902 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1903 dump_printf (MSG_NOTE, "\n");
1907 return opt_result::success ();
1910 /* Create a loop_vec_info for LOOP with SHARED and the
1911 vect_analyze_loop_form result. */
1913 loop_vec_info
1914 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1915 const vect_loop_form_info *info,
1916 loop_vec_info main_loop_info)
1918 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1919 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1920 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1921 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1922 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1923 /* Also record the assumptions for versioning. */
1924 if (!integer_onep (info->assumptions) && !main_loop_info)
1925 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1927 for (gcond *cond : info->conds)
1929 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1930 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1931 /* Mark the statement as a condition. */
1932 STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1935 for (unsigned i = 1; i < info->conds.length (); i ++)
1936 LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1937 LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1939 LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1941 /* Check to see if we're vectorizing multiple exits. */
1942 LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1943 = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1945 if (info->inner_loop_cond)
1947 stmt_vec_info inner_loop_cond_info
1948 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1949 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1950 /* If we have an estimate on the number of iterations of the inner
1951 loop use that to limit the scale for costing, otherwise use
1952 --param vect-inner-loop-cost-factor literally. */
1953 widest_int nit;
1954 if (estimated_stmt_executions (loop->inner, &nit))
1955 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1956 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1959 return loop_vinfo;
1964 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1965 statements update the vectorization factor. */
1967 static void
1968 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1970 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1971 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1972 int nbbs = loop->num_nodes;
1973 poly_uint64 vectorization_factor;
1974 int i;
1976 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1978 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1979 gcc_assert (known_ne (vectorization_factor, 0U));
1981 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1982 vectorization factor of the loop is the unrolling factor required by
1983 the SLP instances. If that unrolling factor is 1, we say, that we
1984 perform pure SLP on loop - cross iteration parallelism is not
1985 exploited. */
1986 bool only_slp_in_loop = true;
1987 for (i = 0; i < nbbs; i++)
1989 basic_block bb = bbs[i];
1990 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1991 gsi_next (&si))
1993 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1994 if (!stmt_info)
1995 continue;
1996 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1997 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1998 && !PURE_SLP_STMT (stmt_info))
1999 /* STMT needs both SLP and loop-based vectorization. */
2000 only_slp_in_loop = false;
2002 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2003 gsi_next (&si))
2005 if (is_gimple_debug (gsi_stmt (si)))
2006 continue;
2007 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2008 stmt_info = vect_stmt_to_vectorize (stmt_info);
2009 if ((STMT_VINFO_RELEVANT_P (stmt_info)
2010 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2011 && !PURE_SLP_STMT (stmt_info))
2012 /* STMT needs both SLP and loop-based vectorization. */
2013 only_slp_in_loop = false;
2017 if (only_slp_in_loop)
2019 if (dump_enabled_p ())
2020 dump_printf_loc (MSG_NOTE, vect_location,
2021 "Loop contains only SLP stmts\n");
2022 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2024 else
2026 if (dump_enabled_p ())
2027 dump_printf_loc (MSG_NOTE, vect_location,
2028 "Loop contains SLP and non-SLP stmts\n");
2029 /* Both the vectorization factor and unroll factor have the form
2030 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2031 so they must have a common multiple. */
2032 vectorization_factor
2033 = force_common_multiple (vectorization_factor,
2034 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2037 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2038 if (dump_enabled_p ())
2040 dump_printf_loc (MSG_NOTE, vect_location,
2041 "Updating vectorization factor to ");
2042 dump_dec (MSG_NOTE, vectorization_factor);
2043 dump_printf (MSG_NOTE, ".\n");
2047 /* Return true if STMT_INFO describes a double reduction phi and if
2048 the other phi in the reduction is also relevant for vectorization.
2049 This rejects cases such as:
2051 outer1:
2052 x_1 = PHI <x_3(outer2), ...>;
2055 inner:
2056 x_2 = ...;
2059 outer2:
2060 x_3 = PHI <x_2(inner)>;
2062 if nothing in x_2 or elsewhere makes x_1 relevant. */
2064 static bool
2065 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2067 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2068 return false;
2070 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2073 /* Function vect_analyze_loop_operations.
2075 Scan the loop stmts and make sure they are all vectorizable. */
2077 static opt_result
2078 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2080 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2081 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2082 int nbbs = loop->num_nodes;
2083 int i;
2084 stmt_vec_info stmt_info;
2085 bool need_to_vectorize = false;
2086 bool ok;
2088 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2090 auto_vec<stmt_info_for_cost> cost_vec;
2092 for (i = 0; i < nbbs; i++)
2094 basic_block bb = bbs[i];
2096 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2097 gsi_next (&si))
2099 gphi *phi = si.phi ();
2100 ok = true;
2102 stmt_info = loop_vinfo->lookup_stmt (phi);
2103 if (dump_enabled_p ())
2104 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2105 (gimple *) phi);
2106 if (virtual_operand_p (gimple_phi_result (phi)))
2107 continue;
2109 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2110 (i.e., a phi in the tail of the outer-loop). */
2111 if (! is_loop_header_bb_p (bb))
2113 /* FORNOW: we currently don't support the case that these phis
2114 are not used in the outerloop (unless it is double reduction,
2115 i.e., this phi is vect_reduction_def), cause this case
2116 requires to actually do something here. */
2117 if (STMT_VINFO_LIVE_P (stmt_info)
2118 && !vect_active_double_reduction_p (stmt_info))
2119 return opt_result::failure_at (phi,
2120 "Unsupported loop-closed phi"
2121 " in outer-loop.\n");
2123 /* If PHI is used in the outer loop, we check that its operand
2124 is defined in the inner loop. */
2125 if (STMT_VINFO_RELEVANT_P (stmt_info))
2127 tree phi_op;
2129 if (gimple_phi_num_args (phi) != 1)
2130 return opt_result::failure_at (phi, "unsupported phi");
2132 phi_op = PHI_ARG_DEF (phi, 0);
2133 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2134 if (!op_def_info)
2135 return opt_result::failure_at (phi, "unsupported phi\n");
2137 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2138 && (STMT_VINFO_RELEVANT (op_def_info)
2139 != vect_used_in_outer_by_reduction))
2140 return opt_result::failure_at (phi, "unsupported phi\n");
2142 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2143 || (STMT_VINFO_DEF_TYPE (stmt_info)
2144 == vect_double_reduction_def))
2145 && !vectorizable_lc_phi (loop_vinfo,
2146 stmt_info, NULL, NULL))
2147 return opt_result::failure_at (phi, "unsupported phi\n");
2150 continue;
2153 gcc_assert (stmt_info);
2155 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2156 || STMT_VINFO_LIVE_P (stmt_info))
2157 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2158 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2159 /* A scalar-dependence cycle that we don't support. */
2160 return opt_result::failure_at (phi,
2161 "not vectorized:"
2162 " scalar dependence cycle.\n");
2164 if (STMT_VINFO_RELEVANT_P (stmt_info))
2166 need_to_vectorize = true;
2167 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2168 && ! PURE_SLP_STMT (stmt_info))
2169 ok = vectorizable_induction (loop_vinfo,
2170 stmt_info, NULL, NULL,
2171 &cost_vec);
2172 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2173 || (STMT_VINFO_DEF_TYPE (stmt_info)
2174 == vect_double_reduction_def)
2175 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2176 && ! PURE_SLP_STMT (stmt_info))
2177 ok = vectorizable_reduction (loop_vinfo,
2178 stmt_info, NULL, NULL, &cost_vec);
2179 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2180 == vect_first_order_recurrence)
2181 && ! PURE_SLP_STMT (stmt_info))
2182 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2183 &cost_vec);
2186 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2187 if (ok
2188 && STMT_VINFO_LIVE_P (stmt_info)
2189 && !PURE_SLP_STMT (stmt_info))
2190 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2191 -1, false, &cost_vec);
2193 if (!ok)
2194 return opt_result::failure_at (phi,
2195 "not vectorized: relevant phi not "
2196 "supported: %G",
2197 static_cast <gimple *> (phi));
2200 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2201 gsi_next (&si))
2203 gimple *stmt = gsi_stmt (si);
2204 if (!gimple_clobber_p (stmt)
2205 && !is_gimple_debug (stmt))
2207 opt_result res
2208 = vect_analyze_stmt (loop_vinfo,
2209 loop_vinfo->lookup_stmt (stmt),
2210 &need_to_vectorize,
2211 NULL, NULL, &cost_vec);
2212 if (!res)
2213 return res;
2216 } /* bbs */
2218 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2220 /* All operations in the loop are either irrelevant (deal with loop
2221 control, or dead), or only used outside the loop and can be moved
2222 out of the loop (e.g. invariants, inductions). The loop can be
2223 optimized away by scalar optimizations. We're better off not
2224 touching this loop. */
2225 if (!need_to_vectorize)
2227 if (dump_enabled_p ())
2228 dump_printf_loc (MSG_NOTE, vect_location,
2229 "All the computation can be taken out of the loop.\n");
2230 return opt_result::failure_at
2231 (vect_location,
2232 "not vectorized: redundant loop. no profit to vectorize.\n");
2235 return opt_result::success ();
2238 /* Return true if we know that the iteration count is smaller than the
2239 vectorization factor. Return false if it isn't, or if we can't be sure
2240 either way. */
2242 static bool
2243 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2245 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2247 HOST_WIDE_INT max_niter;
2248 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2249 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2250 else
2251 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2253 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2254 return true;
2256 return false;
2259 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2260 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2261 definitely no, or -1 if it's worth retrying. */
2263 static int
2264 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2265 unsigned *suggested_unroll_factor)
2267 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2268 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2270 /* Only loops that can handle partially-populated vectors can have iteration
2271 counts less than the vectorization factor. */
2272 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2273 && vect_known_niters_smaller_than_vf (loop_vinfo))
2275 if (dump_enabled_p ())
2276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2277 "not vectorized: iteration count smaller than "
2278 "vectorization factor.\n");
2279 return 0;
2282 /* If we know the number of iterations we can do better, for the
2283 epilogue we can also decide whether the main loop leaves us
2284 with enough iterations, prefering a smaller vector epilog then
2285 also possibly used for the case we skip the vector loop. */
2286 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2288 widest_int scalar_niters
2289 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2290 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2292 loop_vec_info orig_loop_vinfo
2293 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2294 unsigned lowest_vf
2295 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2296 int prolog_peeling = 0;
2297 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2298 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2299 if (prolog_peeling >= 0
2300 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2301 lowest_vf))
2303 unsigned gap
2304 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2305 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2306 % lowest_vf + gap);
2309 /* Reject vectorizing for a single scalar iteration, even if
2310 we could in principle implement that using partial vectors. */
2311 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2312 if (scalar_niters <= peeling_gap + 1)
2314 if (dump_enabled_p ())
2315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316 "not vectorized: loop only has a single "
2317 "scalar iteration.\n");
2318 return 0;
2321 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2323 /* Check that the loop processes at least one full vector. */
2324 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2325 if (known_lt (scalar_niters, vf))
2327 if (dump_enabled_p ())
2328 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2329 "loop does not have enough iterations "
2330 "to support vectorization.\n");
2331 return 0;
2334 /* If we need to peel an extra epilogue iteration to handle data
2335 accesses with gaps, check that there are enough scalar iterations
2336 available.
2338 The check above is redundant with this one when peeling for gaps,
2339 but the distinction is useful for diagnostics. */
2340 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2341 && known_le (scalar_niters, vf))
2343 if (dump_enabled_p ())
2344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2345 "loop does not have enough iterations "
2346 "to support peeling for gaps.\n");
2347 return 0;
2352 /* If using the "very cheap" model. reject cases in which we'd keep
2353 a copy of the scalar code (even if we might be able to vectorize it). */
2354 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2355 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2356 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2359 if (dump_enabled_p ())
2360 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361 "some scalar iterations would need to be peeled\n");
2362 return 0;
2365 int min_profitable_iters, min_profitable_estimate;
2366 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2367 &min_profitable_estimate,
2368 suggested_unroll_factor);
2370 if (min_profitable_iters < 0)
2372 if (dump_enabled_p ())
2373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2374 "not vectorized: vectorization not profitable.\n");
2375 if (dump_enabled_p ())
2376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2377 "not vectorized: vector version will never be "
2378 "profitable.\n");
2379 return -1;
2382 int min_scalar_loop_bound = (param_min_vect_loop_bound
2383 * assumed_vf);
2385 /* Use the cost model only if it is more conservative than user specified
2386 threshold. */
2387 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2388 min_profitable_iters);
2390 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2392 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2393 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2395 if (dump_enabled_p ())
2396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2397 "not vectorized: vectorization not profitable.\n");
2398 if (dump_enabled_p ())
2399 dump_printf_loc (MSG_NOTE, vect_location,
2400 "not vectorized: iteration count smaller than user "
2401 "specified loop bound parameter or minimum profitable "
2402 "iterations (whichever is more conservative).\n");
2403 return 0;
2406 /* The static profitablity threshold min_profitable_estimate includes
2407 the cost of having to check at runtime whether the scalar loop
2408 should be used instead. If it turns out that we don't need or want
2409 such a check, the threshold we should use for the static estimate
2410 is simply the point at which the vector loop becomes more profitable
2411 than the scalar loop. */
2412 if (min_profitable_estimate > min_profitable_iters
2413 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2414 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2415 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2416 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2418 if (dump_enabled_p ())
2419 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2420 " choice between the scalar and vector loops\n");
2421 min_profitable_estimate = min_profitable_iters;
2424 /* If the vector loop needs multiple iterations to be beneficial then
2425 things are probably too close to call, and the conservative thing
2426 would be to stick with the scalar code. */
2427 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2428 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2430 if (dump_enabled_p ())
2431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432 "one iteration of the vector loop would be"
2433 " more expensive than the equivalent number of"
2434 " iterations of the scalar loop\n");
2435 return 0;
2438 HOST_WIDE_INT estimated_niter;
2440 /* If we are vectorizing an epilogue then we know the maximum number of
2441 scalar iterations it will cover is at least one lower than the
2442 vectorization factor of the main loop. */
2443 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2444 estimated_niter
2445 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2446 else
2448 estimated_niter = estimated_stmt_executions_int (loop);
2449 if (estimated_niter == -1)
2450 estimated_niter = likely_max_stmt_executions_int (loop);
2452 if (estimated_niter != -1
2453 && ((unsigned HOST_WIDE_INT) estimated_niter
2454 < MAX (th, (unsigned) min_profitable_estimate)))
2456 if (dump_enabled_p ())
2457 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2458 "not vectorized: estimated iteration count too "
2459 "small.\n");
2460 if (dump_enabled_p ())
2461 dump_printf_loc (MSG_NOTE, vect_location,
2462 "not vectorized: estimated iteration count smaller "
2463 "than specified loop bound parameter or minimum "
2464 "profitable iterations (whichever is more "
2465 "conservative).\n");
2466 return -1;
2469 return 1;
2472 static opt_result
2473 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2474 vec<data_reference_p> *datarefs,
2475 unsigned int *n_stmts)
2477 *n_stmts = 0;
2478 for (unsigned i = 0; i < loop->num_nodes; i++)
2479 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2480 !gsi_end_p (gsi); gsi_next (&gsi))
2482 gimple *stmt = gsi_stmt (gsi);
2483 if (is_gimple_debug (stmt))
2484 continue;
2485 ++(*n_stmts);
2486 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2487 NULL, 0);
2488 if (!res)
2490 if (is_gimple_call (stmt) && loop->safelen)
2492 tree fndecl = gimple_call_fndecl (stmt), op;
2493 if (fndecl == NULL_TREE
2494 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2496 fndecl = gimple_call_arg (stmt, 0);
2497 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2498 fndecl = TREE_OPERAND (fndecl, 0);
2499 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2501 if (fndecl != NULL_TREE)
2503 cgraph_node *node = cgraph_node::get (fndecl);
2504 if (node != NULL && node->simd_clones != NULL)
2506 unsigned int j, n = gimple_call_num_args (stmt);
2507 for (j = 0; j < n; j++)
2509 op = gimple_call_arg (stmt, j);
2510 if (DECL_P (op)
2511 || (REFERENCE_CLASS_P (op)
2512 && get_base_address (op)))
2513 break;
2515 op = gimple_call_lhs (stmt);
2516 /* Ignore #pragma omp declare simd functions
2517 if they don't have data references in the
2518 call stmt itself. */
2519 if (j == n
2520 && !(op
2521 && (DECL_P (op)
2522 || (REFERENCE_CLASS_P (op)
2523 && get_base_address (op)))))
2524 continue;
2528 return res;
2530 /* If dependence analysis will give up due to the limit on the
2531 number of datarefs stop here and fail fatally. */
2532 if (datarefs->length ()
2533 > (unsigned)param_loop_max_datarefs_for_datadeps)
2534 return opt_result::failure_at (stmt, "exceeded param "
2535 "loop-max-datarefs-for-datadeps\n");
2537 return opt_result::success ();
2540 /* Look for SLP-only access groups and turn each individual access into its own
2541 group. */
2542 static void
2543 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2545 unsigned int i;
2546 struct data_reference *dr;
2548 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2550 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2551 FOR_EACH_VEC_ELT (datarefs, i, dr)
2553 gcc_assert (DR_REF (dr));
2554 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2556 /* Check if the load is a part of an interleaving chain. */
2557 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2559 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2560 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2561 unsigned int group_size = DR_GROUP_SIZE (first_element);
2563 /* Check if SLP-only groups. */
2564 if (!STMT_SLP_TYPE (stmt_info)
2565 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2567 /* Dissolve the group. */
2568 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2570 stmt_vec_info vinfo = first_element;
2571 while (vinfo)
2573 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2574 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2575 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2576 DR_GROUP_SIZE (vinfo) = 1;
2577 if (STMT_VINFO_STRIDED_P (first_element)
2578 /* We cannot handle stores with gaps. */
2579 || DR_IS_WRITE (dr_info->dr))
2581 STMT_VINFO_STRIDED_P (vinfo) = true;
2582 DR_GROUP_GAP (vinfo) = 0;
2584 else
2585 DR_GROUP_GAP (vinfo) = group_size - 1;
2586 /* Duplicate and adjust alignment info, it needs to
2587 be present on each group leader, see dr_misalignment. */
2588 if (vinfo != first_element)
2590 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2591 dr_info2->target_alignment = dr_info->target_alignment;
2592 int misalignment = dr_info->misalignment;
2593 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2595 HOST_WIDE_INT diff
2596 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2597 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2598 unsigned HOST_WIDE_INT align_c
2599 = dr_info->target_alignment.to_constant ();
2600 misalignment = (misalignment + diff) % align_c;
2602 dr_info2->misalignment = misalignment;
2604 vinfo = next;
2611 /* Determine if operating on full vectors for LOOP_VINFO might leave
2612 some scalar iterations still to do. If so, decide how we should
2613 handle those scalar iterations. The possibilities are:
2615 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2616 In this case:
2618 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2619 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2620 LOOP_VINFO_PEELING_FOR_NITER == false
2622 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2623 to handle the remaining scalar iterations. In this case:
2625 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2626 LOOP_VINFO_PEELING_FOR_NITER == true
2628 There are two choices:
2630 (2a) Consider vectorizing the epilogue loop at the same VF as the
2631 main loop, but using partial vectors instead of full vectors.
2632 In this case:
2634 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2636 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2637 In this case:
2639 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2642 opt_result
2643 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2645 /* Determine whether there would be any scalar iterations left over. */
2646 bool need_peeling_or_partial_vectors_p
2647 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2649 /* Decide whether to vectorize the loop with partial vectors. */
2650 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2651 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2652 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2653 && need_peeling_or_partial_vectors_p)
2655 /* For partial-vector-usage=1, try to push the handling of partial
2656 vectors to the epilogue, with the main loop continuing to operate
2657 on full vectors.
2659 If we are unrolling we also do not want to use partial vectors. This
2660 is to avoid the overhead of generating multiple masks and also to
2661 avoid having to execute entire iterations of FALSE masked instructions
2662 when dealing with one or less full iterations.
2664 ??? We could then end up failing to use partial vectors if we
2665 decide to peel iterations into a prologue, and if the main loop
2666 then ends up processing fewer than VF iterations. */
2667 if ((param_vect_partial_vector_usage == 1
2668 || loop_vinfo->suggested_unroll_factor > 1)
2669 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2670 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2671 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2672 else
2673 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2676 if (dump_enabled_p ())
2677 dump_printf_loc (MSG_NOTE, vect_location,
2678 "operating on %s vectors%s.\n",
2679 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2680 ? "partial" : "full",
2681 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2682 ? " for epilogue loop" : "");
2684 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2685 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2686 && need_peeling_or_partial_vectors_p);
2688 /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2689 analysis that we don't know whether the loop is vectorized by partial
2690 vectors (More details see tree-vect-loop-manip.cc).
2692 However, SELECT_VL vectorizaton style should only applied on partial
2693 vectorization since SELECT_VL is the GIMPLE IR that calculates the
2694 number of elements to be process for each iteration.
2696 After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2697 if it is not partial vectorized loop. */
2698 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2699 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2701 return opt_result::success ();
2704 /* Function vect_analyze_loop_2.
2706 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2707 analyses will record information in some members of LOOP_VINFO. FATAL
2708 indicates if some analysis meets fatal error. If one non-NULL pointer
2709 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2710 worked out suggested unroll factor, while one NULL pointer shows it's
2711 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2712 is to hold the slp decision when the suggested unroll factor is worked
2713 out. */
2714 static opt_result
2715 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2716 unsigned *suggested_unroll_factor,
2717 bool& slp_done_for_suggested_uf)
2719 opt_result ok = opt_result::success ();
2720 int res;
2721 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2722 poly_uint64 min_vf = 2;
2723 loop_vec_info orig_loop_vinfo = NULL;
2725 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2726 loop_vec_info of the first vectorized loop. */
2727 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2728 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2729 else
2730 orig_loop_vinfo = loop_vinfo;
2731 gcc_assert (orig_loop_vinfo);
2733 /* The first group of checks is independent of the vector size. */
2734 fatal = true;
2736 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2737 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2738 return opt_result::failure_at (vect_location,
2739 "not vectorized: simd if(0)\n");
2741 /* Find all data references in the loop (which correspond to vdefs/vuses)
2742 and analyze their evolution in the loop. */
2744 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2746 /* Gather the data references and count stmts in the loop. */
2747 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2749 opt_result res
2750 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2751 &LOOP_VINFO_DATAREFS (loop_vinfo),
2752 &LOOP_VINFO_N_STMTS (loop_vinfo));
2753 if (!res)
2755 if (dump_enabled_p ())
2756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2757 "not vectorized: loop contains function "
2758 "calls or data references that cannot "
2759 "be analyzed\n");
2760 return res;
2762 loop_vinfo->shared->save_datarefs ();
2764 else
2765 loop_vinfo->shared->check_datarefs ();
2767 /* Analyze the data references and also adjust the minimal
2768 vectorization factor according to the loads and stores. */
2770 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2771 if (!ok)
2773 if (dump_enabled_p ())
2774 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2775 "bad data references.\n");
2776 return ok;
2779 /* Check if we are applying unroll factor now. */
2780 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2781 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2783 /* If the slp decision is false when suggested unroll factor is worked
2784 out, and we are applying suggested unroll factor, we can simply skip
2785 all slp related analyses this time. */
2786 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2788 /* Classify all cross-iteration scalar data-flow cycles.
2789 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2790 vect_analyze_scalar_cycles (loop_vinfo, slp);
2792 vect_pattern_recog (loop_vinfo);
2794 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2796 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2797 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2799 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2800 if (!ok)
2802 if (dump_enabled_p ())
2803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2804 "bad data access.\n");
2805 return ok;
2808 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2810 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2811 if (!ok)
2813 if (dump_enabled_p ())
2814 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2815 "unexpected pattern.\n");
2816 return ok;
2819 /* While the rest of the analysis below depends on it in some way. */
2820 fatal = false;
2822 /* Analyze data dependences between the data-refs in the loop
2823 and adjust the maximum vectorization factor according to
2824 the dependences.
2825 FORNOW: fail at the first data dependence that we encounter. */
2827 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2828 if (!ok)
2830 if (dump_enabled_p ())
2831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2832 "bad data dependence.\n");
2833 return ok;
2835 if (max_vf != MAX_VECTORIZATION_FACTOR
2836 && maybe_lt (max_vf, min_vf))
2837 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2838 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2840 ok = vect_determine_vectorization_factor (loop_vinfo);
2841 if (!ok)
2843 if (dump_enabled_p ())
2844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2845 "can't determine vectorization factor.\n");
2846 return ok;
2849 /* Compute the scalar iteration cost. */
2850 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2852 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2854 if (slp)
2856 /* Check the SLP opportunities in the loop, analyze and build
2857 SLP trees. */
2858 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2859 if (!ok)
2860 return ok;
2862 /* If there are any SLP instances mark them as pure_slp. */
2863 slp = vect_make_slp_decision (loop_vinfo);
2864 if (slp)
2866 /* Find stmts that need to be both vectorized and SLPed. */
2867 vect_detect_hybrid_slp (loop_vinfo);
2869 /* Update the vectorization factor based on the SLP decision. */
2870 vect_update_vf_for_slp (loop_vinfo);
2872 /* Optimize the SLP graph with the vectorization factor fixed. */
2873 vect_optimize_slp (loop_vinfo);
2875 /* Gather the loads reachable from the SLP graph entries. */
2876 vect_gather_slp_loads (loop_vinfo);
2880 bool saved_can_use_partial_vectors_p
2881 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2883 /* We don't expect to have to roll back to anything other than an empty
2884 set of rgroups. */
2885 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2887 /* This is the point where we can re-start analysis with SLP forced off. */
2888 start_over:
2890 /* Apply the suggested unrolling factor, this was determined by the backend
2891 during finish_cost the first time we ran the analyzis for this
2892 vector mode. */
2893 if (applying_suggested_uf)
2894 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2896 /* Now the vectorization factor is final. */
2897 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2898 gcc_assert (known_ne (vectorization_factor, 0U));
2900 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2902 dump_printf_loc (MSG_NOTE, vect_location,
2903 "vectorization_factor = ");
2904 dump_dec (MSG_NOTE, vectorization_factor);
2905 dump_printf (MSG_NOTE, ", niters = %wd\n",
2906 LOOP_VINFO_INT_NITERS (loop_vinfo));
2909 if (max_vf != MAX_VECTORIZATION_FACTOR
2910 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2911 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2913 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2915 /* Analyze the alignment of the data-refs in the loop.
2916 Fail if a data reference is found that cannot be vectorized. */
2918 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2919 if (!ok)
2921 if (dump_enabled_p ())
2922 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2923 "bad data alignment.\n");
2924 return ok;
2927 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2928 It is important to call pruning after vect_analyze_data_ref_accesses,
2929 since we use grouping information gathered by interleaving analysis. */
2930 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2931 if (!ok)
2932 return ok;
2934 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2935 vectorization, since we do not want to add extra peeling or
2936 add versioning for alignment. */
2937 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2938 /* This pass will decide on using loop versioning and/or loop peeling in
2939 order to enhance the alignment of data references in the loop. */
2940 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2941 if (!ok)
2942 return ok;
2944 if (slp)
2946 /* Analyze operations in the SLP instances. Note this may
2947 remove unsupported SLP instances which makes the above
2948 SLP kind detection invalid. */
2949 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2950 vect_slp_analyze_operations (loop_vinfo);
2951 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2953 ok = opt_result::failure_at (vect_location,
2954 "unsupported SLP instances\n");
2955 goto again;
2958 /* Check whether any load in ALL SLP instances is possibly permuted. */
2959 slp_tree load_node, slp_root;
2960 unsigned i, x;
2961 slp_instance instance;
2962 bool can_use_lanes = true;
2963 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2965 slp_root = SLP_INSTANCE_TREE (instance);
2966 int group_size = SLP_TREE_LANES (slp_root);
2967 tree vectype = SLP_TREE_VECTYPE (slp_root);
2968 bool loads_permuted = false;
2969 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2971 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2972 continue;
2973 unsigned j;
2974 stmt_vec_info load_info;
2975 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2976 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2978 loads_permuted = true;
2979 break;
2983 /* If the loads and stores can be handled with load/store-lane
2984 instructions record it and move on to the next instance. */
2985 if (loads_permuted
2986 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2987 && vect_store_lanes_supported (vectype, group_size, false)
2988 != IFN_LAST)
2990 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2991 if (STMT_VINFO_GROUPED_ACCESS
2992 (SLP_TREE_REPRESENTATIVE (load_node)))
2994 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2995 (SLP_TREE_REPRESENTATIVE (load_node));
2996 /* Use SLP for strided accesses (or if we can't
2997 load-lanes). */
2998 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2999 || vect_load_lanes_supported
3000 (STMT_VINFO_VECTYPE (stmt_vinfo),
3001 DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
3002 break;
3005 can_use_lanes
3006 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
3008 if (can_use_lanes && dump_enabled_p ())
3009 dump_printf_loc (MSG_NOTE, vect_location,
3010 "SLP instance %p can use load/store-lanes\n",
3011 (void *) instance);
3013 else
3015 can_use_lanes = false;
3016 break;
3020 /* If all SLP instances can use load/store-lanes abort SLP and try again
3021 with SLP disabled. */
3022 if (can_use_lanes)
3024 ok = opt_result::failure_at (vect_location,
3025 "Built SLP cancelled: can use "
3026 "load/store-lanes\n");
3027 if (dump_enabled_p ())
3028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3029 "Built SLP cancelled: all SLP instances support "
3030 "load/store-lanes\n");
3031 goto again;
3035 /* Dissolve SLP-only groups. */
3036 vect_dissolve_slp_only_groups (loop_vinfo);
3038 /* Scan all the remaining operations in the loop that are not subject
3039 to SLP and make sure they are vectorizable. */
3040 ok = vect_analyze_loop_operations (loop_vinfo);
3041 if (!ok)
3043 if (dump_enabled_p ())
3044 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3045 "bad operation or unsupported loop bound.\n");
3046 return ok;
3049 /* For now, we don't expect to mix both masking and length approaches for one
3050 loop, disable it if both are recorded. */
3051 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3052 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3053 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3055 if (dump_enabled_p ())
3056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3057 "can't vectorize a loop with partial vectors"
3058 " because we don't expect to mix different"
3059 " approaches with partial vectors for the"
3060 " same loop.\n");
3061 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3064 /* If we still have the option of using partial vectors,
3065 check whether we can generate the necessary loop controls. */
3066 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3068 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3070 if (!vect_verify_full_masking (loop_vinfo)
3071 && !vect_verify_full_masking_avx512 (loop_vinfo))
3072 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3074 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3075 if (!vect_verify_loop_lens (loop_vinfo))
3076 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3079 /* If we're vectorizing a loop that uses length "controls" and
3080 can iterate more than once, we apply decrementing IV approach
3081 in loop control. */
3082 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3083 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3084 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3085 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3086 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3087 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3088 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3090 /* If a loop uses length controls and has a decrementing loop control IV,
3091 we will normally pass that IV through a MIN_EXPR to calcaluate the
3092 basis for the length controls. E.g. in a loop that processes one
3093 element per scalar iteration, the number of elements would be
3094 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3096 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3097 step, since only the final iteration of the vector loop can have
3098 inactive lanes.
3100 However, some targets have a dedicated instruction for calculating the
3101 preferred length, given the total number of elements that still need to
3102 be processed. This is encapsulated in the SELECT_VL internal function.
3104 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3105 to determine the basis for the length controls. However, unlike the
3106 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3107 lanes inactive in any iteration of the vector loop, not just the last
3108 iteration. This SELECT_VL approach therefore requires us to use pointer
3109 IVs with variable steps.
3111 Once we've decided how many elements should be processed by one
3112 iteration of the vector loop, we need to populate the rgroup controls.
3113 If a loop has multiple rgroups, we need to make sure that those rgroups
3114 "line up" (that is, they must be consistent about which elements are
3115 active and which aren't). This is done by vect_adjust_loop_lens_control.
3117 In principle, it would be possible to use vect_adjust_loop_lens_control
3118 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3119 However:
3121 (1) In practice, it only makes sense to use SELECT_VL when a vector
3122 operation will be controlled directly by the result. It is not
3123 worth using SELECT_VL if it would only be the input to other
3124 calculations.
3126 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3127 pointer IV will need N updates by a variable amount (N-1 updates
3128 within the iteration and 1 update to move to the next iteration).
3130 Because of this, we prefer to use the MIN_EXPR approach whenever there
3131 is more than one length control.
3133 In addition, SELECT_VL always operates to a granularity of 1 unit.
3134 If we wanted to use it to control an SLP operation on N consecutive
3135 elements, we would need to make the SELECT_VL inputs measure scalar
3136 iterations (rather than elements) and then multiply the SELECT_VL
3137 result by N. But using SELECT_VL this way is inefficient because
3138 of (1) above.
3140 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3141 satisfied:
3143 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3144 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3146 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3147 we will fail to gain benefits of following unroll optimizations. We prefer
3148 using the MIN_EXPR approach in this situation. */
3149 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3151 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3152 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3153 OPTIMIZE_FOR_SPEED)
3154 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3155 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3156 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3157 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3158 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3161 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3162 assuming that the loop will be used as a main loop. We will redo
3163 this analysis later if we instead decide to use the loop as an
3164 epilogue loop. */
3165 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3166 if (!ok)
3167 return ok;
3169 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3170 to be able to handle fewer than VF scalars, or needs to have a lower VF
3171 than the main loop. */
3172 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3173 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3175 poly_uint64 unscaled_vf
3176 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3177 orig_loop_vinfo->suggested_unroll_factor);
3178 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3179 return opt_result::failure_at (vect_location,
3180 "Vectorization factor too high for"
3181 " epilogue loop.\n");
3184 /* Check the costings of the loop make vectorizing worthwhile. */
3185 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3186 if (res < 0)
3188 ok = opt_result::failure_at (vect_location,
3189 "Loop costings may not be worthwhile.\n");
3190 goto again;
3192 if (!res)
3193 return opt_result::failure_at (vect_location,
3194 "Loop costings not worthwhile.\n");
3196 /* If an epilogue loop is required make sure we can create one. */
3197 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3198 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3199 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3201 if (dump_enabled_p ())
3202 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3203 if (!vect_can_advance_ivs_p (loop_vinfo)
3204 || !slpeel_can_duplicate_loop_p (loop,
3205 LOOP_VINFO_IV_EXIT (loop_vinfo),
3206 LOOP_VINFO_IV_EXIT (loop_vinfo)))
3208 ok = opt_result::failure_at (vect_location,
3209 "not vectorized: can't create required "
3210 "epilog loop\n");
3211 goto again;
3215 /* During peeling, we need to check if number of loop iterations is
3216 enough for both peeled prolog loop and vector loop. This check
3217 can be merged along with threshold check of loop versioning, so
3218 increase threshold for this case if necessary.
3220 If we are analyzing an epilogue we still want to check what its
3221 versioning threshold would be. If we decide to vectorize the epilogues we
3222 will want to use the lowest versioning threshold of all epilogues and main
3223 loop. This will enable us to enter a vectorized epilogue even when
3224 versioning the loop. We can't simply check whether the epilogue requires
3225 versioning though since we may have skipped some versioning checks when
3226 analyzing the epilogue. For instance, checks for alias versioning will be
3227 skipped when dealing with epilogues as we assume we already checked them
3228 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3229 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3231 poly_uint64 niters_th = 0;
3232 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3234 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3236 /* Niters for peeled prolog loop. */
3237 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3239 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3240 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3241 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3243 else
3244 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3247 /* Niters for at least one iteration of vectorized loop. */
3248 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3249 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3250 /* One additional iteration because of peeling for gap. */
3251 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3252 niters_th += 1;
3254 /* Use the same condition as vect_transform_loop to decide when to use
3255 the cost to determine a versioning threshold. */
3256 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3257 && ordered_p (th, niters_th))
3258 niters_th = ordered_max (poly_uint64 (th), niters_th);
3260 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3263 gcc_assert (known_eq (vectorization_factor,
3264 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3266 slp_done_for_suggested_uf = slp;
3268 /* Ok to vectorize! */
3269 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3270 return opt_result::success ();
3272 again:
3273 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3274 gcc_assert (!ok);
3276 /* Try again with SLP forced off but if we didn't do any SLP there is
3277 no point in re-trying. */
3278 if (!slp)
3279 return ok;
3281 /* If the slp decision is true when suggested unroll factor is worked
3282 out, and we are applying suggested unroll factor, we don't need to
3283 re-try any more. */
3284 if (applying_suggested_uf && slp_done_for_suggested_uf)
3285 return ok;
3287 /* If there are reduction chains re-trying will fail anyway. */
3288 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3289 return ok;
3291 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3292 via interleaving or lane instructions. */
3293 slp_instance instance;
3294 slp_tree node;
3295 unsigned i, j;
3296 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3298 stmt_vec_info vinfo;
3299 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3300 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3301 continue;
3302 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3303 unsigned int size = DR_GROUP_SIZE (vinfo);
3304 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3305 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3306 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3307 && ! vect_grouped_store_supported (vectype, size))
3308 return opt_result::failure_at (vinfo->stmt,
3309 "unsupported grouped store\n");
3310 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3312 vinfo = SLP_TREE_REPRESENTATIVE (node);
3313 if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3315 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3316 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3317 size = DR_GROUP_SIZE (vinfo);
3318 vectype = STMT_VINFO_VECTYPE (vinfo);
3319 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3320 && ! vect_grouped_load_supported (vectype, single_element_p,
3321 size))
3322 return opt_result::failure_at (vinfo->stmt,
3323 "unsupported grouped load\n");
3328 if (dump_enabled_p ())
3329 dump_printf_loc (MSG_NOTE, vect_location,
3330 "re-trying with SLP disabled\n");
3332 /* Roll back state appropriately. No SLP this time. */
3333 slp = false;
3334 /* Restore vectorization factor as it were without SLP. */
3335 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3336 /* Free the SLP instances. */
3337 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3338 vect_free_slp_instance (instance);
3339 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3340 /* Reset SLP type to loop_vect on all stmts. */
3341 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3343 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3344 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3345 !gsi_end_p (si); gsi_next (&si))
3347 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3348 STMT_SLP_TYPE (stmt_info) = loop_vect;
3349 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3350 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3352 /* vectorizable_reduction adjusts reduction stmt def-types,
3353 restore them to that of the PHI. */
3354 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3355 = STMT_VINFO_DEF_TYPE (stmt_info);
3356 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3357 (STMT_VINFO_REDUC_DEF (stmt_info)))
3358 = STMT_VINFO_DEF_TYPE (stmt_info);
3361 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3362 !gsi_end_p (si); gsi_next (&si))
3364 if (is_gimple_debug (gsi_stmt (si)))
3365 continue;
3366 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3367 STMT_SLP_TYPE (stmt_info) = loop_vect;
3368 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3370 stmt_vec_info pattern_stmt_info
3371 = STMT_VINFO_RELATED_STMT (stmt_info);
3372 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3373 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3375 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3376 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3377 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3378 !gsi_end_p (pi); gsi_next (&pi))
3379 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3380 = loop_vect;
3384 /* Free optimized alias test DDRS. */
3385 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3386 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3387 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3388 /* Reset target cost data. */
3389 delete loop_vinfo->vector_costs;
3390 loop_vinfo->vector_costs = nullptr;
3391 /* Reset accumulated rgroup information. */
3392 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3393 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3394 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3395 /* Reset assorted flags. */
3396 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3397 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3398 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3399 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3400 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3401 = saved_can_use_partial_vectors_p;
3403 goto start_over;
3406 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3407 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3408 OLD_LOOP_VINFO is better unless something specifically indicates
3409 otherwise.
3411 Note that this deliberately isn't a partial order. */
3413 static bool
3414 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3415 loop_vec_info old_loop_vinfo)
3417 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3418 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3420 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3421 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3423 /* Always prefer a VF of loop->simdlen over any other VF. */
3424 if (loop->simdlen)
3426 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3427 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3428 if (new_simdlen_p != old_simdlen_p)
3429 return new_simdlen_p;
3432 const auto *old_costs = old_loop_vinfo->vector_costs;
3433 const auto *new_costs = new_loop_vinfo->vector_costs;
3434 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3435 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3437 return new_costs->better_main_loop_than_p (old_costs);
3440 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3441 true if we should. */
3443 static bool
3444 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3445 loop_vec_info old_loop_vinfo)
3447 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3448 return false;
3450 if (dump_enabled_p ())
3451 dump_printf_loc (MSG_NOTE, vect_location,
3452 "***** Preferring vector mode %s to vector mode %s\n",
3453 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3454 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3455 return true;
3458 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3459 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3460 MODE_I to the next mode useful to analyze.
3461 Return the loop_vinfo on success and wrapped null on failure. */
3463 static opt_loop_vec_info
3464 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3465 const vect_loop_form_info *loop_form_info,
3466 loop_vec_info main_loop_vinfo,
3467 const vector_modes &vector_modes, unsigned &mode_i,
3468 machine_mode &autodetected_vector_mode,
3469 bool &fatal)
3471 loop_vec_info loop_vinfo
3472 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3474 machine_mode vector_mode = vector_modes[mode_i];
3475 loop_vinfo->vector_mode = vector_mode;
3476 unsigned int suggested_unroll_factor = 1;
3477 bool slp_done_for_suggested_uf = false;
3479 /* Run the main analysis. */
3480 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3481 &suggested_unroll_factor,
3482 slp_done_for_suggested_uf);
3483 if (dump_enabled_p ())
3484 dump_printf_loc (MSG_NOTE, vect_location,
3485 "***** Analysis %s with vector mode %s\n",
3486 res ? "succeeded" : " failed",
3487 GET_MODE_NAME (loop_vinfo->vector_mode));
3489 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3491 if (dump_enabled_p ())
3492 dump_printf_loc (MSG_NOTE, vect_location,
3493 "***** Re-trying analysis for unrolling"
3494 " with unroll factor %d and slp %s.\n",
3495 suggested_unroll_factor,
3496 slp_done_for_suggested_uf ? "on" : "off");
3497 loop_vec_info unroll_vinfo
3498 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3499 unroll_vinfo->vector_mode = vector_mode;
3500 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3501 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3502 slp_done_for_suggested_uf);
3503 if (new_res)
3505 delete loop_vinfo;
3506 loop_vinfo = unroll_vinfo;
3508 else
3509 delete unroll_vinfo;
3512 /* Remember the autodetected vector mode. */
3513 if (vector_mode == VOIDmode)
3514 autodetected_vector_mode = loop_vinfo->vector_mode;
3516 /* Advance mode_i, first skipping modes that would result in the
3517 same analysis result. */
3518 while (mode_i + 1 < vector_modes.length ()
3519 && vect_chooses_same_modes_p (loop_vinfo,
3520 vector_modes[mode_i + 1]))
3522 if (dump_enabled_p ())
3523 dump_printf_loc (MSG_NOTE, vect_location,
3524 "***** The result for vector mode %s would"
3525 " be the same\n",
3526 GET_MODE_NAME (vector_modes[mode_i + 1]));
3527 mode_i += 1;
3529 if (mode_i + 1 < vector_modes.length ()
3530 && VECTOR_MODE_P (autodetected_vector_mode)
3531 && (related_vector_mode (vector_modes[mode_i + 1],
3532 GET_MODE_INNER (autodetected_vector_mode))
3533 == autodetected_vector_mode)
3534 && (related_vector_mode (autodetected_vector_mode,
3535 GET_MODE_INNER (vector_modes[mode_i + 1]))
3536 == vector_modes[mode_i + 1]))
3538 if (dump_enabled_p ())
3539 dump_printf_loc (MSG_NOTE, vect_location,
3540 "***** Skipping vector mode %s, which would"
3541 " repeat the analysis for %s\n",
3542 GET_MODE_NAME (vector_modes[mode_i + 1]),
3543 GET_MODE_NAME (autodetected_vector_mode));
3544 mode_i += 1;
3546 mode_i++;
3548 if (!res)
3550 delete loop_vinfo;
3551 if (fatal)
3552 gcc_checking_assert (main_loop_vinfo == NULL);
3553 return opt_loop_vec_info::propagate_failure (res);
3556 return opt_loop_vec_info::success (loop_vinfo);
3559 /* Function vect_analyze_loop.
3561 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3562 for it. The different analyses will record information in the
3563 loop_vec_info struct. */
3564 opt_loop_vec_info
3565 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3567 DUMP_VECT_SCOPE ("analyze_loop_nest");
3569 if (loop_outer (loop)
3570 && loop_vec_info_for_loop (loop_outer (loop))
3571 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3572 return opt_loop_vec_info::failure_at (vect_location,
3573 "outer-loop already vectorized.\n");
3575 if (!find_loop_nest (loop, &shared->loop_nest))
3576 return opt_loop_vec_info::failure_at
3577 (vect_location,
3578 "not vectorized: loop nest containing two or more consecutive inner"
3579 " loops cannot be vectorized\n");
3581 /* Analyze the loop form. */
3582 vect_loop_form_info loop_form_info;
3583 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3584 if (!res)
3586 if (dump_enabled_p ())
3587 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3588 "bad loop form.\n");
3589 return opt_loop_vec_info::propagate_failure (res);
3591 if (!integer_onep (loop_form_info.assumptions))
3593 /* We consider to vectorize this loop by versioning it under
3594 some assumptions. In order to do this, we need to clear
3595 existing information computed by scev and niter analyzer. */
3596 scev_reset_htab ();
3597 free_numbers_of_iterations_estimates (loop);
3598 /* Also set flag for this loop so that following scev and niter
3599 analysis are done under the assumptions. */
3600 loop_constraint_set (loop, LOOP_C_FINITE);
3602 else
3603 /* Clear the existing niter information to make sure the nonwrapping flag
3604 will be calculated and set propriately. */
3605 free_numbers_of_iterations_estimates (loop);
3607 auto_vector_modes vector_modes;
3608 /* Autodetect first vector size we try. */
3609 vector_modes.safe_push (VOIDmode);
3610 unsigned int autovec_flags
3611 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3612 loop->simdlen != 0);
3613 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3614 && !unlimited_cost_model (loop));
3615 machine_mode autodetected_vector_mode = VOIDmode;
3616 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3617 unsigned int mode_i = 0;
3618 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3620 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3621 a mode has not been analyzed. */
3622 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3623 for (unsigned i = 0; i < vector_modes.length (); ++i)
3624 cached_vf_per_mode.safe_push (0);
3626 /* First determine the main loop vectorization mode, either the first
3627 one that works, starting with auto-detecting the vector mode and then
3628 following the targets order of preference, or the one with the
3629 lowest cost if pick_lowest_cost_p. */
3630 while (1)
3632 bool fatal;
3633 unsigned int last_mode_i = mode_i;
3634 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3635 failed. */
3636 cached_vf_per_mode[last_mode_i] = -1;
3637 opt_loop_vec_info loop_vinfo
3638 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3639 NULL, vector_modes, mode_i,
3640 autodetected_vector_mode, fatal);
3641 if (fatal)
3642 break;
3644 if (loop_vinfo)
3646 /* Analyzis has been successful so update the VF value. The
3647 VF should always be a multiple of unroll_factor and we want to
3648 capture the original VF here. */
3649 cached_vf_per_mode[last_mode_i]
3650 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3651 loop_vinfo->suggested_unroll_factor);
3652 /* Once we hit the desired simdlen for the first time,
3653 discard any previous attempts. */
3654 if (simdlen
3655 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3657 delete first_loop_vinfo;
3658 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3659 simdlen = 0;
3661 else if (pick_lowest_cost_p
3662 && first_loop_vinfo
3663 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3665 /* Pick loop_vinfo over first_loop_vinfo. */
3666 delete first_loop_vinfo;
3667 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3669 if (first_loop_vinfo == NULL)
3670 first_loop_vinfo = loop_vinfo;
3671 else
3673 delete loop_vinfo;
3674 loop_vinfo = opt_loop_vec_info::success (NULL);
3677 /* Commit to first_loop_vinfo if we have no reason to try
3678 alternatives. */
3679 if (!simdlen && !pick_lowest_cost_p)
3680 break;
3682 if (mode_i == vector_modes.length ()
3683 || autodetected_vector_mode == VOIDmode)
3684 break;
3686 /* Try the next biggest vector size. */
3687 if (dump_enabled_p ())
3688 dump_printf_loc (MSG_NOTE, vect_location,
3689 "***** Re-trying analysis with vector mode %s\n",
3690 GET_MODE_NAME (vector_modes[mode_i]));
3692 if (!first_loop_vinfo)
3693 return opt_loop_vec_info::propagate_failure (res);
3695 if (dump_enabled_p ())
3696 dump_printf_loc (MSG_NOTE, vect_location,
3697 "***** Choosing vector mode %s\n",
3698 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3700 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3701 enabled, SIMDUID is not set, it is the innermost loop and we have
3702 either already found the loop's SIMDLEN or there was no SIMDLEN to
3703 begin with.
3704 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3705 bool vect_epilogues = (!simdlen
3706 && loop->inner == NULL
3707 && param_vect_epilogues_nomask
3708 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3709 /* No code motion support for multiple epilogues so for now
3710 not supported when multiple exits. */
3711 && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3712 && !loop->simduid);
3713 if (!vect_epilogues)
3714 return first_loop_vinfo;
3716 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3717 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3719 /* For epilogues start the analysis from the first mode. The motivation
3720 behind starting from the beginning comes from cases where the VECTOR_MODES
3721 array may contain length-agnostic and length-specific modes. Their
3722 ordering is not guaranteed, so we could end up picking a mode for the main
3723 loop that is after the epilogue's optimal mode. */
3724 vector_modes[0] = autodetected_vector_mode;
3725 mode_i = 0;
3727 bool supports_partial_vectors =
3728 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3729 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3731 while (1)
3733 /* If the target does not support partial vectors we can shorten the
3734 number of modes to analyze for the epilogue as we know we can't pick a
3735 mode that would lead to a VF at least as big as the
3736 FIRST_VINFO_VF. */
3737 if (!supports_partial_vectors
3738 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3740 mode_i++;
3741 if (mode_i == vector_modes.length ())
3742 break;
3743 continue;
3746 if (dump_enabled_p ())
3747 dump_printf_loc (MSG_NOTE, vect_location,
3748 "***** Re-trying epilogue analysis with vector "
3749 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3751 bool fatal;
3752 opt_loop_vec_info loop_vinfo
3753 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3754 first_loop_vinfo,
3755 vector_modes, mode_i,
3756 autodetected_vector_mode, fatal);
3757 if (fatal)
3758 break;
3760 if (loop_vinfo)
3762 if (pick_lowest_cost_p)
3764 /* Keep trying to roll back vectorization attempts while the
3765 loop_vec_infos they produced were worse than this one. */
3766 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3767 while (!vinfos.is_empty ()
3768 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3770 gcc_assert (vect_epilogues);
3771 delete vinfos.pop ();
3774 /* For now only allow one epilogue loop. */
3775 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3777 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3778 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3779 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3780 || maybe_ne (lowest_th, 0U));
3781 /* Keep track of the known smallest versioning
3782 threshold. */
3783 if (ordered_p (lowest_th, th))
3784 lowest_th = ordered_min (lowest_th, th);
3786 else
3788 delete loop_vinfo;
3789 loop_vinfo = opt_loop_vec_info::success (NULL);
3792 /* For now only allow one epilogue loop, but allow
3793 pick_lowest_cost_p to replace it, so commit to the
3794 first epilogue if we have no reason to try alternatives. */
3795 if (!pick_lowest_cost_p)
3796 break;
3799 if (mode_i == vector_modes.length ())
3800 break;
3804 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3806 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3807 if (dump_enabled_p ())
3808 dump_printf_loc (MSG_NOTE, vect_location,
3809 "***** Choosing epilogue vector mode %s\n",
3810 GET_MODE_NAME
3811 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3814 return first_loop_vinfo;
3817 /* Return true if there is an in-order reduction function for CODE, storing
3818 it in *REDUC_FN if so. */
3820 static bool
3821 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3823 /* We support MINUS_EXPR by negating the operand. This also preserves an
3824 initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3825 (-0.0) = -0.0. */
3826 if (code == PLUS_EXPR || code == MINUS_EXPR)
3828 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3829 return true;
3831 return false;
3834 /* Function reduction_fn_for_scalar_code
3836 Input:
3837 CODE - tree_code of a reduction operations.
3839 Output:
3840 REDUC_FN - the corresponding internal function to be used to reduce the
3841 vector of partial results into a single scalar result, or IFN_LAST
3842 if the operation is a supported reduction operation, but does not have
3843 such an internal function.
3845 Return FALSE if CODE currently cannot be vectorized as reduction. */
3847 bool
3848 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3850 if (code.is_tree_code ())
3851 switch (tree_code (code))
3853 case MAX_EXPR:
3854 *reduc_fn = IFN_REDUC_MAX;
3855 return true;
3857 case MIN_EXPR:
3858 *reduc_fn = IFN_REDUC_MIN;
3859 return true;
3861 case PLUS_EXPR:
3862 *reduc_fn = IFN_REDUC_PLUS;
3863 return true;
3865 case BIT_AND_EXPR:
3866 *reduc_fn = IFN_REDUC_AND;
3867 return true;
3869 case BIT_IOR_EXPR:
3870 *reduc_fn = IFN_REDUC_IOR;
3871 return true;
3873 case BIT_XOR_EXPR:
3874 *reduc_fn = IFN_REDUC_XOR;
3875 return true;
3877 case MULT_EXPR:
3878 case MINUS_EXPR:
3879 *reduc_fn = IFN_LAST;
3880 return true;
3882 default:
3883 return false;
3885 else
3886 switch (combined_fn (code))
3888 CASE_CFN_FMAX:
3889 *reduc_fn = IFN_REDUC_FMAX;
3890 return true;
3892 CASE_CFN_FMIN:
3893 *reduc_fn = IFN_REDUC_FMIN;
3894 return true;
3896 default:
3897 return false;
3901 /* If there is a neutral value X such that a reduction would not be affected
3902 by the introduction of additional X elements, return that X, otherwise
3903 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3904 of the scalar elements. If the reduction has just a single initial value
3905 then INITIAL_VALUE is that value, otherwise it is null.
3906 If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3907 In that case no signed zero is returned. */
3909 tree
3910 neutral_op_for_reduction (tree scalar_type, code_helper code,
3911 tree initial_value, bool as_initial)
3913 if (code.is_tree_code ())
3914 switch (tree_code (code))
3916 case DOT_PROD_EXPR:
3917 case SAD_EXPR:
3918 case MINUS_EXPR:
3919 case BIT_IOR_EXPR:
3920 case BIT_XOR_EXPR:
3921 return build_zero_cst (scalar_type);
3922 case WIDEN_SUM_EXPR:
3923 case PLUS_EXPR:
3924 if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3925 return build_real (scalar_type, dconstm0);
3926 else
3927 return build_zero_cst (scalar_type);
3929 case MULT_EXPR:
3930 return build_one_cst (scalar_type);
3932 case BIT_AND_EXPR:
3933 return build_all_ones_cst (scalar_type);
3935 case MAX_EXPR:
3936 case MIN_EXPR:
3937 return initial_value;
3939 default:
3940 return NULL_TREE;
3942 else
3943 switch (combined_fn (code))
3945 CASE_CFN_FMIN:
3946 CASE_CFN_FMAX:
3947 return initial_value;
3949 default:
3950 return NULL_TREE;
3954 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3955 STMT is printed with a message MSG. */
3957 static void
3958 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3960 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3963 /* Return true if we need an in-order reduction for operation CODE
3964 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3965 overflow must wrap. */
3967 bool
3968 needs_fold_left_reduction_p (tree type, code_helper code)
3970 /* CHECKME: check for !flag_finite_math_only too? */
3971 if (SCALAR_FLOAT_TYPE_P (type))
3973 if (code.is_tree_code ())
3974 switch (tree_code (code))
3976 case MIN_EXPR:
3977 case MAX_EXPR:
3978 return false;
3980 default:
3981 return !flag_associative_math;
3983 else
3984 switch (combined_fn (code))
3986 CASE_CFN_FMIN:
3987 CASE_CFN_FMAX:
3988 return false;
3990 default:
3991 return !flag_associative_math;
3995 if (INTEGRAL_TYPE_P (type))
3996 return (!code.is_tree_code ()
3997 || !operation_no_trapping_overflow (type, tree_code (code)));
3999 if (SAT_FIXED_POINT_TYPE_P (type))
4000 return true;
4002 return false;
4005 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4006 has a handled computation expression. Store the main reduction
4007 operation in *CODE. */
4009 static bool
4010 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4011 tree loop_arg, code_helper *code,
4012 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
4014 auto_bitmap visited;
4015 tree lookfor = PHI_RESULT (phi);
4016 ssa_op_iter curri;
4017 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4018 while (USE_FROM_PTR (curr) != loop_arg)
4019 curr = op_iter_next_use (&curri);
4020 curri.i = curri.numops;
4023 path.safe_push (std::make_pair (curri, curr));
4024 tree use = USE_FROM_PTR (curr);
4025 if (use == lookfor)
4026 break;
4027 gimple *def = SSA_NAME_DEF_STMT (use);
4028 if (gimple_nop_p (def)
4029 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4031 pop:
4034 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4035 curri = x.first;
4036 curr = x.second;
4038 curr = op_iter_next_use (&curri);
4039 /* Skip already visited or non-SSA operands (from iterating
4040 over PHI args). */
4041 while (curr != NULL_USE_OPERAND_P
4042 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4043 || ! bitmap_set_bit (visited,
4044 SSA_NAME_VERSION
4045 (USE_FROM_PTR (curr)))));
4047 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4048 if (curr == NULL_USE_OPERAND_P)
4049 break;
4051 else
4053 if (gimple_code (def) == GIMPLE_PHI)
4054 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4055 else
4056 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4057 while (curr != NULL_USE_OPERAND_P
4058 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4059 || ! bitmap_set_bit (visited,
4060 SSA_NAME_VERSION
4061 (USE_FROM_PTR (curr)))))
4062 curr = op_iter_next_use (&curri);
4063 if (curr == NULL_USE_OPERAND_P)
4064 goto pop;
4067 while (1);
4068 if (dump_file && (dump_flags & TDF_DETAILS))
4070 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4071 unsigned i;
4072 std::pair<ssa_op_iter, use_operand_p> *x;
4073 FOR_EACH_VEC_ELT (path, i, x)
4074 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4075 dump_printf (MSG_NOTE, "\n");
4078 /* Check whether the reduction path detected is valid. */
4079 bool fail = path.length () == 0;
4080 bool neg = false;
4081 int sign = -1;
4082 *code = ERROR_MARK;
4083 for (unsigned i = 1; i < path.length (); ++i)
4085 gimple *use_stmt = USE_STMT (path[i].second);
4086 gimple_match_op op;
4087 if (!gimple_extract_op (use_stmt, &op))
4089 fail = true;
4090 break;
4092 unsigned int opi = op.num_ops;
4093 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4095 /* The following make sure we can compute the operand index
4096 easily plus it mostly disallows chaining via COND_EXPR condition
4097 operands. */
4098 for (opi = 0; opi < op.num_ops; ++opi)
4099 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4100 break;
4102 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4104 for (opi = 0; opi < op.num_ops; ++opi)
4105 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4106 break;
4108 if (opi == op.num_ops)
4110 fail = true;
4111 break;
4113 op.code = canonicalize_code (op.code, op.type);
4114 if (op.code == MINUS_EXPR)
4116 op.code = PLUS_EXPR;
4117 /* Track whether we negate the reduction value each iteration. */
4118 if (op.ops[1] == op.ops[opi])
4119 neg = ! neg;
4121 else if (op.code == IFN_COND_SUB)
4123 op.code = IFN_COND_ADD;
4124 /* Track whether we negate the reduction value each iteration. */
4125 if (op.ops[2] == op.ops[opi])
4126 neg = ! neg;
4128 if (CONVERT_EXPR_CODE_P (op.code)
4129 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4131 else if (*code == ERROR_MARK)
4133 *code = op.code;
4134 sign = TYPE_SIGN (op.type);
4136 else if (op.code != *code)
4138 fail = true;
4139 break;
4141 else if ((op.code == MIN_EXPR
4142 || op.code == MAX_EXPR)
4143 && sign != TYPE_SIGN (op.type))
4145 fail = true;
4146 break;
4148 /* Check there's only a single stmt the op is used on. For the
4149 not value-changing tail and the last stmt allow out-of-loop uses.
4150 ??? We could relax this and handle arbitrary live stmts by
4151 forcing a scalar epilogue for example. */
4152 imm_use_iterator imm_iter;
4153 use_operand_p use_p;
4154 gimple *op_use_stmt;
4155 unsigned cnt = 0;
4156 bool cond_fn_p = op.code.is_internal_fn ()
4157 && (conditional_internal_fn_code (internal_fn (op.code))
4158 != ERROR_MARK);
4160 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4162 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4163 op1 twice (once as definition, once as else) in the same operation.
4164 Allow this. */
4165 if (cond_fn_p && op_use_stmt == use_stmt)
4167 gcall *call = as_a<gcall *> (use_stmt);
4168 unsigned else_pos
4169 = internal_fn_else_index (internal_fn (op.code));
4171 for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4173 if (j == else_pos)
4174 continue;
4175 if (gimple_call_arg (call, j) == op.ops[opi])
4176 cnt++;
4179 else if (!is_gimple_debug (op_use_stmt)
4180 && (*code != ERROR_MARK
4181 || flow_bb_inside_loop_p (loop,
4182 gimple_bb (op_use_stmt))))
4183 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4184 cnt++;
4187 if (cnt != 1)
4189 fail = true;
4190 break;
4193 return ! fail && ! neg && *code != ERROR_MARK;
4196 bool
4197 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4198 tree loop_arg, enum tree_code code)
4200 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4201 code_helper code_;
4202 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4203 && code_ == code);
4208 /* Function vect_is_simple_reduction
4210 (1) Detect a cross-iteration def-use cycle that represents a simple
4211 reduction computation. We look for the following pattern:
4213 loop_header:
4214 a1 = phi < a0, a2 >
4215 a3 = ...
4216 a2 = operation (a3, a1)
4220 a3 = ...
4221 loop_header:
4222 a1 = phi < a0, a2 >
4223 a2 = operation (a3, a1)
4225 such that:
4226 1. operation is commutative and associative and it is safe to
4227 change the order of the computation
4228 2. no uses for a2 in the loop (a2 is used out of the loop)
4229 3. no uses of a1 in the loop besides the reduction operation
4230 4. no uses of a1 outside the loop.
4232 Conditions 1,4 are tested here.
4233 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4235 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4236 nested cycles.
4238 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4239 reductions:
4241 a1 = phi < a0, a2 >
4242 inner loop (def of a3)
4243 a2 = phi < a3 >
4245 (4) Detect condition expressions, ie:
4246 for (int i = 0; i < N; i++)
4247 if (a[i] < val)
4248 ret_val = a[i];
4252 static stmt_vec_info
4253 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4254 bool *double_reduc, bool *reduc_chain_p, bool slp)
4256 gphi *phi = as_a <gphi *> (phi_info->stmt);
4257 gimple *phi_use_stmt = NULL;
4258 imm_use_iterator imm_iter;
4259 use_operand_p use_p;
4261 *double_reduc = false;
4262 *reduc_chain_p = false;
4263 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4265 tree phi_name = PHI_RESULT (phi);
4266 /* ??? If there are no uses of the PHI result the inner loop reduction
4267 won't be detected as possibly double-reduction by vectorizable_reduction
4268 because that tries to walk the PHI arg from the preheader edge which
4269 can be constant. See PR60382. */
4270 if (has_zero_uses (phi_name))
4271 return NULL;
4272 class loop *loop = (gimple_bb (phi))->loop_father;
4273 unsigned nphi_def_loop_uses = 0;
4274 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4276 gimple *use_stmt = USE_STMT (use_p);
4277 if (is_gimple_debug (use_stmt))
4278 continue;
4280 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4282 if (dump_enabled_p ())
4283 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4284 "intermediate value used outside loop.\n");
4286 return NULL;
4289 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4290 op1 twice (once as definition, once as else) in the same operation.
4291 Only count it as one. */
4292 if (use_stmt != phi_use_stmt)
4294 nphi_def_loop_uses++;
4295 phi_use_stmt = use_stmt;
4299 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4300 if (TREE_CODE (latch_def) != SSA_NAME)
4302 if (dump_enabled_p ())
4303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4304 "reduction: not ssa_name: %T\n", latch_def);
4305 return NULL;
4308 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4309 if (!def_stmt_info
4310 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4311 return NULL;
4313 bool nested_in_vect_loop
4314 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4315 unsigned nlatch_def_loop_uses = 0;
4316 auto_vec<gphi *, 3> lcphis;
4317 bool inner_loop_of_double_reduc = false;
4318 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4320 gimple *use_stmt = USE_STMT (use_p);
4321 if (is_gimple_debug (use_stmt))
4322 continue;
4323 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4324 nlatch_def_loop_uses++;
4325 else
4327 /* We can have more than one loop-closed PHI. */
4328 lcphis.safe_push (as_a <gphi *> (use_stmt));
4329 if (nested_in_vect_loop
4330 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4331 == vect_double_reduction_def))
4332 inner_loop_of_double_reduc = true;
4336 /* If we are vectorizing an inner reduction we are executing that
4337 in the original order only in case we are not dealing with a
4338 double reduction. */
4339 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4341 if (dump_enabled_p ())
4342 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4343 "detected nested cycle: ");
4344 return def_stmt_info;
4347 /* When the inner loop of a double reduction ends up with more than
4348 one loop-closed PHI we have failed to classify alternate such
4349 PHIs as double reduction, leading to wrong code. See PR103237. */
4350 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4352 if (dump_enabled_p ())
4353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4354 "unhandle double reduction\n");
4355 return NULL;
4358 /* If this isn't a nested cycle or if the nested cycle reduction value
4359 is used ouside of the inner loop we cannot handle uses of the reduction
4360 value. */
4361 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4363 if (dump_enabled_p ())
4364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4365 "reduction used in loop.\n");
4366 return NULL;
4369 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4370 defined in the inner loop. */
4371 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4373 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4374 if (gimple_phi_num_args (def_stmt) != 1
4375 || TREE_CODE (op1) != SSA_NAME)
4377 if (dump_enabled_p ())
4378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4379 "unsupported phi node definition.\n");
4381 return NULL;
4384 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4385 and the latch definition op1. */
4386 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4387 if (gimple_bb (def1)
4388 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4389 && loop->inner
4390 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4391 && (is_gimple_assign (def1) || is_gimple_call (def1))
4392 && is_a <gphi *> (phi_use_stmt)
4393 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4394 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4395 loop_latch_edge (loop->inner))))
4397 if (dump_enabled_p ())
4398 report_vect_op (MSG_NOTE, def_stmt,
4399 "detected double reduction: ");
4401 *double_reduc = true;
4402 return def_stmt_info;
4405 return NULL;
4408 /* Look for the expression computing latch_def from then loop PHI result. */
4409 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4410 code_helper code;
4411 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4412 path))
4414 STMT_VINFO_REDUC_CODE (phi_info) = code;
4415 if (code == COND_EXPR && !nested_in_vect_loop)
4416 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4418 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4419 reduction chain for which the additional restriction is that
4420 all operations in the chain are the same. */
4421 auto_vec<stmt_vec_info, 8> reduc_chain;
4422 unsigned i;
4423 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4424 for (i = path.length () - 1; i >= 1; --i)
4426 gimple *stmt = USE_STMT (path[i].second);
4427 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4428 gimple_match_op op;
4429 if (!gimple_extract_op (stmt, &op))
4430 gcc_unreachable ();
4431 if (gassign *assign = dyn_cast<gassign *> (stmt))
4432 STMT_VINFO_REDUC_IDX (stmt_info)
4433 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4434 else
4436 gcall *call = as_a<gcall *> (stmt);
4437 STMT_VINFO_REDUC_IDX (stmt_info)
4438 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4440 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4441 && (i == 1 || i == path.length () - 1));
4442 if ((op.code != code && !leading_conversion)
4443 /* We can only handle the final value in epilogue
4444 generation for reduction chains. */
4445 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4446 is_slp_reduc = false;
4447 /* For reduction chains we support a trailing/leading
4448 conversions. We do not store those in the actual chain. */
4449 if (leading_conversion)
4450 continue;
4451 reduc_chain.safe_push (stmt_info);
4453 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4455 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4457 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4458 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4460 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4461 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4463 /* Save the chain for further analysis in SLP detection. */
4464 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4465 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4467 *reduc_chain_p = true;
4468 if (dump_enabled_p ())
4469 dump_printf_loc (MSG_NOTE, vect_location,
4470 "reduction: detected reduction chain\n");
4472 else if (dump_enabled_p ())
4473 dump_printf_loc (MSG_NOTE, vect_location,
4474 "reduction: detected reduction\n");
4476 return def_stmt_info;
4479 if (dump_enabled_p ())
4480 dump_printf_loc (MSG_NOTE, vect_location,
4481 "reduction: unknown pattern\n");
4483 return NULL;
4486 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4487 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4488 or -1 if not known. */
4490 static int
4491 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4493 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4494 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4496 if (dump_enabled_p ())
4497 dump_printf_loc (MSG_NOTE, vect_location,
4498 "cost model: epilogue peel iters set to vf/2 "
4499 "because loop iterations are unknown .\n");
4500 return assumed_vf / 2;
4502 else
4504 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4505 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4506 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4507 /* If we need to peel for gaps, but no peeling is required, we have to
4508 peel VF iterations. */
4509 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4510 peel_iters_epilogue = assumed_vf;
4511 return peel_iters_epilogue;
4515 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4517 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4518 int *peel_iters_epilogue,
4519 stmt_vector_for_cost *scalar_cost_vec,
4520 stmt_vector_for_cost *prologue_cost_vec,
4521 stmt_vector_for_cost *epilogue_cost_vec)
4523 int retval = 0;
4525 *peel_iters_epilogue
4526 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4528 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4530 /* If peeled iterations are known but number of scalar loop
4531 iterations are unknown, count a taken branch per peeled loop. */
4532 if (peel_iters_prologue > 0)
4533 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4534 vect_prologue);
4535 if (*peel_iters_epilogue > 0)
4536 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4537 vect_epilogue);
4540 stmt_info_for_cost *si;
4541 int j;
4542 if (peel_iters_prologue)
4543 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4544 retval += record_stmt_cost (prologue_cost_vec,
4545 si->count * peel_iters_prologue,
4546 si->kind, si->stmt_info, si->misalign,
4547 vect_prologue);
4548 if (*peel_iters_epilogue)
4549 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4550 retval += record_stmt_cost (epilogue_cost_vec,
4551 si->count * *peel_iters_epilogue,
4552 si->kind, si->stmt_info, si->misalign,
4553 vect_epilogue);
4555 return retval;
4558 /* Function vect_estimate_min_profitable_iters
4560 Return the number of iterations required for the vector version of the
4561 loop to be profitable relative to the cost of the scalar version of the
4562 loop.
4564 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4565 of iterations for vectorization. -1 value means loop vectorization
4566 is not profitable. This returned value may be used for dynamic
4567 profitability check.
4569 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4570 for static check against estimated number of iterations. */
4572 static void
4573 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4574 int *ret_min_profitable_niters,
4575 int *ret_min_profitable_estimate,
4576 unsigned *suggested_unroll_factor)
4578 int min_profitable_iters;
4579 int min_profitable_estimate;
4580 int peel_iters_prologue;
4581 int peel_iters_epilogue;
4582 unsigned vec_inside_cost = 0;
4583 int vec_outside_cost = 0;
4584 unsigned vec_prologue_cost = 0;
4585 unsigned vec_epilogue_cost = 0;
4586 int scalar_single_iter_cost = 0;
4587 int scalar_outside_cost = 0;
4588 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4589 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4590 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4592 /* Cost model disabled. */
4593 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4595 if (dump_enabled_p ())
4596 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4597 *ret_min_profitable_niters = 0;
4598 *ret_min_profitable_estimate = 0;
4599 return;
4602 /* Requires loop versioning tests to handle misalignment. */
4603 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4605 /* FIXME: Make cost depend on complexity of individual check. */
4606 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4607 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4608 if (dump_enabled_p ())
4609 dump_printf (MSG_NOTE,
4610 "cost model: Adding cost of checks for loop "
4611 "versioning to treat misalignment.\n");
4614 /* Requires loop versioning with alias checks. */
4615 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4617 /* FIXME: Make cost depend on complexity of individual check. */
4618 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4619 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4620 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4621 if (len)
4622 /* Count LEN - 1 ANDs and LEN comparisons. */
4623 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4624 scalar_stmt, vect_prologue);
4625 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4626 if (len)
4628 /* Count LEN - 1 ANDs and LEN comparisons. */
4629 unsigned int nstmts = len * 2 - 1;
4630 /* +1 for each bias that needs adding. */
4631 for (unsigned int i = 0; i < len; ++i)
4632 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4633 nstmts += 1;
4634 (void) add_stmt_cost (target_cost_data, nstmts,
4635 scalar_stmt, vect_prologue);
4637 if (dump_enabled_p ())
4638 dump_printf (MSG_NOTE,
4639 "cost model: Adding cost of checks for loop "
4640 "versioning aliasing.\n");
4643 /* Requires loop versioning with niter checks. */
4644 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4646 /* FIXME: Make cost depend on complexity of individual check. */
4647 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4648 NULL, NULL, NULL_TREE, 0, vect_prologue);
4649 if (dump_enabled_p ())
4650 dump_printf (MSG_NOTE,
4651 "cost model: Adding cost of checks for loop "
4652 "versioning niters.\n");
4655 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4656 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4657 vect_prologue);
4659 /* Count statements in scalar loop. Using this as scalar cost for a single
4660 iteration for now.
4662 TODO: Add outer loop support.
4664 TODO: Consider assigning different costs to different scalar
4665 statements. */
4667 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4669 /* Add additional cost for the peeled instructions in prologue and epilogue
4670 loop. (For fully-masked loops there will be no peeling.)
4672 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4673 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4675 TODO: Build an expression that represents peel_iters for prologue and
4676 epilogue to be used in a run-time test. */
4678 bool prologue_need_br_taken_cost = false;
4679 bool prologue_need_br_not_taken_cost = false;
4681 /* Calculate peel_iters_prologue. */
4682 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4683 peel_iters_prologue = 0;
4684 else if (npeel < 0)
4686 peel_iters_prologue = assumed_vf / 2;
4687 if (dump_enabled_p ())
4688 dump_printf (MSG_NOTE, "cost model: "
4689 "prologue peel iters set to vf/2.\n");
4691 /* If peeled iterations are unknown, count a taken branch and a not taken
4692 branch per peeled loop. Even if scalar loop iterations are known,
4693 vector iterations are not known since peeled prologue iterations are
4694 not known. Hence guards remain the same. */
4695 prologue_need_br_taken_cost = true;
4696 prologue_need_br_not_taken_cost = true;
4698 else
4700 peel_iters_prologue = npeel;
4701 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4702 /* If peeled iterations are known but number of scalar loop
4703 iterations are unknown, count a taken branch per peeled loop. */
4704 prologue_need_br_taken_cost = true;
4707 bool epilogue_need_br_taken_cost = false;
4708 bool epilogue_need_br_not_taken_cost = false;
4710 /* Calculate peel_iters_epilogue. */
4711 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4712 /* We need to peel exactly one iteration for gaps. */
4713 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4714 else if (npeel < 0)
4716 /* If peeling for alignment is unknown, loop bound of main loop
4717 becomes unknown. */
4718 peel_iters_epilogue = assumed_vf / 2;
4719 if (dump_enabled_p ())
4720 dump_printf (MSG_NOTE, "cost model: "
4721 "epilogue peel iters set to vf/2 because "
4722 "peeling for alignment is unknown.\n");
4724 /* See the same reason above in peel_iters_prologue calculation. */
4725 epilogue_need_br_taken_cost = true;
4726 epilogue_need_br_not_taken_cost = true;
4728 else
4730 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4731 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4732 /* If peeled iterations are known but number of scalar loop
4733 iterations are unknown, count a taken branch per peeled loop. */
4734 epilogue_need_br_taken_cost = true;
4737 stmt_info_for_cost *si;
4738 int j;
4739 /* Add costs associated with peel_iters_prologue. */
4740 if (peel_iters_prologue)
4741 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4743 (void) add_stmt_cost (target_cost_data,
4744 si->count * peel_iters_prologue, si->kind,
4745 si->stmt_info, si->node, si->vectype,
4746 si->misalign, vect_prologue);
4749 /* Add costs associated with peel_iters_epilogue. */
4750 if (peel_iters_epilogue)
4751 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4753 (void) add_stmt_cost (target_cost_data,
4754 si->count * peel_iters_epilogue, si->kind,
4755 si->stmt_info, si->node, si->vectype,
4756 si->misalign, vect_epilogue);
4759 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4761 if (prologue_need_br_taken_cost)
4762 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4763 vect_prologue);
4765 if (prologue_need_br_not_taken_cost)
4766 (void) add_stmt_cost (target_cost_data, 1,
4767 cond_branch_not_taken, vect_prologue);
4769 if (epilogue_need_br_taken_cost)
4770 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4771 vect_epilogue);
4773 if (epilogue_need_br_not_taken_cost)
4774 (void) add_stmt_cost (target_cost_data, 1,
4775 cond_branch_not_taken, vect_epilogue);
4777 /* Take care of special costs for rgroup controls of partial vectors. */
4778 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4779 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4780 == vect_partial_vectors_avx512))
4782 /* Calculate how many masks we need to generate. */
4783 unsigned int num_masks = 0;
4784 bool need_saturation = false;
4785 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4786 if (rgm.type)
4788 unsigned nvectors = rgm.factor;
4789 num_masks += nvectors;
4790 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4791 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4792 need_saturation = true;
4795 /* ??? The target isn't able to identify the costs below as
4796 producing masks so it cannot penaltize cases where we'd run
4797 out of mask registers for example. */
4799 /* ??? We are also failing to account for smaller vector masks
4800 we generate by splitting larger masks in vect_get_loop_mask. */
4802 /* In the worst case, we need to generate each mask in the prologue
4803 and in the loop body. We need one splat per group and one
4804 compare per mask.
4806 Sometimes the prologue mask will fold to a constant,
4807 so the actual prologue cost might be smaller. However, it's
4808 simpler and safer to use the worst-case cost; if this ends up
4809 being the tie-breaker between vectorizing or not, then it's
4810 probably better not to vectorize. */
4811 (void) add_stmt_cost (target_cost_data,
4812 num_masks
4813 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4814 vector_stmt, NULL, NULL, NULL_TREE, 0,
4815 vect_prologue);
4816 (void) add_stmt_cost (target_cost_data,
4817 num_masks
4818 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4819 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4821 /* When we need saturation we need it both in the prologue and
4822 the epilogue. */
4823 if (need_saturation)
4825 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4826 NULL, NULL, NULL_TREE, 0, vect_prologue);
4827 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4828 NULL, NULL, NULL_TREE, 0, vect_body);
4831 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4832 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4833 == vect_partial_vectors_while_ult))
4835 /* Calculate how many masks we need to generate. */
4836 unsigned int num_masks = 0;
4837 rgroup_controls *rgm;
4838 unsigned int num_vectors_m1;
4839 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4840 num_vectors_m1, rgm)
4841 if (rgm->type)
4842 num_masks += num_vectors_m1 + 1;
4843 gcc_assert (num_masks > 0);
4845 /* In the worst case, we need to generate each mask in the prologue
4846 and in the loop body. One of the loop body mask instructions
4847 replaces the comparison in the scalar loop, and since we don't
4848 count the scalar comparison against the scalar body, we shouldn't
4849 count that vector instruction against the vector body either.
4851 Sometimes we can use unpacks instead of generating prologue
4852 masks and sometimes the prologue mask will fold to a constant,
4853 so the actual prologue cost might be smaller. However, it's
4854 simpler and safer to use the worst-case cost; if this ends up
4855 being the tie-breaker between vectorizing or not, then it's
4856 probably better not to vectorize. */
4857 (void) add_stmt_cost (target_cost_data, num_masks,
4858 vector_stmt, NULL, NULL, NULL_TREE, 0,
4859 vect_prologue);
4860 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4861 vector_stmt, NULL, NULL, NULL_TREE, 0,
4862 vect_body);
4864 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4866 /* Referring to the functions vect_set_loop_condition_partial_vectors
4867 and vect_set_loop_controls_directly, we need to generate each
4868 length in the prologue and in the loop body if required. Although
4869 there are some possible optimizations, we consider the worst case
4870 here. */
4872 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4873 signed char partial_load_store_bias
4874 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4875 bool need_iterate_p
4876 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4877 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4879 /* Calculate how many statements to be added. */
4880 unsigned int prologue_stmts = 0;
4881 unsigned int body_stmts = 0;
4883 rgroup_controls *rgc;
4884 unsigned int num_vectors_m1;
4885 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4886 if (rgc->type)
4888 /* May need one SHIFT for nitems_total computation. */
4889 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4890 if (nitems != 1 && !niters_known_p)
4891 prologue_stmts += 1;
4893 /* May need one MAX and one MINUS for wrap around. */
4894 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4895 prologue_stmts += 2;
4897 /* Need one MAX and one MINUS for each batch limit excepting for
4898 the 1st one. */
4899 prologue_stmts += num_vectors_m1 * 2;
4901 unsigned int num_vectors = num_vectors_m1 + 1;
4903 /* Need to set up lengths in prologue, only one MIN required
4904 for each since start index is zero. */
4905 prologue_stmts += num_vectors;
4907 /* If we have a non-zero partial load bias, we need one PLUS
4908 to adjust the load length. */
4909 if (partial_load_store_bias != 0)
4910 body_stmts += 1;
4912 unsigned int length_update_cost = 0;
4913 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4914 /* For decrement IV style, Each only need a single SELECT_VL
4915 or MIN since beginning to calculate the number of elements
4916 need to be processed in current iteration. */
4917 length_update_cost = 1;
4918 else
4919 /* For increment IV stype, Each may need two MINs and one MINUS to
4920 update lengths in body for next iteration. */
4921 length_update_cost = 3;
4923 if (need_iterate_p)
4924 body_stmts += length_update_cost * num_vectors;
4927 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4928 scalar_stmt, vect_prologue);
4929 (void) add_stmt_cost (target_cost_data, body_stmts,
4930 scalar_stmt, vect_body);
4933 /* FORNOW: The scalar outside cost is incremented in one of the
4934 following ways:
4936 1. The vectorizer checks for alignment and aliasing and generates
4937 a condition that allows dynamic vectorization. A cost model
4938 check is ANDED with the versioning condition. Hence scalar code
4939 path now has the added cost of the versioning check.
4941 if (cost > th & versioning_check)
4942 jmp to vector code
4944 Hence run-time scalar is incremented by not-taken branch cost.
4946 2. The vectorizer then checks if a prologue is required. If the
4947 cost model check was not done before during versioning, it has to
4948 be done before the prologue check.
4950 if (cost <= th)
4951 prologue = scalar_iters
4952 if (prologue == 0)
4953 jmp to vector code
4954 else
4955 execute prologue
4956 if (prologue == num_iters)
4957 go to exit
4959 Hence the run-time scalar cost is incremented by a taken branch,
4960 plus a not-taken branch, plus a taken branch cost.
4962 3. The vectorizer then checks if an epilogue is required. If the
4963 cost model check was not done before during prologue check, it
4964 has to be done with the epilogue check.
4966 if (prologue == 0)
4967 jmp to vector code
4968 else
4969 execute prologue
4970 if (prologue == num_iters)
4971 go to exit
4972 vector code:
4973 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4974 jmp to epilogue
4976 Hence the run-time scalar cost should be incremented by 2 taken
4977 branches.
4979 TODO: The back end may reorder the BBS's differently and reverse
4980 conditions/branch directions. Change the estimates below to
4981 something more reasonable. */
4983 /* If the number of iterations is known and we do not do versioning, we can
4984 decide whether to vectorize at compile time. Hence the scalar version
4985 do not carry cost model guard costs. */
4986 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4987 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4989 /* Cost model check occurs at versioning. */
4990 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4991 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4992 else
4994 /* Cost model check occurs at prologue generation. */
4995 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4996 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4997 + vect_get_stmt_cost (cond_branch_not_taken);
4998 /* Cost model check occurs at epilogue generation. */
4999 else
5000 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
5004 /* Complete the target-specific cost calculations. */
5005 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
5006 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
5007 suggested_unroll_factor);
5009 if (suggested_unroll_factor && *suggested_unroll_factor > 1
5010 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
5011 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
5012 *suggested_unroll_factor,
5013 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5015 if (dump_enabled_p ())
5016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5017 "can't unroll as unrolled vectorization factor larger"
5018 " than maximum vectorization factor: "
5019 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5020 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5021 *suggested_unroll_factor = 1;
5024 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5026 if (dump_enabled_p ())
5028 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5029 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
5030 vec_inside_cost);
5031 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
5032 vec_prologue_cost);
5033 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
5034 vec_epilogue_cost);
5035 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
5036 scalar_single_iter_cost);
5037 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
5038 scalar_outside_cost);
5039 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
5040 vec_outside_cost);
5041 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
5042 peel_iters_prologue);
5043 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
5044 peel_iters_epilogue);
5047 /* Calculate number of iterations required to make the vector version
5048 profitable, relative to the loop bodies only. The following condition
5049 must hold true:
5050 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5051 where
5052 SIC = scalar iteration cost, VIC = vector iteration cost,
5053 VOC = vector outside cost, VF = vectorization factor,
5054 NPEEL = prologue iterations + epilogue iterations,
5055 SOC = scalar outside cost for run time cost model check. */
5057 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5058 - vec_inside_cost);
5059 if (saving_per_viter <= 0)
5061 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5062 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5063 "vectorization did not happen for a simd loop");
5065 if (dump_enabled_p ())
5066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5067 "cost model: the vector iteration cost = %d "
5068 "divided by the scalar iteration cost = %d "
5069 "is greater or equal to the vectorization factor = %d"
5070 ".\n",
5071 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5072 *ret_min_profitable_niters = -1;
5073 *ret_min_profitable_estimate = -1;
5074 return;
5077 /* ??? The "if" arm is written to handle all cases; see below for what
5078 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5079 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5081 /* Rewriting the condition above in terms of the number of
5082 vector iterations (vniters) rather than the number of
5083 scalar iterations (niters) gives:
5085 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5087 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5089 For integer N, X and Y when X > 0:
5091 N * X > Y <==> N >= (Y /[floor] X) + 1. */
5092 int outside_overhead = (vec_outside_cost
5093 - scalar_single_iter_cost * peel_iters_prologue
5094 - scalar_single_iter_cost * peel_iters_epilogue
5095 - scalar_outside_cost);
5096 /* We're only interested in cases that require at least one
5097 vector iteration. */
5098 int min_vec_niters = 1;
5099 if (outside_overhead > 0)
5100 min_vec_niters = outside_overhead / saving_per_viter + 1;
5102 if (dump_enabled_p ())
5103 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5104 min_vec_niters);
5106 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5108 /* Now that we know the minimum number of vector iterations,
5109 find the minimum niters for which the scalar cost is larger:
5111 SIC * niters > VIC * vniters + VOC - SOC
5113 We know that the minimum niters is no more than
5114 vniters * VF + NPEEL, but it might be (and often is) less
5115 than that if a partial vector iteration is cheaper than the
5116 equivalent scalar code. */
5117 int threshold = (vec_inside_cost * min_vec_niters
5118 + vec_outside_cost
5119 - scalar_outside_cost);
5120 if (threshold <= 0)
5121 min_profitable_iters = 1;
5122 else
5123 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5125 else
5126 /* Convert the number of vector iterations into a number of
5127 scalar iterations. */
5128 min_profitable_iters = (min_vec_niters * assumed_vf
5129 + peel_iters_prologue
5130 + peel_iters_epilogue);
5132 else
5134 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5135 * assumed_vf
5136 - vec_inside_cost * peel_iters_prologue
5137 - vec_inside_cost * peel_iters_epilogue);
5138 if (min_profitable_iters <= 0)
5139 min_profitable_iters = 0;
5140 else
5142 min_profitable_iters /= saving_per_viter;
5144 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5145 <= (((int) vec_inside_cost * min_profitable_iters)
5146 + (((int) vec_outside_cost - scalar_outside_cost)
5147 * assumed_vf)))
5148 min_profitable_iters++;
5152 if (dump_enabled_p ())
5153 dump_printf (MSG_NOTE,
5154 " Calculated minimum iters for profitability: %d\n",
5155 min_profitable_iters);
5157 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5158 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5159 /* We want the vectorized loop to execute at least once. */
5160 min_profitable_iters = assumed_vf + peel_iters_prologue;
5161 else if (min_profitable_iters < peel_iters_prologue)
5162 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5163 vectorized loop executes at least once. */
5164 min_profitable_iters = peel_iters_prologue;
5166 if (dump_enabled_p ())
5167 dump_printf_loc (MSG_NOTE, vect_location,
5168 " Runtime profitability threshold = %d\n",
5169 min_profitable_iters);
5171 *ret_min_profitable_niters = min_profitable_iters;
5173 /* Calculate number of iterations required to make the vector version
5174 profitable, relative to the loop bodies only.
5176 Non-vectorized variant is SIC * niters and it must win over vector
5177 variant on the expected loop trip count. The following condition must hold true:
5178 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5180 if (vec_outside_cost <= 0)
5181 min_profitable_estimate = 0;
5182 /* ??? This "else if" arm is written to handle all cases; see below for
5183 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5184 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5186 /* This is a repeat of the code above, but with + SOC rather
5187 than - SOC. */
5188 int outside_overhead = (vec_outside_cost
5189 - scalar_single_iter_cost * peel_iters_prologue
5190 - scalar_single_iter_cost * peel_iters_epilogue
5191 + scalar_outside_cost);
5192 int min_vec_niters = 1;
5193 if (outside_overhead > 0)
5194 min_vec_niters = outside_overhead / saving_per_viter + 1;
5196 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5198 int threshold = (vec_inside_cost * min_vec_niters
5199 + vec_outside_cost
5200 + scalar_outside_cost);
5201 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5203 else
5204 min_profitable_estimate = (min_vec_niters * assumed_vf
5205 + peel_iters_prologue
5206 + peel_iters_epilogue);
5208 else
5210 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5211 * assumed_vf
5212 - vec_inside_cost * peel_iters_prologue
5213 - vec_inside_cost * peel_iters_epilogue)
5214 / ((scalar_single_iter_cost * assumed_vf)
5215 - vec_inside_cost);
5217 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5218 if (dump_enabled_p ())
5219 dump_printf_loc (MSG_NOTE, vect_location,
5220 " Static estimate profitability threshold = %d\n",
5221 min_profitable_estimate);
5223 *ret_min_profitable_estimate = min_profitable_estimate;
5226 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5227 vector elements (not bits) for a vector with NELT elements. */
5228 static void
5229 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5230 vec_perm_builder *sel)
5232 /* The encoding is a single stepped pattern. Any wrap-around is handled
5233 by vec_perm_indices. */
5234 sel->new_vector (nelt, 1, 3);
5235 for (unsigned int i = 0; i < 3; i++)
5236 sel->quick_push (i + offset);
5239 /* Checks whether the target supports whole-vector shifts for vectors of mode
5240 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5241 it supports vec_perm_const with masks for all necessary shift amounts. */
5242 static bool
5243 have_whole_vector_shift (machine_mode mode)
5245 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5246 return true;
5248 /* Variable-length vectors should be handled via the optab. */
5249 unsigned int nelt;
5250 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5251 return false;
5253 vec_perm_builder sel;
5254 vec_perm_indices indices;
5255 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5257 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5258 indices.new_vector (sel, 2, nelt);
5259 if (!can_vec_perm_const_p (mode, mode, indices, false))
5260 return false;
5262 return true;
5265 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5266 multiplication operands have differing signs and (b) we intend
5267 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5268 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5270 static bool
5271 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5272 stmt_vec_info stmt_info)
5274 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5275 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5276 return false;
5278 tree rhs1 = gimple_assign_rhs1 (assign);
5279 tree rhs2 = gimple_assign_rhs2 (assign);
5280 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5281 return false;
5283 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5284 gcc_assert (reduc_info->is_reduc_info);
5285 return !directly_supported_p (DOT_PROD_EXPR,
5286 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5287 optab_vector_mixed_sign);
5290 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5291 functions. Design better to avoid maintenance issues. */
5293 /* Function vect_model_reduction_cost.
5295 Models cost for a reduction operation, including the vector ops
5296 generated within the strip-mine loop in some cases, the initial
5297 definition before the loop, and the epilogue code that must be generated. */
5299 static void
5300 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5301 stmt_vec_info stmt_info, internal_fn reduc_fn,
5302 vect_reduction_type reduction_type,
5303 int ncopies, stmt_vector_for_cost *cost_vec)
5305 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5306 tree vectype;
5307 machine_mode mode;
5308 class loop *loop = NULL;
5310 if (loop_vinfo)
5311 loop = LOOP_VINFO_LOOP (loop_vinfo);
5313 /* Condition reductions generate two reductions in the loop. */
5314 if (reduction_type == COND_REDUCTION)
5315 ncopies *= 2;
5317 vectype = STMT_VINFO_VECTYPE (stmt_info);
5318 mode = TYPE_MODE (vectype);
5319 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5321 gimple_match_op op;
5322 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5323 gcc_unreachable ();
5325 bool emulated_mixed_dot_prod
5326 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5327 if (reduction_type == EXTRACT_LAST_REDUCTION)
5328 /* No extra instructions are needed in the prologue. The loop body
5329 operations are costed in vectorizable_condition. */
5330 inside_cost = 0;
5331 else if (reduction_type == FOLD_LEFT_REDUCTION)
5333 /* No extra instructions needed in the prologue. */
5334 prologue_cost = 0;
5336 if (reduc_fn != IFN_LAST)
5337 /* Count one reduction-like operation per vector. */
5338 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5339 stmt_info, 0, vect_body);
5340 else
5342 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5343 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5344 inside_cost = record_stmt_cost (cost_vec, nelements,
5345 vec_to_scalar, stmt_info, 0,
5346 vect_body);
5347 inside_cost += record_stmt_cost (cost_vec, nelements,
5348 scalar_stmt, stmt_info, 0,
5349 vect_body);
5352 else
5354 /* Add in the cost of the initial definitions. */
5355 int prologue_stmts;
5356 if (reduction_type == COND_REDUCTION)
5357 /* For cond reductions we have four vectors: initial index, step,
5358 initial result of the data reduction, initial value of the index
5359 reduction. */
5360 prologue_stmts = 4;
5361 else if (emulated_mixed_dot_prod)
5362 /* We need the initial reduction value and two invariants:
5363 one that contains the minimum signed value and one that
5364 contains half of its negative. */
5365 prologue_stmts = 3;
5366 else
5367 prologue_stmts = 1;
5368 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5369 scalar_to_vec, stmt_info, 0,
5370 vect_prologue);
5373 /* Determine cost of epilogue code.
5375 We have a reduction operator that will reduce the vector in one statement.
5376 Also requires scalar extract. */
5378 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5380 if (reduc_fn != IFN_LAST)
5382 if (reduction_type == COND_REDUCTION)
5384 /* An EQ stmt and an COND_EXPR stmt. */
5385 epilogue_cost += record_stmt_cost (cost_vec, 2,
5386 vector_stmt, stmt_info, 0,
5387 vect_epilogue);
5388 /* Reduction of the max index and a reduction of the found
5389 values. */
5390 epilogue_cost += record_stmt_cost (cost_vec, 2,
5391 vec_to_scalar, stmt_info, 0,
5392 vect_epilogue);
5393 /* A broadcast of the max value. */
5394 epilogue_cost += record_stmt_cost (cost_vec, 1,
5395 scalar_to_vec, stmt_info, 0,
5396 vect_epilogue);
5398 else
5400 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5401 stmt_info, 0, vect_epilogue);
5402 epilogue_cost += record_stmt_cost (cost_vec, 1,
5403 vec_to_scalar, stmt_info, 0,
5404 vect_epilogue);
5407 else if (reduction_type == COND_REDUCTION)
5409 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5410 /* Extraction of scalar elements. */
5411 epilogue_cost += record_stmt_cost (cost_vec,
5412 2 * estimated_nunits,
5413 vec_to_scalar, stmt_info, 0,
5414 vect_epilogue);
5415 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5416 epilogue_cost += record_stmt_cost (cost_vec,
5417 2 * estimated_nunits - 3,
5418 scalar_stmt, stmt_info, 0,
5419 vect_epilogue);
5421 else if (reduction_type == EXTRACT_LAST_REDUCTION
5422 || reduction_type == FOLD_LEFT_REDUCTION)
5423 /* No extra instructions need in the epilogue. */
5425 else
5427 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5428 tree bitsize = TYPE_SIZE (op.type);
5429 int element_bitsize = tree_to_uhwi (bitsize);
5430 int nelements = vec_size_in_bits / element_bitsize;
5432 if (op.code == COND_EXPR)
5433 op.code = MAX_EXPR;
5435 /* We have a whole vector shift available. */
5436 if (VECTOR_MODE_P (mode)
5437 && directly_supported_p (op.code, vectype)
5438 && have_whole_vector_shift (mode))
5440 /* Final reduction via vector shifts and the reduction operator.
5441 Also requires scalar extract. */
5442 epilogue_cost += record_stmt_cost (cost_vec,
5443 exact_log2 (nelements) * 2,
5444 vector_stmt, stmt_info, 0,
5445 vect_epilogue);
5446 epilogue_cost += record_stmt_cost (cost_vec, 1,
5447 vec_to_scalar, stmt_info, 0,
5448 vect_epilogue);
5450 else
5451 /* Use extracts and reduction op for final reduction. For N
5452 elements, we have N extracts and N-1 reduction ops. */
5453 epilogue_cost += record_stmt_cost (cost_vec,
5454 nelements + nelements - 1,
5455 vector_stmt, stmt_info, 0,
5456 vect_epilogue);
5460 if (dump_enabled_p ())
5461 dump_printf (MSG_NOTE,
5462 "vect_model_reduction_cost: inside_cost = %d, "
5463 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5464 prologue_cost, epilogue_cost);
5467 /* SEQ is a sequence of instructions that initialize the reduction
5468 described by REDUC_INFO. Emit them in the appropriate place. */
5470 static void
5471 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5472 stmt_vec_info reduc_info, gimple *seq)
5474 if (reduc_info->reused_accumulator)
5476 /* When reusing an accumulator from the main loop, we only need
5477 initialization instructions if the main loop can be skipped.
5478 In that case, emit the initialization instructions at the end
5479 of the guard block that does the skip. */
5480 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5481 gcc_assert (skip_edge);
5482 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5483 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5485 else
5487 /* The normal case: emit the initialization instructions on the
5488 preheader edge. */
5489 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5490 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5494 /* Function get_initial_def_for_reduction
5496 Input:
5497 REDUC_INFO - the info_for_reduction
5498 INIT_VAL - the initial value of the reduction variable
5499 NEUTRAL_OP - a value that has no effect on the reduction, as per
5500 neutral_op_for_reduction
5502 Output:
5503 Return a vector variable, initialized according to the operation that
5504 STMT_VINFO performs. This vector will be used as the initial value
5505 of the vector of partial results.
5507 The value we need is a vector in which element 0 has value INIT_VAL
5508 and every other element has value NEUTRAL_OP. */
5510 static tree
5511 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5512 stmt_vec_info reduc_info,
5513 tree init_val, tree neutral_op)
5515 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5516 tree scalar_type = TREE_TYPE (init_val);
5517 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5518 tree init_def;
5519 gimple_seq stmts = NULL;
5521 gcc_assert (vectype);
5523 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5524 || SCALAR_FLOAT_TYPE_P (scalar_type));
5526 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5527 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5529 if (operand_equal_p (init_val, neutral_op))
5531 /* If both elements are equal then the vector described above is
5532 just a splat. */
5533 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5534 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5536 else
5538 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5539 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5540 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5542 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5543 element 0. */
5544 init_def = gimple_build_vector_from_val (&stmts, vectype,
5545 neutral_op);
5546 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5547 vectype, init_def, init_val);
5549 else
5551 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5552 tree_vector_builder elts (vectype, 1, 2);
5553 elts.quick_push (init_val);
5554 elts.quick_push (neutral_op);
5555 init_def = gimple_build_vector (&stmts, &elts);
5559 if (stmts)
5560 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5561 return init_def;
5564 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5565 which performs a reduction involving GROUP_SIZE scalar statements.
5566 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5567 is nonnull, introducing extra elements of that value will not change the
5568 result. */
5570 static void
5571 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5572 stmt_vec_info reduc_info,
5573 vec<tree> *vec_oprnds,
5574 unsigned int number_of_vectors,
5575 unsigned int group_size, tree neutral_op)
5577 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5578 unsigned HOST_WIDE_INT nunits;
5579 unsigned j, number_of_places_left_in_vector;
5580 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5581 unsigned int i;
5583 gcc_assert (group_size == initial_values.length () || neutral_op);
5585 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5586 created vectors. It is greater than 1 if unrolling is performed.
5588 For example, we have two scalar operands, s1 and s2 (e.g., group of
5589 strided accesses of size two), while NUNITS is four (i.e., four scalars
5590 of this type can be packed in a vector). The output vector will contain
5591 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5592 will be 2).
5594 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5595 vectors containing the operands.
5597 For example, NUNITS is four as before, and the group size is 8
5598 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5599 {s5, s6, s7, s8}. */
5601 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5602 nunits = group_size;
5604 number_of_places_left_in_vector = nunits;
5605 bool constant_p = true;
5606 tree_vector_builder elts (vector_type, nunits, 1);
5607 elts.quick_grow (nunits);
5608 gimple_seq ctor_seq = NULL;
5609 for (j = 0; j < nunits * number_of_vectors; ++j)
5611 tree op;
5612 i = j % group_size;
5614 /* Get the def before the loop. In reduction chain we have only
5615 one initial value. Else we have as many as PHIs in the group. */
5616 if (i >= initial_values.length () || (j > i && neutral_op))
5617 op = neutral_op;
5618 else
5619 op = initial_values[i];
5621 /* Create 'vect_ = {op0,op1,...,opn}'. */
5622 number_of_places_left_in_vector--;
5623 elts[nunits - number_of_places_left_in_vector - 1] = op;
5624 if (!CONSTANT_CLASS_P (op))
5625 constant_p = false;
5627 if (number_of_places_left_in_vector == 0)
5629 tree init;
5630 if (constant_p && !neutral_op
5631 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5632 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5633 /* Build the vector directly from ELTS. */
5634 init = gimple_build_vector (&ctor_seq, &elts);
5635 else if (neutral_op)
5637 /* Build a vector of the neutral value and shift the
5638 other elements into place. */
5639 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5640 neutral_op);
5641 int k = nunits;
5642 while (k > 0 && elts[k - 1] == neutral_op)
5643 k -= 1;
5644 while (k > 0)
5646 k -= 1;
5647 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5648 vector_type, init, elts[k]);
5651 else
5653 /* First time round, duplicate ELTS to fill the
5654 required number of vectors. */
5655 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5656 elts, number_of_vectors, *vec_oprnds);
5657 break;
5659 vec_oprnds->quick_push (init);
5661 number_of_places_left_in_vector = nunits;
5662 elts.new_vector (vector_type, nunits, 1);
5663 elts.quick_grow (nunits);
5664 constant_p = true;
5667 if (ctor_seq != NULL)
5668 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5671 /* For a statement STMT_INFO taking part in a reduction operation return
5672 the stmt_vec_info the meta information is stored on. */
5674 stmt_vec_info
5675 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5677 stmt_info = vect_orig_stmt (stmt_info);
5678 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5679 if (!is_a <gphi *> (stmt_info->stmt)
5680 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5681 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5682 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5683 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5685 if (gimple_phi_num_args (phi) == 1)
5686 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5688 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5690 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5691 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5692 stmt_info = info;
5694 return stmt_info;
5697 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5698 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5699 return false. */
5701 static bool
5702 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5703 stmt_vec_info reduc_info)
5705 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5706 if (!main_loop_vinfo)
5707 return false;
5709 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5710 return false;
5712 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5713 auto_vec<tree, 16> main_loop_results (num_phis);
5714 auto_vec<tree, 16> initial_values (num_phis);
5715 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5717 /* The epilogue loop can be entered either from the main loop or
5718 from an earlier guard block. */
5719 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5720 for (tree incoming_value : reduc_info->reduc_initial_values)
5722 /* Look for:
5724 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5725 INITIAL_VALUE(guard block)>. */
5726 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5728 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5729 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5731 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5732 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5734 main_loop_results.quick_push (from_main_loop);
5735 initial_values.quick_push (from_skip);
5738 else
5739 /* The main loop dominates the epilogue loop. */
5740 main_loop_results.splice (reduc_info->reduc_initial_values);
5742 /* See if the main loop has the kind of accumulator we need. */
5743 vect_reusable_accumulator *accumulator
5744 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5745 if (!accumulator
5746 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5747 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5748 accumulator->reduc_info->reduc_scalar_results.begin ()))
5749 return false;
5751 /* Handle the case where we can reduce wider vectors to narrower ones. */
5752 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5753 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5754 unsigned HOST_WIDE_INT m;
5755 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5756 TYPE_VECTOR_SUBPARTS (vectype), &m))
5757 return false;
5758 /* Check the intermediate vector types and operations are available. */
5759 tree prev_vectype = old_vectype;
5760 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5761 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5763 intermediate_nunits = exact_div (intermediate_nunits, 2);
5764 tree intermediate_vectype = get_related_vectype_for_scalar_type
5765 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5766 if (!intermediate_vectype
5767 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5768 intermediate_vectype)
5769 || !can_vec_extract (TYPE_MODE (prev_vectype),
5770 TYPE_MODE (intermediate_vectype)))
5771 return false;
5772 prev_vectype = intermediate_vectype;
5775 /* Non-SLP reductions might apply an adjustment after the reduction
5776 operation, in order to simplify the initialization of the accumulator.
5777 If the epilogue loop carries on from where the main loop left off,
5778 it should apply the same adjustment to the final reduction result.
5780 If the epilogue loop can also be entered directly (rather than via
5781 the main loop), we need to be able to handle that case in the same way,
5782 with the same adjustment. (In principle we could add a PHI node
5783 to select the correct adjustment, but in practice that shouldn't be
5784 necessary.) */
5785 tree main_adjustment
5786 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5787 if (loop_vinfo->main_loop_edge && main_adjustment)
5789 gcc_assert (num_phis == 1);
5790 tree initial_value = initial_values[0];
5791 /* Check that we can use INITIAL_VALUE as the adjustment and
5792 initialize the accumulator with a neutral value instead. */
5793 if (!operand_equal_p (initial_value, main_adjustment))
5794 return false;
5795 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5796 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5797 code, initial_value);
5799 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5800 reduc_info->reduc_initial_values.truncate (0);
5801 reduc_info->reduc_initial_values.splice (initial_values);
5802 reduc_info->reused_accumulator = accumulator;
5803 return true;
5806 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5807 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5809 static tree
5810 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5811 gimple_seq *seq)
5813 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5814 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5815 tree stype = TREE_TYPE (vectype);
5816 tree new_temp = vec_def;
5817 while (nunits > nunits1)
5819 nunits /= 2;
5820 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5821 stype, nunits);
5822 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5824 /* The target has to make sure we support lowpart/highpart
5825 extraction, either via direct vector extract or through
5826 an integer mode punning. */
5827 tree dst1, dst2;
5828 gimple *epilog_stmt;
5829 if (convert_optab_handler (vec_extract_optab,
5830 TYPE_MODE (TREE_TYPE (new_temp)),
5831 TYPE_MODE (vectype1))
5832 != CODE_FOR_nothing)
5834 /* Extract sub-vectors directly once vec_extract becomes
5835 a conversion optab. */
5836 dst1 = make_ssa_name (vectype1);
5837 epilog_stmt
5838 = gimple_build_assign (dst1, BIT_FIELD_REF,
5839 build3 (BIT_FIELD_REF, vectype1,
5840 new_temp, TYPE_SIZE (vectype1),
5841 bitsize_int (0)));
5842 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5843 dst2 = make_ssa_name (vectype1);
5844 epilog_stmt
5845 = gimple_build_assign (dst2, BIT_FIELD_REF,
5846 build3 (BIT_FIELD_REF, vectype1,
5847 new_temp, TYPE_SIZE (vectype1),
5848 bitsize_int (bitsize)));
5849 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5851 else
5853 /* Extract via punning to appropriately sized integer mode
5854 vector. */
5855 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5856 tree etype = build_vector_type (eltype, 2);
5857 gcc_assert (convert_optab_handler (vec_extract_optab,
5858 TYPE_MODE (etype),
5859 TYPE_MODE (eltype))
5860 != CODE_FOR_nothing);
5861 tree tem = make_ssa_name (etype);
5862 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5863 build1 (VIEW_CONVERT_EXPR,
5864 etype, new_temp));
5865 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5866 new_temp = tem;
5867 tem = make_ssa_name (eltype);
5868 epilog_stmt
5869 = gimple_build_assign (tem, BIT_FIELD_REF,
5870 build3 (BIT_FIELD_REF, eltype,
5871 new_temp, TYPE_SIZE (eltype),
5872 bitsize_int (0)));
5873 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5874 dst1 = make_ssa_name (vectype1);
5875 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5876 build1 (VIEW_CONVERT_EXPR,
5877 vectype1, tem));
5878 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5879 tem = make_ssa_name (eltype);
5880 epilog_stmt
5881 = gimple_build_assign (tem, BIT_FIELD_REF,
5882 build3 (BIT_FIELD_REF, eltype,
5883 new_temp, TYPE_SIZE (eltype),
5884 bitsize_int (bitsize)));
5885 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5886 dst2 = make_ssa_name (vectype1);
5887 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5888 build1 (VIEW_CONVERT_EXPR,
5889 vectype1, tem));
5890 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5893 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5896 return new_temp;
5899 /* Retrieves the definining statement to be used for a reduction.
5900 For LAST_VAL_REDUC_P we use the current VEC_STMTs which correspond to the
5901 final value after vectorization and otherwise we look at the reduction
5902 definitions to get the first. */
5904 tree
5905 vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
5906 slp_instance slp_node_instance, bool last_val_reduc_p,
5907 unsigned i, vec <gimple *> &vec_stmts)
5909 tree def;
5911 if (slp_node)
5913 if (!last_val_reduc_p)
5914 slp_node = slp_node_instance->reduc_phis;
5915 def = vect_get_slp_vect_def (slp_node, i);
5917 else
5919 if (!last_val_reduc_p)
5920 reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
5921 vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
5922 def = gimple_get_lhs (vec_stmts[0]);
5925 return def;
5928 /* Function vect_create_epilog_for_reduction
5930 Create code at the loop-epilog to finalize the result of a reduction
5931 computation.
5933 STMT_INFO is the scalar reduction stmt that is being vectorized.
5934 SLP_NODE is an SLP node containing a group of reduction statements. The
5935 first one in this group is STMT_INFO.
5936 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5937 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5938 (counting from 0)
5939 LOOP_EXIT is the edge to update in the merge block. In the case of a single
5940 exit this edge is always the main loop exit.
5942 This function:
5943 1. Completes the reduction def-use cycles.
5944 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5945 by calling the function specified by REDUC_FN if available, or by
5946 other means (whole-vector shifts or a scalar loop).
5947 The function also creates a new phi node at the loop exit to preserve
5948 loop-closed form, as illustrated below.
5950 The flow at the entry to this function:
5952 loop:
5953 vec_def = phi <vec_init, null> # REDUCTION_PHI
5954 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5955 s_loop = scalar_stmt # (scalar) STMT_INFO
5956 loop_exit:
5957 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5958 use <s_out0>
5959 use <s_out0>
5961 The above is transformed by this function into:
5963 loop:
5964 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5965 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5966 s_loop = scalar_stmt # (scalar) STMT_INFO
5967 loop_exit:
5968 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5969 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5970 v_out2 = reduce <v_out1>
5971 s_out3 = extract_field <v_out2, 0>
5972 s_out4 = adjust_result <s_out3>
5973 use <s_out4>
5974 use <s_out4>
5977 static void
5978 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5979 stmt_vec_info stmt_info,
5980 slp_tree slp_node,
5981 slp_instance slp_node_instance,
5982 edge loop_exit)
5984 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5985 gcc_assert (reduc_info->is_reduc_info);
5986 /* For double reductions we need to get at the inner loop reduction
5987 stmt which has the meta info attached. Our stmt_info is that of the
5988 loop-closed PHI of the inner loop which we remember as
5989 def for the reduction PHI generation. */
5990 bool double_reduc = false;
5991 bool last_val_reduc_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit
5992 && !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
5993 stmt_vec_info rdef_info = stmt_info;
5994 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5996 gcc_assert (!slp_node);
5997 double_reduc = true;
5998 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5999 (stmt_info->stmt, 0));
6000 stmt_info = vect_stmt_to_vectorize (stmt_info);
6002 gphi *reduc_def_stmt
6003 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
6004 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
6005 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6006 tree vectype;
6007 machine_mode mode;
6008 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
6009 basic_block exit_bb;
6010 tree scalar_dest;
6011 tree scalar_type;
6012 gimple *new_phi = NULL, *phi = NULL;
6013 gimple_stmt_iterator exit_gsi;
6014 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
6015 gimple *epilog_stmt = NULL;
6016 gimple *exit_phi;
6017 tree bitsize;
6018 tree def;
6019 tree orig_name, scalar_result;
6020 imm_use_iterator imm_iter, phi_imm_iter;
6021 use_operand_p use_p, phi_use_p;
6022 gimple *use_stmt;
6023 auto_vec<tree> reduc_inputs;
6024 int j, i;
6025 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
6026 unsigned int group_size = 1, k;
6027 /* SLP reduction without reduction chain, e.g.,
6028 # a1 = phi <a2, a0>
6029 # b1 = phi <b2, b0>
6030 a2 = operation (a1)
6031 b2 = operation (b1) */
6032 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6033 bool direct_slp_reduc;
6034 tree induction_index = NULL_TREE;
6036 if (slp_node)
6037 group_size = SLP_TREE_LANES (slp_node);
6039 if (nested_in_vect_loop_p (loop, stmt_info))
6041 outer_loop = loop;
6042 loop = loop->inner;
6043 gcc_assert (!slp_node && double_reduc);
6046 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6047 gcc_assert (vectype);
6048 mode = TYPE_MODE (vectype);
6050 tree induc_val = NULL_TREE;
6051 tree adjustment_def = NULL;
6052 if (slp_node)
6054 else
6056 /* Optimize: for induction condition reduction, if we can't use zero
6057 for induc_val, use initial_def. */
6058 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6059 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6060 else if (double_reduc)
6062 else
6063 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6066 stmt_vec_info single_live_out_stmt[] = { stmt_info };
6067 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6068 if (slp_reduc)
6069 /* All statements produce live-out values. */
6070 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6071 else if (slp_node)
6073 /* The last statement in the reduction chain produces the live-out
6074 value. Note SLP optimization can shuffle scalar stmts to
6075 optimize permutations so we have to search for the last stmt. */
6076 for (k = 0; k < group_size; ++k)
6077 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
6079 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6080 break;
6084 unsigned vec_num;
6085 int ncopies;
6086 if (slp_node)
6088 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6089 ncopies = 1;
6091 else
6093 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6094 vec_num = 1;
6095 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6098 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6099 which is updated with the current index of the loop for every match of
6100 the original loop's cond_expr (VEC_STMT). This results in a vector
6101 containing the last time the condition passed for that vector lane.
6102 The first match will be a 1 to allow 0 to be used for non-matching
6103 indexes. If there are no matches at all then the vector will be all
6104 zeroes.
6106 PR92772: This algorithm is broken for architectures that support
6107 masked vectors, but do not provide fold_extract_last. */
6108 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6110 auto_vec<std::pair<tree, bool>, 2> ccompares;
6111 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6112 cond_info = vect_stmt_to_vectorize (cond_info);
6113 while (cond_info != reduc_info)
6115 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6117 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6118 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6119 ccompares.safe_push
6120 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6121 STMT_VINFO_REDUC_IDX (cond_info) == 2));
6123 cond_info
6124 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6125 1 + STMT_VINFO_REDUC_IDX
6126 (cond_info)));
6127 cond_info = vect_stmt_to_vectorize (cond_info);
6129 gcc_assert (ccompares.length () != 0);
6131 tree indx_before_incr, indx_after_incr;
6132 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6133 int scalar_precision
6134 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6135 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6136 tree cr_index_vector_type = get_related_vectype_for_scalar_type
6137 (TYPE_MODE (vectype), cr_index_scalar_type,
6138 TYPE_VECTOR_SUBPARTS (vectype));
6140 /* First we create a simple vector induction variable which starts
6141 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6142 vector size (STEP). */
6144 /* Create a {1,2,3,...} vector. */
6145 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6147 /* Create a vector of the step value. */
6148 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6149 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6151 /* Create an induction variable. */
6152 gimple_stmt_iterator incr_gsi;
6153 bool insert_after;
6154 vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6155 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6156 insert_after, &indx_before_incr, &indx_after_incr);
6158 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6159 filled with zeros (VEC_ZERO). */
6161 /* Create a vector of 0s. */
6162 tree zero = build_zero_cst (cr_index_scalar_type);
6163 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6165 /* Create a vector phi node. */
6166 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6167 new_phi = create_phi_node (new_phi_tree, loop->header);
6168 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6169 loop_preheader_edge (loop), UNKNOWN_LOCATION);
6171 /* Now take the condition from the loops original cond_exprs
6172 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6173 every match uses values from the induction variable
6174 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6175 (NEW_PHI_TREE).
6176 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6177 the new cond_expr (INDEX_COND_EXPR). */
6178 gimple_seq stmts = NULL;
6179 for (int i = ccompares.length () - 1; i != -1; --i)
6181 tree ccompare = ccompares[i].first;
6182 if (ccompares[i].second)
6183 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6184 cr_index_vector_type,
6185 ccompare,
6186 indx_before_incr, new_phi_tree);
6187 else
6188 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6189 cr_index_vector_type,
6190 ccompare,
6191 new_phi_tree, indx_before_incr);
6193 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6195 /* Update the phi with the vec cond. */
6196 induction_index = new_phi_tree;
6197 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6198 loop_latch_edge (loop), UNKNOWN_LOCATION);
6201 /* 2. Create epilog code.
6202 The reduction epilog code operates across the elements of the vector
6203 of partial results computed by the vectorized loop.
6204 The reduction epilog code consists of:
6206 step 1: compute the scalar result in a vector (v_out2)
6207 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6208 step 3: adjust the scalar result (s_out3) if needed.
6210 Step 1 can be accomplished using one the following three schemes:
6211 (scheme 1) using reduc_fn, if available.
6212 (scheme 2) using whole-vector shifts, if available.
6213 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6214 combined.
6216 The overall epilog code looks like this:
6218 s_out0 = phi <s_loop> # original EXIT_PHI
6219 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6220 v_out2 = reduce <v_out1> # step 1
6221 s_out3 = extract_field <v_out2, 0> # step 2
6222 s_out4 = adjust_result <s_out3> # step 3
6224 (step 3 is optional, and steps 1 and 2 may be combined).
6225 Lastly, the uses of s_out0 are replaced by s_out4. */
6228 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6229 v_out1 = phi <VECT_DEF>
6230 Store them in NEW_PHIS. */
6231 if (double_reduc)
6232 loop = outer_loop;
6233 /* We need to reduce values in all exits. */
6234 exit_bb = loop_exit->dest;
6235 exit_gsi = gsi_after_labels (exit_bb);
6236 reduc_inputs.create (slp_node ? vec_num : ncopies);
6237 vec <gimple *> vec_stmts = vNULL;
6238 for (unsigned i = 0; i < vec_num; i++)
6240 gimple_seq stmts = NULL;
6241 def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
6242 last_val_reduc_p, i, vec_stmts);
6243 for (j = 0; j < ncopies; j++)
6245 tree new_def = copy_ssa_name (def);
6246 phi = create_phi_node (new_def, exit_bb);
6247 if (j)
6248 def = gimple_get_lhs (vec_stmts[j]);
6249 if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6250 SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6251 else
6253 for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6254 SET_PHI_ARG_DEF (phi, k, def);
6256 new_def = gimple_convert (&stmts, vectype, new_def);
6257 reduc_inputs.quick_push (new_def);
6259 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6262 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6263 (i.e. when reduc_fn is not available) and in the final adjustment
6264 code (if needed). Also get the original scalar reduction variable as
6265 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6266 represents a reduction pattern), the tree-code and scalar-def are
6267 taken from the original stmt that the pattern-stmt (STMT) replaces.
6268 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6269 are taken from STMT. */
6271 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6272 if (orig_stmt_info != stmt_info)
6274 /* Reduction pattern */
6275 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6276 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6279 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6280 scalar_type = TREE_TYPE (scalar_dest);
6281 scalar_results.truncate (0);
6282 scalar_results.reserve_exact (group_size);
6283 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6284 bitsize = TYPE_SIZE (scalar_type);
6286 /* True if we should implement SLP_REDUC using native reduction operations
6287 instead of scalar operations. */
6288 direct_slp_reduc = (reduc_fn != IFN_LAST
6289 && slp_reduc
6290 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6292 /* In case of reduction chain, e.g.,
6293 # a1 = phi <a3, a0>
6294 a2 = operation (a1)
6295 a3 = operation (a2),
6297 we may end up with more than one vector result. Here we reduce them
6298 to one vector.
6300 The same is true for a SLP reduction, e.g.,
6301 # a1 = phi <a2, a0>
6302 # b1 = phi <b2, b0>
6303 a2 = operation (a1)
6304 b2 = operation (a2),
6306 where we can end up with more than one vector as well. We can
6307 easily accumulate vectors when the number of vector elements is
6308 a multiple of the SLP group size.
6310 The same is true if we couldn't use a single defuse cycle. */
6311 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6312 || direct_slp_reduc
6313 || (slp_reduc
6314 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6315 || ncopies > 1)
6317 gimple_seq stmts = NULL;
6318 tree single_input = reduc_inputs[0];
6319 for (k = 1; k < reduc_inputs.length (); k++)
6320 single_input = gimple_build (&stmts, code, vectype,
6321 single_input, reduc_inputs[k]);
6322 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6324 reduc_inputs.truncate (0);
6325 reduc_inputs.safe_push (single_input);
6328 tree orig_reduc_input = reduc_inputs[0];
6330 /* If this loop is an epilogue loop that can be skipped after the
6331 main loop, we can only share a reduction operation between the
6332 main loop and the epilogue if we put it at the target of the
6333 skip edge.
6335 We can still reuse accumulators if this check fails. Doing so has
6336 the minor(?) benefit of making the epilogue loop's scalar result
6337 independent of the main loop's scalar result. */
6338 bool unify_with_main_loop_p = false;
6339 if (reduc_info->reused_accumulator
6340 && loop_vinfo->skip_this_loop_edge
6341 && single_succ_p (exit_bb)
6342 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6344 unify_with_main_loop_p = true;
6346 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6347 reduc_inputs[0] = make_ssa_name (vectype);
6348 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6349 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6350 UNKNOWN_LOCATION);
6351 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6352 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6353 exit_gsi = gsi_after_labels (reduc_block);
6356 /* Shouldn't be used beyond this point. */
6357 exit_bb = nullptr;
6359 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6360 && reduc_fn != IFN_LAST)
6362 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6363 various data values where the condition matched and another vector
6364 (INDUCTION_INDEX) containing all the indexes of those matches. We
6365 need to extract the last matching index (which will be the index with
6366 highest value) and use this to index into the data vector.
6367 For the case where there were no matches, the data vector will contain
6368 all default values and the index vector will be all zeros. */
6370 /* Get various versions of the type of the vector of indexes. */
6371 tree index_vec_type = TREE_TYPE (induction_index);
6372 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6373 tree index_scalar_type = TREE_TYPE (index_vec_type);
6374 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6376 /* Get an unsigned integer version of the type of the data vector. */
6377 int scalar_precision
6378 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6379 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6380 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6381 vectype);
6383 /* First we need to create a vector (ZERO_VEC) of zeros and another
6384 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6385 can create using a MAX reduction and then expanding.
6386 In the case where the loop never made any matches, the max index will
6387 be zero. */
6389 /* Vector of {0, 0, 0,...}. */
6390 tree zero_vec = build_zero_cst (vectype);
6392 /* Find maximum value from the vector of found indexes. */
6393 tree max_index = make_ssa_name (index_scalar_type);
6394 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6395 1, induction_index);
6396 gimple_call_set_lhs (max_index_stmt, max_index);
6397 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6399 /* Vector of {max_index, max_index, max_index,...}. */
6400 tree max_index_vec = make_ssa_name (index_vec_type);
6401 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6402 max_index);
6403 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6404 max_index_vec_rhs);
6405 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6407 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6408 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6409 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6410 otherwise. Only one value should match, resulting in a vector
6411 (VEC_COND) with one data value and the rest zeros.
6412 In the case where the loop never made any matches, every index will
6413 match, resulting in a vector with all data values (which will all be
6414 the default value). */
6416 /* Compare the max index vector to the vector of found indexes to find
6417 the position of the max value. */
6418 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6419 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6420 induction_index,
6421 max_index_vec);
6422 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6424 /* Use the compare to choose either values from the data vector or
6425 zero. */
6426 tree vec_cond = make_ssa_name (vectype);
6427 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6428 vec_compare,
6429 reduc_inputs[0],
6430 zero_vec);
6431 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6433 /* Finally we need to extract the data value from the vector (VEC_COND)
6434 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6435 reduction, but because this doesn't exist, we can use a MAX reduction
6436 instead. The data value might be signed or a float so we need to cast
6437 it first.
6438 In the case where the loop never made any matches, the data values are
6439 all identical, and so will reduce down correctly. */
6441 /* Make the matched data values unsigned. */
6442 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6443 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6444 vec_cond);
6445 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6446 VIEW_CONVERT_EXPR,
6447 vec_cond_cast_rhs);
6448 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6450 /* Reduce down to a scalar value. */
6451 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6452 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6453 1, vec_cond_cast);
6454 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6455 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6457 /* Convert the reduced value back to the result type and set as the
6458 result. */
6459 gimple_seq stmts = NULL;
6460 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6461 data_reduc);
6462 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6463 scalar_results.safe_push (new_temp);
6465 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6466 && reduc_fn == IFN_LAST)
6468 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6469 idx = 0;
6470 idx_val = induction_index[0];
6471 val = data_reduc[0];
6472 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6473 if (induction_index[i] > idx_val)
6474 val = data_reduc[i], idx_val = induction_index[i];
6475 return val; */
6477 tree data_eltype = TREE_TYPE (vectype);
6478 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6479 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6480 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6481 /* Enforced by vectorizable_reduction, which ensures we have target
6482 support before allowing a conditional reduction on variable-length
6483 vectors. */
6484 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6485 tree idx_val = NULL_TREE, val = NULL_TREE;
6486 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6488 tree old_idx_val = idx_val;
6489 tree old_val = val;
6490 idx_val = make_ssa_name (idx_eltype);
6491 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6492 build3 (BIT_FIELD_REF, idx_eltype,
6493 induction_index,
6494 bitsize_int (el_size),
6495 bitsize_int (off)));
6496 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6497 val = make_ssa_name (data_eltype);
6498 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6499 build3 (BIT_FIELD_REF,
6500 data_eltype,
6501 reduc_inputs[0],
6502 bitsize_int (el_size),
6503 bitsize_int (off)));
6504 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6505 if (off != 0)
6507 tree new_idx_val = idx_val;
6508 if (off != v_size - el_size)
6510 new_idx_val = make_ssa_name (idx_eltype);
6511 epilog_stmt = gimple_build_assign (new_idx_val,
6512 MAX_EXPR, idx_val,
6513 old_idx_val);
6514 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6516 tree cond = make_ssa_name (boolean_type_node);
6517 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6518 idx_val, old_idx_val);
6519 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6520 tree new_val = make_ssa_name (data_eltype);
6521 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6522 cond, val, old_val);
6523 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6524 idx_val = new_idx_val;
6525 val = new_val;
6528 /* Convert the reduced value back to the result type and set as the
6529 result. */
6530 gimple_seq stmts = NULL;
6531 val = gimple_convert (&stmts, scalar_type, val);
6532 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6533 scalar_results.safe_push (val);
6536 /* 2.3 Create the reduction code, using one of the three schemes described
6537 above. In SLP we simply need to extract all the elements from the
6538 vector (without reducing them), so we use scalar shifts. */
6539 else if (reduc_fn != IFN_LAST && !slp_reduc)
6541 tree tmp;
6542 tree vec_elem_type;
6544 /* Case 1: Create:
6545 v_out2 = reduc_expr <v_out1> */
6547 if (dump_enabled_p ())
6548 dump_printf_loc (MSG_NOTE, vect_location,
6549 "Reduce using direct vector reduction.\n");
6551 gimple_seq stmts = NULL;
6552 vec_elem_type = TREE_TYPE (vectype);
6553 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6554 vec_elem_type, reduc_inputs[0]);
6555 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6556 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6558 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6559 && induc_val)
6561 /* Earlier we set the initial value to be a vector if induc_val
6562 values. Check the result and if it is induc_val then replace
6563 with the original initial value, unless induc_val is
6564 the same as initial_def already. */
6565 tree zcompare = make_ssa_name (boolean_type_node);
6566 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6567 new_temp, induc_val);
6568 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6569 tree initial_def = reduc_info->reduc_initial_values[0];
6570 tmp = make_ssa_name (new_scalar_dest);
6571 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6572 initial_def, new_temp);
6573 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6574 new_temp = tmp;
6577 scalar_results.safe_push (new_temp);
6579 else if (direct_slp_reduc)
6581 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6582 with the elements for other SLP statements replaced with the
6583 neutral value. We can then do a normal reduction on each vector. */
6585 /* Enforced by vectorizable_reduction. */
6586 gcc_assert (reduc_inputs.length () == 1);
6587 gcc_assert (pow2p_hwi (group_size));
6589 gimple_seq seq = NULL;
6591 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6592 and the same element size as VECTYPE. */
6593 tree index = build_index_vector (vectype, 0, 1);
6594 tree index_type = TREE_TYPE (index);
6595 tree index_elt_type = TREE_TYPE (index_type);
6596 tree mask_type = truth_type_for (index_type);
6598 /* Create a vector that, for each element, identifies which of
6599 the REDUC_GROUP_SIZE results should use it. */
6600 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6601 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6602 build_vector_from_val (index_type, index_mask));
6604 /* Get a neutral vector value. This is simply a splat of the neutral
6605 scalar value if we have one, otherwise the initial scalar value
6606 is itself a neutral value. */
6607 tree vector_identity = NULL_TREE;
6608 tree neutral_op = NULL_TREE;
6609 if (slp_node)
6611 tree initial_value = NULL_TREE;
6612 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6613 initial_value = reduc_info->reduc_initial_values[0];
6614 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6615 initial_value, false);
6617 if (neutral_op)
6618 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6619 neutral_op);
6620 for (unsigned int i = 0; i < group_size; ++i)
6622 /* If there's no univeral neutral value, we can use the
6623 initial scalar value from the original PHI. This is used
6624 for MIN and MAX reduction, for example. */
6625 if (!neutral_op)
6627 tree scalar_value = reduc_info->reduc_initial_values[i];
6628 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6629 scalar_value);
6630 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6631 scalar_value);
6634 /* Calculate the equivalent of:
6636 sel[j] = (index[j] == i);
6638 which selects the elements of REDUC_INPUTS[0] that should
6639 be included in the result. */
6640 tree compare_val = build_int_cst (index_elt_type, i);
6641 compare_val = build_vector_from_val (index_type, compare_val);
6642 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6643 index, compare_val);
6645 /* Calculate the equivalent of:
6647 vec = seq ? reduc_inputs[0] : vector_identity;
6649 VEC is now suitable for a full vector reduction. */
6650 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6651 sel, reduc_inputs[0], vector_identity);
6653 /* Do the reduction and convert it to the appropriate type. */
6654 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6655 TREE_TYPE (vectype), vec);
6656 scalar = gimple_convert (&seq, scalar_type, scalar);
6657 scalar_results.safe_push (scalar);
6659 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6661 else
6663 bool reduce_with_shift;
6664 tree vec_temp;
6666 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6668 /* See if the target wants to do the final (shift) reduction
6669 in a vector mode of smaller size and first reduce upper/lower
6670 halves against each other. */
6671 enum machine_mode mode1 = mode;
6672 tree stype = TREE_TYPE (vectype);
6673 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6674 unsigned nunits1 = nunits;
6675 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6676 && reduc_inputs.length () == 1)
6678 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6679 /* For SLP reductions we have to make sure lanes match up, but
6680 since we're doing individual element final reduction reducing
6681 vector width here is even more important.
6682 ??? We can also separate lanes with permutes, for the common
6683 case of power-of-two group-size odd/even extracts would work. */
6684 if (slp_reduc && nunits != nunits1)
6686 nunits1 = least_common_multiple (nunits1, group_size);
6687 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6690 if (!slp_reduc
6691 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6692 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6694 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6695 stype, nunits1);
6696 reduce_with_shift = have_whole_vector_shift (mode1);
6697 if (!VECTOR_MODE_P (mode1)
6698 || !directly_supported_p (code, vectype1))
6699 reduce_with_shift = false;
6701 /* First reduce the vector to the desired vector size we should
6702 do shift reduction on by combining upper and lower halves. */
6703 gimple_seq stmts = NULL;
6704 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6705 code, &stmts);
6706 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6707 reduc_inputs[0] = new_temp;
6709 if (reduce_with_shift && !slp_reduc)
6711 int element_bitsize = tree_to_uhwi (bitsize);
6712 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6713 for variable-length vectors and also requires direct target support
6714 for loop reductions. */
6715 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6716 int nelements = vec_size_in_bits / element_bitsize;
6717 vec_perm_builder sel;
6718 vec_perm_indices indices;
6720 int elt_offset;
6722 tree zero_vec = build_zero_cst (vectype1);
6723 /* Case 2: Create:
6724 for (offset = nelements/2; offset >= 1; offset/=2)
6726 Create: va' = vec_shift <va, offset>
6727 Create: va = vop <va, va'>
6728 } */
6730 tree rhs;
6732 if (dump_enabled_p ())
6733 dump_printf_loc (MSG_NOTE, vect_location,
6734 "Reduce using vector shifts\n");
6736 gimple_seq stmts = NULL;
6737 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6738 for (elt_offset = nelements / 2;
6739 elt_offset >= 1;
6740 elt_offset /= 2)
6742 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6743 indices.new_vector (sel, 2, nelements);
6744 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6745 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6746 new_temp, zero_vec, mask);
6747 new_temp = gimple_build (&stmts, code,
6748 vectype1, new_name, new_temp);
6750 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6752 /* 2.4 Extract the final scalar result. Create:
6753 s_out3 = extract_field <v_out2, bitpos> */
6755 if (dump_enabled_p ())
6756 dump_printf_loc (MSG_NOTE, vect_location,
6757 "extract scalar result\n");
6759 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6760 bitsize, bitsize_zero_node);
6761 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6762 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6763 gimple_assign_set_lhs (epilog_stmt, new_temp);
6764 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6765 scalar_results.safe_push (new_temp);
6767 else
6769 /* Case 3: Create:
6770 s = extract_field <v_out2, 0>
6771 for (offset = element_size;
6772 offset < vector_size;
6773 offset += element_size;)
6775 Create: s' = extract_field <v_out2, offset>
6776 Create: s = op <s, s'> // For non SLP cases
6777 } */
6779 if (dump_enabled_p ())
6780 dump_printf_loc (MSG_NOTE, vect_location,
6781 "Reduce using scalar code.\n");
6783 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6784 int element_bitsize = tree_to_uhwi (bitsize);
6785 tree compute_type = TREE_TYPE (vectype);
6786 gimple_seq stmts = NULL;
6787 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6789 int bit_offset;
6790 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6791 vec_temp, bitsize, bitsize_zero_node);
6793 /* In SLP we don't need to apply reduction operation, so we just
6794 collect s' values in SCALAR_RESULTS. */
6795 if (slp_reduc)
6796 scalar_results.safe_push (new_temp);
6798 for (bit_offset = element_bitsize;
6799 bit_offset < vec_size_in_bits;
6800 bit_offset += element_bitsize)
6802 tree bitpos = bitsize_int (bit_offset);
6803 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6804 compute_type, vec_temp,
6805 bitsize, bitpos);
6806 if (slp_reduc)
6808 /* In SLP we don't need to apply reduction operation, so
6809 we just collect s' values in SCALAR_RESULTS. */
6810 new_temp = new_name;
6811 scalar_results.safe_push (new_name);
6813 else
6814 new_temp = gimple_build (&stmts, code, compute_type,
6815 new_name, new_temp);
6819 /* The only case where we need to reduce scalar results in SLP, is
6820 unrolling. If the size of SCALAR_RESULTS is greater than
6821 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6822 REDUC_GROUP_SIZE. */
6823 if (slp_reduc)
6825 tree res, first_res, new_res;
6827 /* Reduce multiple scalar results in case of SLP unrolling. */
6828 for (j = group_size; scalar_results.iterate (j, &res);
6829 j++)
6831 first_res = scalar_results[j % group_size];
6832 new_res = gimple_build (&stmts, code, compute_type,
6833 first_res, res);
6834 scalar_results[j % group_size] = new_res;
6836 scalar_results.truncate (group_size);
6837 for (k = 0; k < group_size; k++)
6838 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6839 scalar_results[k]);
6841 else
6843 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6844 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6845 scalar_results.safe_push (new_temp);
6848 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6851 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6852 && induc_val)
6854 /* Earlier we set the initial value to be a vector if induc_val
6855 values. Check the result and if it is induc_val then replace
6856 with the original initial value, unless induc_val is
6857 the same as initial_def already. */
6858 tree zcompare = make_ssa_name (boolean_type_node);
6859 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6860 induc_val);
6861 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6862 tree initial_def = reduc_info->reduc_initial_values[0];
6863 tree tmp = make_ssa_name (new_scalar_dest);
6864 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6865 initial_def, new_temp);
6866 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6867 scalar_results[0] = tmp;
6871 /* 2.5 Adjust the final result by the initial value of the reduction
6872 variable. (When such adjustment is not needed, then
6873 'adjustment_def' is zero). For example, if code is PLUS we create:
6874 new_temp = loop_exit_def + adjustment_def */
6876 if (adjustment_def)
6878 gcc_assert (!slp_reduc);
6879 gimple_seq stmts = NULL;
6880 if (double_reduc)
6882 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6883 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6884 new_temp = gimple_build (&stmts, code, vectype,
6885 reduc_inputs[0], adjustment_def);
6887 else
6889 new_temp = scalar_results[0];
6890 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6891 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6892 adjustment_def);
6893 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6894 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6895 new_temp, adjustment_def);
6896 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6899 epilog_stmt = gimple_seq_last_stmt (stmts);
6900 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6901 scalar_results[0] = new_temp;
6904 /* Record this operation if it could be reused by the epilogue loop. */
6905 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6906 && reduc_inputs.length () == 1)
6907 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6908 { orig_reduc_input, reduc_info });
6910 if (double_reduc)
6911 loop = outer_loop;
6913 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6914 phis with new adjusted scalar results, i.e., replace use <s_out0>
6915 with use <s_out4>.
6917 Transform:
6918 loop_exit:
6919 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6920 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6921 v_out2 = reduce <v_out1>
6922 s_out3 = extract_field <v_out2, 0>
6923 s_out4 = adjust_result <s_out3>
6924 use <s_out0>
6925 use <s_out0>
6927 into:
6929 loop_exit:
6930 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6931 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6932 v_out2 = reduce <v_out1>
6933 s_out3 = extract_field <v_out2, 0>
6934 s_out4 = adjust_result <s_out3>
6935 use <s_out4>
6936 use <s_out4> */
6938 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6939 auto_vec<gimple *> phis;
6940 for (k = 0; k < live_out_stmts.size (); k++)
6942 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6943 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6945 /* Find the loop-closed-use at the loop exit of the original scalar
6946 result. (The reduction result is expected to have two immediate uses,
6947 one at the latch block, and one at the loop exit). For double
6948 reductions we are looking for exit phis of the outer loop. */
6949 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6951 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6953 if (!is_gimple_debug (USE_STMT (use_p)))
6954 phis.safe_push (USE_STMT (use_p));
6956 else
6958 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6960 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6962 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6964 if (!flow_bb_inside_loop_p (loop,
6965 gimple_bb (USE_STMT (phi_use_p)))
6966 && !is_gimple_debug (USE_STMT (phi_use_p)))
6967 phis.safe_push (USE_STMT (phi_use_p));
6973 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6975 /* Replace the uses: */
6976 orig_name = PHI_RESULT (exit_phi);
6978 /* Look for a single use at the target of the skip edge. */
6979 if (unify_with_main_loop_p)
6981 use_operand_p use_p;
6982 gimple *user;
6983 if (!single_imm_use (orig_name, &use_p, &user))
6984 gcc_unreachable ();
6985 orig_name = gimple_get_lhs (user);
6988 scalar_result = scalar_results[k];
6989 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6991 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6992 SET_USE (use_p, scalar_result);
6993 update_stmt (use_stmt);
6997 phis.truncate (0);
7001 /* Return a vector of type VECTYPE that is equal to the vector select
7002 operation "MASK ? VEC : IDENTITY". Insert the select statements
7003 before GSI. */
7005 static tree
7006 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
7007 tree vec, tree identity)
7009 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
7010 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
7011 mask, vec, identity);
7012 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7013 return cond;
7016 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
7017 order, starting with LHS. Insert the extraction statements before GSI and
7018 associate the new scalar SSA names with variable SCALAR_DEST.
7019 If MASK is nonzero mask the input and then operate on it unconditionally.
7020 Return the SSA name for the result. */
7022 static tree
7023 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
7024 tree_code code, tree lhs, tree vector_rhs,
7025 tree mask)
7027 tree vectype = TREE_TYPE (vector_rhs);
7028 tree scalar_type = TREE_TYPE (vectype);
7029 tree bitsize = TYPE_SIZE (scalar_type);
7030 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
7031 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
7033 /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7034 to perform an unconditional element-wise reduction of it. */
7035 if (mask)
7037 tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7038 "masked_vector_rhs");
7039 tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7040 false);
7041 tree vector_identity = build_vector_from_val (vectype, neutral_op);
7042 gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7043 mask, vector_rhs, vector_identity);
7044 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7045 vector_rhs = masked_vector_rhs;
7048 for (unsigned HOST_WIDE_INT bit_offset = 0;
7049 bit_offset < vec_size_in_bits;
7050 bit_offset += element_bitsize)
7052 tree bitpos = bitsize_int (bit_offset);
7053 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7054 bitsize, bitpos);
7056 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7057 rhs = make_ssa_name (scalar_dest, stmt);
7058 gimple_assign_set_lhs (stmt, rhs);
7059 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7061 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7062 tree new_name = make_ssa_name (scalar_dest, stmt);
7063 gimple_assign_set_lhs (stmt, new_name);
7064 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7065 lhs = new_name;
7067 return lhs;
7070 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
7071 type of the vector input. */
7073 static internal_fn
7074 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7076 internal_fn mask_reduc_fn;
7077 internal_fn mask_len_reduc_fn;
7079 switch (reduc_fn)
7081 case IFN_FOLD_LEFT_PLUS:
7082 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7083 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7084 break;
7086 default:
7087 return IFN_LAST;
7090 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7091 OPTIMIZE_FOR_SPEED))
7092 return mask_reduc_fn;
7093 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7094 OPTIMIZE_FOR_SPEED))
7095 return mask_len_reduc_fn;
7096 return IFN_LAST;
7099 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
7100 statement that sets the live-out value. REDUC_DEF_STMT is the phi
7101 statement. CODE is the operation performed by STMT_INFO and OPS are
7102 its scalar operands. REDUC_INDEX is the index of the operand in
7103 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
7104 implements in-order reduction, or IFN_LAST if we should open-code it.
7105 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7106 that should be used to control the operation in a fully-masked loop. */
7108 static bool
7109 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7110 stmt_vec_info stmt_info,
7111 gimple_stmt_iterator *gsi,
7112 gimple **vec_stmt, slp_tree slp_node,
7113 gimple *reduc_def_stmt,
7114 code_helper code, internal_fn reduc_fn,
7115 tree *ops, int num_ops, tree vectype_in,
7116 int reduc_index, vec_loop_masks *masks,
7117 vec_loop_lens *lens)
7119 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7120 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7121 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7123 int ncopies;
7124 if (slp_node)
7125 ncopies = 1;
7126 else
7127 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7129 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7130 gcc_assert (ncopies == 1);
7132 bool is_cond_op = false;
7133 if (!code.is_tree_code ())
7135 code = conditional_internal_fn_code (internal_fn (code));
7136 gcc_assert (code != ERROR_MARK);
7137 is_cond_op = true;
7140 gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7142 if (slp_node)
7144 if (is_cond_op)
7146 if (dump_enabled_p ())
7147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7148 "fold-left reduction on SLP not supported.\n");
7149 return false;
7152 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7153 TYPE_VECTOR_SUBPARTS (vectype_in)));
7156 /* The operands either come from a binary operation or an IFN_COND operation.
7157 The former is a gimple assign with binary rhs and the latter is a
7158 gimple call with four arguments. */
7159 gcc_assert (num_ops == 2 || num_ops == 4);
7160 tree op0, opmask;
7161 if (!is_cond_op)
7162 op0 = ops[1 - reduc_index];
7163 else
7165 op0 = ops[2 + (1 - reduc_index)];
7166 opmask = ops[0];
7167 gcc_assert (!slp_node);
7170 int group_size = 1;
7171 stmt_vec_info scalar_dest_def_info;
7172 auto_vec<tree> vec_oprnds0, vec_opmask;
7173 if (slp_node)
7175 auto_vec<vec<tree> > vec_defs (2);
7176 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7177 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7178 vec_defs[0].release ();
7179 vec_defs[1].release ();
7180 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7181 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7183 else
7185 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7186 op0, &vec_oprnds0);
7187 scalar_dest_def_info = stmt_info;
7189 /* For an IFN_COND_OP we also need the vector mask operand. */
7190 if (is_cond_op)
7191 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7192 opmask, &vec_opmask);
7195 gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7196 tree scalar_dest = gimple_get_lhs (sdef);
7197 tree scalar_type = TREE_TYPE (scalar_dest);
7198 tree reduc_var = gimple_phi_result (reduc_def_stmt);
7200 int vec_num = vec_oprnds0.length ();
7201 gcc_assert (vec_num == 1 || slp_node);
7202 tree vec_elem_type = TREE_TYPE (vectype_out);
7203 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7205 tree vector_identity = NULL_TREE;
7206 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7208 vector_identity = build_zero_cst (vectype_out);
7209 if (!HONOR_SIGNED_ZEROS (vectype_out))
7211 else
7213 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7214 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7215 vector_identity);
7219 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7220 int i;
7221 tree def0;
7222 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7224 gimple *new_stmt;
7225 tree mask = NULL_TREE;
7226 tree len = NULL_TREE;
7227 tree bias = NULL_TREE;
7228 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7229 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7230 else if (is_cond_op)
7231 mask = vec_opmask[0];
7232 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7234 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7235 i, 1);
7236 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7237 bias = build_int_cst (intQI_type_node, biasval);
7238 if (!is_cond_op)
7239 mask = build_minus_one_cst (truth_type_for (vectype_in));
7242 /* Handle MINUS by adding the negative. */
7243 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7245 tree negated = make_ssa_name (vectype_out);
7246 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7247 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7248 def0 = negated;
7251 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7252 && mask && mask_reduc_fn == IFN_LAST)
7253 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7254 vector_identity);
7256 /* On the first iteration the input is simply the scalar phi
7257 result, and for subsequent iterations it is the output of
7258 the preceding operation. */
7259 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7261 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7262 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7263 def0, mask, len, bias);
7264 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7265 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7266 def0, mask);
7267 else
7268 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7269 def0);
7270 /* For chained SLP reductions the output of the previous reduction
7271 operation serves as the input of the next. For the final statement
7272 the output cannot be a temporary - we reuse the original
7273 scalar destination of the last statement. */
7274 if (i != vec_num - 1)
7276 gimple_set_lhs (new_stmt, scalar_dest_var);
7277 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7278 gimple_set_lhs (new_stmt, reduc_var);
7281 else
7283 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7284 tree_code (code), reduc_var, def0,
7285 mask);
7286 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7287 /* Remove the statement, so that we can use the same code paths
7288 as for statements that we've just created. */
7289 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7290 gsi_remove (&tmp_gsi, true);
7293 if (i == vec_num - 1)
7295 gimple_set_lhs (new_stmt, scalar_dest);
7296 vect_finish_replace_stmt (loop_vinfo,
7297 scalar_dest_def_info,
7298 new_stmt);
7300 else
7301 vect_finish_stmt_generation (loop_vinfo,
7302 scalar_dest_def_info,
7303 new_stmt, gsi);
7305 if (slp_node)
7306 slp_node->push_vec_def (new_stmt);
7307 else
7309 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7310 *vec_stmt = new_stmt;
7314 return true;
7317 /* Function is_nonwrapping_integer_induction.
7319 Check if STMT_VINO (which is part of loop LOOP) both increments and
7320 does not cause overflow. */
7322 static bool
7323 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7325 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7326 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7327 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7328 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7329 widest_int ni, max_loop_value, lhs_max;
7330 wi::overflow_type overflow = wi::OVF_NONE;
7332 /* Make sure the loop is integer based. */
7333 if (TREE_CODE (base) != INTEGER_CST
7334 || TREE_CODE (step) != INTEGER_CST)
7335 return false;
7337 /* Check that the max size of the loop will not wrap. */
7339 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7340 return true;
7342 if (! max_stmt_executions (loop, &ni))
7343 return false;
7345 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7346 &overflow);
7347 if (overflow)
7348 return false;
7350 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7351 TYPE_SIGN (lhs_type), &overflow);
7352 if (overflow)
7353 return false;
7355 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7356 <= TYPE_PRECISION (lhs_type));
7359 /* Check if masking can be supported by inserting a conditional expression.
7360 CODE is the code for the operation. COND_FN is the conditional internal
7361 function, if it exists. VECTYPE_IN is the type of the vector input. */
7362 static bool
7363 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7364 tree vectype_in)
7366 if (cond_fn != IFN_LAST
7367 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7368 OPTIMIZE_FOR_SPEED))
7369 return false;
7371 if (code.is_tree_code ())
7372 switch (tree_code (code))
7374 case DOT_PROD_EXPR:
7375 case SAD_EXPR:
7376 return true;
7378 default:
7379 break;
7381 return false;
7384 /* Insert a conditional expression to enable masked vectorization. CODE is the
7385 code for the operation. VOP is the array of operands. MASK is the loop
7386 mask. GSI is a statement iterator used to place the new conditional
7387 expression. */
7388 static void
7389 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7390 gimple_stmt_iterator *gsi)
7392 switch (tree_code (code))
7394 case DOT_PROD_EXPR:
7396 tree vectype = TREE_TYPE (vop[1]);
7397 tree zero = build_zero_cst (vectype);
7398 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7399 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7400 mask, vop[1], zero);
7401 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7402 vop[1] = masked_op1;
7403 break;
7406 case SAD_EXPR:
7408 tree vectype = TREE_TYPE (vop[1]);
7409 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7410 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7411 mask, vop[1], vop[0]);
7412 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7413 vop[1] = masked_op1;
7414 break;
7417 default:
7418 gcc_unreachable ();
7422 /* Function vectorizable_reduction.
7424 Check if STMT_INFO performs a reduction operation that can be vectorized.
7425 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7426 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7427 Return true if STMT_INFO is vectorizable in this way.
7429 This function also handles reduction idioms (patterns) that have been
7430 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7431 may be of this form:
7432 X = pattern_expr (arg0, arg1, ..., X)
7433 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7434 sequence that had been detected and replaced by the pattern-stmt
7435 (STMT_INFO).
7437 This function also handles reduction of condition expressions, for example:
7438 for (int i = 0; i < N; i++)
7439 if (a[i] < value)
7440 last = a[i];
7441 This is handled by vectorising the loop and creating an additional vector
7442 containing the loop indexes for which "a[i] < value" was true. In the
7443 function epilogue this is reduced to a single max value and then used to
7444 index into the vector of results.
7446 In some cases of reduction patterns, the type of the reduction variable X is
7447 different than the type of the other arguments of STMT_INFO.
7448 In such cases, the vectype that is used when transforming STMT_INFO into
7449 a vector stmt is different than the vectype that is used to determine the
7450 vectorization factor, because it consists of a different number of elements
7451 than the actual number of elements that are being operated upon in parallel.
7453 For example, consider an accumulation of shorts into an int accumulator.
7454 On some targets it's possible to vectorize this pattern operating on 8
7455 shorts at a time (hence, the vectype for purposes of determining the
7456 vectorization factor should be V8HI); on the other hand, the vectype that
7457 is used to create the vector form is actually V4SI (the type of the result).
7459 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7460 indicates what is the actual level of parallelism (V8HI in the example), so
7461 that the right vectorization factor would be derived. This vectype
7462 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7463 be used to create the vectorized stmt. The right vectype for the vectorized
7464 stmt is obtained from the type of the result X:
7465 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7467 This means that, contrary to "regular" reductions (or "regular" stmts in
7468 general), the following equation:
7469 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7470 does *NOT* necessarily hold for reduction patterns. */
7472 bool
7473 vectorizable_reduction (loop_vec_info loop_vinfo,
7474 stmt_vec_info stmt_info, slp_tree slp_node,
7475 slp_instance slp_node_instance,
7476 stmt_vector_for_cost *cost_vec)
7478 tree vectype_in = NULL_TREE;
7479 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7480 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7481 stmt_vec_info cond_stmt_vinfo = NULL;
7482 int i;
7483 int ncopies;
7484 bool single_defuse_cycle = false;
7485 bool nested_cycle = false;
7486 bool double_reduc = false;
7487 int vec_num;
7488 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7489 tree cond_reduc_val = NULL_TREE;
7491 /* Make sure it was already recognized as a reduction computation. */
7492 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7493 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7494 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7495 return false;
7497 /* The stmt we store reduction analysis meta on. */
7498 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7499 reduc_info->is_reduc_info = true;
7501 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7503 if (is_a <gphi *> (stmt_info->stmt))
7505 if (slp_node)
7507 /* We eventually need to set a vector type on invariant
7508 arguments. */
7509 unsigned j;
7510 slp_tree child;
7511 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7512 if (!vect_maybe_update_slp_op_vectype
7513 (child, SLP_TREE_VECTYPE (slp_node)))
7515 if (dump_enabled_p ())
7516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7517 "incompatible vector types for "
7518 "invariants\n");
7519 return false;
7522 /* Analysis for double-reduction is done on the outer
7523 loop PHI, nested cycles have no further restrictions. */
7524 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7526 else
7527 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7528 return true;
7531 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7532 stmt_vec_info phi_info = stmt_info;
7533 if (!is_a <gphi *> (stmt_info->stmt))
7535 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7536 return true;
7538 if (slp_node)
7540 slp_node_instance->reduc_phis = slp_node;
7541 /* ??? We're leaving slp_node to point to the PHIs, we only
7542 need it to get at the number of vector stmts which wasn't
7543 yet initialized for the instance root. */
7545 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7547 use_operand_p use_p;
7548 gimple *use_stmt;
7549 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7550 &use_p, &use_stmt);
7551 gcc_assert (res);
7552 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7555 /* PHIs should not participate in patterns. */
7556 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7557 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7559 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7560 and compute the reduction chain length. Discover the real
7561 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7562 tree reduc_def
7563 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7564 loop_latch_edge
7565 (gimple_bb (reduc_def_phi)->loop_father));
7566 unsigned reduc_chain_length = 0;
7567 bool only_slp_reduc_chain = true;
7568 stmt_info = NULL;
7569 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7570 while (reduc_def != PHI_RESULT (reduc_def_phi))
7572 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7573 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7574 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7576 if (dump_enabled_p ())
7577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7578 "reduction chain broken by patterns.\n");
7579 return false;
7581 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7582 only_slp_reduc_chain = false;
7583 /* For epilogue generation live members of the chain need
7584 to point back to the PHI via their original stmt for
7585 info_for_reduction to work. For SLP we need to look at
7586 all lanes here - even though we only will vectorize from
7587 the SLP node with live lane zero the other live lanes also
7588 need to be identified as part of a reduction to be able
7589 to skip code generation for them. */
7590 if (slp_for_stmt_info)
7592 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7593 if (STMT_VINFO_LIVE_P (s))
7594 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7596 else if (STMT_VINFO_LIVE_P (vdef))
7597 STMT_VINFO_REDUC_DEF (def) = phi_info;
7598 gimple_match_op op;
7599 if (!gimple_extract_op (vdef->stmt, &op))
7601 if (dump_enabled_p ())
7602 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7603 "reduction chain includes unsupported"
7604 " statement type.\n");
7605 return false;
7607 if (CONVERT_EXPR_CODE_P (op.code))
7609 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7611 if (dump_enabled_p ())
7612 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7613 "conversion in the reduction chain.\n");
7614 return false;
7617 else if (!stmt_info)
7618 /* First non-conversion stmt. */
7619 stmt_info = vdef;
7620 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7621 reduc_chain_length++;
7622 if (!stmt_info && slp_node)
7623 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7625 /* PHIs should not participate in patterns. */
7626 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7628 if (nested_in_vect_loop_p (loop, stmt_info))
7630 loop = loop->inner;
7631 nested_cycle = true;
7634 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7635 element. */
7636 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7638 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7639 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7641 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7642 gcc_assert (slp_node
7643 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7645 /* 1. Is vectorizable reduction? */
7646 /* Not supportable if the reduction variable is used in the loop, unless
7647 it's a reduction chain. */
7648 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7649 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7650 return false;
7652 /* Reductions that are not used even in an enclosing outer-loop,
7653 are expected to be "live" (used out of the loop). */
7654 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7655 && !STMT_VINFO_LIVE_P (stmt_info))
7656 return false;
7658 /* 2. Has this been recognized as a reduction pattern?
7660 Check if STMT represents a pattern that has been recognized
7661 in earlier analysis stages. For stmts that represent a pattern,
7662 the STMT_VINFO_RELATED_STMT field records the last stmt in
7663 the original sequence that constitutes the pattern. */
7665 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7666 if (orig_stmt_info)
7668 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7669 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7672 /* 3. Check the operands of the operation. The first operands are defined
7673 inside the loop body. The last operand is the reduction variable,
7674 which is defined by the loop-header-phi. */
7676 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7677 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7678 gimple_match_op op;
7679 if (!gimple_extract_op (stmt_info->stmt, &op))
7680 gcc_unreachable ();
7681 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7682 || op.code == WIDEN_SUM_EXPR
7683 || op.code == SAD_EXPR);
7685 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7686 && !SCALAR_FLOAT_TYPE_P (op.type))
7687 return false;
7689 /* Do not try to vectorize bit-precision reductions. */
7690 if (!type_has_mode_precision_p (op.type))
7691 return false;
7693 /* For lane-reducing ops we're reducing the number of reduction PHIs
7694 which means the only use of that may be in the lane-reducing operation. */
7695 if (lane_reduc_code_p
7696 && reduc_chain_length != 1
7697 && !only_slp_reduc_chain)
7699 if (dump_enabled_p ())
7700 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7701 "lane-reducing reduction with extra stmts.\n");
7702 return false;
7705 /* All uses but the last are expected to be defined in the loop.
7706 The last use is the reduction variable. In case of nested cycle this
7707 assumption is not true: we use reduc_index to record the index of the
7708 reduction variable. */
7709 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7710 tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7711 /* We need to skip an extra operand for COND_EXPRs with embedded
7712 comparison. */
7713 unsigned opno_adjust = 0;
7714 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7715 opno_adjust = 1;
7716 for (i = 0; i < (int) op.num_ops; i++)
7718 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7719 if (i == 0 && op.code == COND_EXPR)
7720 continue;
7722 stmt_vec_info def_stmt_info;
7723 enum vect_def_type dt;
7724 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7725 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7726 &vectype_op[i], &def_stmt_info))
7728 if (dump_enabled_p ())
7729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7730 "use not simple.\n");
7731 return false;
7733 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7734 continue;
7736 /* For an IFN_COND_OP we might hit the reduction definition operand
7737 twice (once as definition, once as else). */
7738 if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7739 continue;
7741 /* There should be only one cycle def in the stmt, the one
7742 leading to reduc_def. */
7743 if (VECTORIZABLE_CYCLE_DEF (dt))
7744 return false;
7746 if (!vectype_op[i])
7747 vectype_op[i]
7748 = get_vectype_for_scalar_type (loop_vinfo,
7749 TREE_TYPE (op.ops[i]), slp_op[i]);
7751 /* To properly compute ncopies we are interested in the widest
7752 non-reduction input type in case we're looking at a widening
7753 accumulation that we later handle in vect_transform_reduction. */
7754 if (lane_reduc_code_p
7755 && vectype_op[i]
7756 && (!vectype_in
7757 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7758 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7759 vectype_in = vectype_op[i];
7761 if (op.code == COND_EXPR)
7763 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7764 if (dt == vect_constant_def)
7766 cond_reduc_dt = dt;
7767 cond_reduc_val = op.ops[i];
7769 if (dt == vect_induction_def
7770 && def_stmt_info
7771 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7773 cond_reduc_dt = dt;
7774 cond_stmt_vinfo = def_stmt_info;
7778 if (!vectype_in)
7779 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7780 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7782 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7783 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7784 /* If we have a condition reduction, see if we can simplify it further. */
7785 if (v_reduc_type == COND_REDUCTION)
7787 if (slp_node)
7788 return false;
7790 /* When the condition uses the reduction value in the condition, fail. */
7791 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7793 if (dump_enabled_p ())
7794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7795 "condition depends on previous iteration\n");
7796 return false;
7799 if (reduc_chain_length == 1
7800 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7801 OPTIMIZE_FOR_SPEED)
7802 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7803 vectype_in,
7804 OPTIMIZE_FOR_SPEED)))
7806 if (dump_enabled_p ())
7807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7808 "optimizing condition reduction with"
7809 " FOLD_EXTRACT_LAST.\n");
7810 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7812 else if (cond_reduc_dt == vect_induction_def)
7814 tree base
7815 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7816 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7818 gcc_assert (TREE_CODE (base) == INTEGER_CST
7819 && TREE_CODE (step) == INTEGER_CST);
7820 cond_reduc_val = NULL_TREE;
7821 enum tree_code cond_reduc_op_code = ERROR_MARK;
7822 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7823 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7825 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7826 above base; punt if base is the minimum value of the type for
7827 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7828 else if (tree_int_cst_sgn (step) == -1)
7830 cond_reduc_op_code = MIN_EXPR;
7831 if (tree_int_cst_sgn (base) == -1)
7832 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7833 else if (tree_int_cst_lt (base,
7834 TYPE_MAX_VALUE (TREE_TYPE (base))))
7835 cond_reduc_val
7836 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7838 else
7840 cond_reduc_op_code = MAX_EXPR;
7841 if (tree_int_cst_sgn (base) == 1)
7842 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7843 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7844 base))
7845 cond_reduc_val
7846 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7848 if (cond_reduc_val)
7850 if (dump_enabled_p ())
7851 dump_printf_loc (MSG_NOTE, vect_location,
7852 "condition expression based on "
7853 "integer induction.\n");
7854 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7855 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7856 = cond_reduc_val;
7857 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7860 else if (cond_reduc_dt == vect_constant_def)
7862 enum vect_def_type cond_initial_dt;
7863 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7864 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7865 if (cond_initial_dt == vect_constant_def
7866 && types_compatible_p (TREE_TYPE (cond_initial_val),
7867 TREE_TYPE (cond_reduc_val)))
7869 tree e = fold_binary (LE_EXPR, boolean_type_node,
7870 cond_initial_val, cond_reduc_val);
7871 if (e && (integer_onep (e) || integer_zerop (e)))
7873 if (dump_enabled_p ())
7874 dump_printf_loc (MSG_NOTE, vect_location,
7875 "condition expression based on "
7876 "compile time constant.\n");
7877 /* Record reduction code at analysis stage. */
7878 STMT_VINFO_REDUC_CODE (reduc_info)
7879 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7880 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7886 if (STMT_VINFO_LIVE_P (phi_info))
7887 return false;
7889 if (slp_node)
7890 ncopies = 1;
7891 else
7892 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7894 gcc_assert (ncopies >= 1);
7896 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7898 if (nested_cycle)
7900 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7901 == vect_double_reduction_def);
7902 double_reduc = true;
7905 /* 4.2. Check support for the epilog operation.
7907 If STMT represents a reduction pattern, then the type of the
7908 reduction variable may be different than the type of the rest
7909 of the arguments. For example, consider the case of accumulation
7910 of shorts into an int accumulator; The original code:
7911 S1: int_a = (int) short_a;
7912 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7914 was replaced with:
7915 STMT: int_acc = widen_sum <short_a, int_acc>
7917 This means that:
7918 1. The tree-code that is used to create the vector operation in the
7919 epilog code (that reduces the partial results) is not the
7920 tree-code of STMT, but is rather the tree-code of the original
7921 stmt from the pattern that STMT is replacing. I.e, in the example
7922 above we want to use 'widen_sum' in the loop, but 'plus' in the
7923 epilog.
7924 2. The type (mode) we use to check available target support
7925 for the vector operation to be created in the *epilog*, is
7926 determined by the type of the reduction variable (in the example
7927 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7928 However the type (mode) we use to check available target support
7929 for the vector operation to be created *inside the loop*, is
7930 determined by the type of the other arguments to STMT (in the
7931 example we'd check this: optab_handler (widen_sum_optab,
7932 vect_short_mode)).
7934 This is contrary to "regular" reductions, in which the types of all
7935 the arguments are the same as the type of the reduction variable.
7936 For "regular" reductions we can therefore use the same vector type
7937 (and also the same tree-code) when generating the epilog code and
7938 when generating the code inside the loop. */
7940 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7942 /* If conversion might have created a conditional operation like
7943 IFN_COND_ADD already. Use the internal code for the following checks. */
7944 if (orig_code.is_internal_fn ())
7946 tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7947 orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7950 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7952 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7953 if (reduction_type == TREE_CODE_REDUCTION)
7955 /* Check whether it's ok to change the order of the computation.
7956 Generally, when vectorizing a reduction we change the order of the
7957 computation. This may change the behavior of the program in some
7958 cases, so we need to check that this is ok. One exception is when
7959 vectorizing an outer-loop: the inner-loop is executed sequentially,
7960 and therefore vectorizing reductions in the inner-loop during
7961 outer-loop vectorization is safe. Likewise when we are vectorizing
7962 a series of reductions using SLP and the VF is one the reductions
7963 are performed in scalar order. */
7964 if (slp_node
7965 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7966 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7968 else if (needs_fold_left_reduction_p (op.type, orig_code))
7970 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7971 is not directy used in stmt. */
7972 if (!only_slp_reduc_chain
7973 && reduc_chain_length != 1)
7975 if (dump_enabled_p ())
7976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7977 "in-order reduction chain without SLP.\n");
7978 return false;
7980 STMT_VINFO_REDUC_TYPE (reduc_info)
7981 = reduction_type = FOLD_LEFT_REDUCTION;
7983 else if (!commutative_binary_op_p (orig_code, op.type)
7984 || !associative_binary_op_p (orig_code, op.type))
7986 if (dump_enabled_p ())
7987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7988 "reduction: not commutative/associative\n");
7989 return false;
7993 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7994 && ncopies > 1)
7996 if (dump_enabled_p ())
7997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7998 "multiple types in double reduction or condition "
7999 "reduction or fold-left reduction.\n");
8000 return false;
8003 internal_fn reduc_fn = IFN_LAST;
8004 if (reduction_type == TREE_CODE_REDUCTION
8005 || reduction_type == FOLD_LEFT_REDUCTION
8006 || reduction_type == INTEGER_INDUC_COND_REDUCTION
8007 || reduction_type == CONST_COND_REDUCTION)
8009 if (reduction_type == FOLD_LEFT_REDUCTION
8010 ? fold_left_reduction_fn (orig_code, &reduc_fn)
8011 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
8013 if (reduc_fn != IFN_LAST
8014 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
8015 OPTIMIZE_FOR_SPEED))
8017 if (dump_enabled_p ())
8018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8019 "reduc op not supported by target.\n");
8021 reduc_fn = IFN_LAST;
8024 else
8026 if (!nested_cycle || double_reduc)
8028 if (dump_enabled_p ())
8029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8030 "no reduc code for scalar code.\n");
8032 return false;
8036 else if (reduction_type == COND_REDUCTION)
8038 int scalar_precision
8039 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8040 cr_index_scalar_type = make_unsigned_type (scalar_precision);
8041 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8042 vectype_out);
8044 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8045 OPTIMIZE_FOR_SPEED))
8046 reduc_fn = IFN_REDUC_MAX;
8048 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8050 if (reduction_type != EXTRACT_LAST_REDUCTION
8051 && (!nested_cycle || double_reduc)
8052 && reduc_fn == IFN_LAST
8053 && !nunits_out.is_constant ())
8055 if (dump_enabled_p ())
8056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8057 "missing target support for reduction on"
8058 " variable-length vectors.\n");
8059 return false;
8062 /* For SLP reductions, see if there is a neutral value we can use. */
8063 tree neutral_op = NULL_TREE;
8064 if (slp_node)
8066 tree initial_value = NULL_TREE;
8067 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8068 initial_value = vect_phi_initial_value (reduc_def_phi);
8069 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8070 orig_code, initial_value);
8073 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8075 /* We can't support in-order reductions of code such as this:
8077 for (int i = 0; i < n1; ++i)
8078 for (int j = 0; j < n2; ++j)
8079 l += a[j];
8081 since GCC effectively transforms the loop when vectorizing:
8083 for (int i = 0; i < n1 / VF; ++i)
8084 for (int j = 0; j < n2; ++j)
8085 for (int k = 0; k < VF; ++k)
8086 l += a[j];
8088 which is a reassociation of the original operation. */
8089 if (dump_enabled_p ())
8090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8091 "in-order double reduction not supported.\n");
8093 return false;
8096 if (reduction_type == FOLD_LEFT_REDUCTION
8097 && slp_node
8098 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8100 /* We cannot use in-order reductions in this case because there is
8101 an implicit reassociation of the operations involved. */
8102 if (dump_enabled_p ())
8103 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8104 "in-order unchained SLP reductions not supported.\n");
8105 return false;
8108 /* For double reductions, and for SLP reductions with a neutral value,
8109 we construct a variable-length initial vector by loading a vector
8110 full of the neutral value and then shift-and-inserting the start
8111 values into the low-numbered elements. */
8112 if ((double_reduc || neutral_op)
8113 && !nunits_out.is_constant ()
8114 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8115 vectype_out, OPTIMIZE_FOR_SPEED))
8117 if (dump_enabled_p ())
8118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8119 "reduction on variable-length vectors requires"
8120 " target support for a vector-shift-and-insert"
8121 " operation.\n");
8122 return false;
8125 /* Check extra constraints for variable-length unchained SLP reductions. */
8126 if (slp_node
8127 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8128 && !nunits_out.is_constant ())
8130 /* We checked above that we could build the initial vector when
8131 there's a neutral element value. Check here for the case in
8132 which each SLP statement has its own initial value and in which
8133 that value needs to be repeated for every instance of the
8134 statement within the initial vector. */
8135 unsigned int group_size = SLP_TREE_LANES (slp_node);
8136 if (!neutral_op
8137 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8138 TREE_TYPE (vectype_out)))
8140 if (dump_enabled_p ())
8141 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8142 "unsupported form of SLP reduction for"
8143 " variable-length vectors: cannot build"
8144 " initial vector.\n");
8145 return false;
8147 /* The epilogue code relies on the number of elements being a multiple
8148 of the group size. The duplicate-and-interleave approach to setting
8149 up the initial vector does too. */
8150 if (!multiple_p (nunits_out, group_size))
8152 if (dump_enabled_p ())
8153 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8154 "unsupported form of SLP reduction for"
8155 " variable-length vectors: the vector size"
8156 " is not a multiple of the number of results.\n");
8157 return false;
8161 if (reduction_type == COND_REDUCTION)
8163 widest_int ni;
8165 if (! max_loop_iterations (loop, &ni))
8167 if (dump_enabled_p ())
8168 dump_printf_loc (MSG_NOTE, vect_location,
8169 "loop count not known, cannot create cond "
8170 "reduction.\n");
8171 return false;
8173 /* Convert backedges to iterations. */
8174 ni += 1;
8176 /* The additional index will be the same type as the condition. Check
8177 that the loop can fit into this less one (because we'll use up the
8178 zero slot for when there are no matches). */
8179 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8180 if (wi::geu_p (ni, wi::to_widest (max_index)))
8182 if (dump_enabled_p ())
8183 dump_printf_loc (MSG_NOTE, vect_location,
8184 "loop size is greater than data size.\n");
8185 return false;
8189 /* In case the vectorization factor (VF) is bigger than the number
8190 of elements that we can fit in a vectype (nunits), we have to generate
8191 more than one vector stmt - i.e - we need to "unroll" the
8192 vector stmt by a factor VF/nunits. For more details see documentation
8193 in vectorizable_operation. */
8195 /* If the reduction is used in an outer loop we need to generate
8196 VF intermediate results, like so (e.g. for ncopies=2):
8197 r0 = phi (init, r0)
8198 r1 = phi (init, r1)
8199 r0 = x0 + r0;
8200 r1 = x1 + r1;
8201 (i.e. we generate VF results in 2 registers).
8202 In this case we have a separate def-use cycle for each copy, and therefore
8203 for each copy we get the vector def for the reduction variable from the
8204 respective phi node created for this copy.
8206 Otherwise (the reduction is unused in the loop nest), we can combine
8207 together intermediate results, like so (e.g. for ncopies=2):
8208 r = phi (init, r)
8209 r = x0 + r;
8210 r = x1 + r;
8211 (i.e. we generate VF/2 results in a single register).
8212 In this case for each copy we get the vector def for the reduction variable
8213 from the vectorized reduction operation generated in the previous iteration.
8215 This only works when we see both the reduction PHI and its only consumer
8216 in vectorizable_reduction and there are no intermediate stmts
8217 participating. When unrolling we want each unrolled iteration to have its
8218 own reduction accumulator since one of the main goals of unrolling a
8219 reduction is to reduce the aggregate loop-carried latency. */
8220 if (ncopies > 1
8221 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8222 && reduc_chain_length == 1
8223 && loop_vinfo->suggested_unroll_factor == 1)
8224 single_defuse_cycle = true;
8226 if (single_defuse_cycle || lane_reduc_code_p)
8228 gcc_assert (op.code != COND_EXPR);
8230 /* 4. Supportable by target? */
8231 bool ok = true;
8233 /* 4.1. check support for the operation in the loop
8235 This isn't necessary for the lane reduction codes, since they
8236 can only be produced by pattern matching, and it's up to the
8237 pattern matcher to test for support. The main reason for
8238 specifically skipping this step is to avoid rechecking whether
8239 mixed-sign dot-products can be implemented using signed
8240 dot-products. */
8241 machine_mode vec_mode = TYPE_MODE (vectype_in);
8242 if (!lane_reduc_code_p
8243 && !directly_supported_p (op.code, vectype_in, optab_vector))
8245 if (dump_enabled_p ())
8246 dump_printf (MSG_NOTE, "op not supported by target.\n");
8247 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8248 || !vect_can_vectorize_without_simd_p (op.code))
8249 ok = false;
8250 else
8251 if (dump_enabled_p ())
8252 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8255 if (vect_emulated_vector_p (vectype_in)
8256 && !vect_can_vectorize_without_simd_p (op.code))
8258 if (dump_enabled_p ())
8259 dump_printf (MSG_NOTE, "using word mode not possible.\n");
8260 return false;
8263 /* lane-reducing operations have to go through vect_transform_reduction.
8264 For the other cases try without the single cycle optimization. */
8265 if (!ok)
8267 if (lane_reduc_code_p)
8268 return false;
8269 else
8270 single_defuse_cycle = false;
8273 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8275 /* If the reduction stmt is one of the patterns that have lane
8276 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
8277 if ((ncopies > 1 && ! single_defuse_cycle)
8278 && lane_reduc_code_p)
8280 if (dump_enabled_p ())
8281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8282 "multi def-use cycle not possible for lane-reducing "
8283 "reduction operation\n");
8284 return false;
8287 if (slp_node
8288 && !(!single_defuse_cycle
8289 && !lane_reduc_code_p
8290 && reduction_type != FOLD_LEFT_REDUCTION))
8291 for (i = 0; i < (int) op.num_ops; i++)
8292 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8294 if (dump_enabled_p ())
8295 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8296 "incompatible vector types for invariants\n");
8297 return false;
8300 if (slp_node)
8301 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8302 else
8303 vec_num = 1;
8305 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8306 reduction_type, ncopies, cost_vec);
8307 /* Cost the reduction op inside the loop if transformed via
8308 vect_transform_reduction. Otherwise this is costed by the
8309 separate vectorizable_* routines. */
8310 if (single_defuse_cycle || lane_reduc_code_p)
8312 int factor = 1;
8313 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8314 /* Three dot-products and a subtraction. */
8315 factor = 4;
8316 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8317 stmt_info, 0, vect_body);
8320 if (dump_enabled_p ()
8321 && reduction_type == FOLD_LEFT_REDUCTION)
8322 dump_printf_loc (MSG_NOTE, vect_location,
8323 "using an in-order (fold-left) reduction.\n");
8324 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8325 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8326 reductions go through their own vectorizable_* routines. */
8327 if (!single_defuse_cycle
8328 && !lane_reduc_code_p
8329 && reduction_type != FOLD_LEFT_REDUCTION)
8331 stmt_vec_info tem
8332 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8333 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8335 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8336 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8338 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8339 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8341 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8343 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8344 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8345 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8347 if (reduction_type != FOLD_LEFT_REDUCTION
8348 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8349 && (cond_fn == IFN_LAST
8350 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8351 OPTIMIZE_FOR_SPEED)))
8353 if (dump_enabled_p ())
8354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8355 "can't operate on partial vectors because"
8356 " no conditional operation is available.\n");
8357 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8359 else if (reduction_type == FOLD_LEFT_REDUCTION
8360 && reduc_fn == IFN_LAST
8361 && !expand_vec_cond_expr_p (vectype_in,
8362 truth_type_for (vectype_in),
8363 SSA_NAME))
8365 if (dump_enabled_p ())
8366 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8367 "can't operate on partial vectors because"
8368 " no conditional operation is available.\n");
8369 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8371 else if (reduction_type == FOLD_LEFT_REDUCTION
8372 && internal_fn_mask_index (reduc_fn) == -1
8373 && FLOAT_TYPE_P (vectype_in)
8374 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8376 if (dump_enabled_p ())
8377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8378 "can't operate on partial vectors because"
8379 " signed zeros cannot be preserved.\n");
8380 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8382 else
8384 internal_fn mask_reduc_fn
8385 = get_masked_reduction_fn (reduc_fn, vectype_in);
8387 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8388 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8389 vectype_in, 1);
8390 else
8391 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8392 vectype_in, NULL);
8395 return true;
8398 /* STMT_INFO is a dot-product reduction whose multiplication operands
8399 have different signs. Emit a sequence to emulate the operation
8400 using a series of signed DOT_PROD_EXPRs and return the last
8401 statement generated. VEC_DEST is the result of the vector operation
8402 and VOP lists its inputs. */
8404 static gassign *
8405 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8406 gimple_stmt_iterator *gsi, tree vec_dest,
8407 tree vop[3])
8409 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8410 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8411 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8412 gimple *new_stmt;
8414 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8415 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8416 std::swap (vop[0], vop[1]);
8418 /* Convert all inputs to signed types. */
8419 for (int i = 0; i < 3; ++i)
8420 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8422 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8423 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8424 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8425 vop[i] = tmp;
8428 /* In the comments below we assume 8-bit inputs for simplicity,
8429 but the approach works for any full integer type. */
8431 /* Create a vector of -128. */
8432 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8433 tree min_narrow = build_vector_from_val (narrow_vectype,
8434 min_narrow_elttype);
8436 /* Create a vector of 64. */
8437 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8438 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8439 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8441 /* Emit: SUB_RES = VOP[0] - 128. */
8442 tree sub_res = make_ssa_name (narrow_vectype);
8443 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8444 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8446 /* Emit:
8448 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8449 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8450 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8452 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8453 Doing the two 64 * y steps first allows more time to compute x. */
8454 tree stage1 = make_ssa_name (wide_vectype);
8455 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8456 vop[1], half_narrow, vop[2]);
8457 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8459 tree stage2 = make_ssa_name (wide_vectype);
8460 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8461 vop[1], half_narrow, stage1);
8462 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8464 tree stage3 = make_ssa_name (wide_vectype);
8465 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8466 sub_res, vop[1], stage2);
8467 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8469 /* Convert STAGE3 to the reduction type. */
8470 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8473 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8474 value. */
8476 bool
8477 vect_transform_reduction (loop_vec_info loop_vinfo,
8478 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8479 gimple **vec_stmt, slp_tree slp_node)
8481 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8482 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8483 int i;
8484 int ncopies;
8485 int vec_num;
8487 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8488 gcc_assert (reduc_info->is_reduc_info);
8490 if (nested_in_vect_loop_p (loop, stmt_info))
8492 loop = loop->inner;
8493 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8496 gimple_match_op op;
8497 if (!gimple_extract_op (stmt_info->stmt, &op))
8498 gcc_unreachable ();
8500 /* All uses but the last are expected to be defined in the loop.
8501 The last use is the reduction variable. In case of nested cycle this
8502 assumption is not true: we use reduc_index to record the index of the
8503 reduction variable. */
8504 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8505 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8506 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8507 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8509 if (slp_node)
8511 ncopies = 1;
8512 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8514 else
8516 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8517 vec_num = 1;
8520 code_helper code = canonicalize_code (op.code, op.type);
8521 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8523 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8524 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8525 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8527 /* Transform. */
8528 tree new_temp = NULL_TREE;
8529 auto_vec<tree> vec_oprnds0;
8530 auto_vec<tree> vec_oprnds1;
8531 auto_vec<tree> vec_oprnds2;
8532 tree def0;
8534 if (dump_enabled_p ())
8535 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8537 /* FORNOW: Multiple types are not supported for condition. */
8538 if (code == COND_EXPR)
8539 gcc_assert (ncopies == 1);
8541 /* A binary COND_OP reduction must have the same definition and else
8542 value. */
8543 bool cond_fn_p = code.is_internal_fn ()
8544 && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8545 if (cond_fn_p)
8547 gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8548 || code == IFN_COND_MUL || code == IFN_COND_AND
8549 || code == IFN_COND_IOR || code == IFN_COND_XOR);
8550 gcc_assert (op.num_ops == 4
8551 && (op.ops[reduc_index]
8552 == op.ops[internal_fn_else_index ((internal_fn) code)]));
8555 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8557 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8558 if (reduction_type == FOLD_LEFT_REDUCTION)
8560 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8561 gcc_assert (code.is_tree_code () || cond_fn_p);
8562 return vectorize_fold_left_reduction
8563 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8564 code, reduc_fn, op.ops, op.num_ops, vectype_in,
8565 reduc_index, masks, lens);
8568 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8569 gcc_assert (single_defuse_cycle
8570 || code == DOT_PROD_EXPR
8571 || code == WIDEN_SUM_EXPR
8572 || code == SAD_EXPR);
8574 /* Create the destination vector */
8575 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8576 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8578 /* Get NCOPIES vector definitions for all operands except the reduction
8579 definition. */
8580 if (!cond_fn_p)
8582 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8583 single_defuse_cycle && reduc_index == 0
8584 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8585 single_defuse_cycle && reduc_index == 1
8586 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8587 op.num_ops == 3
8588 && !(single_defuse_cycle && reduc_index == 2)
8589 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8591 else
8593 /* For a conditional operation pass the truth type as mask
8594 vectype. */
8595 gcc_assert (single_defuse_cycle
8596 && (reduc_index == 1 || reduc_index == 2));
8597 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8598 op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8599 reduc_index == 1 ? NULL_TREE : op.ops[1],
8600 NULL_TREE, &vec_oprnds1,
8601 reduc_index == 2 ? NULL_TREE : op.ops[2],
8602 NULL_TREE, &vec_oprnds2);
8605 /* For single def-use cycles get one copy of the vectorized reduction
8606 definition. */
8607 if (single_defuse_cycle)
8609 gcc_assert (!slp_node);
8610 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8611 op.ops[reduc_index],
8612 reduc_index == 0 ? &vec_oprnds0
8613 : (reduc_index == 1 ? &vec_oprnds1
8614 : &vec_oprnds2));
8617 bool emulated_mixed_dot_prod
8618 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8619 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8621 gimple *new_stmt;
8622 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8623 if (masked_loop_p && !mask_by_cond_expr)
8625 /* No conditional ifns have been defined for dot-product yet. */
8626 gcc_assert (code != DOT_PROD_EXPR);
8628 /* Make sure that the reduction accumulator is vop[0]. */
8629 if (reduc_index == 1)
8631 gcc_assert (commutative_binary_op_p (code, op.type));
8632 std::swap (vop[0], vop[1]);
8634 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8635 vec_num * ncopies, vectype_in, i);
8636 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8637 vop[0], vop[1], vop[0]);
8638 new_temp = make_ssa_name (vec_dest, call);
8639 gimple_call_set_lhs (call, new_temp);
8640 gimple_call_set_nothrow (call, true);
8641 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8642 new_stmt = call;
8644 else
8646 if (op.num_ops >= 3)
8647 vop[2] = vec_oprnds2[i];
8649 if (masked_loop_p && mask_by_cond_expr)
8651 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8652 vec_num * ncopies, vectype_in, i);
8653 build_vect_cond_expr (code, vop, mask, gsi);
8656 if (emulated_mixed_dot_prod)
8657 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8658 vec_dest, vop);
8660 else if (code.is_internal_fn () && !cond_fn_p)
8661 new_stmt = gimple_build_call_internal (internal_fn (code),
8662 op.num_ops,
8663 vop[0], vop[1], vop[2]);
8664 else if (code.is_internal_fn () && cond_fn_p)
8665 new_stmt = gimple_build_call_internal (internal_fn (code),
8666 op.num_ops,
8667 vop[0], vop[1], vop[2],
8668 vop[1]);
8669 else
8670 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8671 vop[0], vop[1], vop[2]);
8672 new_temp = make_ssa_name (vec_dest, new_stmt);
8673 gimple_set_lhs (new_stmt, new_temp);
8674 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8677 if (slp_node)
8678 slp_node->push_vec_def (new_stmt);
8679 else if (single_defuse_cycle
8680 && i < ncopies - 1)
8682 if (reduc_index == 0)
8683 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8684 else if (reduc_index == 1)
8685 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8686 else if (reduc_index == 2)
8687 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8689 else
8690 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8693 if (!slp_node)
8694 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8696 return true;
8699 /* Transform phase of a cycle PHI. */
8701 bool
8702 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8703 stmt_vec_info stmt_info, gimple **vec_stmt,
8704 slp_tree slp_node, slp_instance slp_node_instance)
8706 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8707 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8708 int i;
8709 int ncopies;
8710 int j;
8711 bool nested_cycle = false;
8712 int vec_num;
8714 if (nested_in_vect_loop_p (loop, stmt_info))
8716 loop = loop->inner;
8717 nested_cycle = true;
8720 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8721 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8722 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8723 gcc_assert (reduc_info->is_reduc_info);
8725 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8726 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8727 /* Leave the scalar phi in place. */
8728 return true;
8730 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8731 /* For a nested cycle we do not fill the above. */
8732 if (!vectype_in)
8733 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8734 gcc_assert (vectype_in);
8736 if (slp_node)
8738 /* The size vect_schedule_slp_instance computes is off for us. */
8739 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8740 * SLP_TREE_LANES (slp_node), vectype_in);
8741 ncopies = 1;
8743 else
8745 vec_num = 1;
8746 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8749 /* Check whether we should use a single PHI node and accumulate
8750 vectors to one before the backedge. */
8751 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8752 ncopies = 1;
8754 /* Create the destination vector */
8755 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8756 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8757 vectype_out);
8759 /* Get the loop-entry arguments. */
8760 tree vec_initial_def = NULL_TREE;
8761 auto_vec<tree> vec_initial_defs;
8762 if (slp_node)
8764 vec_initial_defs.reserve (vec_num);
8765 if (nested_cycle)
8767 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8768 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8769 &vec_initial_defs);
8771 else
8773 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8774 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8775 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8777 unsigned int num_phis = stmts.length ();
8778 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8779 num_phis = 1;
8780 initial_values.reserve (num_phis);
8781 for (unsigned int i = 0; i < num_phis; ++i)
8783 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8784 initial_values.quick_push (vect_phi_initial_value (this_phi));
8786 if (vec_num == 1)
8787 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8788 if (!initial_values.is_empty ())
8790 tree initial_value
8791 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8792 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8793 tree neutral_op
8794 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8795 code, initial_value);
8796 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8797 &vec_initial_defs, vec_num,
8798 stmts.length (), neutral_op);
8802 else
8804 /* Get at the scalar def before the loop, that defines the initial
8805 value of the reduction variable. */
8806 tree initial_def = vect_phi_initial_value (phi);
8807 reduc_info->reduc_initial_values.safe_push (initial_def);
8808 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8809 and we can't use zero for induc_val, use initial_def. Similarly
8810 for REDUC_MIN and initial_def larger than the base. */
8811 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8813 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8814 if (TREE_CODE (initial_def) == INTEGER_CST
8815 && !integer_zerop (induc_val)
8816 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8817 && tree_int_cst_lt (initial_def, induc_val))
8818 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8819 && tree_int_cst_lt (induc_val, initial_def))))
8821 induc_val = initial_def;
8822 /* Communicate we used the initial_def to epilouge
8823 generation. */
8824 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8826 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8828 else if (nested_cycle)
8830 /* Do not use an adjustment def as that case is not supported
8831 correctly if ncopies is not one. */
8832 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8833 ncopies, initial_def,
8834 &vec_initial_defs);
8836 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8837 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8838 /* Fill the initial vector with the initial scalar value. */
8839 vec_initial_def
8840 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8841 initial_def, initial_def);
8842 else
8844 if (ncopies == 1)
8845 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8846 if (!reduc_info->reduc_initial_values.is_empty ())
8848 initial_def = reduc_info->reduc_initial_values[0];
8849 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8850 tree neutral_op
8851 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8852 code, initial_def);
8853 gcc_assert (neutral_op);
8854 /* Try to simplify the vector initialization by applying an
8855 adjustment after the reduction has been performed. */
8856 if (!reduc_info->reused_accumulator
8857 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8858 && !operand_equal_p (neutral_op, initial_def))
8860 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8861 = initial_def;
8862 initial_def = neutral_op;
8864 vec_initial_def
8865 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8866 initial_def, neutral_op);
8871 if (vec_initial_def)
8873 vec_initial_defs.create (ncopies);
8874 for (i = 0; i < ncopies; ++i)
8875 vec_initial_defs.quick_push (vec_initial_def);
8878 if (auto *accumulator = reduc_info->reused_accumulator)
8880 tree def = accumulator->reduc_input;
8881 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8883 unsigned int nreduc;
8884 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8885 (TREE_TYPE (def)),
8886 TYPE_VECTOR_SUBPARTS (vectype_out),
8887 &nreduc);
8888 gcc_assert (res);
8889 gimple_seq stmts = NULL;
8890 /* Reduce the single vector to a smaller one. */
8891 if (nreduc != 1)
8893 /* Perform the reduction in the appropriate type. */
8894 tree rvectype = vectype_out;
8895 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8896 TREE_TYPE (TREE_TYPE (def))))
8897 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8898 TYPE_VECTOR_SUBPARTS
8899 (vectype_out));
8900 def = vect_create_partial_epilog (def, rvectype,
8901 STMT_VINFO_REDUC_CODE
8902 (reduc_info),
8903 &stmts);
8905 /* The epilogue loop might use a different vector mode, like
8906 VNx2DI vs. V2DI. */
8907 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8909 tree reduc_type = build_vector_type_for_mode
8910 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8911 def = gimple_convert (&stmts, reduc_type, def);
8913 /* Adjust the input so we pick up the partially reduced value
8914 for the skip edge in vect_create_epilog_for_reduction. */
8915 accumulator->reduc_input = def;
8916 /* And the reduction could be carried out using a different sign. */
8917 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8918 def = gimple_convert (&stmts, vectype_out, def);
8919 if (loop_vinfo->main_loop_edge)
8921 /* While we'd like to insert on the edge this will split
8922 blocks and disturb bookkeeping, we also will eventually
8923 need this on the skip edge. Rely on sinking to
8924 fixup optimal placement and insert in the pred. */
8925 gimple_stmt_iterator gsi
8926 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8927 /* Insert before a cond that eventually skips the
8928 epilogue. */
8929 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8930 gsi_prev (&gsi);
8931 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8933 else
8934 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8935 stmts);
8937 if (loop_vinfo->main_loop_edge)
8938 vec_initial_defs[0]
8939 = vect_get_main_loop_result (loop_vinfo, def,
8940 vec_initial_defs[0]);
8941 else
8942 vec_initial_defs.safe_push (def);
8945 /* Generate the reduction PHIs upfront. */
8946 for (i = 0; i < vec_num; i++)
8948 tree vec_init_def = vec_initial_defs[i];
8949 for (j = 0; j < ncopies; j++)
8951 /* Create the reduction-phi that defines the reduction
8952 operand. */
8953 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8955 /* Set the loop-entry arg of the reduction-phi. */
8956 if (j != 0 && nested_cycle)
8957 vec_init_def = vec_initial_defs[j];
8958 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8959 UNKNOWN_LOCATION);
8961 /* The loop-latch arg is set in epilogue processing. */
8963 if (slp_node)
8964 slp_node->push_vec_def (new_phi);
8965 else
8967 if (j == 0)
8968 *vec_stmt = new_phi;
8969 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8974 return true;
8977 /* Vectorizes LC PHIs. */
8979 bool
8980 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8981 stmt_vec_info stmt_info, gimple **vec_stmt,
8982 slp_tree slp_node)
8984 if (!loop_vinfo
8985 || !is_a <gphi *> (stmt_info->stmt)
8986 || gimple_phi_num_args (stmt_info->stmt) != 1)
8987 return false;
8989 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8990 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8991 return false;
8993 if (!vec_stmt) /* transformation not required. */
8995 /* Deal with copies from externs or constants that disguise as
8996 loop-closed PHI nodes (PR97886). */
8997 if (slp_node
8998 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8999 SLP_TREE_VECTYPE (slp_node)))
9001 if (dump_enabled_p ())
9002 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9003 "incompatible vector types for invariants\n");
9004 return false;
9006 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
9007 return true;
9010 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9011 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9012 basic_block bb = gimple_bb (stmt_info->stmt);
9013 edge e = single_pred_edge (bb);
9014 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9015 auto_vec<tree> vec_oprnds;
9016 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
9017 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
9018 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
9019 for (unsigned i = 0; i < vec_oprnds.length (); i++)
9021 /* Create the vectorized LC PHI node. */
9022 gphi *new_phi = create_phi_node (vec_dest, bb);
9023 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
9024 if (slp_node)
9025 slp_node->push_vec_def (new_phi);
9026 else
9027 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
9029 if (!slp_node)
9030 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9032 return true;
9035 /* Vectorizes PHIs. */
9037 bool
9038 vectorizable_phi (vec_info *,
9039 stmt_vec_info stmt_info, gimple **vec_stmt,
9040 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9042 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9043 return false;
9045 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9046 return false;
9048 tree vectype = SLP_TREE_VECTYPE (slp_node);
9050 if (!vec_stmt) /* transformation not required. */
9052 slp_tree child;
9053 unsigned i;
9054 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9055 if (!child)
9057 if (dump_enabled_p ())
9058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9059 "PHI node with unvectorized backedge def\n");
9060 return false;
9062 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9064 if (dump_enabled_p ())
9065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9066 "incompatible vector types for invariants\n");
9067 return false;
9069 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9070 && !useless_type_conversion_p (vectype,
9071 SLP_TREE_VECTYPE (child)))
9073 /* With bools we can have mask and non-mask precision vectors
9074 or different non-mask precisions. while pattern recog is
9075 supposed to guarantee consistency here bugs in it can cause
9076 mismatches (PR103489 and PR103800 for example).
9077 Deal with them here instead of ICEing later. */
9078 if (dump_enabled_p ())
9079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9080 "incompatible vector type setup from "
9081 "bool pattern detection\n");
9082 return false;
9085 /* For single-argument PHIs assume coalescing which means zero cost
9086 for the scalar and the vector PHIs. This avoids artificially
9087 favoring the vector path (but may pessimize it in some cases). */
9088 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9089 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9090 vector_stmt, stmt_info, vectype, 0, vect_body);
9091 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9092 return true;
9095 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9096 basic_block bb = gimple_bb (stmt_info->stmt);
9097 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9098 auto_vec<gphi *> new_phis;
9099 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9101 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9103 /* Skip not yet vectorized defs. */
9104 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9105 && SLP_TREE_VEC_DEFS (child).is_empty ())
9106 continue;
9108 auto_vec<tree> vec_oprnds;
9109 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9110 if (!new_phis.exists ())
9112 new_phis.create (vec_oprnds.length ());
9113 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9115 /* Create the vectorized LC PHI node. */
9116 new_phis.quick_push (create_phi_node (vec_dest, bb));
9117 slp_node->push_vec_def (new_phis[j]);
9120 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9121 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9122 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9124 /* We should have at least one already vectorized child. */
9125 gcc_assert (new_phis.exists ());
9127 return true;
9130 /* Vectorizes first order recurrences. An overview of the transformation
9131 is described below. Suppose we have the following loop.
9133 int t = 0;
9134 for (int i = 0; i < n; ++i)
9136 b[i] = a[i] - t;
9137 t = a[i];
9140 There is a first-order recurrence on 'a'. For this loop, the scalar IR
9141 looks (simplified) like:
9143 scalar.preheader:
9144 init = 0;
9146 scalar.body:
9147 i = PHI <0(scalar.preheader), i+1(scalar.body)>
9148 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9149 _1 = a[i]
9150 b[i] = _1 - _2
9151 if (i < n) goto scalar.body
9153 In this example, _2 is a recurrence because it's value depends on the
9154 previous iteration. We vectorize this as (VF = 4)
9156 vector.preheader:
9157 vect_init = vect_cst(..., ..., ..., 0)
9159 vector.body
9160 i = PHI <0(vector.preheader), i+4(vector.body)>
9161 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9162 vect_2 = a[i, i+1, i+2, i+3];
9163 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9164 b[i, i+1, i+2, i+3] = vect_2 - vect_3
9165 if (..) goto vector.body
9167 In this function, vectorizable_recurr, we code generate both the
9168 vector PHI node and the permute since those together compute the
9169 vectorized value of the scalar PHI. We do not yet have the
9170 backedge value to fill in there nor into the vec_perm. Those
9171 are filled in maybe_set_vectorized_backedge_value and
9172 vect_schedule_scc.
9174 TODO: Since the scalar loop does not have a use of the recurrence
9175 outside of the loop the natural way to implement peeling via
9176 vectorizing the live value doesn't work. For now peeling of loops
9177 with a recurrence is not implemented. For SLP the supported cases
9178 are restricted to those requiring a single vector recurrence PHI. */
9180 bool
9181 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9182 gimple **vec_stmt, slp_tree slp_node,
9183 stmt_vector_for_cost *cost_vec)
9185 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9186 return false;
9188 gphi *phi = as_a<gphi *> (stmt_info->stmt);
9190 /* So far we only support first-order recurrence auto-vectorization. */
9191 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9192 return false;
9194 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9195 unsigned ncopies;
9196 if (slp_node)
9197 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9198 else
9199 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9200 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9201 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9202 /* We need to be able to make progress with a single vector. */
9203 if (maybe_gt (dist * 2, nunits))
9205 if (dump_enabled_p ())
9206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9207 "first order recurrence exceeds half of "
9208 "a vector\n");
9209 return false;
9212 /* First-order recurrence autovectorization needs to handle permutation
9213 with indices = [nunits-1, nunits, nunits+1, ...]. */
9214 vec_perm_builder sel (nunits, 1, 3);
9215 for (int i = 0; i < 3; ++i)
9216 sel.quick_push (nunits - dist + i);
9217 vec_perm_indices indices (sel, 2, nunits);
9219 if (!vec_stmt) /* transformation not required. */
9221 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9222 indices))
9223 return false;
9225 if (slp_node)
9227 /* We eventually need to set a vector type on invariant
9228 arguments. */
9229 unsigned j;
9230 slp_tree child;
9231 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9232 if (!vect_maybe_update_slp_op_vectype
9233 (child, SLP_TREE_VECTYPE (slp_node)))
9235 if (dump_enabled_p ())
9236 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9237 "incompatible vector types for "
9238 "invariants\n");
9239 return false;
9242 /* The recurrence costs the initialization vector and one permute
9243 for each copy. */
9244 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9245 stmt_info, 0, vect_prologue);
9246 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9247 stmt_info, 0, vect_body);
9248 if (dump_enabled_p ())
9249 dump_printf_loc (MSG_NOTE, vect_location,
9250 "vectorizable_recurr: inside_cost = %d, "
9251 "prologue_cost = %d .\n", inside_cost,
9252 prologue_cost);
9254 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9255 return true;
9258 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9259 basic_block bb = gimple_bb (phi);
9260 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9261 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9263 gimple_seq stmts = NULL;
9264 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9265 gsi_insert_seq_on_edge_immediate (pe, stmts);
9267 tree vec_init = build_vector_from_val (vectype, preheader);
9268 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9270 /* Create the vectorized first-order PHI node. */
9271 tree vec_dest = vect_get_new_vect_var (vectype,
9272 vect_simple_var, "vec_recur_");
9273 gphi *new_phi = create_phi_node (vec_dest, bb);
9274 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9276 /* Insert shuffles the first-order recurrence autovectorization.
9277 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9278 tree perm = vect_gen_perm_mask_checked (vectype, indices);
9280 /* Insert the required permute after the latch definition. The
9281 second and later operands are tentative and will be updated when we have
9282 vectorized the latch definition. */
9283 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9284 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9285 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9286 gsi_next (&gsi2);
9288 for (unsigned i = 0; i < ncopies; ++i)
9290 vec_dest = make_ssa_name (vectype);
9291 gassign *vperm
9292 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9293 i == 0 ? gimple_phi_result (new_phi) : NULL,
9294 NULL, perm);
9295 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9297 if (slp_node)
9298 slp_node->push_vec_def (vperm);
9299 else
9300 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9303 if (!slp_node)
9304 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9305 return true;
9308 /* Return true if VECTYPE represents a vector that requires lowering
9309 by the vector lowering pass. */
9311 bool
9312 vect_emulated_vector_p (tree vectype)
9314 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9315 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9316 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9319 /* Return true if we can emulate CODE on an integer mode representation
9320 of a vector. */
9322 bool
9323 vect_can_vectorize_without_simd_p (tree_code code)
9325 switch (code)
9327 case PLUS_EXPR:
9328 case MINUS_EXPR:
9329 case NEGATE_EXPR:
9330 case BIT_AND_EXPR:
9331 case BIT_IOR_EXPR:
9332 case BIT_XOR_EXPR:
9333 case BIT_NOT_EXPR:
9334 return true;
9336 default:
9337 return false;
9341 /* Likewise, but taking a code_helper. */
9343 bool
9344 vect_can_vectorize_without_simd_p (code_helper code)
9346 return (code.is_tree_code ()
9347 && vect_can_vectorize_without_simd_p (tree_code (code)));
9350 /* Create vector init for vectorized iv. */
9351 static tree
9352 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9353 tree step_expr, poly_uint64 nunits,
9354 tree vectype,
9355 enum vect_induction_op_type induction_type)
9357 unsigned HOST_WIDE_INT const_nunits;
9358 tree vec_shift, vec_init, new_name;
9359 unsigned i;
9360 tree itype = TREE_TYPE (vectype);
9362 /* iv_loop is the loop to be vectorized. Create:
9363 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9364 new_name = gimple_convert (stmts, itype, init_expr);
9365 switch (induction_type)
9367 case vect_step_op_shr:
9368 case vect_step_op_shl:
9369 /* Build the Initial value from shift_expr. */
9370 vec_init = gimple_build_vector_from_val (stmts,
9371 vectype,
9372 new_name);
9373 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9374 build_zero_cst (itype), step_expr);
9375 vec_init = gimple_build (stmts,
9376 (induction_type == vect_step_op_shr
9377 ? RSHIFT_EXPR : LSHIFT_EXPR),
9378 vectype, vec_init, vec_shift);
9379 break;
9381 case vect_step_op_neg:
9383 vec_init = gimple_build_vector_from_val (stmts,
9384 vectype,
9385 new_name);
9386 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9387 vectype, vec_init);
9388 /* The encoding has 2 interleaved stepped patterns. */
9389 vec_perm_builder sel (nunits, 2, 3);
9390 sel.quick_grow (6);
9391 for (i = 0; i < 3; i++)
9393 sel[2 * i] = i;
9394 sel[2 * i + 1] = i + nunits;
9396 vec_perm_indices indices (sel, 2, nunits);
9397 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9398 fail when vec_init is const vector. In that situation vec_perm is not
9399 really needed. */
9400 tree perm_mask_even
9401 = vect_gen_perm_mask_any (vectype, indices);
9402 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9403 vectype,
9404 vec_init, vec_neg,
9405 perm_mask_even);
9407 break;
9409 case vect_step_op_mul:
9411 /* Use unsigned mult to avoid UD integer overflow. */
9412 gcc_assert (nunits.is_constant (&const_nunits));
9413 tree utype = unsigned_type_for (itype);
9414 tree uvectype = build_vector_type (utype,
9415 TYPE_VECTOR_SUBPARTS (vectype));
9416 new_name = gimple_convert (stmts, utype, new_name);
9417 vec_init = gimple_build_vector_from_val (stmts,
9418 uvectype,
9419 new_name);
9420 tree_vector_builder elts (uvectype, const_nunits, 1);
9421 tree elt_step = build_one_cst (utype);
9423 elts.quick_push (elt_step);
9424 for (i = 1; i < const_nunits; i++)
9426 /* Create: new_name_i = new_name + step_expr. */
9427 elt_step = gimple_build (stmts, MULT_EXPR,
9428 utype, elt_step, step_expr);
9429 elts.quick_push (elt_step);
9431 /* Create a vector from [new_name_0, new_name_1, ...,
9432 new_name_nunits-1]. */
9433 tree vec_mul = gimple_build_vector (stmts, &elts);
9434 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9435 vec_init, vec_mul);
9436 vec_init = gimple_convert (stmts, vectype, vec_init);
9438 break;
9440 default:
9441 gcc_unreachable ();
9444 return vec_init;
9447 /* Peel init_expr by skip_niter for induction_type. */
9448 tree
9449 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9450 tree skip_niters, tree step_expr,
9451 enum vect_induction_op_type induction_type)
9453 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9454 tree type = TREE_TYPE (init_expr);
9455 unsigned prec = TYPE_PRECISION (type);
9456 switch (induction_type)
9458 case vect_step_op_neg:
9459 if (TREE_INT_CST_LOW (skip_niters) % 2)
9460 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9461 /* else no change. */
9462 break;
9464 case vect_step_op_shr:
9465 case vect_step_op_shl:
9466 skip_niters = gimple_convert (stmts, type, skip_niters);
9467 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9468 /* When shift mount >= precision, need to avoid UD.
9469 In the original loop, there's no UD, and according to semantic,
9470 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9471 if (!tree_fits_uhwi_p (step_expr)
9472 || tree_to_uhwi (step_expr) >= prec)
9474 if (induction_type == vect_step_op_shl
9475 || TYPE_UNSIGNED (type))
9476 init_expr = build_zero_cst (type);
9477 else
9478 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9479 init_expr,
9480 wide_int_to_tree (type, prec - 1));
9482 else
9483 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9484 ? RSHIFT_EXPR : LSHIFT_EXPR),
9485 type, init_expr, step_expr);
9486 break;
9488 case vect_step_op_mul:
9490 tree utype = unsigned_type_for (type);
9491 init_expr = gimple_convert (stmts, utype, init_expr);
9492 wide_int skipn = wi::to_wide (skip_niters);
9493 wide_int begin = wi::to_wide (step_expr);
9494 auto_mpz base, exp, mod, res;
9495 wi::to_mpz (begin, base, TYPE_SIGN (type));
9496 wi::to_mpz (skipn, exp, UNSIGNED);
9497 mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9498 mpz_powm (res, base, exp, mod);
9499 begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9500 tree mult_expr = wide_int_to_tree (utype, begin);
9501 init_expr = gimple_build (stmts, MULT_EXPR, utype,
9502 init_expr, mult_expr);
9503 init_expr = gimple_convert (stmts, type, init_expr);
9505 break;
9507 default:
9508 gcc_unreachable ();
9511 return init_expr;
9514 /* Create vector step for vectorized iv. */
9515 static tree
9516 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9517 poly_uint64 vf,
9518 enum vect_induction_op_type induction_type)
9520 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9521 tree new_name = NULL;
9522 /* Step should be pow (step, vf) for mult induction. */
9523 if (induction_type == vect_step_op_mul)
9525 gcc_assert (vf.is_constant ());
9526 wide_int begin = wi::to_wide (step_expr);
9528 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9529 begin = wi::mul (begin, wi::to_wide (step_expr));
9531 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9533 else if (induction_type == vect_step_op_neg)
9534 /* Do nothing. */
9536 else
9537 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9538 expr, step_expr);
9539 return new_name;
9542 static tree
9543 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9544 stmt_vec_info stmt_info,
9545 tree new_name, tree vectype,
9546 enum vect_induction_op_type induction_type)
9548 /* No step is needed for neg induction. */
9549 if (induction_type == vect_step_op_neg)
9550 return NULL;
9552 tree t = unshare_expr (new_name);
9553 gcc_assert (CONSTANT_CLASS_P (new_name)
9554 || TREE_CODE (new_name) == SSA_NAME);
9555 tree new_vec = build_vector_from_val (vectype, t);
9556 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9557 new_vec, vectype, NULL);
9558 return vec_step;
9561 /* Update vectorized iv with vect_step, induc_def is init. */
9562 static tree
9563 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9564 tree induc_def, tree vec_step,
9565 enum vect_induction_op_type induction_type)
9567 tree vec_def = induc_def;
9568 switch (induction_type)
9570 case vect_step_op_mul:
9572 /* Use unsigned mult to avoid UD integer overflow. */
9573 tree uvectype
9574 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9575 TYPE_VECTOR_SUBPARTS (vectype));
9576 vec_def = gimple_convert (stmts, uvectype, vec_def);
9577 vec_step = gimple_convert (stmts, uvectype, vec_step);
9578 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9579 vec_def, vec_step);
9580 vec_def = gimple_convert (stmts, vectype, vec_def);
9582 break;
9584 case vect_step_op_shr:
9585 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9586 vec_def, vec_step);
9587 break;
9589 case vect_step_op_shl:
9590 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9591 vec_def, vec_step);
9592 break;
9593 case vect_step_op_neg:
9594 vec_def = induc_def;
9595 /* Do nothing. */
9596 break;
9597 default:
9598 gcc_unreachable ();
9601 return vec_def;
9605 /* Function vectorizable_induction
9607 Check if STMT_INFO performs an nonlinear induction computation that can be
9608 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9609 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9610 basic block.
9611 Return true if STMT_INFO is vectorizable in this way. */
9613 static bool
9614 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9615 stmt_vec_info stmt_info,
9616 gimple **vec_stmt, slp_tree slp_node,
9617 stmt_vector_for_cost *cost_vec)
9619 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9620 unsigned ncopies;
9621 bool nested_in_vect_loop = false;
9622 class loop *iv_loop;
9623 tree vec_def;
9624 edge pe = loop_preheader_edge (loop);
9625 basic_block new_bb;
9626 tree vec_init, vec_step;
9627 tree new_name;
9628 gimple *new_stmt;
9629 gphi *induction_phi;
9630 tree induc_def, vec_dest;
9631 tree init_expr, step_expr;
9632 tree niters_skip;
9633 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9634 unsigned i;
9635 gimple_stmt_iterator si;
9637 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9639 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9640 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9641 enum vect_induction_op_type induction_type
9642 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9644 gcc_assert (induction_type > vect_step_op_add);
9646 if (slp_node)
9647 ncopies = 1;
9648 else
9649 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9650 gcc_assert (ncopies >= 1);
9652 /* FORNOW. Only handle nonlinear induction in the same loop. */
9653 if (nested_in_vect_loop_p (loop, stmt_info))
9655 if (dump_enabled_p ())
9656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9657 "nonlinear induction in nested loop.\n");
9658 return false;
9661 iv_loop = loop;
9662 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9664 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9665 update for each iv and a permutation to generate wanted vector iv. */
9666 if (slp_node)
9668 if (dump_enabled_p ())
9669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9670 "SLP induction not supported for nonlinear"
9671 " induction.\n");
9672 return false;
9675 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9677 if (dump_enabled_p ())
9678 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9679 "floating point nonlinear induction vectorization"
9680 " not supported.\n");
9681 return false;
9684 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9685 init_expr = vect_phi_initial_value (phi);
9686 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9687 && TREE_CODE (step_expr) == INTEGER_CST);
9688 /* step_expr should be aligned with init_expr,
9689 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9690 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9692 if (TREE_CODE (init_expr) == INTEGER_CST)
9693 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9694 else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9696 /* INIT_EXPR could be a bit_field, bail out for such case. */
9697 if (dump_enabled_p ())
9698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9699 "nonlinear induction vectorization failed:"
9700 " component type of vectype is not a nop conversion"
9701 " from type of init_expr.\n");
9702 return false;
9705 switch (induction_type)
9707 case vect_step_op_neg:
9708 if (TREE_CODE (init_expr) != INTEGER_CST
9709 && TREE_CODE (init_expr) != REAL_CST)
9711 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9712 if (!directly_supported_p (NEGATE_EXPR, vectype))
9713 return false;
9715 /* The encoding has 2 interleaved stepped patterns. */
9716 vec_perm_builder sel (nunits, 2, 3);
9717 machine_mode mode = TYPE_MODE (vectype);
9718 sel.quick_grow (6);
9719 for (i = 0; i < 3; i++)
9721 sel[i * 2] = i;
9722 sel[i * 2 + 1] = i + nunits;
9724 vec_perm_indices indices (sel, 2, nunits);
9725 if (!can_vec_perm_const_p (mode, mode, indices))
9726 return false;
9728 break;
9730 case vect_step_op_mul:
9732 /* Check for backend support of MULT_EXPR. */
9733 if (!directly_supported_p (MULT_EXPR, vectype))
9734 return false;
9736 /* ?? How to construct vector step for variable number vector.
9737 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9738 if (!vf.is_constant ())
9739 return false;
9741 break;
9743 case vect_step_op_shr:
9744 /* Check for backend support of RSHIFT_EXPR. */
9745 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9746 return false;
9748 /* Don't shift more than type precision to avoid UD. */
9749 if (!tree_fits_uhwi_p (step_expr)
9750 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9751 TYPE_PRECISION (TREE_TYPE (init_expr))))
9752 return false;
9753 break;
9755 case vect_step_op_shl:
9756 /* Check for backend support of RSHIFT_EXPR. */
9757 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9758 return false;
9760 /* Don't shift more than type precision to avoid UD. */
9761 if (!tree_fits_uhwi_p (step_expr)
9762 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9763 TYPE_PRECISION (TREE_TYPE (init_expr))))
9764 return false;
9766 break;
9768 default:
9769 gcc_unreachable ();
9772 if (!vec_stmt) /* transformation not required. */
9774 unsigned inside_cost = 0, prologue_cost = 0;
9775 /* loop cost for vec_loop. Neg induction doesn't have any
9776 inside_cost. */
9777 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9778 stmt_info, 0, vect_body);
9780 /* loop cost for vec_loop. Neg induction doesn't have any
9781 inside_cost. */
9782 if (induction_type == vect_step_op_neg)
9783 inside_cost = 0;
9785 /* prologue cost for vec_init and vec_step. */
9786 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9787 stmt_info, 0, vect_prologue);
9789 if (dump_enabled_p ())
9790 dump_printf_loc (MSG_NOTE, vect_location,
9791 "vect_model_induction_cost: inside_cost = %d, "
9792 "prologue_cost = %d. \n", inside_cost,
9793 prologue_cost);
9795 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9796 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9797 return true;
9800 /* Transform. */
9802 /* Compute a vector variable, initialized with the first VF values of
9803 the induction variable. E.g., for an iv with IV_PHI='X' and
9804 evolution S, for a vector of 4 units, we want to compute:
9805 [X, X + S, X + 2*S, X + 3*S]. */
9807 if (dump_enabled_p ())
9808 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9810 pe = loop_preheader_edge (iv_loop);
9811 /* Find the first insertion point in the BB. */
9812 basic_block bb = gimple_bb (phi);
9813 si = gsi_after_labels (bb);
9815 gimple_seq stmts = NULL;
9817 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9818 /* If we are using the loop mask to "peel" for alignment then we need
9819 to adjust the start value here. */
9820 if (niters_skip != NULL_TREE)
9821 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9822 step_expr, induction_type);
9824 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9825 step_expr, nunits, vectype,
9826 induction_type);
9827 if (stmts)
9829 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9830 gcc_assert (!new_bb);
9833 stmts = NULL;
9834 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9835 vf, induction_type);
9836 if (stmts)
9838 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9839 gcc_assert (!new_bb);
9842 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9843 new_name, vectype,
9844 induction_type);
9845 /* Create the following def-use cycle:
9846 loop prolog:
9847 vec_init = ...
9848 vec_step = ...
9849 loop:
9850 vec_iv = PHI <vec_init, vec_loop>
9852 STMT
9854 vec_loop = vec_iv + vec_step; */
9856 /* Create the induction-phi that defines the induction-operand. */
9857 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9858 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9859 induc_def = PHI_RESULT (induction_phi);
9861 /* Create the iv update inside the loop. */
9862 stmts = NULL;
9863 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9864 induc_def, vec_step,
9865 induction_type);
9867 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9868 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9870 /* Set the arguments of the phi node: */
9871 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9872 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9873 UNKNOWN_LOCATION);
9875 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9876 *vec_stmt = induction_phi;
9878 /* In case that vectorization factor (VF) is bigger than the number
9879 of elements that we can fit in a vectype (nunits), we have to generate
9880 more than one vector stmt - i.e - we need to "unroll" the
9881 vector stmt by a factor VF/nunits. For more details see documentation
9882 in vectorizable_operation. */
9884 if (ncopies > 1)
9886 stmts = NULL;
9887 /* FORNOW. This restriction should be relaxed. */
9888 gcc_assert (!nested_in_vect_loop);
9890 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9891 nunits, induction_type);
9893 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9894 new_name, vectype,
9895 induction_type);
9896 vec_def = induc_def;
9897 for (i = 1; i < ncopies; i++)
9899 /* vec_i = vec_prev + vec_step. */
9900 stmts = NULL;
9901 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9902 vec_def, vec_step,
9903 induction_type);
9904 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9905 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9906 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9910 if (dump_enabled_p ())
9911 dump_printf_loc (MSG_NOTE, vect_location,
9912 "transform induction: created def-use cycle: %G%G",
9913 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9915 return true;
9918 /* Function vectorizable_induction
9920 Check if STMT_INFO performs an induction computation that can be vectorized.
9921 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9922 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9923 Return true if STMT_INFO is vectorizable in this way. */
9925 bool
9926 vectorizable_induction (loop_vec_info loop_vinfo,
9927 stmt_vec_info stmt_info,
9928 gimple **vec_stmt, slp_tree slp_node,
9929 stmt_vector_for_cost *cost_vec)
9931 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9932 unsigned ncopies;
9933 bool nested_in_vect_loop = false;
9934 class loop *iv_loop;
9935 tree vec_def;
9936 edge pe = loop_preheader_edge (loop);
9937 basic_block new_bb;
9938 tree new_vec, vec_init, vec_step, t;
9939 tree new_name;
9940 gimple *new_stmt;
9941 gphi *induction_phi;
9942 tree induc_def, vec_dest;
9943 tree init_expr, step_expr;
9944 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9945 unsigned i;
9946 tree expr;
9947 gimple_stmt_iterator si;
9948 enum vect_induction_op_type induction_type
9949 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9951 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9952 if (!phi)
9953 return false;
9955 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9956 return false;
9958 /* Make sure it was recognized as induction computation. */
9959 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9960 return false;
9962 /* Handle nonlinear induction in a separate place. */
9963 if (induction_type != vect_step_op_add)
9964 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9965 vec_stmt, slp_node, cost_vec);
9967 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9968 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9970 if (slp_node)
9971 ncopies = 1;
9972 else
9973 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9974 gcc_assert (ncopies >= 1);
9976 /* FORNOW. These restrictions should be relaxed. */
9977 if (nested_in_vect_loop_p (loop, stmt_info))
9979 imm_use_iterator imm_iter;
9980 use_operand_p use_p;
9981 gimple *exit_phi;
9982 edge latch_e;
9983 tree loop_arg;
9985 if (ncopies > 1)
9987 if (dump_enabled_p ())
9988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9989 "multiple types in nested loop.\n");
9990 return false;
9993 exit_phi = NULL;
9994 latch_e = loop_latch_edge (loop->inner);
9995 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9996 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9998 gimple *use_stmt = USE_STMT (use_p);
9999 if (is_gimple_debug (use_stmt))
10000 continue;
10002 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
10004 exit_phi = use_stmt;
10005 break;
10008 if (exit_phi)
10010 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
10011 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
10012 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
10014 if (dump_enabled_p ())
10015 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10016 "inner-loop induction only used outside "
10017 "of the outer vectorized loop.\n");
10018 return false;
10022 nested_in_vect_loop = true;
10023 iv_loop = loop->inner;
10025 else
10026 iv_loop = loop;
10027 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10029 if (slp_node && !nunits.is_constant ())
10031 /* The current SLP code creates the step value element-by-element. */
10032 if (dump_enabled_p ())
10033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10034 "SLP induction not supported for variable-length"
10035 " vectors.\n");
10036 return false;
10039 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10041 if (dump_enabled_p ())
10042 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10043 "floating point induction vectorization disabled\n");
10044 return false;
10047 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10048 gcc_assert (step_expr != NULL_TREE);
10049 if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10050 && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10052 if (dump_enabled_p ())
10053 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10054 "bit-precision induction vectorization not "
10055 "supported.\n");
10056 return false;
10058 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10060 /* Check for backend support of PLUS/MINUS_EXPR. */
10061 if (!directly_supported_p (PLUS_EXPR, step_vectype)
10062 || !directly_supported_p (MINUS_EXPR, step_vectype))
10063 return false;
10065 if (!vec_stmt) /* transformation not required. */
10067 unsigned inside_cost = 0, prologue_cost = 0;
10068 if (slp_node)
10070 /* We eventually need to set a vector type on invariant
10071 arguments. */
10072 unsigned j;
10073 slp_tree child;
10074 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10075 if (!vect_maybe_update_slp_op_vectype
10076 (child, SLP_TREE_VECTYPE (slp_node)))
10078 if (dump_enabled_p ())
10079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10080 "incompatible vector types for "
10081 "invariants\n");
10082 return false;
10084 /* loop cost for vec_loop. */
10085 inside_cost
10086 = record_stmt_cost (cost_vec,
10087 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10088 vector_stmt, stmt_info, 0, vect_body);
10089 /* prologue cost for vec_init (if not nested) and step. */
10090 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10091 scalar_to_vec,
10092 stmt_info, 0, vect_prologue);
10094 else /* if (!slp_node) */
10096 /* loop cost for vec_loop. */
10097 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10098 stmt_info, 0, vect_body);
10099 /* prologue cost for vec_init and vec_step. */
10100 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10101 stmt_info, 0, vect_prologue);
10103 if (dump_enabled_p ())
10104 dump_printf_loc (MSG_NOTE, vect_location,
10105 "vect_model_induction_cost: inside_cost = %d, "
10106 "prologue_cost = %d .\n", inside_cost,
10107 prologue_cost);
10109 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10110 DUMP_VECT_SCOPE ("vectorizable_induction");
10111 return true;
10114 /* Transform. */
10116 /* Compute a vector variable, initialized with the first VF values of
10117 the induction variable. E.g., for an iv with IV_PHI='X' and
10118 evolution S, for a vector of 4 units, we want to compute:
10119 [X, X + S, X + 2*S, X + 3*S]. */
10121 if (dump_enabled_p ())
10122 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10124 pe = loop_preheader_edge (iv_loop);
10125 /* Find the first insertion point in the BB. */
10126 basic_block bb = gimple_bb (phi);
10127 si = gsi_after_labels (bb);
10129 /* For SLP induction we have to generate several IVs as for example
10130 with group size 3 we need
10131 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10132 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
10133 if (slp_node)
10135 /* Enforced above. */
10136 unsigned int const_nunits = nunits.to_constant ();
10138 /* The initial values are vectorized, but any lanes > group_size
10139 need adjustment. */
10140 slp_tree init_node
10141 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10143 /* Gather steps. Since we do not vectorize inductions as
10144 cycles we have to reconstruct the step from SCEV data. */
10145 unsigned group_size = SLP_TREE_LANES (slp_node);
10146 tree *steps = XALLOCAVEC (tree, group_size);
10147 tree *inits = XALLOCAVEC (tree, group_size);
10148 stmt_vec_info phi_info;
10149 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10151 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10152 if (!init_node)
10153 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10154 pe->dest_idx);
10157 /* Now generate the IVs. */
10158 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10159 gcc_assert ((const_nunits * nvects) % group_size == 0);
10160 unsigned nivs;
10161 if (nested_in_vect_loop)
10162 nivs = nvects;
10163 else
10165 /* Compute the number of distinct IVs we need. First reduce
10166 group_size if it is a multiple of const_nunits so we get
10167 one IV for a group_size of 4 but const_nunits 2. */
10168 unsigned group_sizep = group_size;
10169 if (group_sizep % const_nunits == 0)
10170 group_sizep = group_sizep / const_nunits;
10171 nivs = least_common_multiple (group_sizep,
10172 const_nunits) / const_nunits;
10174 tree stept = TREE_TYPE (step_vectype);
10175 tree lupdate_mul = NULL_TREE;
10176 if (!nested_in_vect_loop)
10178 /* The number of iterations covered in one vector iteration. */
10179 unsigned lup_mul = (nvects * const_nunits) / group_size;
10180 lupdate_mul
10181 = build_vector_from_val (step_vectype,
10182 SCALAR_FLOAT_TYPE_P (stept)
10183 ? build_real_from_wide (stept, lup_mul,
10184 UNSIGNED)
10185 : build_int_cstu (stept, lup_mul));
10187 tree peel_mul = NULL_TREE;
10188 gimple_seq init_stmts = NULL;
10189 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10191 if (SCALAR_FLOAT_TYPE_P (stept))
10192 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10193 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10194 else
10195 peel_mul = gimple_convert (&init_stmts, stept,
10196 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10197 peel_mul = gimple_build_vector_from_val (&init_stmts,
10198 step_vectype, peel_mul);
10200 unsigned ivn;
10201 auto_vec<tree> vec_steps;
10202 for (ivn = 0; ivn < nivs; ++ivn)
10204 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10205 tree_vector_builder init_elts (vectype, const_nunits, 1);
10206 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10207 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10209 /* The scalar steps of the IVs. */
10210 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10211 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10212 step_elts.quick_push (elt);
10213 if (!init_node)
10215 /* The scalar inits of the IVs if not vectorized. */
10216 elt = inits[(ivn*const_nunits + eltn) % group_size];
10217 if (!useless_type_conversion_p (TREE_TYPE (vectype),
10218 TREE_TYPE (elt)))
10219 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10220 TREE_TYPE (vectype), elt);
10221 init_elts.quick_push (elt);
10223 /* The number of steps to add to the initial values. */
10224 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10225 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10226 ? build_real_from_wide (stept,
10227 mul_elt, UNSIGNED)
10228 : build_int_cstu (stept, mul_elt));
10230 vec_step = gimple_build_vector (&init_stmts, &step_elts);
10231 vec_steps.safe_push (vec_step);
10232 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10233 if (peel_mul)
10234 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10235 step_mul, peel_mul);
10236 if (!init_node)
10237 vec_init = gimple_build_vector (&init_stmts, &init_elts);
10239 /* Create the induction-phi that defines the induction-operand. */
10240 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10241 "vec_iv_");
10242 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10243 induc_def = PHI_RESULT (induction_phi);
10245 /* Create the iv update inside the loop */
10246 tree up = vec_step;
10247 if (lupdate_mul)
10248 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10249 vec_step, lupdate_mul);
10250 gimple_seq stmts = NULL;
10251 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10252 vec_def = gimple_build (&stmts,
10253 PLUS_EXPR, step_vectype, vec_def, up);
10254 vec_def = gimple_convert (&stmts, vectype, vec_def);
10255 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10256 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10257 UNKNOWN_LOCATION);
10259 if (init_node)
10260 vec_init = vect_get_slp_vect_def (init_node, ivn);
10261 if (!nested_in_vect_loop
10262 && !integer_zerop (step_mul))
10264 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10265 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10266 vec_step, step_mul);
10267 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10268 vec_def, up);
10269 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10272 /* Set the arguments of the phi node: */
10273 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10275 slp_node->push_vec_def (induction_phi);
10277 if (!nested_in_vect_loop)
10279 /* Fill up to the number of vectors we need for the whole group. */
10280 nivs = least_common_multiple (group_size,
10281 const_nunits) / const_nunits;
10282 vec_steps.reserve (nivs-ivn);
10283 for (; ivn < nivs; ++ivn)
10285 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10286 vec_steps.quick_push (vec_steps[0]);
10290 /* Re-use IVs when we can. We are generating further vector
10291 stmts by adding VF' * stride to the IVs generated above. */
10292 if (ivn < nvects)
10294 unsigned vfp
10295 = least_common_multiple (group_size, const_nunits) / group_size;
10296 tree lupdate_mul
10297 = build_vector_from_val (step_vectype,
10298 SCALAR_FLOAT_TYPE_P (stept)
10299 ? build_real_from_wide (stept,
10300 vfp, UNSIGNED)
10301 : build_int_cstu (stept, vfp));
10302 for (; ivn < nvects; ++ivn)
10304 gimple *iv
10305 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10306 tree def = gimple_get_lhs (iv);
10307 if (ivn < 2*nivs)
10308 vec_steps[ivn - nivs]
10309 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10310 vec_steps[ivn - nivs], lupdate_mul);
10311 gimple_seq stmts = NULL;
10312 def = gimple_convert (&stmts, step_vectype, def);
10313 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10314 def, vec_steps[ivn % nivs]);
10315 def = gimple_convert (&stmts, vectype, def);
10316 if (gimple_code (iv) == GIMPLE_PHI)
10317 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10318 else
10320 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10321 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10323 slp_node->push_vec_def (def);
10327 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10328 gcc_assert (!new_bb);
10330 return true;
10333 init_expr = vect_phi_initial_value (phi);
10335 gimple_seq stmts = NULL;
10336 if (!nested_in_vect_loop)
10338 /* Convert the initial value to the IV update type. */
10339 tree new_type = TREE_TYPE (step_expr);
10340 init_expr = gimple_convert (&stmts, new_type, init_expr);
10342 /* If we are using the loop mask to "peel" for alignment then we need
10343 to adjust the start value here. */
10344 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10345 if (skip_niters != NULL_TREE)
10347 if (FLOAT_TYPE_P (vectype))
10348 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10349 skip_niters);
10350 else
10351 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10352 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10353 skip_niters, step_expr);
10354 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10355 init_expr, skip_step);
10359 if (stmts)
10361 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10362 gcc_assert (!new_bb);
10365 /* Create the vector that holds the initial_value of the induction. */
10366 if (nested_in_vect_loop)
10368 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10369 been created during vectorization of previous stmts. We obtain it
10370 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10371 auto_vec<tree> vec_inits;
10372 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10373 init_expr, &vec_inits);
10374 vec_init = vec_inits[0];
10375 /* If the initial value is not of proper type, convert it. */
10376 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10378 new_stmt
10379 = gimple_build_assign (vect_get_new_ssa_name (vectype,
10380 vect_simple_var,
10381 "vec_iv_"),
10382 VIEW_CONVERT_EXPR,
10383 build1 (VIEW_CONVERT_EXPR, vectype,
10384 vec_init));
10385 vec_init = gimple_assign_lhs (new_stmt);
10386 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10387 new_stmt);
10388 gcc_assert (!new_bb);
10391 else
10393 /* iv_loop is the loop to be vectorized. Create:
10394 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10395 stmts = NULL;
10396 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10398 unsigned HOST_WIDE_INT const_nunits;
10399 if (nunits.is_constant (&const_nunits))
10401 tree_vector_builder elts (step_vectype, const_nunits, 1);
10402 elts.quick_push (new_name);
10403 for (i = 1; i < const_nunits; i++)
10405 /* Create: new_name_i = new_name + step_expr */
10406 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10407 new_name, step_expr);
10408 elts.quick_push (new_name);
10410 /* Create a vector from [new_name_0, new_name_1, ...,
10411 new_name_nunits-1] */
10412 vec_init = gimple_build_vector (&stmts, &elts);
10414 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10415 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10416 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10417 new_name, step_expr);
10418 else
10420 /* Build:
10421 [base, base, base, ...]
10422 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10423 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10424 gcc_assert (flag_associative_math);
10425 tree index = build_index_vector (step_vectype, 0, 1);
10426 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10427 new_name);
10428 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10429 step_expr);
10430 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10431 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10432 vec_init, step_vec);
10433 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10434 vec_init, base_vec);
10436 vec_init = gimple_convert (&stmts, vectype, vec_init);
10438 if (stmts)
10440 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10441 gcc_assert (!new_bb);
10446 /* Create the vector that holds the step of the induction. */
10447 gimple_stmt_iterator *step_iv_si = NULL;
10448 if (nested_in_vect_loop)
10449 /* iv_loop is nested in the loop to be vectorized. Generate:
10450 vec_step = [S, S, S, S] */
10451 new_name = step_expr;
10452 else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10454 /* When we're using loop_len produced by SELEC_VL, the non-final
10455 iterations are not always processing VF elements. So vectorize
10456 induction variable instead of
10458 _21 = vect_vec_iv_.6_22 + { VF, ... };
10460 We should generate:
10462 _35 = .SELECT_VL (ivtmp_33, VF);
10463 vect_cst__22 = [vec_duplicate_expr] _35;
10464 _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10465 gcc_assert (!slp_node);
10466 gimple_seq seq = NULL;
10467 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10468 tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10469 expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10470 unshare_expr (len)),
10471 &seq, true, NULL_TREE);
10472 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10473 step_expr);
10474 gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10475 step_iv_si = &si;
10477 else
10479 /* iv_loop is the loop to be vectorized. Generate:
10480 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10481 gimple_seq seq = NULL;
10482 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10484 expr = build_int_cst (integer_type_node, vf);
10485 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10487 else
10488 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10489 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10490 expr, step_expr);
10491 if (seq)
10493 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10494 gcc_assert (!new_bb);
10498 t = unshare_expr (new_name);
10499 gcc_assert (CONSTANT_CLASS_P (new_name)
10500 || TREE_CODE (new_name) == SSA_NAME);
10501 new_vec = build_vector_from_val (step_vectype, t);
10502 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10503 new_vec, step_vectype, step_iv_si);
10506 /* Create the following def-use cycle:
10507 loop prolog:
10508 vec_init = ...
10509 vec_step = ...
10510 loop:
10511 vec_iv = PHI <vec_init, vec_loop>
10513 STMT
10515 vec_loop = vec_iv + vec_step; */
10517 /* Create the induction-phi that defines the induction-operand. */
10518 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10519 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10520 induc_def = PHI_RESULT (induction_phi);
10522 /* Create the iv update inside the loop */
10523 stmts = NULL;
10524 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10525 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10526 vec_def = gimple_convert (&stmts, vectype, vec_def);
10527 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10528 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10530 /* Set the arguments of the phi node: */
10531 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10532 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10533 UNKNOWN_LOCATION);
10535 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10536 *vec_stmt = induction_phi;
10538 /* In case that vectorization factor (VF) is bigger than the number
10539 of elements that we can fit in a vectype (nunits), we have to generate
10540 more than one vector stmt - i.e - we need to "unroll" the
10541 vector stmt by a factor VF/nunits. For more details see documentation
10542 in vectorizable_operation. */
10544 if (ncopies > 1)
10546 gimple_seq seq = NULL;
10547 /* FORNOW. This restriction should be relaxed. */
10548 gcc_assert (!nested_in_vect_loop);
10549 /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1. */
10550 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10552 /* Create the vector that holds the step of the induction. */
10553 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10555 expr = build_int_cst (integer_type_node, nunits);
10556 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10558 else
10559 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10560 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10561 expr, step_expr);
10562 if (seq)
10564 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10565 gcc_assert (!new_bb);
10568 t = unshare_expr (new_name);
10569 gcc_assert (CONSTANT_CLASS_P (new_name)
10570 || TREE_CODE (new_name) == SSA_NAME);
10571 new_vec = build_vector_from_val (step_vectype, t);
10572 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10573 new_vec, step_vectype, NULL);
10575 vec_def = induc_def;
10576 for (i = 1; i < ncopies + 1; i++)
10578 /* vec_i = vec_prev + vec_step */
10579 gimple_seq stmts = NULL;
10580 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10581 vec_def = gimple_build (&stmts,
10582 PLUS_EXPR, step_vectype, vec_def, vec_step);
10583 vec_def = gimple_convert (&stmts, vectype, vec_def);
10585 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10586 if (i < ncopies)
10588 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10589 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10591 else
10593 /* vec_1 = vec_iv + (VF/n * S)
10594 vec_2 = vec_1 + (VF/n * S)
10596 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10598 vec_n is used as vec_loop to save the large step register and
10599 related operations. */
10600 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10601 UNKNOWN_LOCATION);
10606 if (dump_enabled_p ())
10607 dump_printf_loc (MSG_NOTE, vect_location,
10608 "transform induction: created def-use cycle: %G%G",
10609 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10611 return true;
10614 /* Function vectorizable_live_operation_1.
10616 helper function for vectorizable_live_operation. */
10618 static tree
10619 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10620 stmt_vec_info stmt_info, basic_block exit_bb,
10621 tree vectype, int ncopies, slp_tree slp_node,
10622 tree bitsize, tree bitstart, tree vec_lhs,
10623 tree lhs_type, gimple_stmt_iterator *exit_gsi)
10625 gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10627 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10628 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10629 for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10630 SET_PHI_ARG_DEF (phi, i, vec_lhs);
10632 gimple_seq stmts = NULL;
10633 tree new_tree;
10635 /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10636 if (integer_zerop (bitstart))
10638 tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10639 vec_lhs_phi, bitsize, bitstart);
10641 /* Convert the extracted vector element to the scalar type. */
10642 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10644 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10646 /* Emit:
10648 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10650 where VEC_LHS is the vectorized live-out result and MASK is
10651 the loop mask for the final iteration. */
10652 gcc_assert (ncopies == 1 && !slp_node);
10653 gimple_seq tem = NULL;
10654 gimple_stmt_iterator gsi = gsi_last (tem);
10655 tree len = vect_get_loop_len (loop_vinfo, &gsi,
10656 &LOOP_VINFO_LENS (loop_vinfo),
10657 1, vectype, 0, 0);
10659 /* BIAS - 1. */
10660 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10661 tree bias_minus_one
10662 = int_const_binop (MINUS_EXPR,
10663 build_int_cst (TREE_TYPE (len), biasval),
10664 build_one_cst (TREE_TYPE (len)));
10666 /* LAST_INDEX = LEN + (BIAS - 1). */
10667 tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10668 len, bias_minus_one);
10670 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10671 tree scalar_res
10672 = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10673 vec_lhs_phi, last_index);
10675 /* Convert the extracted vector element to the scalar type. */
10676 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10678 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10680 /* Emit:
10682 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10684 where VEC_LHS is the vectorized live-out result and MASK is
10685 the loop mask for the final iteration. */
10686 gcc_assert (!slp_node);
10687 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10688 gimple_seq tem = NULL;
10689 gimple_stmt_iterator gsi = gsi_last (tem);
10690 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10691 &LOOP_VINFO_MASKS (loop_vinfo),
10692 1, vectype, 0);
10693 tree scalar_res;
10694 gimple_seq_add_seq (&stmts, tem);
10696 scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10697 mask, vec_lhs_phi);
10699 /* Convert the extracted vector element to the scalar type. */
10700 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10702 else
10704 tree bftype = TREE_TYPE (vectype);
10705 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10706 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10707 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10708 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10709 &stmts, true, NULL_TREE);
10712 *exit_gsi = gsi_after_labels (exit_bb);
10713 if (stmts)
10714 gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10716 return new_tree;
10719 /* Function vectorizable_live_operation.
10721 STMT_INFO computes a value that is used outside the loop. Check if
10722 it can be supported. */
10724 bool
10725 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10726 slp_tree slp_node, slp_instance slp_node_instance,
10727 int slp_index, bool vec_stmt_p,
10728 stmt_vector_for_cost *cost_vec)
10730 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10731 imm_use_iterator imm_iter;
10732 tree lhs, lhs_type, bitsize;
10733 tree vectype = (slp_node
10734 ? SLP_TREE_VECTYPE (slp_node)
10735 : STMT_VINFO_VECTYPE (stmt_info));
10736 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10737 int ncopies;
10738 gimple *use_stmt;
10739 use_operand_p use_p;
10740 auto_vec<tree> vec_oprnds;
10741 int vec_entry = 0;
10742 poly_uint64 vec_index = 0;
10744 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10745 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10747 /* If a stmt of a reduction is live, vectorize it via
10748 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10749 validity so just trigger the transform here. */
10750 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10752 if (!vec_stmt_p)
10753 return true;
10754 if (slp_node)
10756 /* For reduction chains the meta-info is attached to
10757 the group leader. */
10758 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10759 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10760 /* For SLP reductions we vectorize the epilogue for
10761 all involved stmts together. */
10762 else if (slp_index != 0)
10763 return true;
10765 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10766 gcc_assert (reduc_info->is_reduc_info);
10767 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10768 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10769 return true;
10771 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10772 slp_node_instance,
10773 LOOP_VINFO_IV_EXIT (loop_vinfo));
10775 /* If early break we only have to materialize the reduction on the merge
10776 block, but we have to find an alternate exit first. */
10777 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10779 for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10780 if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10782 vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10783 slp_node, slp_node_instance,
10784 exit);
10785 break;
10789 return true;
10792 /* If STMT is not relevant and it is a simple assignment and its inputs are
10793 invariant then it can remain in place, unvectorized. The original last
10794 scalar value that it computes will be used. */
10795 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10797 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10798 if (dump_enabled_p ())
10799 dump_printf_loc (MSG_NOTE, vect_location,
10800 "statement is simple and uses invariant. Leaving in "
10801 "place.\n");
10802 return true;
10805 if (slp_node)
10806 ncopies = 1;
10807 else
10808 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10810 if (slp_node)
10812 gcc_assert (slp_index >= 0);
10814 /* Get the last occurrence of the scalar index from the concatenation of
10815 all the slp vectors. Calculate which slp vector it is and the index
10816 within. */
10817 int num_scalar = SLP_TREE_LANES (slp_node);
10818 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10819 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10821 /* Calculate which vector contains the result, and which lane of
10822 that vector we need. */
10823 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10825 if (dump_enabled_p ())
10826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10827 "Cannot determine which vector holds the"
10828 " final result.\n");
10829 return false;
10833 if (!vec_stmt_p)
10835 /* No transformation required. */
10836 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10838 if (slp_node)
10840 if (dump_enabled_p ())
10841 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10842 "can't operate on partial vectors "
10843 "because an SLP statement is live after "
10844 "the loop.\n");
10845 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10847 else if (ncopies > 1)
10849 if (dump_enabled_p ())
10850 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10851 "can't operate on partial vectors "
10852 "because ncopies is greater than 1.\n");
10853 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10855 else
10857 gcc_assert (ncopies == 1 && !slp_node);
10858 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10859 OPTIMIZE_FOR_SPEED))
10860 vect_record_loop_mask (loop_vinfo,
10861 &LOOP_VINFO_MASKS (loop_vinfo),
10862 1, vectype, NULL);
10863 else if (can_vec_extract_var_idx_p (
10864 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10865 vect_record_loop_len (loop_vinfo,
10866 &LOOP_VINFO_LENS (loop_vinfo),
10867 1, vectype, 1);
10868 else
10870 if (dump_enabled_p ())
10871 dump_printf_loc (
10872 MSG_MISSED_OPTIMIZATION, vect_location,
10873 "can't operate on partial vectors "
10874 "because the target doesn't support extract "
10875 "last reduction.\n");
10876 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10880 /* ??? Enable for loop costing as well. */
10881 if (!loop_vinfo)
10882 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10883 0, vect_epilogue);
10884 return true;
10887 /* Use the lhs of the original scalar statement. */
10888 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10889 if (dump_enabled_p ())
10890 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10891 "stmt %G", stmt);
10893 lhs = gimple_get_lhs (stmt);
10894 lhs_type = TREE_TYPE (lhs);
10896 bitsize = vector_element_bits_tree (vectype);
10898 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10899 tree vec_lhs, vec_lhs0, bitstart;
10900 gimple *vec_stmt, *vec_stmt0;
10901 if (slp_node)
10903 gcc_assert (!loop_vinfo
10904 || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10905 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10907 /* Get the correct slp vectorized stmt. */
10908 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10909 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10911 /* In case we need to early break vectorize also get the first stmt. */
10912 vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10913 vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10915 /* Get entry to use. */
10916 bitstart = bitsize_int (vec_index);
10917 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10919 else
10921 /* For multiple copies, get the last copy. */
10922 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10923 vec_lhs = gimple_get_lhs (vec_stmt);
10925 /* In case we need to early break vectorize also get the first stmt. */
10926 vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10927 vec_lhs0 = gimple_get_lhs (vec_stmt0);
10929 /* Get the last lane in the vector. */
10930 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10933 if (loop_vinfo)
10935 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10936 requirement, insert one phi node for it. It looks like:
10937 loop;
10939 # lhs' = PHI <lhs>
10941 loop;
10943 # vec_lhs' = PHI <vec_lhs>
10944 new_tree = lane_extract <vec_lhs', ...>;
10945 lhs' = new_tree; */
10947 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10948 /* Check if we have a loop where the chosen exit is not the main exit,
10949 in these cases for an early break we restart the iteration the vector code
10950 did. For the live values we want the value at the start of the iteration
10951 rather than at the end. */
10952 edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10953 bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10954 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10955 if (!is_gimple_debug (use_stmt)
10956 && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10957 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10959 edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10960 phi_arg_index_from_use (use_p));
10961 gcc_assert (loop_exit_edge_p (loop, e));
10962 bool main_exit_edge = e == main_e;
10963 tree tmp_vec_lhs = vec_lhs;
10964 tree tmp_bitstart = bitstart;
10966 /* For early exit where the exit is not in the BB that leads
10967 to the latch then we're restarting the iteration in the
10968 scalar loop. So get the first live value. */
10969 restart_loop = restart_loop || !main_exit_edge;
10970 if (restart_loop
10971 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10973 tmp_vec_lhs = vec_lhs0;
10974 tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10977 gimple_stmt_iterator exit_gsi;
10978 tree new_tree
10979 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10980 e->dest, vectype, ncopies,
10981 slp_node, bitsize,
10982 tmp_bitstart, tmp_vec_lhs,
10983 lhs_type, &exit_gsi);
10985 auto gsi = gsi_for_stmt (use_stmt);
10986 remove_phi_node (&gsi, false);
10987 tree lhs_phi = gimple_phi_result (use_stmt);
10988 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10989 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10990 break;
10993 /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10994 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10995 gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10997 else
10999 /* For basic-block vectorization simply insert the lane-extraction. */
11000 tree bftype = TREE_TYPE (vectype);
11001 if (VECTOR_BOOLEAN_TYPE_P (vectype))
11002 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11003 tree new_tree = build3 (BIT_FIELD_REF, bftype,
11004 vec_lhs, bitsize, bitstart);
11005 gimple_seq stmts = NULL;
11006 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11007 &stmts, true, NULL_TREE);
11008 if (TREE_CODE (new_tree) == SSA_NAME
11009 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11010 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11011 if (is_a <gphi *> (vec_stmt))
11013 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11014 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11016 else
11018 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11019 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11022 /* Replace use of lhs with newly computed result. If the use stmt is a
11023 single arg PHI, just replace all uses of PHI result. It's necessary
11024 because lcssa PHI defining lhs may be before newly inserted stmt. */
11025 use_operand_p use_p;
11026 stmt_vec_info use_stmt_info;
11027 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11028 if (!is_gimple_debug (use_stmt)
11029 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11030 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11032 /* ??? This can happen when the live lane ends up being
11033 rooted in a vector construction code-generated by an
11034 external SLP node (and code-generation for that already
11035 happened). See gcc.dg/vect/bb-slp-47.c.
11036 Doing this is what would happen if that vector CTOR
11037 were not code-generated yet so it is not too bad.
11038 ??? In fact we'd likely want to avoid this situation
11039 in the first place. */
11040 if (TREE_CODE (new_tree) == SSA_NAME
11041 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11042 && gimple_code (use_stmt) != GIMPLE_PHI
11043 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11044 use_stmt))
11046 if (dump_enabled_p ())
11047 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11048 "Using original scalar computation for "
11049 "live lane because use preceeds vector "
11050 "def\n");
11051 continue;
11053 /* ??? It can also happen that we end up pulling a def into
11054 a loop where replacing out-of-loop uses would require
11055 a new LC SSA PHI node. Retain the original scalar in
11056 those cases as well. PR98064. */
11057 if (TREE_CODE (new_tree) == SSA_NAME
11058 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11059 && (gimple_bb (use_stmt)->loop_father
11060 != gimple_bb (vec_stmt)->loop_father)
11061 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11062 gimple_bb (use_stmt)->loop_father))
11064 if (dump_enabled_p ())
11065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11066 "Using original scalar computation for "
11067 "live lane because there is an out-of-loop "
11068 "definition for it\n");
11069 continue;
11071 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11072 SET_USE (use_p, new_tree);
11073 update_stmt (use_stmt);
11077 return true;
11080 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
11082 static void
11083 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11085 ssa_op_iter op_iter;
11086 imm_use_iterator imm_iter;
11087 def_operand_p def_p;
11088 gimple *ustmt;
11090 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11092 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11094 basic_block bb;
11096 if (!is_gimple_debug (ustmt))
11097 continue;
11099 bb = gimple_bb (ustmt);
11101 if (!flow_bb_inside_loop_p (loop, bb))
11103 if (gimple_debug_bind_p (ustmt))
11105 if (dump_enabled_p ())
11106 dump_printf_loc (MSG_NOTE, vect_location,
11107 "killing debug use\n");
11109 gimple_debug_bind_reset_value (ustmt);
11110 update_stmt (ustmt);
11112 else
11113 gcc_unreachable ();
11119 /* Given loop represented by LOOP_VINFO, return true if computation of
11120 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11121 otherwise. */
11123 static bool
11124 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11126 /* Constant case. */
11127 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11129 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11130 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11132 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11133 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11134 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11135 return true;
11138 widest_int max;
11139 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11140 /* Check the upper bound of loop niters. */
11141 if (get_max_loop_iterations (loop, &max))
11143 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11144 signop sgn = TYPE_SIGN (type);
11145 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11146 if (max < type_max)
11147 return true;
11149 return false;
11152 /* Return a mask type with half the number of elements as OLD_TYPE,
11153 given that it should have mode NEW_MODE. */
11155 tree
11156 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11158 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11159 return build_truth_vector_type_for_mode (nunits, new_mode);
11162 /* Return a mask type with twice as many elements as OLD_TYPE,
11163 given that it should have mode NEW_MODE. */
11165 tree
11166 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11168 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11169 return build_truth_vector_type_for_mode (nunits, new_mode);
11172 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11173 contain a sequence of NVECTORS masks that each control a vector of type
11174 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
11175 these vector masks with the vector version of SCALAR_MASK. */
11177 void
11178 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11179 unsigned int nvectors, tree vectype, tree scalar_mask)
11181 gcc_assert (nvectors != 0);
11183 if (scalar_mask)
11185 scalar_cond_masked_key cond (scalar_mask, nvectors);
11186 loop_vinfo->scalar_cond_masked_set.add (cond);
11189 masks->mask_set.add (std::make_pair (vectype, nvectors));
11192 /* Given a complete set of masks MASKS, extract mask number INDEX
11193 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11194 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
11196 See the comment above vec_loop_masks for more details about the mask
11197 arrangement. */
11199 tree
11200 vect_get_loop_mask (loop_vec_info loop_vinfo,
11201 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11202 unsigned int nvectors, tree vectype, unsigned int index)
11204 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11205 == vect_partial_vectors_while_ult)
11207 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11208 tree mask_type = rgm->type;
11210 /* Populate the rgroup's mask array, if this is the first time we've
11211 used it. */
11212 if (rgm->controls.is_empty ())
11214 rgm->controls.safe_grow_cleared (nvectors, true);
11215 for (unsigned int i = 0; i < nvectors; ++i)
11217 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11218 /* Provide a dummy definition until the real one is available. */
11219 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11220 rgm->controls[i] = mask;
11224 tree mask = rgm->controls[index];
11225 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11226 TYPE_VECTOR_SUBPARTS (vectype)))
11228 /* A loop mask for data type X can be reused for data type Y
11229 if X has N times more elements than Y and if Y's elements
11230 are N times bigger than X's. In this case each sequence
11231 of N elements in the loop mask will be all-zero or all-one.
11232 We can then view-convert the mask so that each sequence of
11233 N elements is replaced by a single element. */
11234 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11235 TYPE_VECTOR_SUBPARTS (vectype)));
11236 gimple_seq seq = NULL;
11237 mask_type = truth_type_for (vectype);
11238 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11239 if (seq)
11240 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11242 return mask;
11244 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11245 == vect_partial_vectors_avx512)
11247 /* The number of scalars per iteration and the number of vectors are
11248 both compile-time constants. */
11249 unsigned int nscalars_per_iter
11250 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11251 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11253 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11255 /* The stored nV is dependent on the mask type produced. */
11256 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11257 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11258 == rgm->factor);
11259 nvectors = rgm->factor;
11261 /* Populate the rgroup's mask array, if this is the first time we've
11262 used it. */
11263 if (rgm->controls.is_empty ())
11265 rgm->controls.safe_grow_cleared (nvectors, true);
11266 for (unsigned int i = 0; i < nvectors; ++i)
11268 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11269 /* Provide a dummy definition until the real one is available. */
11270 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11271 rgm->controls[i] = mask;
11274 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11275 TYPE_VECTOR_SUBPARTS (vectype)))
11276 return rgm->controls[index];
11278 /* Split the vector if needed. Since we are dealing with integer mode
11279 masks with AVX512 we can operate on the integer representation
11280 performing the whole vector shifting. */
11281 unsigned HOST_WIDE_INT factor;
11282 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11283 TYPE_VECTOR_SUBPARTS (vectype), &factor);
11284 gcc_assert (ok);
11285 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11286 tree mask_type = truth_type_for (vectype);
11287 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11288 unsigned vi = index / factor;
11289 unsigned vpart = index % factor;
11290 tree vec = rgm->controls[vi];
11291 gimple_seq seq = NULL;
11292 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11293 lang_hooks.types.type_for_mode
11294 (TYPE_MODE (rgm->type), 1), vec);
11295 /* For integer mode masks simply shift the right bits into position. */
11296 if (vpart != 0)
11297 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11298 build_int_cst (integer_type_node,
11299 (TYPE_VECTOR_SUBPARTS (vectype)
11300 * vpart)));
11301 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11302 (TYPE_MODE (mask_type), 1), vec);
11303 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11304 if (seq)
11305 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11306 return vec;
11308 else
11309 gcc_unreachable ();
11312 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11313 lengths for controlling an operation on VECTYPE. The operation splits
11314 each element of VECTYPE into FACTOR separate subelements, measuring the
11315 length as a number of these subelements. */
11317 void
11318 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11319 unsigned int nvectors, tree vectype, unsigned int factor)
11321 gcc_assert (nvectors != 0);
11322 if (lens->length () < nvectors)
11323 lens->safe_grow_cleared (nvectors, true);
11324 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11326 /* The number of scalars per iteration, scalar occupied bytes and
11327 the number of vectors are both compile-time constants. */
11328 unsigned int nscalars_per_iter
11329 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11330 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11332 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11334 /* For now, we only support cases in which all loads and stores fall back
11335 to VnQI or none do. */
11336 gcc_assert (!rgl->max_nscalars_per_iter
11337 || (rgl->factor == 1 && factor == 1)
11338 || (rgl->max_nscalars_per_iter * rgl->factor
11339 == nscalars_per_iter * factor));
11340 rgl->max_nscalars_per_iter = nscalars_per_iter;
11341 rgl->type = vectype;
11342 rgl->factor = factor;
11346 /* Given a complete set of lengths LENS, extract length number INDEX
11347 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11348 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11349 multipled by the number of elements that should be processed.
11350 Insert any set-up statements before GSI. */
11352 tree
11353 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11354 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11355 unsigned int index, unsigned int factor)
11357 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11358 bool use_bias_adjusted_len =
11359 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11361 /* Populate the rgroup's len array, if this is the first time we've
11362 used it. */
11363 if (rgl->controls.is_empty ())
11365 rgl->controls.safe_grow_cleared (nvectors, true);
11366 for (unsigned int i = 0; i < nvectors; ++i)
11368 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11369 gcc_assert (len_type != NULL_TREE);
11371 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11373 /* Provide a dummy definition until the real one is available. */
11374 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11375 rgl->controls[i] = len;
11377 if (use_bias_adjusted_len)
11379 gcc_assert (i == 0);
11380 tree adjusted_len =
11381 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11382 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11383 rgl->bias_adjusted_ctrl = adjusted_len;
11388 if (use_bias_adjusted_len)
11389 return rgl->bias_adjusted_ctrl;
11391 tree loop_len = rgl->controls[index];
11392 if (rgl->factor == 1 && factor == 1)
11394 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11395 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11396 if (maybe_ne (nunits1, nunits2))
11398 /* A loop len for data type X can be reused for data type Y
11399 if X has N times more elements than Y and if Y's elements
11400 are N times bigger than X's. */
11401 gcc_assert (multiple_p (nunits1, nunits2));
11402 factor = exact_div (nunits1, nunits2).to_constant ();
11403 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11404 gimple_seq seq = NULL;
11405 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11406 build_int_cst (iv_type, factor));
11407 if (seq)
11408 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11411 return loop_len;
11414 /* Scale profiling counters by estimation for LOOP which is vectorized
11415 by factor VF.
11416 If FLAT is true, the loop we started with had unrealistically flat
11417 profile. */
11419 static void
11420 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11422 /* For flat profiles do not scale down proportionally by VF and only
11423 cap by known iteration count bounds. */
11424 if (flat)
11426 if (dump_file && (dump_flags & TDF_DETAILS))
11427 fprintf (dump_file,
11428 "Vectorized loop profile seems flat; not scaling iteration "
11429 "count down by the vectorization factor %i\n", vf);
11430 scale_loop_profile (loop, profile_probability::always (),
11431 get_likely_max_loop_iterations_int (loop));
11432 return;
11434 /* Loop body executes VF fewer times and exit increases VF times. */
11435 profile_count entry_count = loop_preheader_edge (loop)->count ();
11437 /* If we have unreliable loop profile avoid dropping entry
11438 count bellow header count. This can happen since loops
11439 has unrealistically low trip counts. */
11440 while (vf > 1
11441 && loop->header->count > entry_count
11442 && loop->header->count < entry_count * vf)
11444 if (dump_file && (dump_flags & TDF_DETAILS))
11445 fprintf (dump_file,
11446 "Vectorization factor %i seems too large for profile "
11447 "prevoiusly believed to be consistent; reducing.\n", vf);
11448 vf /= 2;
11451 if (entry_count.nonzero_p ())
11452 set_edge_probability_and_rescale_others
11453 (exit_e,
11454 entry_count.probability_in (loop->header->count / vf));
11455 /* Avoid producing very large exit probability when we do not have
11456 sensible profile. */
11457 else if (exit_e->probability < profile_probability::always () / (vf * 2))
11458 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11459 loop->latch->count = single_pred_edge (loop->latch)->count ();
11461 scale_loop_profile (loop, profile_probability::always () / vf,
11462 get_likely_max_loop_iterations_int (loop));
11465 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11466 latch edge values originally defined by it. */
11468 static void
11469 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11470 stmt_vec_info def_stmt_info)
11472 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11473 if (!def || TREE_CODE (def) != SSA_NAME)
11474 return;
11475 stmt_vec_info phi_info;
11476 imm_use_iterator iter;
11477 use_operand_p use_p;
11478 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11480 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11481 if (!phi)
11482 continue;
11483 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11484 && (phi_info = loop_vinfo->lookup_stmt (phi))
11485 && STMT_VINFO_RELEVANT_P (phi_info)))
11486 continue;
11487 loop_p loop = gimple_bb (phi)->loop_father;
11488 edge e = loop_latch_edge (loop);
11489 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11490 continue;
11492 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11493 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11494 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11496 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11497 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11498 gcc_assert (phi_defs.length () == latch_defs.length ());
11499 for (unsigned i = 0; i < phi_defs.length (); ++i)
11500 add_phi_arg (as_a <gphi *> (phi_defs[i]),
11501 gimple_get_lhs (latch_defs[i]), e,
11502 gimple_phi_arg_location (phi, e->dest_idx));
11504 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11506 /* For first order recurrences we have to update both uses of
11507 the latch definition, the one in the PHI node and the one
11508 in the generated VEC_PERM_EXPR. */
11509 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11510 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11511 gcc_assert (phi_defs.length () == latch_defs.length ());
11512 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11513 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11514 for (unsigned i = 0; i < phi_defs.length (); ++i)
11516 gassign *perm = as_a <gassign *> (phi_defs[i]);
11517 if (i > 0)
11518 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11519 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11520 update_stmt (perm);
11522 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11523 gimple_phi_arg_location (phi, e->dest_idx));
11528 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11529 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11530 stmt_vec_info. */
11532 static bool
11533 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11534 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11536 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11537 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11539 if (dump_enabled_p ())
11540 dump_printf_loc (MSG_NOTE, vect_location,
11541 "------>vectorizing statement: %G", stmt_info->stmt);
11543 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11544 vect_loop_kill_debug_uses (loop, stmt_info);
11546 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11547 && !STMT_VINFO_LIVE_P (stmt_info))
11549 if (is_gimple_call (stmt_info->stmt)
11550 && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11552 gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11553 *seen_store = stmt_info;
11554 return false;
11556 return false;
11559 if (STMT_VINFO_VECTYPE (stmt_info))
11561 poly_uint64 nunits
11562 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11563 if (!STMT_SLP_TYPE (stmt_info)
11564 && maybe_ne (nunits, vf)
11565 && dump_enabled_p ())
11566 /* For SLP VF is set according to unrolling factor, and not
11567 to vector size, hence for SLP this print is not valid. */
11568 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11571 /* Pure SLP statements have already been vectorized. We still need
11572 to apply loop vectorization to hybrid SLP statements. */
11573 if (PURE_SLP_STMT (stmt_info))
11574 return false;
11576 if (dump_enabled_p ())
11577 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11579 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11580 *seen_store = stmt_info;
11582 return true;
11585 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11586 in the hash_map with its corresponding values. */
11588 static tree
11589 find_in_mapping (tree t, void *context)
11591 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11593 tree *value = mapping->get (t);
11594 return value ? *value : t;
11597 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11598 original loop that has now been vectorized.
11600 The inits of the data_references need to be advanced with the number of
11601 iterations of the main loop. This has been computed in vect_do_peeling and
11602 is stored in parameter ADVANCE. We first restore the data_references
11603 initial offset with the values recored in ORIG_DRS_INIT.
11605 Since the loop_vec_info of this EPILOGUE was constructed for the original
11606 loop, its stmt_vec_infos all point to the original statements. These need
11607 to be updated to point to their corresponding copies as well as the SSA_NAMES
11608 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11610 The data_reference's connections also need to be updated. Their
11611 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11612 stmt_vec_infos, their statements need to point to their corresponding copy,
11613 if they are gather loads or scatter stores then their reference needs to be
11614 updated to point to its corresponding copy and finally we set
11615 'base_misaligned' to false as we have already peeled for alignment in the
11616 prologue of the main loop. */
11618 static void
11619 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11621 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11622 auto_vec<gimple *> stmt_worklist;
11623 hash_map<tree,tree> mapping;
11624 gimple *orig_stmt, *new_stmt;
11625 gimple_stmt_iterator epilogue_gsi;
11626 gphi_iterator epilogue_phi_gsi;
11627 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11628 basic_block *epilogue_bbs = get_loop_body (epilogue);
11629 unsigned i;
11631 free (LOOP_VINFO_BBS (epilogue_vinfo));
11632 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11634 /* Advance data_reference's with the number of iterations of the previous
11635 loop and its prologue. */
11636 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11639 /* The EPILOGUE loop is a copy of the original loop so they share the same
11640 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11641 point to the copied statements. We also create a mapping of all LHS' in
11642 the original loop and all the LHS' in the EPILOGUE and create worklists to
11643 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11644 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11646 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11647 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11649 new_stmt = epilogue_phi_gsi.phi ();
11651 gcc_assert (gimple_uid (new_stmt) > 0);
11652 stmt_vinfo
11653 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11655 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11656 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11658 mapping.put (gimple_phi_result (orig_stmt),
11659 gimple_phi_result (new_stmt));
11660 /* PHI nodes can not have patterns or related statements. */
11661 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11662 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11665 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11666 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11668 new_stmt = gsi_stmt (epilogue_gsi);
11669 if (is_gimple_debug (new_stmt))
11670 continue;
11672 gcc_assert (gimple_uid (new_stmt) > 0);
11673 stmt_vinfo
11674 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11676 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11677 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11679 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11680 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11682 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11684 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11685 for (gimple_stmt_iterator gsi = gsi_start (seq);
11686 !gsi_end_p (gsi); gsi_next (&gsi))
11687 stmt_worklist.safe_push (gsi_stmt (gsi));
11690 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11691 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11693 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11694 stmt_worklist.safe_push (stmt);
11695 /* Set BB such that the assert in
11696 'get_initial_def_for_reduction' is able to determine that
11697 the BB of the related stmt is inside this loop. */
11698 gimple_set_bb (stmt,
11699 gimple_bb (new_stmt));
11700 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11701 gcc_assert (related_vinfo == NULL
11702 || related_vinfo == stmt_vinfo);
11707 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11708 using the original main loop and thus need to be updated to refer to the
11709 cloned variables used in the epilogue. */
11710 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11712 gimple *stmt = stmt_worklist[i];
11713 tree *new_op;
11715 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11717 tree op = gimple_op (stmt, j);
11718 if ((new_op = mapping.get(op)))
11719 gimple_set_op (stmt, j, *new_op);
11720 else
11722 /* PR92429: The last argument of simplify_replace_tree disables
11723 folding when replacing arguments. This is required as
11724 otherwise you might end up with different statements than the
11725 ones analyzed in vect_loop_analyze, leading to different
11726 vectorization. */
11727 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11728 &find_in_mapping, &mapping, false);
11729 gimple_set_op (stmt, j, op);
11734 struct data_reference *dr;
11735 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11736 FOR_EACH_VEC_ELT (datarefs, i, dr)
11738 orig_stmt = DR_STMT (dr);
11739 gcc_assert (gimple_uid (orig_stmt) > 0);
11740 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11741 /* Data references for gather loads and scatter stores do not use the
11742 updated offset we set using ADVANCE. Instead we have to make sure the
11743 reference in the data references point to the corresponding copy of
11744 the original in the epilogue. Make sure to update both
11745 gather/scatters recognized by dataref analysis and also other
11746 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11747 auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11748 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11749 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11751 DR_REF (dr)
11752 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11753 &find_in_mapping, &mapping);
11754 DR_BASE_ADDRESS (dr)
11755 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11756 &find_in_mapping, &mapping);
11758 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11759 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11760 /* The vector size of the epilogue is smaller than that of the main loop
11761 so the alignment is either the same or lower. This means the dr will
11762 thus by definition be aligned. */
11763 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11766 epilogue_vinfo->shared->datarefs_copy.release ();
11767 epilogue_vinfo->shared->save_datarefs ();
11770 /* When vectorizing early break statements instructions that happen before
11771 the early break in the current BB need to be moved to after the early
11772 break. This function deals with that and assumes that any validity
11773 checks has already been performed.
11775 While moving the instructions if it encounters a VUSE or VDEF it then
11776 corrects the VUSES as it moves the statements along. GDEST is the location
11777 in which to insert the new statements. */
11779 static void
11780 move_early_exit_stmts (loop_vec_info loop_vinfo)
11782 DUMP_VECT_SCOPE ("move_early_exit_stmts");
11784 if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11785 return;
11787 /* Move all stmts that need moving. */
11788 basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11789 gimple_stmt_iterator dest_gsi = gsi_start_bb (dest_bb);
11791 for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11793 /* Check to see if statement is still required for vect or has been
11794 elided. */
11795 auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11796 if (!stmt_info)
11797 continue;
11799 if (dump_enabled_p ())
11800 dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11802 gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11803 gsi_move_before (&stmt_gsi, &dest_gsi);
11804 gsi_prev (&dest_gsi);
11807 /* Update all the stmts with their new reaching VUSES. */
11808 tree vuse
11809 = gimple_vuse (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).last ());
11810 for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11812 if (dump_enabled_p ())
11813 dump_printf_loc (MSG_NOTE, vect_location,
11814 "updating vuse to %T for load %G", vuse, p);
11815 gimple_set_vuse (p, vuse);
11816 update_stmt (p);
11819 /* And update the LC PHIs on exits. */
11820 for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11821 if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11822 if (gphi *phi = get_virtual_phi (e->dest))
11823 SET_PHI_ARG_DEF_ON_EDGE (phi, e, vuse);
11826 /* Function vect_transform_loop.
11828 The analysis phase has determined that the loop is vectorizable.
11829 Vectorize the loop - created vectorized stmts to replace the scalar
11830 stmts in the loop, and update the loop exit condition.
11831 Returns scalar epilogue loop if any. */
11833 class loop *
11834 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11836 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11837 class loop *epilogue = NULL;
11838 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11839 int nbbs = loop->num_nodes;
11840 int i;
11841 tree niters_vector = NULL_TREE;
11842 tree step_vector = NULL_TREE;
11843 tree niters_vector_mult_vf = NULL_TREE;
11844 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11845 unsigned int lowest_vf = constant_lower_bound (vf);
11846 gimple *stmt;
11847 bool check_profitability = false;
11848 unsigned int th;
11849 bool flat = maybe_flat_loop_profile (loop);
11851 DUMP_VECT_SCOPE ("vec_transform_loop");
11853 loop_vinfo->shared->check_datarefs ();
11855 /* Use the more conservative vectorization threshold. If the number
11856 of iterations is constant assume the cost check has been performed
11857 by our caller. If the threshold makes all loops profitable that
11858 run at least the (estimated) vectorization factor number of times
11859 checking is pointless, too. */
11860 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11861 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11863 if (dump_enabled_p ())
11864 dump_printf_loc (MSG_NOTE, vect_location,
11865 "Profitability threshold is %d loop iterations.\n",
11866 th);
11867 check_profitability = true;
11870 /* Make sure there exists a single-predecessor exit bb. Do this before
11871 versioning. */
11872 edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11873 if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11875 split_loop_exit_edge (e, true);
11876 if (dump_enabled_p ())
11877 dump_printf (MSG_NOTE, "split exit edge\n");
11880 /* Version the loop first, if required, so the profitability check
11881 comes first. */
11883 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11885 class loop *sloop
11886 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11887 sloop->force_vectorize = false;
11888 check_profitability = false;
11891 /* Make sure there exists a single-predecessor exit bb also on the
11892 scalar loop copy. Do this after versioning but before peeling
11893 so CFG structure is fine for both scalar and if-converted loop
11894 to make slpeel_duplicate_current_defs_from_edges face matched
11895 loop closed PHI nodes on the exit. */
11896 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11898 e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11899 if (! single_pred_p (e->dest))
11901 split_loop_exit_edge (e, true);
11902 if (dump_enabled_p ())
11903 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11907 tree niters = vect_build_loop_niters (loop_vinfo);
11908 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11909 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11910 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11911 tree advance;
11912 drs_init_vec orig_drs_init;
11914 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11915 &step_vector, &niters_vector_mult_vf, th,
11916 check_profitability, niters_no_overflow,
11917 &advance);
11918 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11919 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11921 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11922 block after loop exit. We need to scale all that. */
11923 basic_block preheader
11924 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11925 preheader->count
11926 = preheader->count.apply_probability
11927 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11928 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11929 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11930 LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11933 if (niters_vector == NULL_TREE)
11935 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11936 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11937 && known_eq (lowest_vf, vf))
11939 niters_vector
11940 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11941 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11942 step_vector = build_one_cst (TREE_TYPE (niters));
11944 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11945 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11946 &step_vector, niters_no_overflow);
11947 else
11948 /* vect_do_peeling subtracted the number of peeled prologue
11949 iterations from LOOP_VINFO_NITERS. */
11950 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11951 &niters_vector, &step_vector,
11952 niters_no_overflow);
11955 /* 1) Make sure the loop header has exactly two entries
11956 2) Make sure we have a preheader basic block. */
11958 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11960 split_edge (loop_preheader_edge (loop));
11962 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11963 /* This will deal with any possible peeling. */
11964 vect_prepare_for_masked_peels (loop_vinfo);
11966 /* Handle any code motion that we need to for early-break vectorization after
11967 we've done peeling but just before we start vectorizing. */
11968 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11969 move_early_exit_stmts (loop_vinfo);
11971 /* Schedule the SLP instances first, then handle loop vectorization
11972 below. */
11973 if (!loop_vinfo->slp_instances.is_empty ())
11975 DUMP_VECT_SCOPE ("scheduling SLP instances");
11976 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11979 /* FORNOW: the vectorizer supports only loops which body consist
11980 of one basic block (header + empty latch). When the vectorizer will
11981 support more involved loop forms, the order by which the BBs are
11982 traversed need to be reconsidered. */
11984 for (i = 0; i < nbbs; i++)
11986 basic_block bb = bbs[i];
11987 stmt_vec_info stmt_info;
11989 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11990 gsi_next (&si))
11992 gphi *phi = si.phi ();
11993 if (dump_enabled_p ())
11994 dump_printf_loc (MSG_NOTE, vect_location,
11995 "------>vectorizing phi: %G", (gimple *) phi);
11996 stmt_info = loop_vinfo->lookup_stmt (phi);
11997 if (!stmt_info)
11998 continue;
12000 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
12001 vect_loop_kill_debug_uses (loop, stmt_info);
12003 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12004 && !STMT_VINFO_LIVE_P (stmt_info))
12005 continue;
12007 if (STMT_VINFO_VECTYPE (stmt_info)
12008 && (maybe_ne
12009 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12010 && dump_enabled_p ())
12011 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12013 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12014 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12015 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12016 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12017 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12018 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12019 && ! PURE_SLP_STMT (stmt_info))
12021 if (dump_enabled_p ())
12022 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12023 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12027 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12028 gsi_next (&si))
12030 gphi *phi = si.phi ();
12031 stmt_info = loop_vinfo->lookup_stmt (phi);
12032 if (!stmt_info)
12033 continue;
12035 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12036 && !STMT_VINFO_LIVE_P (stmt_info))
12037 continue;
12039 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12040 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12041 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12042 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12043 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12044 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12045 && ! PURE_SLP_STMT (stmt_info))
12046 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12049 for (gimple_stmt_iterator si = gsi_start_bb (bb);
12050 !gsi_end_p (si);)
12052 stmt = gsi_stmt (si);
12053 /* During vectorization remove existing clobber stmts. */
12054 if (gimple_clobber_p (stmt))
12056 unlink_stmt_vdef (stmt);
12057 gsi_remove (&si, true);
12058 release_defs (stmt);
12060 else
12062 /* Ignore vector stmts created in the outer loop. */
12063 stmt_info = loop_vinfo->lookup_stmt (stmt);
12065 /* vector stmts created in the outer-loop during vectorization of
12066 stmts in an inner-loop may not have a stmt_info, and do not
12067 need to be vectorized. */
12068 stmt_vec_info seen_store = NULL;
12069 if (stmt_info)
12071 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12073 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12074 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12075 !gsi_end_p (subsi); gsi_next (&subsi))
12077 stmt_vec_info pat_stmt_info
12078 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12079 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12080 &si, &seen_store);
12082 stmt_vec_info pat_stmt_info
12083 = STMT_VINFO_RELATED_STMT (stmt_info);
12084 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12085 &si, &seen_store))
12086 maybe_set_vectorized_backedge_value (loop_vinfo,
12087 pat_stmt_info);
12089 else
12091 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12092 &seen_store))
12093 maybe_set_vectorized_backedge_value (loop_vinfo,
12094 stmt_info);
12097 gsi_next (&si);
12098 if (seen_store)
12100 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12101 /* Interleaving. If IS_STORE is TRUE, the
12102 vectorization of the interleaving chain was
12103 completed - free all the stores in the chain. */
12104 vect_remove_stores (loop_vinfo,
12105 DR_GROUP_FIRST_ELEMENT (seen_store));
12106 else
12107 /* Free the attached stmt_vec_info and remove the stmt. */
12108 loop_vinfo->remove_stmt (stmt_info);
12113 /* Stub out scalar statements that must not survive vectorization.
12114 Doing this here helps with grouped statements, or statements that
12115 are involved in patterns. */
12116 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12117 !gsi_end_p (gsi); gsi_next (&gsi))
12119 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12120 if (!call || !gimple_call_internal_p (call))
12121 continue;
12122 internal_fn ifn = gimple_call_internal_fn (call);
12123 if (ifn == IFN_MASK_LOAD)
12125 tree lhs = gimple_get_lhs (call);
12126 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12128 tree zero = build_zero_cst (TREE_TYPE (lhs));
12129 gimple *new_stmt = gimple_build_assign (lhs, zero);
12130 gsi_replace (&gsi, new_stmt, true);
12133 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12135 tree lhs = gimple_get_lhs (call);
12136 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12138 tree else_arg
12139 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12140 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12141 gsi_replace (&gsi, new_stmt, true);
12145 } /* BBs in loop */
12147 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12148 a zero NITERS becomes a nonzero NITERS_VECTOR. */
12149 if (integer_onep (step_vector))
12150 niters_no_overflow = true;
12151 vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12152 niters_vector, step_vector, niters_vector_mult_vf,
12153 !niters_no_overflow);
12155 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12157 /* True if the final iteration might not handle a full vector's
12158 worth of scalar iterations. */
12159 bool final_iter_may_be_partial
12160 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
12161 /* The minimum number of iterations performed by the epilogue. This
12162 is 1 when peeling for gaps because we always need a final scalar
12163 iteration. */
12164 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12165 /* +1 to convert latch counts to loop iteration counts,
12166 -min_epilogue_iters to remove iterations that cannot be performed
12167 by the vector code. */
12168 int bias_for_lowest = 1 - min_epilogue_iters;
12169 int bias_for_assumed = bias_for_lowest;
12170 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12171 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12173 /* When the amount of peeling is known at compile time, the first
12174 iteration will have exactly alignment_npeels active elements.
12175 In the worst case it will have at least one. */
12176 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12177 bias_for_lowest += lowest_vf - min_first_active;
12178 bias_for_assumed += assumed_vf - min_first_active;
12180 /* In these calculations the "- 1" converts loop iteration counts
12181 back to latch counts. */
12182 if (loop->any_upper_bound)
12184 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12185 loop->nb_iterations_upper_bound
12186 = (final_iter_may_be_partial
12187 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12188 lowest_vf) - 1
12189 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12190 lowest_vf) - 1);
12191 if (main_vinfo
12192 /* Both peeling for alignment and peeling for gaps can end up
12193 with the scalar epilogue running for more than VF-1 iterations. */
12194 && !main_vinfo->peeling_for_alignment
12195 && !main_vinfo->peeling_for_gaps)
12197 unsigned int bound;
12198 poly_uint64 main_iters
12199 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12200 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12201 main_iters
12202 = upper_bound (main_iters,
12203 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12204 if (can_div_away_from_zero_p (main_iters,
12205 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12206 &bound))
12207 loop->nb_iterations_upper_bound
12208 = wi::umin ((bound_wide_int) (bound - 1),
12209 loop->nb_iterations_upper_bound);
12212 if (loop->any_likely_upper_bound)
12213 loop->nb_iterations_likely_upper_bound
12214 = (final_iter_may_be_partial
12215 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12216 + bias_for_lowest, lowest_vf) - 1
12217 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12218 + bias_for_lowest, lowest_vf) - 1);
12219 if (loop->any_estimate)
12220 loop->nb_iterations_estimate
12221 = (final_iter_may_be_partial
12222 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12223 assumed_vf) - 1
12224 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12225 assumed_vf) - 1);
12226 scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12227 assumed_vf, flat);
12229 if (dump_enabled_p ())
12231 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12233 dump_printf_loc (MSG_NOTE, vect_location,
12234 "LOOP VECTORIZED\n");
12235 if (loop->inner)
12236 dump_printf_loc (MSG_NOTE, vect_location,
12237 "OUTER LOOP VECTORIZED\n");
12238 dump_printf (MSG_NOTE, "\n");
12240 else
12241 dump_printf_loc (MSG_NOTE, vect_location,
12242 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12243 GET_MODE_NAME (loop_vinfo->vector_mode));
12246 /* Loops vectorized with a variable factor won't benefit from
12247 unrolling/peeling. */
12248 if (!vf.is_constant ())
12250 loop->unroll = 1;
12251 if (dump_enabled_p ())
12252 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12253 " variable-length vectorization factor\n");
12255 /* Free SLP instances here because otherwise stmt reference counting
12256 won't work. */
12257 slp_instance instance;
12258 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12259 vect_free_slp_instance (instance);
12260 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12261 /* Clear-up safelen field since its value is invalid after vectorization
12262 since vectorized loop can have loop-carried dependencies. */
12263 loop->safelen = 0;
12265 if (epilogue)
12267 update_epilogue_loop_vinfo (epilogue, advance);
12269 epilogue->simduid = loop->simduid;
12270 epilogue->force_vectorize = loop->force_vectorize;
12271 epilogue->dont_vectorize = false;
12274 return epilogue;
12277 /* The code below is trying to perform simple optimization - revert
12278 if-conversion for masked stores, i.e. if the mask of a store is zero
12279 do not perform it and all stored value producers also if possible.
12280 For example,
12281 for (i=0; i<n; i++)
12282 if (c[i])
12284 p1[i] += 1;
12285 p2[i] = p3[i] +2;
12287 this transformation will produce the following semi-hammock:
12289 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12291 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12292 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12293 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12294 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12295 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12296 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12300 void
12301 optimize_mask_stores (class loop *loop)
12303 basic_block *bbs = get_loop_body (loop);
12304 unsigned nbbs = loop->num_nodes;
12305 unsigned i;
12306 basic_block bb;
12307 class loop *bb_loop;
12308 gimple_stmt_iterator gsi;
12309 gimple *stmt;
12310 auto_vec<gimple *> worklist;
12311 auto_purge_vect_location sentinel;
12313 vect_location = find_loop_location (loop);
12314 /* Pick up all masked stores in loop if any. */
12315 for (i = 0; i < nbbs; i++)
12317 bb = bbs[i];
12318 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12319 gsi_next (&gsi))
12321 stmt = gsi_stmt (gsi);
12322 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12323 worklist.safe_push (stmt);
12327 free (bbs);
12328 if (worklist.is_empty ())
12329 return;
12331 /* Loop has masked stores. */
12332 while (!worklist.is_empty ())
12334 gimple *last, *last_store;
12335 edge e, efalse;
12336 tree mask;
12337 basic_block store_bb, join_bb;
12338 gimple_stmt_iterator gsi_to;
12339 tree vdef, new_vdef;
12340 gphi *phi;
12341 tree vectype;
12342 tree zero;
12344 last = worklist.pop ();
12345 mask = gimple_call_arg (last, 2);
12346 bb = gimple_bb (last);
12347 /* Create then_bb and if-then structure in CFG, then_bb belongs to
12348 the same loop as if_bb. It could be different to LOOP when two
12349 level loop-nest is vectorized and mask_store belongs to the inner
12350 one. */
12351 e = split_block (bb, last);
12352 bb_loop = bb->loop_father;
12353 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12354 join_bb = e->dest;
12355 store_bb = create_empty_bb (bb);
12356 add_bb_to_loop (store_bb, bb_loop);
12357 e->flags = EDGE_TRUE_VALUE;
12358 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12359 /* Put STORE_BB to likely part. */
12360 efalse->probability = profile_probability::likely ();
12361 e->probability = efalse->probability.invert ();
12362 store_bb->count = efalse->count ();
12363 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12364 if (dom_info_available_p (CDI_DOMINATORS))
12365 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12366 if (dump_enabled_p ())
12367 dump_printf_loc (MSG_NOTE, vect_location,
12368 "Create new block %d to sink mask stores.",
12369 store_bb->index);
12370 /* Create vector comparison with boolean result. */
12371 vectype = TREE_TYPE (mask);
12372 zero = build_zero_cst (vectype);
12373 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12374 gsi = gsi_last_bb (bb);
12375 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12376 /* Create new PHI node for vdef of the last masked store:
12377 .MEM_2 = VDEF <.MEM_1>
12378 will be converted to
12379 .MEM.3 = VDEF <.MEM_1>
12380 and new PHI node will be created in join bb
12381 .MEM_2 = PHI <.MEM_1, .MEM_3>
12383 vdef = gimple_vdef (last);
12384 new_vdef = make_ssa_name (gimple_vop (cfun), last);
12385 gimple_set_vdef (last, new_vdef);
12386 phi = create_phi_node (vdef, join_bb);
12387 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12389 /* Put all masked stores with the same mask to STORE_BB if possible. */
12390 while (true)
12392 gimple_stmt_iterator gsi_from;
12393 gimple *stmt1 = NULL;
12395 /* Move masked store to STORE_BB. */
12396 last_store = last;
12397 gsi = gsi_for_stmt (last);
12398 gsi_from = gsi;
12399 /* Shift GSI to the previous stmt for further traversal. */
12400 gsi_prev (&gsi);
12401 gsi_to = gsi_start_bb (store_bb);
12402 gsi_move_before (&gsi_from, &gsi_to);
12403 /* Setup GSI_TO to the non-empty block start. */
12404 gsi_to = gsi_start_bb (store_bb);
12405 if (dump_enabled_p ())
12406 dump_printf_loc (MSG_NOTE, vect_location,
12407 "Move stmt to created bb\n%G", last);
12408 /* Move all stored value producers if possible. */
12409 while (!gsi_end_p (gsi))
12411 tree lhs;
12412 imm_use_iterator imm_iter;
12413 use_operand_p use_p;
12414 bool res;
12416 /* Skip debug statements. */
12417 if (is_gimple_debug (gsi_stmt (gsi)))
12419 gsi_prev (&gsi);
12420 continue;
12422 stmt1 = gsi_stmt (gsi);
12423 /* Do not consider statements writing to memory or having
12424 volatile operand. */
12425 if (gimple_vdef (stmt1)
12426 || gimple_has_volatile_ops (stmt1))
12427 break;
12428 gsi_from = gsi;
12429 gsi_prev (&gsi);
12430 lhs = gimple_get_lhs (stmt1);
12431 if (!lhs)
12432 break;
12434 /* LHS of vectorized stmt must be SSA_NAME. */
12435 if (TREE_CODE (lhs) != SSA_NAME)
12436 break;
12438 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12440 /* Remove dead scalar statement. */
12441 if (has_zero_uses (lhs))
12443 gsi_remove (&gsi_from, true);
12444 continue;
12448 /* Check that LHS does not have uses outside of STORE_BB. */
12449 res = true;
12450 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12452 gimple *use_stmt;
12453 use_stmt = USE_STMT (use_p);
12454 if (is_gimple_debug (use_stmt))
12455 continue;
12456 if (gimple_bb (use_stmt) != store_bb)
12458 res = false;
12459 break;
12462 if (!res)
12463 break;
12465 if (gimple_vuse (stmt1)
12466 && gimple_vuse (stmt1) != gimple_vuse (last_store))
12467 break;
12469 /* Can move STMT1 to STORE_BB. */
12470 if (dump_enabled_p ())
12471 dump_printf_loc (MSG_NOTE, vect_location,
12472 "Move stmt to created bb\n%G", stmt1);
12473 gsi_move_before (&gsi_from, &gsi_to);
12474 /* Shift GSI_TO for further insertion. */
12475 gsi_prev (&gsi_to);
12477 /* Put other masked stores with the same mask to STORE_BB. */
12478 if (worklist.is_empty ()
12479 || gimple_call_arg (worklist.last (), 2) != mask
12480 || worklist.last () != stmt1)
12481 break;
12482 last = worklist.pop ();
12484 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12488 /* Decide whether it is possible to use a zero-based induction variable
12489 when vectorizing LOOP_VINFO with partial vectors. If it is, return
12490 the value that the induction variable must be able to hold in order
12491 to ensure that the rgroups eventually have no active vector elements.
12492 Return -1 otherwise. */
12494 widest_int
12495 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12497 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12498 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12499 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12501 /* Calculate the value that the induction variable must be able
12502 to hit in order to ensure that we end the loop with an all-false mask.
12503 This involves adding the maximum number of inactive trailing scalar
12504 iterations. */
12505 widest_int iv_limit = -1;
12506 if (max_loop_iterations (loop, &iv_limit))
12508 if (niters_skip)
12510 /* Add the maximum number of skipped iterations to the
12511 maximum iteration count. */
12512 if (TREE_CODE (niters_skip) == INTEGER_CST)
12513 iv_limit += wi::to_widest (niters_skip);
12514 else
12515 iv_limit += max_vf - 1;
12517 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12518 /* Make a conservatively-correct assumption. */
12519 iv_limit += max_vf - 1;
12521 /* IV_LIMIT is the maximum number of latch iterations, which is also
12522 the maximum in-range IV value. Round this value down to the previous
12523 vector alignment boundary and then add an extra full iteration. */
12524 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12525 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12527 return iv_limit;
12530 /* For the given rgroup_controls RGC, check whether an induction variable
12531 would ever hit a value that produces a set of all-false masks or zero
12532 lengths before wrapping around. Return true if it's possible to wrap
12533 around before hitting the desirable value, otherwise return false. */
12535 bool
12536 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12538 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12540 if (iv_limit == -1)
12541 return true;
12543 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12544 unsigned int compare_precision = TYPE_PRECISION (compare_type);
12545 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12547 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12548 return true;
12550 return false;