Daily bump.
[official-gcc.git] / gcc / tree-vect-loop.cc
blobfedecd620e9b2f035f5b9565cdb49ccd5aca7724
1 /* Loop Vectorization
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "memmodel.h"
36 #include "optabs.h"
37 #include "diagnostic-core.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "cfganal.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "gimple-fold.h"
52 #include "cgraph.h"
53 #include "tree-cfg.h"
54 #include "tree-if-conv.h"
55 #include "internal-fn.h"
56 #include "tree-vector-builder.h"
57 #include "vec-perm-indices.h"
58 #include "tree-eh.h"
59 #include "case-cfn-macros.h"
60 #include "langhooks.h"
62 /* Loop Vectorization Pass.
64 This pass tries to vectorize loops.
66 For example, the vectorizer transforms the following simple loop:
68 short a[N]; short b[N]; short c[N]; int i;
70 for (i=0; i<N; i++){
71 a[i] = b[i] + c[i];
74 as if it was manually vectorized by rewriting the source code into:
76 typedef int __attribute__((mode(V8HI))) v8hi;
77 short a[N]; short b[N]; short c[N]; int i;
78 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 v8hi va, vb, vc;
81 for (i=0; i<N/8; i++){
82 vb = pb[i];
83 vc = pc[i];
84 va = vb + vc;
85 pa[i] = va;
88 The main entry to this pass is vectorize_loops(), in which
89 the vectorizer applies a set of analyses on a given set of loops,
90 followed by the actual vectorization transformation for the loops that
91 had successfully passed the analysis phase.
92 Throughout this pass we make a distinction between two types of
93 data: scalars (which are represented by SSA_NAMES), and memory references
94 ("data-refs"). These two types of data require different handling both
95 during analysis and transformation. The types of data-refs that the
96 vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 accesses are required to have a simple (consecutive) access pattern.
100 Analysis phase:
101 ===============
102 The driver for the analysis phase is vect_analyze_loop().
103 It applies a set of analyses, some of which rely on the scalar evolution
104 analyzer (scev) developed by Sebastian Pop.
106 During the analysis phase the vectorizer records some information
107 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 loop, as well as general information about the loop as a whole, which is
109 recorded in a "loop_vec_info" struct attached to each loop.
111 Transformation phase:
112 =====================
113 The loop transformation phase scans all the stmts in the loop, and
114 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 the loop that needs to be vectorized. It inserts the vector code sequence
116 just before the scalar stmt S, and records a pointer to the vector code
117 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 attached to S). This pointer will be used for the vectorization of following
119 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 otherwise, we rely on dead code elimination for removing it.
122 For example, say stmt S1 was vectorized into stmt VS1:
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 S2: a = b;
128 To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 resulting sequence would be:
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 VS2: va = vb;
136 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 Operands that are not SSA_NAMEs, are data-refs that appear in
139 load/store operations (like 'x[i]' in S1), and are handled differently.
141 Target modeling:
142 =================
143 Currently the only target specific information that is used is the
144 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 Targets that can support different sizes of vectors, for now will need
146 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 flexibility will be added in the future.
149 Since we only vectorize operations which vector form can be
150 expressed using existing tree codes, to verify that an operation is
151 supported, the vectorizer checks the relevant optab at the relevant
152 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 the value found is CODE_FOR_nothing, then there's no target support, and
154 we can't vectorize the stmt.
156 For additional information on this project see:
157 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 unsigned *);
162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 bool *, bool *, bool);
165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 may already be set for general statements (not just data refs). */
169 static opt_result
170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 bool vectype_maybe_set_p,
172 poly_uint64 *vf)
174 gimple *stmt = stmt_info->stmt;
176 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 && !STMT_VINFO_LIVE_P (stmt_info))
178 || gimple_clobber_p (stmt))
180 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 return opt_result::success ();
185 tree stmt_vectype, nunits_vectype;
186 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 &stmt_vectype,
188 &nunits_vectype);
189 if (!res)
190 return res;
192 if (stmt_vectype)
194 if (STMT_VINFO_VECTYPE (stmt_info))
195 /* The only case when a vectype had been already set is for stmts
196 that contain a data ref, or for "pattern-stmts" (stmts generated
197 by the vectorizer to represent/replace a certain idiom). */
198 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 || vectype_maybe_set_p)
200 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 else
202 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
205 if (nunits_vectype)
206 vect_update_max_nunits (vf, nunits_vectype);
208 return opt_result::success ();
211 /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 types of STMT_INFO and all attached pattern statements and update
213 the vectorization factor VF accordingly. Return true on success
214 or false if something prevented vectorization. */
216 static opt_result
217 vect_determine_vf_for_stmt (vec_info *vinfo,
218 stmt_vec_info stmt_info, poly_uint64 *vf)
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 if (!res)
225 return res;
227 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 && STMT_VINFO_RELATED_STMT (stmt_info))
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
233 /* If a pattern statement has def stmts, analyze them too. */
234 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 !gsi_end_p (si); gsi_next (&si))
237 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "==> examining pattern def stmt: %G",
241 def_stmt_info->stmt);
242 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 if (!res)
244 return res;
247 if (dump_enabled_p ())
248 dump_printf_loc (MSG_NOTE, vect_location,
249 "==> examining pattern statement: %G",
250 stmt_info->stmt);
251 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 if (!res)
253 return res;
256 return opt_result::success ();
259 /* Function vect_determine_vectorization_factor
261 Determine the vectorization factor (VF). VF is the number of data elements
262 that are operated upon in parallel in a single iteration of the vectorized
263 loop. For example, when vectorizing a loop that operates on 4byte elements,
264 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 elements can fit in a single vector register.
267 We currently support vectorization of loops in which all types operated upon
268 are of the same size. Therefore this function currently sets VF according to
269 the size of the types operated upon, and fails if there are multiple sizes
270 in the loop.
272 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 original loop:
274 for (i=0; i<N; i++){
275 a[i] = b[i] + c[i];
278 vectorized loop:
279 for (i=0; i<N; i+=VF){
280 a[i:VF] = b[i:VF] + c[i:VF];
284 static opt_result
285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
287 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 unsigned nbbs = loop->num_nodes;
290 poly_uint64 vectorization_factor = 1;
291 tree scalar_type = NULL_TREE;
292 gphi *phi;
293 tree vectype;
294 stmt_vec_info stmt_info;
295 unsigned i;
297 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
299 for (i = 0; i < nbbs; i++)
301 basic_block bb = bbs[i];
303 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 gsi_next (&si))
306 phi = si.phi ();
307 stmt_info = loop_vinfo->lookup_stmt (phi);
308 if (dump_enabled_p ())
309 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 (gimple *) phi);
312 gcc_assert (stmt_info);
314 if (STMT_VINFO_RELEVANT_P (stmt_info)
315 || STMT_VINFO_LIVE_P (stmt_info))
317 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 scalar_type = TREE_TYPE (PHI_RESULT (phi));
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE, vect_location,
322 "get vectype for scalar type: %T\n",
323 scalar_type);
325 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 if (!vectype)
327 return opt_result::failure_at (phi,
328 "not vectorized: unsupported "
329 "data-type %T\n",
330 scalar_type);
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 dump_printf (MSG_NOTE, "\n");
344 vect_update_max_nunits (&vectorization_factor, vectype);
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 gsi_next (&si))
351 if (is_gimple_debug (gsi_stmt (si)))
352 continue;
353 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 opt_result res
355 = vect_determine_vf_for_stmt (loop_vinfo,
356 stmt_info, &vectorization_factor);
357 if (!res)
358 return res;
362 /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 dump_dec (MSG_NOTE, vectorization_factor);
367 dump_printf (MSG_NOTE, "\n");
370 if (known_le (vectorization_factor, 1U))
371 return opt_result::failure_at (vect_location,
372 "not vectorized: unsupported data-type\n");
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 return opt_result::success ();
378 /* Function vect_is_simple_iv_evolution.
380 FORNOW: A simple evolution of an induction variables in the loop is
381 considered a polynomial evolution. */
383 static bool
384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 tree * step)
387 tree init_expr;
388 tree step_expr;
389 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 basic_block bb;
392 /* When there is no evolution in this loop, the evolution function
393 is not "simple". */
394 if (evolution_part == NULL_TREE)
395 return false;
397 /* When the evolution is a polynomial of degree >= 2
398 the evolution function is not "simple". */
399 if (tree_is_chrec (evolution_part))
400 return false;
402 step_expr = evolution_part;
403 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 step_expr, init_expr);
409 *init = init_expr;
410 *step = step_expr;
412 if (TREE_CODE (step_expr) != INTEGER_CST
413 && (TREE_CODE (step_expr) != SSA_NAME
414 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 || !flag_associative_math)))
419 && (TREE_CODE (step_expr) != REAL_CST
420 || !flag_associative_math))
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "step unknown.\n");
425 return false;
428 return true;
431 /* Function vect_is_nonlinear_iv_evolution
433 Only support nonlinear induction for integer type
434 1. neg
435 2. mul by constant
436 3. lshift/rshift by constant.
438 For neg induction, return a fake step as integer -1. */
439 static bool
440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 gphi* loop_phi_node, tree *init, tree *step)
443 tree init_expr, ev_expr, result, op1, op2;
444 gimple* def;
446 if (gimple_phi_num_args (loop_phi_node) != 2)
447 return false;
449 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
452 /* Support nonlinear induction only for integer type. */
453 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 return false;
456 *init = init_expr;
457 result = PHI_RESULT (loop_phi_node);
459 if (TREE_CODE (ev_expr) != SSA_NAME
460 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 || !is_gimple_assign (def))
462 return false;
464 enum tree_code t_code = gimple_assign_rhs_code (def);
465 switch (t_code)
467 case NEGATE_EXPR:
468 if (gimple_assign_rhs1 (def) != result)
469 return false;
470 *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 break;
474 case RSHIFT_EXPR:
475 case LSHIFT_EXPR:
476 case MULT_EXPR:
477 op1 = gimple_assign_rhs1 (def);
478 op2 = gimple_assign_rhs2 (def);
479 if (TREE_CODE (op2) != INTEGER_CST
480 || op1 != result)
481 return false;
482 *step = op2;
483 if (t_code == LSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 else if (t_code == RSHIFT_EXPR)
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 else
489 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 break;
492 default:
493 return false;
496 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
499 return true;
502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 what we are assuming is a double reduction. For example, given
504 a structure like this:
506 outer1:
507 x_1 = PHI <x_4(outer2), ...>;
510 inner:
511 x_2 = PHI <x_1(outer1), ...>;
513 x_3 = ...;
516 outer2:
517 x_4 = PHI <x_3(inner)>;
520 outer loop analysis would treat x_1 as a double reduction phi and
521 this function would then return true for x_2. */
523 static bool
524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
526 use_operand_p use_p;
527 ssa_op_iter op_iter;
528 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 return true;
532 return false;
535 /* Returns true if Phi is a first-order recurrence. A first-order
536 recurrence is a non-reduction recurrence relation in which the value of
537 the recurrence in the current loop iteration equals a value defined in
538 the previous iteration. */
540 static bool
541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 gphi *phi)
544 /* A nested cycle isn't vectorizable as first order recurrence. */
545 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 return false;
548 /* Ensure the loop latch definition is from within the loop. */
549 edge latch = loop_latch_edge (loop);
550 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 if (TREE_CODE (ldef) != SSA_NAME
552 || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 return false;
557 tree def = gimple_phi_result (phi);
559 /* Ensure every use_stmt of the phi node is dominated by the latch
560 definition. */
561 imm_use_iterator imm_iter;
562 use_operand_p use_p;
563 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 if (!is_gimple_debug (USE_STMT (use_p))
565 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 USE_STMT (use_p))))
568 return false;
570 /* First-order recurrence autovectorization needs shuffle vector. */
571 tree scalar_type = TREE_TYPE (def);
572 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 if (!vectype)
574 return false;
576 return true;
579 /* Function vect_analyze_scalar_cycles_1.
581 Examine the cross iteration def-use cycles of scalar variables
582 in LOOP. LOOP_VINFO represents the loop that is now being
583 considered for vectorization (can be LOOP, or an outer-loop
584 enclosing LOOP). SLP indicates there will be some subsequent
585 slp analyses or not. */
587 static void
588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 bool slp)
591 basic_block bb = loop->header;
592 tree init, step;
593 auto_vec<stmt_vec_info, 64> worklist;
594 gphi_iterator gsi;
595 bool double_reduc, reduc_chain;
597 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
599 /* First - identify all inductions. Reduction detection assumes that all the
600 inductions have been identified, therefore, this order must not be
601 changed. */
602 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
604 gphi *phi = gsi.phi ();
605 tree access_fn = NULL;
606 tree def = PHI_RESULT (phi);
607 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 (gimple *) phi);
613 /* Skip virtual phi's. The data dependences that are associated with
614 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 if (virtual_operand_p (def))
616 continue;
618 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
620 /* Analyze the evolution function. */
621 access_fn = analyze_scalar_evolution (loop, def);
622 if (access_fn)
624 STRIP_NOPS (access_fn);
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "Access function of PHI: %T\n", access_fn);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 = initial_condition_in_loop_num (access_fn, loop->num);
630 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 = evolution_part_in_loop_num (access_fn, loop->num);
634 if ((!access_fn
635 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 &init, &step)
638 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 && TREE_CODE (step) != INTEGER_CST))
640 /* Only handle nonlinear iv for same loop. */
641 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 phi, &init, &step)))
645 worklist.safe_push (stmt_vinfo);
646 continue;
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 != NULL_TREE);
651 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
659 /* Second - identify all reductions and nested cycles. */
660 while (worklist.length () > 0)
662 stmt_vec_info stmt_vinfo = worklist.pop ();
663 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
664 tree def = PHI_RESULT (phi);
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 (gimple *) phi);
670 gcc_assert (!virtual_operand_p (def)
671 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
673 stmt_vec_info reduc_stmt_info
674 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 &reduc_chain, slp);
676 if (reduc_stmt_info)
678 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 if (double_reduc)
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location,
684 "Detected double reduction.\n");
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
689 else
691 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "Detected vectorizable nested cycle.\n");
697 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
699 else
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_NOTE, vect_location,
703 "Detected reduction.\n");
705 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
706 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
707 /* Store the reduction cycles for possible vectorization in
708 loop-aware SLP if it was not detected as reduction
709 chain. */
710 if (! reduc_chain)
711 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
712 (reduc_stmt_info);
716 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
718 else
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown def-use cycle pattern.\n");
726 /* Function vect_analyze_scalar_cycles.
728 Examine the cross iteration def-use cycles of scalar variables, by
729 analyzing the loop-header PHIs of scalar variables. Classify each
730 cycle as one of the following: invariant, induction, reduction, unknown.
731 We do that for the loop represented by LOOP_VINFO, and also to its
732 inner-loop, if exists.
733 Examples for scalar cycles:
735 Example1: reduction:
737 loop1:
738 for (i=0; i<N; i++)
739 sum += a[i];
741 Example2: induction:
743 loop2:
744 for (i=0; i<N; i++)
745 a[i] = i; */
747 static void
748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
752 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
754 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 Reductions in such inner-loop therefore have different properties than
756 the reductions in the nest that gets vectorized:
757 1. When vectorized, they are executed in the same order as in the original
758 scalar loop, so we can't change the order of computation when
759 vectorizing them.
760 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 current checks are too strict. */
763 if (loop->inner)
764 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
767 /* Transfer group and reduction information from STMT_INFO to its
768 pattern stmt. */
770 static void
771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
773 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
774 stmt_vec_info stmtp;
775 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
777 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
780 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
781 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
782 == STMT_VINFO_DEF_TYPE (stmt_info));
783 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
784 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
785 if (stmt_info)
786 REDUC_GROUP_NEXT_ELEMENT (stmtp)
787 = STMT_VINFO_RELATED_STMT (stmt_info);
789 while (stmt_info);
792 /* Fixup scalar cycles that now have their stmts detected as patterns. */
794 static void
795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
797 stmt_vec_info first;
798 unsigned i;
800 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
802 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
803 while (next)
805 if ((STMT_VINFO_IN_PATTERN_P (next)
806 != STMT_VINFO_IN_PATTERN_P (first))
807 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
808 break;
809 next = REDUC_GROUP_NEXT_ELEMENT (next);
811 /* If all reduction chain members are well-formed patterns adjust
812 the group to group the pattern stmts instead. */
813 if (! next
814 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
816 if (STMT_VINFO_IN_PATTERN_P (first))
818 vect_fixup_reduc_chain (first);
819 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
820 = STMT_VINFO_RELATED_STMT (first);
823 /* If not all stmt in the chain are patterns or if we failed
824 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 it as regular reduction instead. */
826 else
828 stmt_vec_info vinfo = first;
829 stmt_vec_info last = NULL;
830 while (vinfo)
832 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
833 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
834 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
835 last = vinfo;
836 vinfo = next;
838 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
839 = vect_internal_def;
840 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
841 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
842 --i;
847 /* Function vect_get_loop_niters.
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 niter information holds in ASSUMPTIONS.
854 Return the loop exit conditions. */
857 static vec<gcond *>
858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
859 tree *number_of_iterations, tree *number_of_iterationsm1)
861 auto_vec<edge> exits = get_loop_exit_edges (loop);
862 vec<gcond *> conds;
863 conds.create (exits.length ());
864 class tree_niter_desc niter_desc;
865 tree niter_assumptions, niter, may_be_zero;
867 *assumptions = boolean_true_node;
868 *number_of_iterationsm1 = chrec_dont_know;
869 *number_of_iterations = chrec_dont_know;
871 DUMP_VECT_SCOPE ("get_loop_niters");
873 if (exits.is_empty ())
874 return conds;
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
878 exits.length ());
880 edge exit;
881 unsigned int i;
882 FOR_EACH_VEC_ELT (exits, i, exit)
884 gcond *cond = get_loop_exit_condition (exit);
885 if (cond)
886 conds.safe_push (cond);
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
891 if (exit != main_exit)
892 continue;
894 may_be_zero = NULL_TREE;
895 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
896 || chrec_contains_undetermined (niter_desc.niter))
897 continue;
899 niter_assumptions = niter_desc.assumptions;
900 may_be_zero = niter_desc.may_be_zero;
901 niter = niter_desc.niter;
903 if (may_be_zero && integer_zerop (may_be_zero))
904 may_be_zero = NULL_TREE;
906 if (may_be_zero)
908 if (COMPARISON_CLASS_P (may_be_zero))
910 /* Try to combine may_be_zero with assumptions, this can simplify
911 computation of niter expression. */
912 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
913 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
914 niter_assumptions,
915 fold_build1 (TRUTH_NOT_EXPR,
916 boolean_type_node,
917 may_be_zero));
918 else
919 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
920 build_int_cst (TREE_TYPE (niter), 0),
921 rewrite_to_non_trapping_overflow (niter));
923 may_be_zero = NULL_TREE;
925 else if (integer_nonzerop (may_be_zero))
927 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
928 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
929 continue;
931 else
932 continue;
935 /* Loop assumptions are based off the normal exit. */
936 *assumptions = niter_assumptions;
937 *number_of_iterationsm1 = niter;
939 /* We want the number of loop header executions which is the number
940 of latch executions plus one.
941 ??? For UINT_MAX latch executions this number overflows to zero
942 for loops like do { n++; } while (n != 0); */
943 if (niter && !chrec_contains_undetermined (niter))
945 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
946 unshare_expr (niter),
947 build_int_cst (TREE_TYPE (niter), 1));
948 if (TREE_CODE (niter) == INTEGER_CST
949 && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
951 /* If we manage to fold niter + 1 into INTEGER_CST even when
952 niter is some complex expression, ensure back
953 *number_of_iterationsm1 is an INTEGER_CST as well. See
954 PR113210. */
955 *number_of_iterationsm1
956 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
957 build_minus_one_cst (TREE_TYPE (niter)));
960 *number_of_iterations = niter;
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
966 return conds;
969 /* Determine the main loop exit for the vectorizer. */
971 edge
972 vec_init_loop_exit_info (class loop *loop)
974 /* Before we begin we must first determine which exit is the main one and
975 which are auxilary exits. */
976 auto_vec<edge> exits = get_loop_exit_edges (loop);
977 if (exits.length () == 1)
978 return exits[0];
980 /* If we have multiple exits we only support counting IV at the moment. Analyze
981 all exits and return one */
982 class tree_niter_desc niter_desc;
983 edge candidate = NULL;
984 for (edge exit : exits)
986 if (!get_loop_exit_condition (exit))
987 continue;
989 if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
990 && !chrec_contains_undetermined (niter_desc.niter))
992 tree may_be_zero = niter_desc.may_be_zero;
993 if (integer_zerop (may_be_zero)
994 && (!candidate
995 || dominated_by_p (CDI_DOMINATORS, exit->src,
996 candidate->src)))
997 candidate = exit;
1001 return candidate;
1004 /* Function bb_in_loop_p
1006 Used as predicate for dfs order traversal of the loop bbs. */
1008 static bool
1009 bb_in_loop_p (const_basic_block bb, const void *data)
1011 const class loop *const loop = (const class loop *)data;
1012 if (flow_bb_inside_loop_p (loop, bb))
1013 return true;
1014 return false;
1018 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1019 stmt_vec_info structs for all the stmts in LOOP_IN. */
1021 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1022 : vec_info (vec_info::loop, shared),
1023 loop (loop_in),
1024 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1025 num_itersm1 (NULL_TREE),
1026 num_iters (NULL_TREE),
1027 num_iters_unchanged (NULL_TREE),
1028 num_iters_assumptions (NULL_TREE),
1029 vector_costs (nullptr),
1030 scalar_costs (nullptr),
1031 th (0),
1032 versioning_threshold (0),
1033 vectorization_factor (0),
1034 main_loop_edge (nullptr),
1035 skip_main_loop_edge (nullptr),
1036 skip_this_loop_edge (nullptr),
1037 reusable_accumulators (),
1038 suggested_unroll_factor (1),
1039 max_vectorization_factor (0),
1040 mask_skip_niters (NULL_TREE),
1041 rgroup_compare_type (NULL_TREE),
1042 simd_if_cond (NULL_TREE),
1043 partial_vector_style (vect_partial_vectors_none),
1044 unaligned_dr (NULL),
1045 peeling_for_alignment (0),
1046 ptr_mask (0),
1047 ivexpr_map (NULL),
1048 scan_map (NULL),
1049 slp_unrolling_factor (1),
1050 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1051 vectorizable (false),
1052 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1053 using_partial_vectors_p (false),
1054 using_decrementing_iv_p (false),
1055 using_select_vl_p (false),
1056 epil_using_partial_vectors_p (false),
1057 partial_load_store_bias (0),
1058 peeling_for_gaps (false),
1059 peeling_for_niter (false),
1060 early_breaks (false),
1061 no_data_dependencies (false),
1062 has_mask_store (false),
1063 scalar_loop_scaling (profile_probability::uninitialized ()),
1064 scalar_loop (NULL),
1065 orig_loop_info (NULL),
1066 vec_loop_iv_exit (NULL),
1067 vec_epilogue_loop_iv_exit (NULL),
1068 scalar_loop_iv_exit (NULL)
1070 /* CHECKME: We want to visit all BBs before their successors (except for
1071 latch blocks, for which this assertion wouldn't hold). In the simple
1072 case of the loop forms we allow, a dfs order of the BBs would the same
1073 as reversed postorder traversal, so we are safe. */
1075 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1076 bbs, loop->num_nodes, loop);
1077 gcc_assert (nbbs == loop->num_nodes);
1079 for (unsigned int i = 0; i < nbbs; i++)
1081 basic_block bb = bbs[i];
1082 gimple_stmt_iterator si;
1084 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1086 gimple *phi = gsi_stmt (si);
1087 gimple_set_uid (phi, 0);
1088 add_stmt (phi);
1091 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1093 gimple *stmt = gsi_stmt (si);
1094 gimple_set_uid (stmt, 0);
1095 if (is_gimple_debug (stmt))
1096 continue;
1097 add_stmt (stmt);
1098 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1099 third argument is the #pragma omp simd if (x) condition, when 0,
1100 loop shouldn't be vectorized, when non-zero constant, it should
1101 be vectorized normally, otherwise versioned with vectorized loop
1102 done if the condition is non-zero at runtime. */
1103 if (loop_in->simduid
1104 && is_gimple_call (stmt)
1105 && gimple_call_internal_p (stmt)
1106 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1107 && gimple_call_num_args (stmt) >= 3
1108 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1109 && (loop_in->simduid
1110 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1112 tree arg = gimple_call_arg (stmt, 2);
1113 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1114 simd_if_cond = arg;
1115 else
1116 gcc_assert (integer_nonzerop (arg));
1121 epilogue_vinfos.create (6);
1124 /* Free all levels of rgroup CONTROLS. */
1126 void
1127 release_vec_loop_controls (vec<rgroup_controls> *controls)
1129 rgroup_controls *rgc;
1130 unsigned int i;
1131 FOR_EACH_VEC_ELT (*controls, i, rgc)
1132 rgc->controls.release ();
1133 controls->release ();
1136 /* Free all memory used by the _loop_vec_info, as well as all the
1137 stmt_vec_info structs of all the stmts in the loop. */
1139 _loop_vec_info::~_loop_vec_info ()
1141 free (bbs);
1143 release_vec_loop_controls (&masks.rgc_vec);
1144 release_vec_loop_controls (&lens);
1145 delete ivexpr_map;
1146 delete scan_map;
1147 epilogue_vinfos.release ();
1148 delete scalar_costs;
1149 delete vector_costs;
1151 /* When we release an epiloge vinfo that we do not intend to use
1152 avoid clearing AUX of the main loop which should continue to
1153 point to the main loop vinfo since otherwise we'll leak that. */
1154 if (loop->aux == this)
1155 loop->aux = NULL;
1158 /* Return an invariant or register for EXPR and emit necessary
1159 computations in the LOOP_VINFO loop preheader. */
1161 tree
1162 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1164 if (is_gimple_reg (expr)
1165 || is_gimple_min_invariant (expr))
1166 return expr;
1168 if (! loop_vinfo->ivexpr_map)
1169 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1170 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1171 if (! cached)
1173 gimple_seq stmts = NULL;
1174 cached = force_gimple_operand (unshare_expr (expr),
1175 &stmts, true, NULL_TREE);
1176 if (stmts)
1178 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1179 gsi_insert_seq_on_edge_immediate (e, stmts);
1182 return cached;
1185 /* Return true if we can use CMP_TYPE as the comparison type to produce
1186 all masks required to mask LOOP_VINFO. */
1188 static bool
1189 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1191 rgroup_controls *rgm;
1192 unsigned int i;
1193 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1194 if (rgm->type != NULL_TREE
1195 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1196 cmp_type, rgm->type,
1197 OPTIMIZE_FOR_SPEED))
1198 return false;
1199 return true;
1202 /* Calculate the maximum number of scalars per iteration for every
1203 rgroup in LOOP_VINFO. */
1205 static unsigned int
1206 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1208 unsigned int res = 1;
1209 unsigned int i;
1210 rgroup_controls *rgm;
1211 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1212 res = MAX (res, rgm->max_nscalars_per_iter);
1213 return res;
1216 /* Calculate the minimum precision necessary to represent:
1218 MAX_NITERS * FACTOR
1220 as an unsigned integer, where MAX_NITERS is the maximum number of
1221 loop header iterations for the original scalar form of LOOP_VINFO. */
1223 static unsigned
1224 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1226 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1228 /* Get the maximum number of iterations that is representable
1229 in the counter type. */
1230 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1231 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1233 /* Get a more refined estimate for the number of iterations. */
1234 widest_int max_back_edges;
1235 if (max_loop_iterations (loop, &max_back_edges))
1236 max_ni = wi::smin (max_ni, max_back_edges + 1);
1238 /* Work out how many bits we need to represent the limit. */
1239 return wi::min_precision (max_ni * factor, UNSIGNED);
1242 /* True if the loop needs peeling or partial vectors when vectorized. */
1244 static bool
1245 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1247 unsigned HOST_WIDE_INT const_vf;
1248 HOST_WIDE_INT max_niter
1249 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1251 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1252 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1253 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1254 (loop_vinfo));
1256 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1257 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1259 /* Work out the (constant) number of iterations that need to be
1260 peeled for reasons other than niters. */
1261 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1262 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1263 peel_niter += 1;
1264 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1265 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1266 return true;
1268 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1269 /* ??? When peeling for gaps but not alignment, we could
1270 try to check whether the (variable) niters is known to be
1271 VF * N + 1. That's something of a niche case though. */
1272 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1273 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1274 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1275 < (unsigned) exact_log2 (const_vf))
1276 /* In case of versioning, check if the maximum number of
1277 iterations is greater than th. If they are identical,
1278 the epilogue is unnecessary. */
1279 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1280 || ((unsigned HOST_WIDE_INT) max_niter
1281 /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1282 but that's only computed later based on our result.
1283 The following is the most conservative approximation. */
1284 > (std::max ((unsigned HOST_WIDE_INT) th,
1285 const_vf) / const_vf) * const_vf))))
1286 return true;
1288 return false;
1291 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1292 whether we can actually generate the masks required. Return true if so,
1293 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1295 static bool
1296 vect_verify_full_masking (loop_vec_info loop_vinfo)
1298 unsigned int min_ni_width;
1300 /* Use a normal loop if there are no statements that need masking.
1301 This only happens in rare degenerate cases: it means that the loop
1302 has no loads, no stores, and no live-out values. */
1303 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1304 return false;
1306 /* Produce the rgroup controls. */
1307 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1309 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1310 tree vectype = mask.first;
1311 unsigned nvectors = mask.second;
1313 if (masks->rgc_vec.length () < nvectors)
1314 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1315 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1316 /* The number of scalars per iteration and the number of vectors are
1317 both compile-time constants. */
1318 unsigned int nscalars_per_iter
1319 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1320 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1322 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1324 rgm->max_nscalars_per_iter = nscalars_per_iter;
1325 rgm->type = truth_type_for (vectype);
1326 rgm->factor = 1;
1330 unsigned int max_nscalars_per_iter
1331 = vect_get_max_nscalars_per_iter (loop_vinfo);
1333 /* Work out how many bits we need to represent the limit. */
1334 min_ni_width
1335 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1337 /* Find a scalar mode for which WHILE_ULT is supported. */
1338 opt_scalar_int_mode cmp_mode_iter;
1339 tree cmp_type = NULL_TREE;
1340 tree iv_type = NULL_TREE;
1341 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1342 unsigned int iv_precision = UINT_MAX;
1344 if (iv_limit != -1)
1345 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1346 UNSIGNED);
1348 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1350 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1351 if (cmp_bits >= min_ni_width
1352 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1354 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1355 if (this_type
1356 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1358 /* Although we could stop as soon as we find a valid mode,
1359 there are at least two reasons why that's not always the
1360 best choice:
1362 - An IV that's Pmode or wider is more likely to be reusable
1363 in address calculations than an IV that's narrower than
1364 Pmode.
1366 - Doing the comparison in IV_PRECISION or wider allows
1367 a natural 0-based IV, whereas using a narrower comparison
1368 type requires mitigations against wrap-around.
1370 Conversely, if the IV limit is variable, doing the comparison
1371 in a wider type than the original type can introduce
1372 unnecessary extensions, so picking the widest valid mode
1373 is not always a good choice either.
1375 Here we prefer the first IV type that's Pmode or wider,
1376 and the first comparison type that's IV_PRECISION or wider.
1377 (The comparison type must be no wider than the IV type,
1378 to avoid extensions in the vector loop.)
1380 ??? We might want to try continuing beyond Pmode for ILP32
1381 targets if CMP_BITS < IV_PRECISION. */
1382 iv_type = this_type;
1383 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1384 cmp_type = this_type;
1385 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1386 break;
1391 if (!cmp_type)
1393 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1394 return false;
1397 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1398 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1399 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1400 return true;
1403 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1404 whether we can actually generate AVX512 style masks. Return true if so,
1405 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1407 static bool
1408 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1410 /* Produce differently organized rgc_vec and differently check
1411 we can produce masks. */
1413 /* Use a normal loop if there are no statements that need masking.
1414 This only happens in rare degenerate cases: it means that the loop
1415 has no loads, no stores, and no live-out values. */
1416 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1417 return false;
1419 /* For the decrementing IV we need to represent all values in
1420 [0, niter + niter_skip] where niter_skip is the elements we
1421 skip in the first iteration for prologue peeling. */
1422 tree iv_type = NULL_TREE;
1423 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1424 unsigned int iv_precision = UINT_MAX;
1425 if (iv_limit != -1)
1426 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1428 /* First compute the type for the IV we use to track the remaining
1429 scalar iterations. */
1430 opt_scalar_int_mode cmp_mode_iter;
1431 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1433 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1434 if (cmp_bits >= iv_precision
1435 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1437 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1438 if (iv_type)
1439 break;
1442 if (!iv_type)
1443 return false;
1445 /* Produce the rgroup controls. */
1446 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1448 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1449 tree vectype = mask.first;
1450 unsigned nvectors = mask.second;
1452 /* The number of scalars per iteration and the number of vectors are
1453 both compile-time constants. */
1454 unsigned int nscalars_per_iter
1455 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1456 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1458 /* We index the rgroup_controls vector with nscalars_per_iter
1459 which we keep constant and instead have a varying nvectors,
1460 remembering the vector mask with the fewest nV. */
1461 if (masks->rgc_vec.length () < nscalars_per_iter)
1462 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1463 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1465 if (!rgm->type || rgm->factor > nvectors)
1467 rgm->type = truth_type_for (vectype);
1468 rgm->compare_type = NULL_TREE;
1469 rgm->max_nscalars_per_iter = nscalars_per_iter;
1470 rgm->factor = nvectors;
1471 rgm->bias_adjusted_ctrl = NULL_TREE;
1475 /* There is no fixed compare type we are going to use but we have to
1476 be able to get at one for each mask group. */
1477 unsigned int min_ni_width
1478 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1480 bool ok = true;
1481 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1483 tree mask_type = rgc.type;
1484 if (!mask_type)
1485 continue;
1487 /* For now vect_get_loop_mask only supports integer mode masks
1488 when we need to split it. */
1489 if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1490 || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1492 ok = false;
1493 break;
1496 /* If iv_type is usable as compare type use that - we can elide the
1497 saturation in that case. */
1498 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1500 tree cmp_vectype
1501 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1502 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1503 rgc.compare_type = cmp_vectype;
1505 if (!rgc.compare_type)
1506 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1508 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1509 if (cmp_bits >= min_ni_width
1510 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1512 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1513 if (!cmp_type)
1514 continue;
1516 /* Check whether we can produce the mask with cmp_type. */
1517 tree cmp_vectype
1518 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1519 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1521 rgc.compare_type = cmp_vectype;
1522 break;
1526 if (!rgc.compare_type)
1528 ok = false;
1529 break;
1532 if (!ok)
1534 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1535 return false;
1538 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1539 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1540 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1541 return true;
1544 /* Check whether we can use vector access with length based on precison
1545 comparison. So far, to keep it simple, we only allow the case that the
1546 precision of the target supported length is larger than the precision
1547 required by loop niters. */
1549 static bool
1550 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1552 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1553 return false;
1555 machine_mode len_load_mode, len_store_mode;
1556 if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1557 .exists (&len_load_mode))
1558 return false;
1559 if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1560 .exists (&len_store_mode))
1561 return false;
1563 signed char partial_load_bias = internal_len_load_store_bias
1564 (IFN_LEN_LOAD, len_load_mode);
1566 signed char partial_store_bias = internal_len_load_store_bias
1567 (IFN_LEN_STORE, len_store_mode);
1569 gcc_assert (partial_load_bias == partial_store_bias);
1571 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1572 return false;
1574 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1575 len_loads with a length of zero. In order to avoid that we prohibit
1576 more than one loop length here. */
1577 if (partial_load_bias == -1
1578 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1579 return false;
1581 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1583 unsigned int max_nitems_per_iter = 1;
1584 unsigned int i;
1585 rgroup_controls *rgl;
1586 /* Find the maximum number of items per iteration for every rgroup. */
1587 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1589 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1590 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1593 /* Work out how many bits we need to represent the length limit. */
1594 unsigned int min_ni_prec
1595 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1597 /* Now use the maximum of below precisions for one suitable IV type:
1598 - the IV's natural precision
1599 - the precision needed to hold: the maximum number of scalar
1600 iterations multiplied by the scale factor (min_ni_prec above)
1601 - the Pmode precision
1603 If min_ni_prec is less than the precision of the current niters,
1604 we perfer to still use the niters type. Prefer to use Pmode and
1605 wider IV to avoid narrow conversions. */
1607 unsigned int ni_prec
1608 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1609 min_ni_prec = MAX (min_ni_prec, ni_prec);
1610 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1612 tree iv_type = NULL_TREE;
1613 opt_scalar_int_mode tmode_iter;
1614 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1616 scalar_mode tmode = tmode_iter.require ();
1617 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1619 /* ??? Do we really want to construct one IV whose precision exceeds
1620 BITS_PER_WORD? */
1621 if (tbits > BITS_PER_WORD)
1622 break;
1624 /* Find the first available standard integral type. */
1625 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1627 iv_type = build_nonstandard_integer_type (tbits, true);
1628 break;
1632 if (!iv_type)
1634 if (dump_enabled_p ())
1635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1636 "can't vectorize with length-based partial vectors"
1637 " because there is no suitable iv type.\n");
1638 return false;
1641 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1642 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1643 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1645 return true;
1648 /* Calculate the cost of one scalar iteration of the loop. */
1649 static void
1650 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1652 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1653 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1654 int nbbs = loop->num_nodes, factor;
1655 int innerloop_iters, i;
1657 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1659 /* Gather costs for statements in the scalar loop. */
1661 /* FORNOW. */
1662 innerloop_iters = 1;
1663 if (loop->inner)
1664 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1666 for (i = 0; i < nbbs; i++)
1668 gimple_stmt_iterator si;
1669 basic_block bb = bbs[i];
1671 if (bb->loop_father == loop->inner)
1672 factor = innerloop_iters;
1673 else
1674 factor = 1;
1676 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1678 gimple *stmt = gsi_stmt (si);
1679 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1681 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1682 continue;
1684 /* Skip stmts that are not vectorized inside the loop. */
1685 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1686 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1687 && (!STMT_VINFO_LIVE_P (vstmt_info)
1688 || !VECTORIZABLE_CYCLE_DEF
1689 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1690 continue;
1692 vect_cost_for_stmt kind;
1693 if (STMT_VINFO_DATA_REF (stmt_info))
1695 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1696 kind = scalar_load;
1697 else
1698 kind = scalar_store;
1700 else if (vect_nop_conversion_p (stmt_info))
1701 continue;
1702 else
1703 kind = scalar_stmt;
1705 /* We are using vect_prologue here to avoid scaling twice
1706 by the inner loop factor. */
1707 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1708 factor, kind, stmt_info, 0, vect_prologue);
1712 /* Now accumulate cost. */
1713 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1714 add_stmt_costs (loop_vinfo->scalar_costs,
1715 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1716 loop_vinfo->scalar_costs->finish_cost (nullptr);
1719 /* Function vect_analyze_loop_form.
1721 Verify that certain CFG restrictions hold, including:
1722 - the loop has a pre-header
1723 - the loop has a single entry
1724 - nested loops can have only a single exit.
1725 - the loop exit condition is simple enough
1726 - the number of iterations can be analyzed, i.e, a countable loop. The
1727 niter could be analyzed under some assumptions. */
1729 opt_result
1730 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1732 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1734 edge exit_e = vec_init_loop_exit_info (loop);
1735 if (!exit_e)
1736 return opt_result::failure_at (vect_location,
1737 "not vectorized:"
1738 " could not determine main exit from"
1739 " loop with multiple exits.\n");
1740 info->loop_exit = exit_e;
1741 if (dump_enabled_p ())
1742 dump_printf_loc (MSG_NOTE, vect_location,
1743 "using as main loop exit: %d -> %d [AUX: %p]\n",
1744 exit_e->src->index, exit_e->dest->index, exit_e->aux);
1746 /* Check if we have any control flow that doesn't leave the loop. */
1747 class loop *v_loop = loop->inner ? loop->inner : loop;
1748 basic_block *bbs= get_loop_body (v_loop);
1749 for (unsigned i = 0; i < v_loop->num_nodes; i++)
1750 if (EDGE_COUNT (bbs[i]->succs) != 1
1751 && (EDGE_COUNT (bbs[i]->succs) != 2
1752 || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1753 return opt_result::failure_at (vect_location,
1754 "not vectorized:"
1755 " unsupported control flow in loop.\n");
1757 /* Different restrictions apply when we are considering an inner-most loop,
1758 vs. an outer (nested) loop.
1759 (FORNOW. May want to relax some of these restrictions in the future). */
1761 info->inner_loop_cond = NULL;
1762 if (!loop->inner)
1764 /* Inner-most loop. We currently require that the number of BBs is
1765 exactly 2 (the header and latch). Vectorizable inner-most loops
1766 look like this:
1768 (pre-header)
1770 header <--------+
1771 | | |
1772 | +--> latch --+
1774 (exit-bb) */
1776 if (empty_block_p (loop->header))
1777 return opt_result::failure_at (vect_location,
1778 "not vectorized: empty loop.\n");
1780 else
1782 class loop *innerloop = loop->inner;
1783 edge entryedge;
1785 /* Nested loop. We currently require that the loop is doubly-nested,
1786 contains a single inner loop, and the number of BBs is exactly 5.
1787 Vectorizable outer-loops look like this:
1789 (pre-header)
1791 header <---+
1793 inner-loop |
1795 tail ------+
1797 (exit-bb)
1799 The inner-loop has the properties expected of inner-most loops
1800 as described above. */
1802 if ((loop->inner)->inner || (loop->inner)->next)
1803 return opt_result::failure_at (vect_location,
1804 "not vectorized:"
1805 " multiple nested loops.\n");
1807 entryedge = loop_preheader_edge (innerloop);
1808 if (entryedge->src != loop->header
1809 || !single_exit (innerloop)
1810 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1811 return opt_result::failure_at (vect_location,
1812 "not vectorized:"
1813 " unsupported outerloop form.\n");
1815 /* Analyze the inner-loop. */
1816 vect_loop_form_info inner;
1817 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1818 if (!res)
1820 if (dump_enabled_p ())
1821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1822 "not vectorized: Bad inner loop.\n");
1823 return res;
1826 /* Don't support analyzing niter under assumptions for inner
1827 loop. */
1828 if (!integer_onep (inner.assumptions))
1829 return opt_result::failure_at (vect_location,
1830 "not vectorized: Bad inner loop.\n");
1832 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1833 return opt_result::failure_at (vect_location,
1834 "not vectorized: inner-loop count not"
1835 " invariant.\n");
1837 if (dump_enabled_p ())
1838 dump_printf_loc (MSG_NOTE, vect_location,
1839 "Considering outer-loop vectorization.\n");
1840 info->inner_loop_cond = inner.conds[0];
1843 if (EDGE_COUNT (loop->header->preds) != 2)
1844 return opt_result::failure_at (vect_location,
1845 "not vectorized:"
1846 " too many incoming edges.\n");
1848 /* We assume that the loop exit condition is at the end of the loop. i.e,
1849 that the loop is represented as a do-while (with a proper if-guard
1850 before the loop if needed), where the loop header contains all the
1851 executable statements, and the latch is empty. */
1852 if (!empty_block_p (loop->latch)
1853 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1854 return opt_result::failure_at (vect_location,
1855 "not vectorized: latch block not empty.\n");
1857 /* Make sure the exit is not abnormal. */
1858 auto_vec<edge> exits = get_loop_exit_edges (loop);
1859 for (edge e : exits)
1861 if (e->flags & EDGE_ABNORMAL)
1862 return opt_result::failure_at (vect_location,
1863 "not vectorized:"
1864 " abnormal loop exit edge.\n");
1867 info->conds
1868 = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1869 &info->number_of_iterations,
1870 &info->number_of_iterationsm1);
1872 if (info->conds.is_empty ())
1873 return opt_result::failure_at
1874 (vect_location,
1875 "not vectorized: complicated exit condition.\n");
1877 /* Determine what the primary and alternate exit conds are. */
1878 for (unsigned i = 0; i < info->conds.length (); i++)
1880 gcond *cond = info->conds[i];
1881 if (exit_e->src == gimple_bb (cond))
1882 std::swap (info->conds[0], info->conds[i]);
1885 if (integer_zerop (info->assumptions)
1886 || !info->number_of_iterations
1887 || chrec_contains_undetermined (info->number_of_iterations))
1888 return opt_result::failure_at
1889 (info->conds[0],
1890 "not vectorized: number of iterations cannot be computed.\n");
1892 if (integer_zerop (info->number_of_iterations))
1893 return opt_result::failure_at
1894 (info->conds[0],
1895 "not vectorized: number of iterations = 0.\n");
1897 if (!(tree_fits_shwi_p (info->number_of_iterations)
1898 && tree_to_shwi (info->number_of_iterations) > 0))
1900 if (dump_enabled_p ())
1902 dump_printf_loc (MSG_NOTE, vect_location,
1903 "Symbolic number of iterations is ");
1904 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1905 dump_printf (MSG_NOTE, "\n");
1909 return opt_result::success ();
1912 /* Create a loop_vec_info for LOOP with SHARED and the
1913 vect_analyze_loop_form result. */
1915 loop_vec_info
1916 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1917 const vect_loop_form_info *info,
1918 loop_vec_info main_loop_info)
1920 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1921 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1922 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1923 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1924 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1925 /* Also record the assumptions for versioning. */
1926 if (!integer_onep (info->assumptions) && !main_loop_info)
1927 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1929 for (gcond *cond : info->conds)
1931 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1932 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1933 /* Mark the statement as a condition. */
1934 STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1937 for (unsigned i = 1; i < info->conds.length (); i ++)
1938 LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1939 LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1941 LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1943 /* Check to see if we're vectorizing multiple exits. */
1944 LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1945 = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1947 if (info->inner_loop_cond)
1949 stmt_vec_info inner_loop_cond_info
1950 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1951 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1952 /* If we have an estimate on the number of iterations of the inner
1953 loop use that to limit the scale for costing, otherwise use
1954 --param vect-inner-loop-cost-factor literally. */
1955 widest_int nit;
1956 if (estimated_stmt_executions (loop->inner, &nit))
1957 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1958 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1961 return loop_vinfo;
1966 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1967 statements update the vectorization factor. */
1969 static void
1970 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1972 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1973 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1974 int nbbs = loop->num_nodes;
1975 poly_uint64 vectorization_factor;
1976 int i;
1978 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1980 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1981 gcc_assert (known_ne (vectorization_factor, 0U));
1983 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1984 vectorization factor of the loop is the unrolling factor required by
1985 the SLP instances. If that unrolling factor is 1, we say, that we
1986 perform pure SLP on loop - cross iteration parallelism is not
1987 exploited. */
1988 bool only_slp_in_loop = true;
1989 for (i = 0; i < nbbs; i++)
1991 basic_block bb = bbs[i];
1992 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1993 gsi_next (&si))
1995 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1996 if (!stmt_info)
1997 continue;
1998 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1999 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2000 && !PURE_SLP_STMT (stmt_info))
2001 /* STMT needs both SLP and loop-based vectorization. */
2002 only_slp_in_loop = false;
2004 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2005 gsi_next (&si))
2007 if (is_gimple_debug (gsi_stmt (si)))
2008 continue;
2009 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2010 stmt_info = vect_stmt_to_vectorize (stmt_info);
2011 if ((STMT_VINFO_RELEVANT_P (stmt_info)
2012 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2013 && !PURE_SLP_STMT (stmt_info))
2014 /* STMT needs both SLP and loop-based vectorization. */
2015 only_slp_in_loop = false;
2019 if (only_slp_in_loop)
2021 if (dump_enabled_p ())
2022 dump_printf_loc (MSG_NOTE, vect_location,
2023 "Loop contains only SLP stmts\n");
2024 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2026 else
2028 if (dump_enabled_p ())
2029 dump_printf_loc (MSG_NOTE, vect_location,
2030 "Loop contains SLP and non-SLP stmts\n");
2031 /* Both the vectorization factor and unroll factor have the form
2032 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2033 so they must have a common multiple. */
2034 vectorization_factor
2035 = force_common_multiple (vectorization_factor,
2036 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2039 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2040 if (dump_enabled_p ())
2042 dump_printf_loc (MSG_NOTE, vect_location,
2043 "Updating vectorization factor to ");
2044 dump_dec (MSG_NOTE, vectorization_factor);
2045 dump_printf (MSG_NOTE, ".\n");
2049 /* Return true if STMT_INFO describes a double reduction phi and if
2050 the other phi in the reduction is also relevant for vectorization.
2051 This rejects cases such as:
2053 outer1:
2054 x_1 = PHI <x_3(outer2), ...>;
2057 inner:
2058 x_2 = ...;
2061 outer2:
2062 x_3 = PHI <x_2(inner)>;
2064 if nothing in x_2 or elsewhere makes x_1 relevant. */
2066 static bool
2067 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2069 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2070 return false;
2072 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2075 /* Function vect_analyze_loop_operations.
2077 Scan the loop stmts and make sure they are all vectorizable. */
2079 static opt_result
2080 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2082 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2083 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2084 int nbbs = loop->num_nodes;
2085 int i;
2086 stmt_vec_info stmt_info;
2087 bool need_to_vectorize = false;
2088 bool ok;
2090 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2092 auto_vec<stmt_info_for_cost> cost_vec;
2094 for (i = 0; i < nbbs; i++)
2096 basic_block bb = bbs[i];
2098 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2099 gsi_next (&si))
2101 gphi *phi = si.phi ();
2102 ok = true;
2104 stmt_info = loop_vinfo->lookup_stmt (phi);
2105 if (dump_enabled_p ())
2106 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2107 (gimple *) phi);
2108 if (virtual_operand_p (gimple_phi_result (phi)))
2109 continue;
2111 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2112 (i.e., a phi in the tail of the outer-loop). */
2113 if (! is_loop_header_bb_p (bb))
2115 /* FORNOW: we currently don't support the case that these phis
2116 are not used in the outerloop (unless it is double reduction,
2117 i.e., this phi is vect_reduction_def), cause this case
2118 requires to actually do something here. */
2119 if (STMT_VINFO_LIVE_P (stmt_info)
2120 && !vect_active_double_reduction_p (stmt_info))
2121 return opt_result::failure_at (phi,
2122 "Unsupported loop-closed phi"
2123 " in outer-loop.\n");
2125 /* If PHI is used in the outer loop, we check that its operand
2126 is defined in the inner loop. */
2127 if (STMT_VINFO_RELEVANT_P (stmt_info))
2129 tree phi_op;
2131 if (gimple_phi_num_args (phi) != 1)
2132 return opt_result::failure_at (phi, "unsupported phi");
2134 phi_op = PHI_ARG_DEF (phi, 0);
2135 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2136 if (!op_def_info)
2137 return opt_result::failure_at (phi, "unsupported phi\n");
2139 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2140 && (STMT_VINFO_RELEVANT (op_def_info)
2141 != vect_used_in_outer_by_reduction))
2142 return opt_result::failure_at (phi, "unsupported phi\n");
2144 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2145 || (STMT_VINFO_DEF_TYPE (stmt_info)
2146 == vect_double_reduction_def))
2147 && !vectorizable_lc_phi (loop_vinfo,
2148 stmt_info, NULL, NULL))
2149 return opt_result::failure_at (phi, "unsupported phi\n");
2152 continue;
2155 gcc_assert (stmt_info);
2157 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2158 || STMT_VINFO_LIVE_P (stmt_info))
2159 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2160 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2161 /* A scalar-dependence cycle that we don't support. */
2162 return opt_result::failure_at (phi,
2163 "not vectorized:"
2164 " scalar dependence cycle.\n");
2166 if (STMT_VINFO_RELEVANT_P (stmt_info))
2168 need_to_vectorize = true;
2169 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2170 && ! PURE_SLP_STMT (stmt_info))
2171 ok = vectorizable_induction (loop_vinfo,
2172 stmt_info, NULL, NULL,
2173 &cost_vec);
2174 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2175 || (STMT_VINFO_DEF_TYPE (stmt_info)
2176 == vect_double_reduction_def)
2177 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2178 && ! PURE_SLP_STMT (stmt_info))
2179 ok = vectorizable_reduction (loop_vinfo,
2180 stmt_info, NULL, NULL, &cost_vec);
2181 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2182 == vect_first_order_recurrence)
2183 && ! PURE_SLP_STMT (stmt_info))
2184 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2185 &cost_vec);
2188 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2189 if (ok
2190 && STMT_VINFO_LIVE_P (stmt_info)
2191 && !PURE_SLP_STMT (stmt_info))
2192 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2193 -1, false, &cost_vec);
2195 if (!ok)
2196 return opt_result::failure_at (phi,
2197 "not vectorized: relevant phi not "
2198 "supported: %G",
2199 static_cast <gimple *> (phi));
2202 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2203 gsi_next (&si))
2205 gimple *stmt = gsi_stmt (si);
2206 if (!gimple_clobber_p (stmt)
2207 && !is_gimple_debug (stmt))
2209 opt_result res
2210 = vect_analyze_stmt (loop_vinfo,
2211 loop_vinfo->lookup_stmt (stmt),
2212 &need_to_vectorize,
2213 NULL, NULL, &cost_vec);
2214 if (!res)
2215 return res;
2218 } /* bbs */
2220 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2222 /* All operations in the loop are either irrelevant (deal with loop
2223 control, or dead), or only used outside the loop and can be moved
2224 out of the loop (e.g. invariants, inductions). The loop can be
2225 optimized away by scalar optimizations. We're better off not
2226 touching this loop. */
2227 if (!need_to_vectorize)
2229 if (dump_enabled_p ())
2230 dump_printf_loc (MSG_NOTE, vect_location,
2231 "All the computation can be taken out of the loop.\n");
2232 return opt_result::failure_at
2233 (vect_location,
2234 "not vectorized: redundant loop. no profit to vectorize.\n");
2237 return opt_result::success ();
2240 /* Return true if we know that the iteration count is smaller than the
2241 vectorization factor. Return false if it isn't, or if we can't be sure
2242 either way. */
2244 static bool
2245 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2247 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2249 HOST_WIDE_INT max_niter;
2250 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2251 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2252 else
2253 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2255 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2256 return true;
2258 return false;
2261 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2262 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2263 definitely no, or -1 if it's worth retrying. */
2265 static int
2266 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2267 unsigned *suggested_unroll_factor)
2269 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2270 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2272 /* Only loops that can handle partially-populated vectors can have iteration
2273 counts less than the vectorization factor. */
2274 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2275 && vect_known_niters_smaller_than_vf (loop_vinfo))
2277 if (dump_enabled_p ())
2278 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2279 "not vectorized: iteration count smaller than "
2280 "vectorization factor.\n");
2281 return 0;
2284 /* If we know the number of iterations we can do better, for the
2285 epilogue we can also decide whether the main loop leaves us
2286 with enough iterations, prefering a smaller vector epilog then
2287 also possibly used for the case we skip the vector loop. */
2288 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2290 widest_int scalar_niters
2291 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2292 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2294 loop_vec_info orig_loop_vinfo
2295 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2296 unsigned lowest_vf
2297 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2298 int prolog_peeling = 0;
2299 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2300 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2301 if (prolog_peeling >= 0
2302 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2303 lowest_vf))
2305 unsigned gap
2306 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2307 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2308 % lowest_vf + gap);
2311 /* Reject vectorizing for a single scalar iteration, even if
2312 we could in principle implement that using partial vectors. */
2313 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2314 if (scalar_niters <= peeling_gap + 1)
2316 if (dump_enabled_p ())
2317 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2318 "not vectorized: loop only has a single "
2319 "scalar iteration.\n");
2320 return 0;
2323 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2325 /* Check that the loop processes at least one full vector. */
2326 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2327 if (known_lt (scalar_niters, vf))
2329 if (dump_enabled_p ())
2330 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2331 "loop does not have enough iterations "
2332 "to support vectorization.\n");
2333 return 0;
2336 /* If we need to peel an extra epilogue iteration to handle data
2337 accesses with gaps, check that there are enough scalar iterations
2338 available.
2340 The check above is redundant with this one when peeling for gaps,
2341 but the distinction is useful for diagnostics. */
2342 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2343 && known_le (scalar_niters, vf))
2345 if (dump_enabled_p ())
2346 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2347 "loop does not have enough iterations "
2348 "to support peeling for gaps.\n");
2349 return 0;
2354 /* If using the "very cheap" model. reject cases in which we'd keep
2355 a copy of the scalar code (even if we might be able to vectorize it). */
2356 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2357 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2358 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2359 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2361 if (dump_enabled_p ())
2362 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2363 "some scalar iterations would need to be peeled\n");
2364 return 0;
2367 int min_profitable_iters, min_profitable_estimate;
2368 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2369 &min_profitable_estimate,
2370 suggested_unroll_factor);
2372 if (min_profitable_iters < 0)
2374 if (dump_enabled_p ())
2375 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2376 "not vectorized: vectorization not profitable.\n");
2377 if (dump_enabled_p ())
2378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2379 "not vectorized: vector version will never be "
2380 "profitable.\n");
2381 return -1;
2384 int min_scalar_loop_bound = (param_min_vect_loop_bound
2385 * assumed_vf);
2387 /* Use the cost model only if it is more conservative than user specified
2388 threshold. */
2389 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2390 min_profitable_iters);
2392 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2394 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2395 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2397 if (dump_enabled_p ())
2398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2399 "not vectorized: vectorization not profitable.\n");
2400 if (dump_enabled_p ())
2401 dump_printf_loc (MSG_NOTE, vect_location,
2402 "not vectorized: iteration count smaller than user "
2403 "specified loop bound parameter or minimum profitable "
2404 "iterations (whichever is more conservative).\n");
2405 return 0;
2408 /* The static profitablity threshold min_profitable_estimate includes
2409 the cost of having to check at runtime whether the scalar loop
2410 should be used instead. If it turns out that we don't need or want
2411 such a check, the threshold we should use for the static estimate
2412 is simply the point at which the vector loop becomes more profitable
2413 than the scalar loop. */
2414 if (min_profitable_estimate > min_profitable_iters
2415 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2416 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2417 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2418 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2420 if (dump_enabled_p ())
2421 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2422 " choice between the scalar and vector loops\n");
2423 min_profitable_estimate = min_profitable_iters;
2426 /* If the vector loop needs multiple iterations to be beneficial then
2427 things are probably too close to call, and the conservative thing
2428 would be to stick with the scalar code. */
2429 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2430 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2432 if (dump_enabled_p ())
2433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2434 "one iteration of the vector loop would be"
2435 " more expensive than the equivalent number of"
2436 " iterations of the scalar loop\n");
2437 return 0;
2440 HOST_WIDE_INT estimated_niter;
2442 /* If we are vectorizing an epilogue then we know the maximum number of
2443 scalar iterations it will cover is at least one lower than the
2444 vectorization factor of the main loop. */
2445 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2446 estimated_niter
2447 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2448 else
2450 estimated_niter = estimated_stmt_executions_int (loop);
2451 if (estimated_niter == -1)
2452 estimated_niter = likely_max_stmt_executions_int (loop);
2454 if (estimated_niter != -1
2455 && ((unsigned HOST_WIDE_INT) estimated_niter
2456 < MAX (th, (unsigned) min_profitable_estimate)))
2458 if (dump_enabled_p ())
2459 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2460 "not vectorized: estimated iteration count too "
2461 "small.\n");
2462 if (dump_enabled_p ())
2463 dump_printf_loc (MSG_NOTE, vect_location,
2464 "not vectorized: estimated iteration count smaller "
2465 "than specified loop bound parameter or minimum "
2466 "profitable iterations (whichever is more "
2467 "conservative).\n");
2468 return -1;
2471 return 1;
2474 static opt_result
2475 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2476 vec<data_reference_p> *datarefs,
2477 unsigned int *n_stmts)
2479 *n_stmts = 0;
2480 for (unsigned i = 0; i < loop->num_nodes; i++)
2481 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2482 !gsi_end_p (gsi); gsi_next (&gsi))
2484 gimple *stmt = gsi_stmt (gsi);
2485 if (is_gimple_debug (stmt))
2486 continue;
2487 ++(*n_stmts);
2488 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2489 NULL, 0);
2490 if (!res)
2492 if (is_gimple_call (stmt) && loop->safelen)
2494 tree fndecl = gimple_call_fndecl (stmt), op;
2495 if (fndecl == NULL_TREE
2496 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2498 fndecl = gimple_call_arg (stmt, 0);
2499 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2500 fndecl = TREE_OPERAND (fndecl, 0);
2501 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2503 if (fndecl != NULL_TREE)
2505 cgraph_node *node = cgraph_node::get (fndecl);
2506 if (node != NULL && node->simd_clones != NULL)
2508 unsigned int j, n = gimple_call_num_args (stmt);
2509 for (j = 0; j < n; j++)
2511 op = gimple_call_arg (stmt, j);
2512 if (DECL_P (op)
2513 || (REFERENCE_CLASS_P (op)
2514 && get_base_address (op)))
2515 break;
2517 op = gimple_call_lhs (stmt);
2518 /* Ignore #pragma omp declare simd functions
2519 if they don't have data references in the
2520 call stmt itself. */
2521 if (j == n
2522 && !(op
2523 && (DECL_P (op)
2524 || (REFERENCE_CLASS_P (op)
2525 && get_base_address (op)))))
2526 continue;
2530 return res;
2532 /* If dependence analysis will give up due to the limit on the
2533 number of datarefs stop here and fail fatally. */
2534 if (datarefs->length ()
2535 > (unsigned)param_loop_max_datarefs_for_datadeps)
2536 return opt_result::failure_at (stmt, "exceeded param "
2537 "loop-max-datarefs-for-datadeps\n");
2539 return opt_result::success ();
2542 /* Look for SLP-only access groups and turn each individual access into its own
2543 group. */
2544 static void
2545 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2547 unsigned int i;
2548 struct data_reference *dr;
2550 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2552 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2553 FOR_EACH_VEC_ELT (datarefs, i, dr)
2555 gcc_assert (DR_REF (dr));
2556 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2558 /* Check if the load is a part of an interleaving chain. */
2559 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2561 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2562 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2563 unsigned int group_size = DR_GROUP_SIZE (first_element);
2565 /* Check if SLP-only groups. */
2566 if (!STMT_SLP_TYPE (stmt_info)
2567 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2569 /* Dissolve the group. */
2570 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2572 stmt_vec_info vinfo = first_element;
2573 while (vinfo)
2575 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2576 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2577 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2578 DR_GROUP_SIZE (vinfo) = 1;
2579 if (STMT_VINFO_STRIDED_P (first_element)
2580 /* We cannot handle stores with gaps. */
2581 || DR_IS_WRITE (dr_info->dr))
2583 STMT_VINFO_STRIDED_P (vinfo) = true;
2584 DR_GROUP_GAP (vinfo) = 0;
2586 else
2587 DR_GROUP_GAP (vinfo) = group_size - 1;
2588 /* Duplicate and adjust alignment info, it needs to
2589 be present on each group leader, see dr_misalignment. */
2590 if (vinfo != first_element)
2592 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2593 dr_info2->target_alignment = dr_info->target_alignment;
2594 int misalignment = dr_info->misalignment;
2595 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2597 HOST_WIDE_INT diff
2598 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2599 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2600 unsigned HOST_WIDE_INT align_c
2601 = dr_info->target_alignment.to_constant ();
2602 misalignment = (misalignment + diff) % align_c;
2604 dr_info2->misalignment = misalignment;
2606 vinfo = next;
2613 /* Determine if operating on full vectors for LOOP_VINFO might leave
2614 some scalar iterations still to do. If so, decide how we should
2615 handle those scalar iterations. The possibilities are:
2617 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2618 In this case:
2620 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2621 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2622 LOOP_VINFO_PEELING_FOR_NITER == false
2624 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2625 to handle the remaining scalar iterations. In this case:
2627 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2628 LOOP_VINFO_PEELING_FOR_NITER == true
2630 There are two choices:
2632 (2a) Consider vectorizing the epilogue loop at the same VF as the
2633 main loop, but using partial vectors instead of full vectors.
2634 In this case:
2636 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2638 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2639 In this case:
2641 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2644 opt_result
2645 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2647 /* Determine whether there would be any scalar iterations left over. */
2648 bool need_peeling_or_partial_vectors_p
2649 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2651 /* Decide whether to vectorize the loop with partial vectors. */
2652 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2653 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2654 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2655 && need_peeling_or_partial_vectors_p)
2657 /* For partial-vector-usage=1, try to push the handling of partial
2658 vectors to the epilogue, with the main loop continuing to operate
2659 on full vectors.
2661 If we are unrolling we also do not want to use partial vectors. This
2662 is to avoid the overhead of generating multiple masks and also to
2663 avoid having to execute entire iterations of FALSE masked instructions
2664 when dealing with one or less full iterations.
2666 ??? We could then end up failing to use partial vectors if we
2667 decide to peel iterations into a prologue, and if the main loop
2668 then ends up processing fewer than VF iterations. */
2669 if ((param_vect_partial_vector_usage == 1
2670 || loop_vinfo->suggested_unroll_factor > 1)
2671 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2672 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2673 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2674 else
2675 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2678 if (dump_enabled_p ())
2679 dump_printf_loc (MSG_NOTE, vect_location,
2680 "operating on %s vectors%s.\n",
2681 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2682 ? "partial" : "full",
2683 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2684 ? " for epilogue loop" : "");
2686 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2687 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2688 && need_peeling_or_partial_vectors_p);
2690 /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2691 analysis that we don't know whether the loop is vectorized by partial
2692 vectors (More details see tree-vect-loop-manip.cc).
2694 However, SELECT_VL vectorizaton style should only applied on partial
2695 vectorization since SELECT_VL is the GIMPLE IR that calculates the
2696 number of elements to be process for each iteration.
2698 After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2699 if it is not partial vectorized loop. */
2700 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2701 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2703 return opt_result::success ();
2706 /* Function vect_analyze_loop_2.
2708 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2709 analyses will record information in some members of LOOP_VINFO. FATAL
2710 indicates if some analysis meets fatal error. If one non-NULL pointer
2711 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2712 worked out suggested unroll factor, while one NULL pointer shows it's
2713 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2714 is to hold the slp decision when the suggested unroll factor is worked
2715 out. */
2716 static opt_result
2717 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2718 unsigned *suggested_unroll_factor,
2719 bool& slp_done_for_suggested_uf)
2721 opt_result ok = opt_result::success ();
2722 int res;
2723 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2724 poly_uint64 min_vf = 2;
2725 loop_vec_info orig_loop_vinfo = NULL;
2727 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2728 loop_vec_info of the first vectorized loop. */
2729 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2730 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2731 else
2732 orig_loop_vinfo = loop_vinfo;
2733 gcc_assert (orig_loop_vinfo);
2735 /* The first group of checks is independent of the vector size. */
2736 fatal = true;
2738 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2739 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2740 return opt_result::failure_at (vect_location,
2741 "not vectorized: simd if(0)\n");
2743 /* Find all data references in the loop (which correspond to vdefs/vuses)
2744 and analyze their evolution in the loop. */
2746 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2748 /* Gather the data references and count stmts in the loop. */
2749 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2751 opt_result res
2752 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2753 &LOOP_VINFO_DATAREFS (loop_vinfo),
2754 &LOOP_VINFO_N_STMTS (loop_vinfo));
2755 if (!res)
2757 if (dump_enabled_p ())
2758 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2759 "not vectorized: loop contains function "
2760 "calls or data references that cannot "
2761 "be analyzed\n");
2762 return res;
2764 loop_vinfo->shared->save_datarefs ();
2766 else
2767 loop_vinfo->shared->check_datarefs ();
2769 /* Analyze the data references and also adjust the minimal
2770 vectorization factor according to the loads and stores. */
2772 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2773 if (!ok)
2775 if (dump_enabled_p ())
2776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2777 "bad data references.\n");
2778 return ok;
2781 /* Check if we are applying unroll factor now. */
2782 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2783 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2785 /* If the slp decision is false when suggested unroll factor is worked
2786 out, and we are applying suggested unroll factor, we can simply skip
2787 all slp related analyses this time. */
2788 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2790 /* Classify all cross-iteration scalar data-flow cycles.
2791 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2792 vect_analyze_scalar_cycles (loop_vinfo, slp);
2794 vect_pattern_recog (loop_vinfo);
2796 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2798 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2799 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2801 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2802 if (!ok)
2804 if (dump_enabled_p ())
2805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2806 "bad data access.\n");
2807 return ok;
2810 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2812 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2813 if (!ok)
2815 if (dump_enabled_p ())
2816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2817 "unexpected pattern.\n");
2818 return ok;
2821 /* While the rest of the analysis below depends on it in some way. */
2822 fatal = false;
2824 /* Analyze data dependences between the data-refs in the loop
2825 and adjust the maximum vectorization factor according to
2826 the dependences.
2827 FORNOW: fail at the first data dependence that we encounter. */
2829 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2830 if (!ok)
2832 if (dump_enabled_p ())
2833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2834 "bad data dependence.\n");
2835 return ok;
2837 if (max_vf != MAX_VECTORIZATION_FACTOR
2838 && maybe_lt (max_vf, min_vf))
2839 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2840 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2842 ok = vect_determine_vectorization_factor (loop_vinfo);
2843 if (!ok)
2845 if (dump_enabled_p ())
2846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2847 "can't determine vectorization factor.\n");
2848 return ok;
2851 /* Compute the scalar iteration cost. */
2852 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2854 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2856 if (slp)
2858 /* Check the SLP opportunities in the loop, analyze and build
2859 SLP trees. */
2860 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2861 if (!ok)
2862 return ok;
2864 /* If there are any SLP instances mark them as pure_slp. */
2865 slp = vect_make_slp_decision (loop_vinfo);
2866 if (slp)
2868 /* Find stmts that need to be both vectorized and SLPed. */
2869 vect_detect_hybrid_slp (loop_vinfo);
2871 /* Update the vectorization factor based on the SLP decision. */
2872 vect_update_vf_for_slp (loop_vinfo);
2874 /* Optimize the SLP graph with the vectorization factor fixed. */
2875 vect_optimize_slp (loop_vinfo);
2877 /* Gather the loads reachable from the SLP graph entries. */
2878 vect_gather_slp_loads (loop_vinfo);
2882 bool saved_can_use_partial_vectors_p
2883 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2885 /* We don't expect to have to roll back to anything other than an empty
2886 set of rgroups. */
2887 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2889 /* This is the point where we can re-start analysis with SLP forced off. */
2890 start_over:
2892 /* Apply the suggested unrolling factor, this was determined by the backend
2893 during finish_cost the first time we ran the analyzis for this
2894 vector mode. */
2895 if (applying_suggested_uf)
2896 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2898 /* Now the vectorization factor is final. */
2899 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2900 gcc_assert (known_ne (vectorization_factor, 0U));
2902 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2904 dump_printf_loc (MSG_NOTE, vect_location,
2905 "vectorization_factor = ");
2906 dump_dec (MSG_NOTE, vectorization_factor);
2907 dump_printf (MSG_NOTE, ", niters = %wd\n",
2908 LOOP_VINFO_INT_NITERS (loop_vinfo));
2911 if (max_vf != MAX_VECTORIZATION_FACTOR
2912 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2913 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2915 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2917 /* Analyze the alignment of the data-refs in the loop.
2918 Fail if a data reference is found that cannot be vectorized. */
2920 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2921 if (!ok)
2923 if (dump_enabled_p ())
2924 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2925 "bad data alignment.\n");
2926 return ok;
2929 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2930 It is important to call pruning after vect_analyze_data_ref_accesses,
2931 since we use grouping information gathered by interleaving analysis. */
2932 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2933 if (!ok)
2934 return ok;
2936 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2937 vectorization, since we do not want to add extra peeling or
2938 add versioning for alignment. */
2939 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2940 /* This pass will decide on using loop versioning and/or loop peeling in
2941 order to enhance the alignment of data references in the loop. */
2942 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2943 if (!ok)
2944 return ok;
2946 if (slp)
2948 /* Analyze operations in the SLP instances. Note this may
2949 remove unsupported SLP instances which makes the above
2950 SLP kind detection invalid. */
2951 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2952 vect_slp_analyze_operations (loop_vinfo);
2953 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2955 ok = opt_result::failure_at (vect_location,
2956 "unsupported SLP instances\n");
2957 goto again;
2960 /* Check whether any load in ALL SLP instances is possibly permuted. */
2961 slp_tree load_node, slp_root;
2962 unsigned i, x;
2963 slp_instance instance;
2964 bool can_use_lanes = true;
2965 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2967 slp_root = SLP_INSTANCE_TREE (instance);
2968 int group_size = SLP_TREE_LANES (slp_root);
2969 tree vectype = SLP_TREE_VECTYPE (slp_root);
2970 bool loads_permuted = false;
2971 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2973 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2974 continue;
2975 unsigned j;
2976 stmt_vec_info load_info;
2977 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2978 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2980 loads_permuted = true;
2981 break;
2985 /* If the loads and stores can be handled with load/store-lane
2986 instructions record it and move on to the next instance. */
2987 if (loads_permuted
2988 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2989 && vect_store_lanes_supported (vectype, group_size, false)
2990 != IFN_LAST)
2992 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2993 if (STMT_VINFO_GROUPED_ACCESS
2994 (SLP_TREE_REPRESENTATIVE (load_node)))
2996 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2997 (SLP_TREE_REPRESENTATIVE (load_node));
2998 /* Use SLP for strided accesses (or if we can't
2999 load-lanes). */
3000 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
3001 || vect_load_lanes_supported
3002 (STMT_VINFO_VECTYPE (stmt_vinfo),
3003 DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
3004 break;
3007 can_use_lanes
3008 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
3010 if (can_use_lanes && dump_enabled_p ())
3011 dump_printf_loc (MSG_NOTE, vect_location,
3012 "SLP instance %p can use load/store-lanes\n",
3013 (void *) instance);
3015 else
3017 can_use_lanes = false;
3018 break;
3022 /* If all SLP instances can use load/store-lanes abort SLP and try again
3023 with SLP disabled. */
3024 if (can_use_lanes)
3026 ok = opt_result::failure_at (vect_location,
3027 "Built SLP cancelled: can use "
3028 "load/store-lanes\n");
3029 if (dump_enabled_p ())
3030 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3031 "Built SLP cancelled: all SLP instances support "
3032 "load/store-lanes\n");
3033 goto again;
3037 /* Dissolve SLP-only groups. */
3038 vect_dissolve_slp_only_groups (loop_vinfo);
3040 /* Scan all the remaining operations in the loop that are not subject
3041 to SLP and make sure they are vectorizable. */
3042 ok = vect_analyze_loop_operations (loop_vinfo);
3043 if (!ok)
3045 if (dump_enabled_p ())
3046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3047 "bad operation or unsupported loop bound.\n");
3048 return ok;
3051 /* For now, we don't expect to mix both masking and length approaches for one
3052 loop, disable it if both are recorded. */
3053 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3054 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3055 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3057 if (dump_enabled_p ())
3058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3059 "can't vectorize a loop with partial vectors"
3060 " because we don't expect to mix different"
3061 " approaches with partial vectors for the"
3062 " same loop.\n");
3063 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3066 /* If we still have the option of using partial vectors,
3067 check whether we can generate the necessary loop controls. */
3068 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3070 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3072 if (!vect_verify_full_masking (loop_vinfo)
3073 && !vect_verify_full_masking_avx512 (loop_vinfo))
3074 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3076 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3077 if (!vect_verify_loop_lens (loop_vinfo))
3078 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3081 /* If we're vectorizing a loop that uses length "controls" and
3082 can iterate more than once, we apply decrementing IV approach
3083 in loop control. */
3084 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3085 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3086 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3087 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3088 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3089 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3090 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3092 /* If a loop uses length controls and has a decrementing loop control IV,
3093 we will normally pass that IV through a MIN_EXPR to calcaluate the
3094 basis for the length controls. E.g. in a loop that processes one
3095 element per scalar iteration, the number of elements would be
3096 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3098 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3099 step, since only the final iteration of the vector loop can have
3100 inactive lanes.
3102 However, some targets have a dedicated instruction for calculating the
3103 preferred length, given the total number of elements that still need to
3104 be processed. This is encapsulated in the SELECT_VL internal function.
3106 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3107 to determine the basis for the length controls. However, unlike the
3108 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3109 lanes inactive in any iteration of the vector loop, not just the last
3110 iteration. This SELECT_VL approach therefore requires us to use pointer
3111 IVs with variable steps.
3113 Once we've decided how many elements should be processed by one
3114 iteration of the vector loop, we need to populate the rgroup controls.
3115 If a loop has multiple rgroups, we need to make sure that those rgroups
3116 "line up" (that is, they must be consistent about which elements are
3117 active and which aren't). This is done by vect_adjust_loop_lens_control.
3119 In principle, it would be possible to use vect_adjust_loop_lens_control
3120 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3121 However:
3123 (1) In practice, it only makes sense to use SELECT_VL when a vector
3124 operation will be controlled directly by the result. It is not
3125 worth using SELECT_VL if it would only be the input to other
3126 calculations.
3128 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3129 pointer IV will need N updates by a variable amount (N-1 updates
3130 within the iteration and 1 update to move to the next iteration).
3132 Because of this, we prefer to use the MIN_EXPR approach whenever there
3133 is more than one length control.
3135 In addition, SELECT_VL always operates to a granularity of 1 unit.
3136 If we wanted to use it to control an SLP operation on N consecutive
3137 elements, we would need to make the SELECT_VL inputs measure scalar
3138 iterations (rather than elements) and then multiply the SELECT_VL
3139 result by N. But using SELECT_VL this way is inefficient because
3140 of (1) above.
3142 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3143 satisfied:
3145 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3146 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3148 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3149 we will fail to gain benefits of following unroll optimizations. We prefer
3150 using the MIN_EXPR approach in this situation. */
3151 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3153 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3154 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3155 OPTIMIZE_FOR_SPEED)
3156 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3157 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3158 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3159 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3160 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3163 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3164 assuming that the loop will be used as a main loop. We will redo
3165 this analysis later if we instead decide to use the loop as an
3166 epilogue loop. */
3167 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3168 if (!ok)
3169 return ok;
3171 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3172 to be able to handle fewer than VF scalars, or needs to have a lower VF
3173 than the main loop. */
3174 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3175 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3177 poly_uint64 unscaled_vf
3178 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3179 orig_loop_vinfo->suggested_unroll_factor);
3180 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3181 return opt_result::failure_at (vect_location,
3182 "Vectorization factor too high for"
3183 " epilogue loop.\n");
3186 /* Check the costings of the loop make vectorizing worthwhile. */
3187 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3188 if (res < 0)
3190 ok = opt_result::failure_at (vect_location,
3191 "Loop costings may not be worthwhile.\n");
3192 goto again;
3194 if (!res)
3195 return opt_result::failure_at (vect_location,
3196 "Loop costings not worthwhile.\n");
3198 /* If an epilogue loop is required make sure we can create one. */
3199 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3200 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3201 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3203 if (dump_enabled_p ())
3204 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3205 if (!vect_can_advance_ivs_p (loop_vinfo)
3206 || !slpeel_can_duplicate_loop_p (loop,
3207 LOOP_VINFO_IV_EXIT (loop_vinfo),
3208 LOOP_VINFO_IV_EXIT (loop_vinfo)))
3210 ok = opt_result::failure_at (vect_location,
3211 "not vectorized: can't create required "
3212 "epilog loop\n");
3213 goto again;
3217 /* During peeling, we need to check if number of loop iterations is
3218 enough for both peeled prolog loop and vector loop. This check
3219 can be merged along with threshold check of loop versioning, so
3220 increase threshold for this case if necessary.
3222 If we are analyzing an epilogue we still want to check what its
3223 versioning threshold would be. If we decide to vectorize the epilogues we
3224 will want to use the lowest versioning threshold of all epilogues and main
3225 loop. This will enable us to enter a vectorized epilogue even when
3226 versioning the loop. We can't simply check whether the epilogue requires
3227 versioning though since we may have skipped some versioning checks when
3228 analyzing the epilogue. For instance, checks for alias versioning will be
3229 skipped when dealing with epilogues as we assume we already checked them
3230 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3231 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3233 poly_uint64 niters_th = 0;
3234 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3236 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3238 /* Niters for peeled prolog loop. */
3239 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3241 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3242 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3243 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3245 else
3246 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3249 /* Niters for at least one iteration of vectorized loop. */
3250 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3251 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3252 /* One additional iteration because of peeling for gap. */
3253 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3254 niters_th += 1;
3256 /* Use the same condition as vect_transform_loop to decide when to use
3257 the cost to determine a versioning threshold. */
3258 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3259 && ordered_p (th, niters_th))
3260 niters_th = ordered_max (poly_uint64 (th), niters_th);
3262 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3265 gcc_assert (known_eq (vectorization_factor,
3266 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3268 slp_done_for_suggested_uf = slp;
3270 /* Ok to vectorize! */
3271 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3272 return opt_result::success ();
3274 again:
3275 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3276 gcc_assert (!ok);
3278 /* Try again with SLP forced off but if we didn't do any SLP there is
3279 no point in re-trying. */
3280 if (!slp)
3281 return ok;
3283 /* If the slp decision is true when suggested unroll factor is worked
3284 out, and we are applying suggested unroll factor, we don't need to
3285 re-try any more. */
3286 if (applying_suggested_uf && slp_done_for_suggested_uf)
3287 return ok;
3289 /* If there are reduction chains re-trying will fail anyway. */
3290 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3291 return ok;
3293 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3294 via interleaving or lane instructions. */
3295 slp_instance instance;
3296 slp_tree node;
3297 unsigned i, j;
3298 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3300 stmt_vec_info vinfo;
3301 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3302 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3303 continue;
3304 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3305 unsigned int size = DR_GROUP_SIZE (vinfo);
3306 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3307 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3308 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3309 && ! vect_grouped_store_supported (vectype, size))
3310 return opt_result::failure_at (vinfo->stmt,
3311 "unsupported grouped store\n");
3312 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3314 vinfo = SLP_TREE_REPRESENTATIVE (node);
3315 if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3317 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3318 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3319 size = DR_GROUP_SIZE (vinfo);
3320 vectype = STMT_VINFO_VECTYPE (vinfo);
3321 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3322 && ! vect_grouped_load_supported (vectype, single_element_p,
3323 size))
3324 return opt_result::failure_at (vinfo->stmt,
3325 "unsupported grouped load\n");
3330 if (dump_enabled_p ())
3331 dump_printf_loc (MSG_NOTE, vect_location,
3332 "re-trying with SLP disabled\n");
3334 /* Roll back state appropriately. No SLP this time. */
3335 slp = false;
3336 /* Restore vectorization factor as it were without SLP. */
3337 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3338 /* Free the SLP instances. */
3339 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3340 vect_free_slp_instance (instance);
3341 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3342 /* Reset SLP type to loop_vect on all stmts. */
3343 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3345 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3346 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3347 !gsi_end_p (si); gsi_next (&si))
3349 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3350 STMT_SLP_TYPE (stmt_info) = loop_vect;
3351 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3352 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3354 /* vectorizable_reduction adjusts reduction stmt def-types,
3355 restore them to that of the PHI. */
3356 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3357 = STMT_VINFO_DEF_TYPE (stmt_info);
3358 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3359 (STMT_VINFO_REDUC_DEF (stmt_info)))
3360 = STMT_VINFO_DEF_TYPE (stmt_info);
3363 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3364 !gsi_end_p (si); gsi_next (&si))
3366 if (is_gimple_debug (gsi_stmt (si)))
3367 continue;
3368 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3369 STMT_SLP_TYPE (stmt_info) = loop_vect;
3370 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3372 stmt_vec_info pattern_stmt_info
3373 = STMT_VINFO_RELATED_STMT (stmt_info);
3374 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3375 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3377 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3378 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3379 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3380 !gsi_end_p (pi); gsi_next (&pi))
3381 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3382 = loop_vect;
3386 /* Free optimized alias test DDRS. */
3387 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3388 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3389 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3390 /* Reset target cost data. */
3391 delete loop_vinfo->vector_costs;
3392 loop_vinfo->vector_costs = nullptr;
3393 /* Reset accumulated rgroup information. */
3394 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3395 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3396 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3397 /* Reset assorted flags. */
3398 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3399 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3400 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3401 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3402 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3403 = saved_can_use_partial_vectors_p;
3405 goto start_over;
3408 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3409 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3410 OLD_LOOP_VINFO is better unless something specifically indicates
3411 otherwise.
3413 Note that this deliberately isn't a partial order. */
3415 static bool
3416 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3417 loop_vec_info old_loop_vinfo)
3419 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3420 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3422 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3423 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3425 /* Always prefer a VF of loop->simdlen over any other VF. */
3426 if (loop->simdlen)
3428 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3429 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3430 if (new_simdlen_p != old_simdlen_p)
3431 return new_simdlen_p;
3434 const auto *old_costs = old_loop_vinfo->vector_costs;
3435 const auto *new_costs = new_loop_vinfo->vector_costs;
3436 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3437 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3439 return new_costs->better_main_loop_than_p (old_costs);
3442 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3443 true if we should. */
3445 static bool
3446 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3447 loop_vec_info old_loop_vinfo)
3449 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3450 return false;
3452 if (dump_enabled_p ())
3453 dump_printf_loc (MSG_NOTE, vect_location,
3454 "***** Preferring vector mode %s to vector mode %s\n",
3455 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3456 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3457 return true;
3460 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3461 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3462 MODE_I to the next mode useful to analyze.
3463 Return the loop_vinfo on success and wrapped null on failure. */
3465 static opt_loop_vec_info
3466 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3467 const vect_loop_form_info *loop_form_info,
3468 loop_vec_info main_loop_vinfo,
3469 const vector_modes &vector_modes, unsigned &mode_i,
3470 machine_mode &autodetected_vector_mode,
3471 bool &fatal)
3473 loop_vec_info loop_vinfo
3474 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3476 machine_mode vector_mode = vector_modes[mode_i];
3477 loop_vinfo->vector_mode = vector_mode;
3478 unsigned int suggested_unroll_factor = 1;
3479 bool slp_done_for_suggested_uf = false;
3481 /* Run the main analysis. */
3482 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3483 &suggested_unroll_factor,
3484 slp_done_for_suggested_uf);
3485 if (dump_enabled_p ())
3486 dump_printf_loc (MSG_NOTE, vect_location,
3487 "***** Analysis %s with vector mode %s\n",
3488 res ? "succeeded" : " failed",
3489 GET_MODE_NAME (loop_vinfo->vector_mode));
3491 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3493 if (dump_enabled_p ())
3494 dump_printf_loc (MSG_NOTE, vect_location,
3495 "***** Re-trying analysis for unrolling"
3496 " with unroll factor %d and slp %s.\n",
3497 suggested_unroll_factor,
3498 slp_done_for_suggested_uf ? "on" : "off");
3499 loop_vec_info unroll_vinfo
3500 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3501 unroll_vinfo->vector_mode = vector_mode;
3502 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3503 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3504 slp_done_for_suggested_uf);
3505 if (new_res)
3507 delete loop_vinfo;
3508 loop_vinfo = unroll_vinfo;
3510 else
3511 delete unroll_vinfo;
3514 /* Remember the autodetected vector mode. */
3515 if (vector_mode == VOIDmode)
3516 autodetected_vector_mode = loop_vinfo->vector_mode;
3518 /* Advance mode_i, first skipping modes that would result in the
3519 same analysis result. */
3520 while (mode_i + 1 < vector_modes.length ()
3521 && vect_chooses_same_modes_p (loop_vinfo,
3522 vector_modes[mode_i + 1]))
3524 if (dump_enabled_p ())
3525 dump_printf_loc (MSG_NOTE, vect_location,
3526 "***** The result for vector mode %s would"
3527 " be the same\n",
3528 GET_MODE_NAME (vector_modes[mode_i + 1]));
3529 mode_i += 1;
3531 if (mode_i + 1 < vector_modes.length ()
3532 && VECTOR_MODE_P (autodetected_vector_mode)
3533 && (related_vector_mode (vector_modes[mode_i + 1],
3534 GET_MODE_INNER (autodetected_vector_mode))
3535 == autodetected_vector_mode)
3536 && (related_vector_mode (autodetected_vector_mode,
3537 GET_MODE_INNER (vector_modes[mode_i + 1]))
3538 == vector_modes[mode_i + 1]))
3540 if (dump_enabled_p ())
3541 dump_printf_loc (MSG_NOTE, vect_location,
3542 "***** Skipping vector mode %s, which would"
3543 " repeat the analysis for %s\n",
3544 GET_MODE_NAME (vector_modes[mode_i + 1]),
3545 GET_MODE_NAME (autodetected_vector_mode));
3546 mode_i += 1;
3548 mode_i++;
3550 if (!res)
3552 delete loop_vinfo;
3553 if (fatal)
3554 gcc_checking_assert (main_loop_vinfo == NULL);
3555 return opt_loop_vec_info::propagate_failure (res);
3558 return opt_loop_vec_info::success (loop_vinfo);
3561 /* Function vect_analyze_loop.
3563 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3564 for it. The different analyses will record information in the
3565 loop_vec_info struct. */
3566 opt_loop_vec_info
3567 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3569 DUMP_VECT_SCOPE ("analyze_loop_nest");
3571 if (loop_outer (loop)
3572 && loop_vec_info_for_loop (loop_outer (loop))
3573 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3574 return opt_loop_vec_info::failure_at (vect_location,
3575 "outer-loop already vectorized.\n");
3577 if (!find_loop_nest (loop, &shared->loop_nest))
3578 return opt_loop_vec_info::failure_at
3579 (vect_location,
3580 "not vectorized: loop nest containing two or more consecutive inner"
3581 " loops cannot be vectorized\n");
3583 /* Analyze the loop form. */
3584 vect_loop_form_info loop_form_info;
3585 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3586 if (!res)
3588 if (dump_enabled_p ())
3589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3590 "bad loop form.\n");
3591 return opt_loop_vec_info::propagate_failure (res);
3593 if (!integer_onep (loop_form_info.assumptions))
3595 /* We consider to vectorize this loop by versioning it under
3596 some assumptions. In order to do this, we need to clear
3597 existing information computed by scev and niter analyzer. */
3598 scev_reset_htab ();
3599 free_numbers_of_iterations_estimates (loop);
3600 /* Also set flag for this loop so that following scev and niter
3601 analysis are done under the assumptions. */
3602 loop_constraint_set (loop, LOOP_C_FINITE);
3604 else
3605 /* Clear the existing niter information to make sure the nonwrapping flag
3606 will be calculated and set propriately. */
3607 free_numbers_of_iterations_estimates (loop);
3609 auto_vector_modes vector_modes;
3610 /* Autodetect first vector size we try. */
3611 vector_modes.safe_push (VOIDmode);
3612 unsigned int autovec_flags
3613 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3614 loop->simdlen != 0);
3615 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3616 && !unlimited_cost_model (loop));
3617 machine_mode autodetected_vector_mode = VOIDmode;
3618 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3619 unsigned int mode_i = 0;
3620 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3622 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3623 a mode has not been analyzed. */
3624 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3625 for (unsigned i = 0; i < vector_modes.length (); ++i)
3626 cached_vf_per_mode.safe_push (0);
3628 /* First determine the main loop vectorization mode, either the first
3629 one that works, starting with auto-detecting the vector mode and then
3630 following the targets order of preference, or the one with the
3631 lowest cost if pick_lowest_cost_p. */
3632 while (1)
3634 bool fatal;
3635 unsigned int last_mode_i = mode_i;
3636 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3637 failed. */
3638 cached_vf_per_mode[last_mode_i] = -1;
3639 opt_loop_vec_info loop_vinfo
3640 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3641 NULL, vector_modes, mode_i,
3642 autodetected_vector_mode, fatal);
3643 if (fatal)
3644 break;
3646 if (loop_vinfo)
3648 /* Analyzis has been successful so update the VF value. The
3649 VF should always be a multiple of unroll_factor and we want to
3650 capture the original VF here. */
3651 cached_vf_per_mode[last_mode_i]
3652 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3653 loop_vinfo->suggested_unroll_factor);
3654 /* Once we hit the desired simdlen for the first time,
3655 discard any previous attempts. */
3656 if (simdlen
3657 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3659 delete first_loop_vinfo;
3660 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3661 simdlen = 0;
3663 else if (pick_lowest_cost_p
3664 && first_loop_vinfo
3665 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3667 /* Pick loop_vinfo over first_loop_vinfo. */
3668 delete first_loop_vinfo;
3669 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3671 if (first_loop_vinfo == NULL)
3672 first_loop_vinfo = loop_vinfo;
3673 else
3675 delete loop_vinfo;
3676 loop_vinfo = opt_loop_vec_info::success (NULL);
3679 /* Commit to first_loop_vinfo if we have no reason to try
3680 alternatives. */
3681 if (!simdlen && !pick_lowest_cost_p)
3682 break;
3684 if (mode_i == vector_modes.length ()
3685 || autodetected_vector_mode == VOIDmode)
3686 break;
3688 /* Try the next biggest vector size. */
3689 if (dump_enabled_p ())
3690 dump_printf_loc (MSG_NOTE, vect_location,
3691 "***** Re-trying analysis with vector mode %s\n",
3692 GET_MODE_NAME (vector_modes[mode_i]));
3694 if (!first_loop_vinfo)
3695 return opt_loop_vec_info::propagate_failure (res);
3697 if (dump_enabled_p ())
3698 dump_printf_loc (MSG_NOTE, vect_location,
3699 "***** Choosing vector mode %s\n",
3700 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3702 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3703 enabled, SIMDUID is not set, it is the innermost loop and we have
3704 either already found the loop's SIMDLEN or there was no SIMDLEN to
3705 begin with.
3706 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3707 bool vect_epilogues = (!simdlen
3708 && loop->inner == NULL
3709 && param_vect_epilogues_nomask
3710 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3711 /* No code motion support for multiple epilogues so for now
3712 not supported when multiple exits. */
3713 && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3714 && !loop->simduid);
3715 if (!vect_epilogues)
3716 return first_loop_vinfo;
3718 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3719 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3721 /* For epilogues start the analysis from the first mode. The motivation
3722 behind starting from the beginning comes from cases where the VECTOR_MODES
3723 array may contain length-agnostic and length-specific modes. Their
3724 ordering is not guaranteed, so we could end up picking a mode for the main
3725 loop that is after the epilogue's optimal mode. */
3726 vector_modes[0] = autodetected_vector_mode;
3727 mode_i = 0;
3729 bool supports_partial_vectors =
3730 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3731 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3733 while (1)
3735 /* If the target does not support partial vectors we can shorten the
3736 number of modes to analyze for the epilogue as we know we can't pick a
3737 mode that would lead to a VF at least as big as the
3738 FIRST_VINFO_VF. */
3739 if (!supports_partial_vectors
3740 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3742 mode_i++;
3743 if (mode_i == vector_modes.length ())
3744 break;
3745 continue;
3748 if (dump_enabled_p ())
3749 dump_printf_loc (MSG_NOTE, vect_location,
3750 "***** Re-trying epilogue analysis with vector "
3751 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3753 bool fatal;
3754 opt_loop_vec_info loop_vinfo
3755 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3756 first_loop_vinfo,
3757 vector_modes, mode_i,
3758 autodetected_vector_mode, fatal);
3759 if (fatal)
3760 break;
3762 if (loop_vinfo)
3764 if (pick_lowest_cost_p)
3766 /* Keep trying to roll back vectorization attempts while the
3767 loop_vec_infos they produced were worse than this one. */
3768 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3769 while (!vinfos.is_empty ()
3770 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3772 gcc_assert (vect_epilogues);
3773 delete vinfos.pop ();
3776 /* For now only allow one epilogue loop. */
3777 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3779 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3780 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3781 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3782 || maybe_ne (lowest_th, 0U));
3783 /* Keep track of the known smallest versioning
3784 threshold. */
3785 if (ordered_p (lowest_th, th))
3786 lowest_th = ordered_min (lowest_th, th);
3788 else
3790 delete loop_vinfo;
3791 loop_vinfo = opt_loop_vec_info::success (NULL);
3794 /* For now only allow one epilogue loop, but allow
3795 pick_lowest_cost_p to replace it, so commit to the
3796 first epilogue if we have no reason to try alternatives. */
3797 if (!pick_lowest_cost_p)
3798 break;
3801 if (mode_i == vector_modes.length ())
3802 break;
3806 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3808 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3809 if (dump_enabled_p ())
3810 dump_printf_loc (MSG_NOTE, vect_location,
3811 "***** Choosing epilogue vector mode %s\n",
3812 GET_MODE_NAME
3813 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3816 return first_loop_vinfo;
3819 /* Return true if there is an in-order reduction function for CODE, storing
3820 it in *REDUC_FN if so. */
3822 static bool
3823 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3825 /* We support MINUS_EXPR by negating the operand. This also preserves an
3826 initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3827 (-0.0) = -0.0. */
3828 if (code == PLUS_EXPR || code == MINUS_EXPR)
3830 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3831 return true;
3833 return false;
3836 /* Function reduction_fn_for_scalar_code
3838 Input:
3839 CODE - tree_code of a reduction operations.
3841 Output:
3842 REDUC_FN - the corresponding internal function to be used to reduce the
3843 vector of partial results into a single scalar result, or IFN_LAST
3844 if the operation is a supported reduction operation, but does not have
3845 such an internal function.
3847 Return FALSE if CODE currently cannot be vectorized as reduction. */
3849 bool
3850 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3852 if (code.is_tree_code ())
3853 switch (tree_code (code))
3855 case MAX_EXPR:
3856 *reduc_fn = IFN_REDUC_MAX;
3857 return true;
3859 case MIN_EXPR:
3860 *reduc_fn = IFN_REDUC_MIN;
3861 return true;
3863 case PLUS_EXPR:
3864 *reduc_fn = IFN_REDUC_PLUS;
3865 return true;
3867 case BIT_AND_EXPR:
3868 *reduc_fn = IFN_REDUC_AND;
3869 return true;
3871 case BIT_IOR_EXPR:
3872 *reduc_fn = IFN_REDUC_IOR;
3873 return true;
3875 case BIT_XOR_EXPR:
3876 *reduc_fn = IFN_REDUC_XOR;
3877 return true;
3879 case MULT_EXPR:
3880 case MINUS_EXPR:
3881 *reduc_fn = IFN_LAST;
3882 return true;
3884 default:
3885 return false;
3887 else
3888 switch (combined_fn (code))
3890 CASE_CFN_FMAX:
3891 *reduc_fn = IFN_REDUC_FMAX;
3892 return true;
3894 CASE_CFN_FMIN:
3895 *reduc_fn = IFN_REDUC_FMIN;
3896 return true;
3898 default:
3899 return false;
3903 /* If there is a neutral value X such that a reduction would not be affected
3904 by the introduction of additional X elements, return that X, otherwise
3905 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3906 of the scalar elements. If the reduction has just a single initial value
3907 then INITIAL_VALUE is that value, otherwise it is null.
3908 If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3909 In that case no signed zero is returned. */
3911 tree
3912 neutral_op_for_reduction (tree scalar_type, code_helper code,
3913 tree initial_value, bool as_initial)
3915 if (code.is_tree_code ())
3916 switch (tree_code (code))
3918 case DOT_PROD_EXPR:
3919 case SAD_EXPR:
3920 case MINUS_EXPR:
3921 case BIT_IOR_EXPR:
3922 case BIT_XOR_EXPR:
3923 return build_zero_cst (scalar_type);
3924 case WIDEN_SUM_EXPR:
3925 case PLUS_EXPR:
3926 if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3927 return build_real (scalar_type, dconstm0);
3928 else
3929 return build_zero_cst (scalar_type);
3931 case MULT_EXPR:
3932 return build_one_cst (scalar_type);
3934 case BIT_AND_EXPR:
3935 return build_all_ones_cst (scalar_type);
3937 case MAX_EXPR:
3938 case MIN_EXPR:
3939 return initial_value;
3941 default:
3942 return NULL_TREE;
3944 else
3945 switch (combined_fn (code))
3947 CASE_CFN_FMIN:
3948 CASE_CFN_FMAX:
3949 return initial_value;
3951 default:
3952 return NULL_TREE;
3956 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3957 STMT is printed with a message MSG. */
3959 static void
3960 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3962 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3965 /* Return true if we need an in-order reduction for operation CODE
3966 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3967 overflow must wrap. */
3969 bool
3970 needs_fold_left_reduction_p (tree type, code_helper code)
3972 /* CHECKME: check for !flag_finite_math_only too? */
3973 if (SCALAR_FLOAT_TYPE_P (type))
3975 if (code.is_tree_code ())
3976 switch (tree_code (code))
3978 case MIN_EXPR:
3979 case MAX_EXPR:
3980 return false;
3982 default:
3983 return !flag_associative_math;
3985 else
3986 switch (combined_fn (code))
3988 CASE_CFN_FMIN:
3989 CASE_CFN_FMAX:
3990 return false;
3992 default:
3993 return !flag_associative_math;
3997 if (INTEGRAL_TYPE_P (type))
3998 return (!code.is_tree_code ()
3999 || !operation_no_trapping_overflow (type, tree_code (code)));
4001 if (SAT_FIXED_POINT_TYPE_P (type))
4002 return true;
4004 return false;
4007 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4008 has a handled computation expression. Store the main reduction
4009 operation in *CODE. */
4011 static bool
4012 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4013 tree loop_arg, code_helper *code,
4014 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
4016 auto_bitmap visited;
4017 tree lookfor = PHI_RESULT (phi);
4018 ssa_op_iter curri;
4019 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4020 while (USE_FROM_PTR (curr) != loop_arg)
4021 curr = op_iter_next_use (&curri);
4022 curri.i = curri.numops;
4025 path.safe_push (std::make_pair (curri, curr));
4026 tree use = USE_FROM_PTR (curr);
4027 if (use == lookfor)
4028 break;
4029 gimple *def = SSA_NAME_DEF_STMT (use);
4030 if (gimple_nop_p (def)
4031 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4033 pop:
4036 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4037 curri = x.first;
4038 curr = x.second;
4040 curr = op_iter_next_use (&curri);
4041 /* Skip already visited or non-SSA operands (from iterating
4042 over PHI args). */
4043 while (curr != NULL_USE_OPERAND_P
4044 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4045 || ! bitmap_set_bit (visited,
4046 SSA_NAME_VERSION
4047 (USE_FROM_PTR (curr)))));
4049 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4050 if (curr == NULL_USE_OPERAND_P)
4051 break;
4053 else
4055 if (gimple_code (def) == GIMPLE_PHI)
4056 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4057 else
4058 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4059 while (curr != NULL_USE_OPERAND_P
4060 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4061 || ! bitmap_set_bit (visited,
4062 SSA_NAME_VERSION
4063 (USE_FROM_PTR (curr)))))
4064 curr = op_iter_next_use (&curri);
4065 if (curr == NULL_USE_OPERAND_P)
4066 goto pop;
4069 while (1);
4070 if (dump_file && (dump_flags & TDF_DETAILS))
4072 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4073 unsigned i;
4074 std::pair<ssa_op_iter, use_operand_p> *x;
4075 FOR_EACH_VEC_ELT (path, i, x)
4076 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4077 dump_printf (MSG_NOTE, "\n");
4080 /* Check whether the reduction path detected is valid. */
4081 bool fail = path.length () == 0;
4082 bool neg = false;
4083 int sign = -1;
4084 *code = ERROR_MARK;
4085 for (unsigned i = 1; i < path.length (); ++i)
4087 gimple *use_stmt = USE_STMT (path[i].second);
4088 gimple_match_op op;
4089 if (!gimple_extract_op (use_stmt, &op))
4091 fail = true;
4092 break;
4094 unsigned int opi = op.num_ops;
4095 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4097 /* The following make sure we can compute the operand index
4098 easily plus it mostly disallows chaining via COND_EXPR condition
4099 operands. */
4100 for (opi = 0; opi < op.num_ops; ++opi)
4101 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4102 break;
4104 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4106 for (opi = 0; opi < op.num_ops; ++opi)
4107 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4108 break;
4110 if (opi == op.num_ops)
4112 fail = true;
4113 break;
4115 op.code = canonicalize_code (op.code, op.type);
4116 if (op.code == MINUS_EXPR)
4118 op.code = PLUS_EXPR;
4119 /* Track whether we negate the reduction value each iteration. */
4120 if (op.ops[1] == op.ops[opi])
4121 neg = ! neg;
4123 else if (op.code == IFN_COND_SUB)
4125 op.code = IFN_COND_ADD;
4126 /* Track whether we negate the reduction value each iteration. */
4127 if (op.ops[2] == op.ops[opi])
4128 neg = ! neg;
4130 if (CONVERT_EXPR_CODE_P (op.code)
4131 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4133 else if (*code == ERROR_MARK)
4135 *code = op.code;
4136 sign = TYPE_SIGN (op.type);
4138 else if (op.code != *code)
4140 fail = true;
4141 break;
4143 else if ((op.code == MIN_EXPR
4144 || op.code == MAX_EXPR)
4145 && sign != TYPE_SIGN (op.type))
4147 fail = true;
4148 break;
4150 /* Check there's only a single stmt the op is used on. For the
4151 not value-changing tail and the last stmt allow out-of-loop uses.
4152 ??? We could relax this and handle arbitrary live stmts by
4153 forcing a scalar epilogue for example. */
4154 imm_use_iterator imm_iter;
4155 use_operand_p use_p;
4156 gimple *op_use_stmt;
4157 unsigned cnt = 0;
4158 bool cond_fn_p = op.code.is_internal_fn ()
4159 && (conditional_internal_fn_code (internal_fn (op.code))
4160 != ERROR_MARK);
4162 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4164 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4165 op1 twice (once as definition, once as else) in the same operation.
4166 Allow this. */
4167 if (cond_fn_p && op_use_stmt == use_stmt)
4169 gcall *call = as_a<gcall *> (use_stmt);
4170 unsigned else_pos
4171 = internal_fn_else_index (internal_fn (op.code));
4173 for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4175 if (j == else_pos)
4176 continue;
4177 if (gimple_call_arg (call, j) == op.ops[opi])
4178 cnt++;
4181 else if (!is_gimple_debug (op_use_stmt)
4182 && (*code != ERROR_MARK
4183 || flow_bb_inside_loop_p (loop,
4184 gimple_bb (op_use_stmt))))
4185 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4186 cnt++;
4189 if (cnt != 1)
4191 fail = true;
4192 break;
4195 return ! fail && ! neg && *code != ERROR_MARK;
4198 bool
4199 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4200 tree loop_arg, enum tree_code code)
4202 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4203 code_helper code_;
4204 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4205 && code_ == code);
4210 /* Function vect_is_simple_reduction
4212 (1) Detect a cross-iteration def-use cycle that represents a simple
4213 reduction computation. We look for the following pattern:
4215 loop_header:
4216 a1 = phi < a0, a2 >
4217 a3 = ...
4218 a2 = operation (a3, a1)
4222 a3 = ...
4223 loop_header:
4224 a1 = phi < a0, a2 >
4225 a2 = operation (a3, a1)
4227 such that:
4228 1. operation is commutative and associative and it is safe to
4229 change the order of the computation
4230 2. no uses for a2 in the loop (a2 is used out of the loop)
4231 3. no uses of a1 in the loop besides the reduction operation
4232 4. no uses of a1 outside the loop.
4234 Conditions 1,4 are tested here.
4235 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4237 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4238 nested cycles.
4240 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4241 reductions:
4243 a1 = phi < a0, a2 >
4244 inner loop (def of a3)
4245 a2 = phi < a3 >
4247 (4) Detect condition expressions, ie:
4248 for (int i = 0; i < N; i++)
4249 if (a[i] < val)
4250 ret_val = a[i];
4254 static stmt_vec_info
4255 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4256 bool *double_reduc, bool *reduc_chain_p, bool slp)
4258 gphi *phi = as_a <gphi *> (phi_info->stmt);
4259 gimple *phi_use_stmt = NULL;
4260 imm_use_iterator imm_iter;
4261 use_operand_p use_p;
4263 *double_reduc = false;
4264 *reduc_chain_p = false;
4265 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4267 tree phi_name = PHI_RESULT (phi);
4268 /* ??? If there are no uses of the PHI result the inner loop reduction
4269 won't be detected as possibly double-reduction by vectorizable_reduction
4270 because that tries to walk the PHI arg from the preheader edge which
4271 can be constant. See PR60382. */
4272 if (has_zero_uses (phi_name))
4273 return NULL;
4274 class loop *loop = (gimple_bb (phi))->loop_father;
4275 unsigned nphi_def_loop_uses = 0;
4276 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4278 gimple *use_stmt = USE_STMT (use_p);
4279 if (is_gimple_debug (use_stmt))
4280 continue;
4282 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4284 if (dump_enabled_p ())
4285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4286 "intermediate value used outside loop.\n");
4288 return NULL;
4291 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4292 op1 twice (once as definition, once as else) in the same operation.
4293 Only count it as one. */
4294 if (use_stmt != phi_use_stmt)
4296 nphi_def_loop_uses++;
4297 phi_use_stmt = use_stmt;
4301 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4302 if (TREE_CODE (latch_def) != SSA_NAME)
4304 if (dump_enabled_p ())
4305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4306 "reduction: not ssa_name: %T\n", latch_def);
4307 return NULL;
4310 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4311 if (!def_stmt_info
4312 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4313 return NULL;
4315 bool nested_in_vect_loop
4316 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4317 unsigned nlatch_def_loop_uses = 0;
4318 auto_vec<gphi *, 3> lcphis;
4319 bool inner_loop_of_double_reduc = false;
4320 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4322 gimple *use_stmt = USE_STMT (use_p);
4323 if (is_gimple_debug (use_stmt))
4324 continue;
4325 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4326 nlatch_def_loop_uses++;
4327 else
4329 /* We can have more than one loop-closed PHI. */
4330 lcphis.safe_push (as_a <gphi *> (use_stmt));
4331 if (nested_in_vect_loop
4332 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4333 == vect_double_reduction_def))
4334 inner_loop_of_double_reduc = true;
4338 /* If we are vectorizing an inner reduction we are executing that
4339 in the original order only in case we are not dealing with a
4340 double reduction. */
4341 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4343 if (dump_enabled_p ())
4344 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4345 "detected nested cycle: ");
4346 return def_stmt_info;
4349 /* When the inner loop of a double reduction ends up with more than
4350 one loop-closed PHI we have failed to classify alternate such
4351 PHIs as double reduction, leading to wrong code. See PR103237. */
4352 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4354 if (dump_enabled_p ())
4355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4356 "unhandle double reduction\n");
4357 return NULL;
4360 /* If this isn't a nested cycle or if the nested cycle reduction value
4361 is used ouside of the inner loop we cannot handle uses of the reduction
4362 value. */
4363 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4365 if (dump_enabled_p ())
4366 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4367 "reduction used in loop.\n");
4368 return NULL;
4371 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4372 defined in the inner loop. */
4373 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4375 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4376 if (gimple_phi_num_args (def_stmt) != 1
4377 || TREE_CODE (op1) != SSA_NAME)
4379 if (dump_enabled_p ())
4380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4381 "unsupported phi node definition.\n");
4383 return NULL;
4386 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4387 and the latch definition op1. */
4388 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4389 if (gimple_bb (def1)
4390 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4391 && loop->inner
4392 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4393 && (is_gimple_assign (def1) || is_gimple_call (def1))
4394 && is_a <gphi *> (phi_use_stmt)
4395 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4396 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4397 loop_latch_edge (loop->inner))))
4399 if (dump_enabled_p ())
4400 report_vect_op (MSG_NOTE, def_stmt,
4401 "detected double reduction: ");
4403 *double_reduc = true;
4404 return def_stmt_info;
4407 return NULL;
4410 /* Look for the expression computing latch_def from then loop PHI result. */
4411 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4412 code_helper code;
4413 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4414 path))
4416 STMT_VINFO_REDUC_CODE (phi_info) = code;
4417 if (code == COND_EXPR && !nested_in_vect_loop)
4418 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4420 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4421 reduction chain for which the additional restriction is that
4422 all operations in the chain are the same. */
4423 auto_vec<stmt_vec_info, 8> reduc_chain;
4424 unsigned i;
4425 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4426 for (i = path.length () - 1; i >= 1; --i)
4428 gimple *stmt = USE_STMT (path[i].second);
4429 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4430 gimple_match_op op;
4431 if (!gimple_extract_op (stmt, &op))
4432 gcc_unreachable ();
4433 if (gassign *assign = dyn_cast<gassign *> (stmt))
4434 STMT_VINFO_REDUC_IDX (stmt_info)
4435 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4436 else
4438 gcall *call = as_a<gcall *> (stmt);
4439 STMT_VINFO_REDUC_IDX (stmt_info)
4440 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4442 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4443 && (i == 1 || i == path.length () - 1));
4444 if ((op.code != code && !leading_conversion)
4445 /* We can only handle the final value in epilogue
4446 generation for reduction chains. */
4447 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4448 is_slp_reduc = false;
4449 /* For reduction chains we support a trailing/leading
4450 conversions. We do not store those in the actual chain. */
4451 if (leading_conversion)
4452 continue;
4453 reduc_chain.safe_push (stmt_info);
4455 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4457 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4459 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4460 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4462 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4463 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4465 /* Save the chain for further analysis in SLP detection. */
4466 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4467 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4469 *reduc_chain_p = true;
4470 if (dump_enabled_p ())
4471 dump_printf_loc (MSG_NOTE, vect_location,
4472 "reduction: detected reduction chain\n");
4474 else if (dump_enabled_p ())
4475 dump_printf_loc (MSG_NOTE, vect_location,
4476 "reduction: detected reduction\n");
4478 return def_stmt_info;
4481 if (dump_enabled_p ())
4482 dump_printf_loc (MSG_NOTE, vect_location,
4483 "reduction: unknown pattern\n");
4485 return NULL;
4488 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4489 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4490 or -1 if not known. */
4492 static int
4493 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4495 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4496 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4498 if (dump_enabled_p ())
4499 dump_printf_loc (MSG_NOTE, vect_location,
4500 "cost model: epilogue peel iters set to vf/2 "
4501 "because loop iterations are unknown .\n");
4502 return assumed_vf / 2;
4504 else
4506 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4507 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4508 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4509 /* If we need to peel for gaps, but no peeling is required, we have to
4510 peel VF iterations. */
4511 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4512 peel_iters_epilogue = assumed_vf;
4513 return peel_iters_epilogue;
4517 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4519 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4520 int *peel_iters_epilogue,
4521 stmt_vector_for_cost *scalar_cost_vec,
4522 stmt_vector_for_cost *prologue_cost_vec,
4523 stmt_vector_for_cost *epilogue_cost_vec)
4525 int retval = 0;
4527 *peel_iters_epilogue
4528 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4530 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4532 /* If peeled iterations are known but number of scalar loop
4533 iterations are unknown, count a taken branch per peeled loop. */
4534 if (peel_iters_prologue > 0)
4535 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4536 vect_prologue);
4537 if (*peel_iters_epilogue > 0)
4538 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4539 vect_epilogue);
4542 stmt_info_for_cost *si;
4543 int j;
4544 if (peel_iters_prologue)
4545 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4546 retval += record_stmt_cost (prologue_cost_vec,
4547 si->count * peel_iters_prologue,
4548 si->kind, si->stmt_info, si->misalign,
4549 vect_prologue);
4550 if (*peel_iters_epilogue)
4551 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4552 retval += record_stmt_cost (epilogue_cost_vec,
4553 si->count * *peel_iters_epilogue,
4554 si->kind, si->stmt_info, si->misalign,
4555 vect_epilogue);
4557 return retval;
4560 /* Function vect_estimate_min_profitable_iters
4562 Return the number of iterations required for the vector version of the
4563 loop to be profitable relative to the cost of the scalar version of the
4564 loop.
4566 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4567 of iterations for vectorization. -1 value means loop vectorization
4568 is not profitable. This returned value may be used for dynamic
4569 profitability check.
4571 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4572 for static check against estimated number of iterations. */
4574 static void
4575 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4576 int *ret_min_profitable_niters,
4577 int *ret_min_profitable_estimate,
4578 unsigned *suggested_unroll_factor)
4580 int min_profitable_iters;
4581 int min_profitable_estimate;
4582 int peel_iters_prologue;
4583 int peel_iters_epilogue;
4584 unsigned vec_inside_cost = 0;
4585 int vec_outside_cost = 0;
4586 unsigned vec_prologue_cost = 0;
4587 unsigned vec_epilogue_cost = 0;
4588 int scalar_single_iter_cost = 0;
4589 int scalar_outside_cost = 0;
4590 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4591 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4592 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4594 /* Cost model disabled. */
4595 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4597 if (dump_enabled_p ())
4598 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4599 *ret_min_profitable_niters = 0;
4600 *ret_min_profitable_estimate = 0;
4601 return;
4604 /* Requires loop versioning tests to handle misalignment. */
4605 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4607 /* FIXME: Make cost depend on complexity of individual check. */
4608 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4609 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4610 if (dump_enabled_p ())
4611 dump_printf (MSG_NOTE,
4612 "cost model: Adding cost of checks for loop "
4613 "versioning to treat misalignment.\n");
4616 /* Requires loop versioning with alias checks. */
4617 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4619 /* FIXME: Make cost depend on complexity of individual check. */
4620 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4621 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4622 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4623 if (len)
4624 /* Count LEN - 1 ANDs and LEN comparisons. */
4625 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4626 scalar_stmt, vect_prologue);
4627 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4628 if (len)
4630 /* Count LEN - 1 ANDs and LEN comparisons. */
4631 unsigned int nstmts = len * 2 - 1;
4632 /* +1 for each bias that needs adding. */
4633 for (unsigned int i = 0; i < len; ++i)
4634 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4635 nstmts += 1;
4636 (void) add_stmt_cost (target_cost_data, nstmts,
4637 scalar_stmt, vect_prologue);
4639 if (dump_enabled_p ())
4640 dump_printf (MSG_NOTE,
4641 "cost model: Adding cost of checks for loop "
4642 "versioning aliasing.\n");
4645 /* Requires loop versioning with niter checks. */
4646 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4648 /* FIXME: Make cost depend on complexity of individual check. */
4649 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4650 NULL, NULL, NULL_TREE, 0, vect_prologue);
4651 if (dump_enabled_p ())
4652 dump_printf (MSG_NOTE,
4653 "cost model: Adding cost of checks for loop "
4654 "versioning niters.\n");
4657 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4658 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4659 vect_prologue);
4661 /* Count statements in scalar loop. Using this as scalar cost for a single
4662 iteration for now.
4664 TODO: Add outer loop support.
4666 TODO: Consider assigning different costs to different scalar
4667 statements. */
4669 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4671 /* Add additional cost for the peeled instructions in prologue and epilogue
4672 loop. (For fully-masked loops there will be no peeling.)
4674 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4675 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4677 TODO: Build an expression that represents peel_iters for prologue and
4678 epilogue to be used in a run-time test. */
4680 bool prologue_need_br_taken_cost = false;
4681 bool prologue_need_br_not_taken_cost = false;
4683 /* Calculate peel_iters_prologue. */
4684 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4685 peel_iters_prologue = 0;
4686 else if (npeel < 0)
4688 peel_iters_prologue = assumed_vf / 2;
4689 if (dump_enabled_p ())
4690 dump_printf (MSG_NOTE, "cost model: "
4691 "prologue peel iters set to vf/2.\n");
4693 /* If peeled iterations are unknown, count a taken branch and a not taken
4694 branch per peeled loop. Even if scalar loop iterations are known,
4695 vector iterations are not known since peeled prologue iterations are
4696 not known. Hence guards remain the same. */
4697 prologue_need_br_taken_cost = true;
4698 prologue_need_br_not_taken_cost = true;
4700 else
4702 peel_iters_prologue = npeel;
4703 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4704 /* If peeled iterations are known but number of scalar loop
4705 iterations are unknown, count a taken branch per peeled loop. */
4706 prologue_need_br_taken_cost = true;
4709 bool epilogue_need_br_taken_cost = false;
4710 bool epilogue_need_br_not_taken_cost = false;
4712 /* Calculate peel_iters_epilogue. */
4713 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4714 /* We need to peel exactly one iteration for gaps. */
4715 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4716 else if (npeel < 0)
4718 /* If peeling for alignment is unknown, loop bound of main loop
4719 becomes unknown. */
4720 peel_iters_epilogue = assumed_vf / 2;
4721 if (dump_enabled_p ())
4722 dump_printf (MSG_NOTE, "cost model: "
4723 "epilogue peel iters set to vf/2 because "
4724 "peeling for alignment is unknown.\n");
4726 /* See the same reason above in peel_iters_prologue calculation. */
4727 epilogue_need_br_taken_cost = true;
4728 epilogue_need_br_not_taken_cost = true;
4730 else
4732 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4733 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4734 /* If peeled iterations are known but number of scalar loop
4735 iterations are unknown, count a taken branch per peeled loop. */
4736 epilogue_need_br_taken_cost = true;
4739 stmt_info_for_cost *si;
4740 int j;
4741 /* Add costs associated with peel_iters_prologue. */
4742 if (peel_iters_prologue)
4743 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4745 (void) add_stmt_cost (target_cost_data,
4746 si->count * peel_iters_prologue, si->kind,
4747 si->stmt_info, si->node, si->vectype,
4748 si->misalign, vect_prologue);
4751 /* Add costs associated with peel_iters_epilogue. */
4752 if (peel_iters_epilogue)
4753 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4755 (void) add_stmt_cost (target_cost_data,
4756 si->count * peel_iters_epilogue, si->kind,
4757 si->stmt_info, si->node, si->vectype,
4758 si->misalign, vect_epilogue);
4761 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4763 if (prologue_need_br_taken_cost)
4764 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4765 vect_prologue);
4767 if (prologue_need_br_not_taken_cost)
4768 (void) add_stmt_cost (target_cost_data, 1,
4769 cond_branch_not_taken, vect_prologue);
4771 if (epilogue_need_br_taken_cost)
4772 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4773 vect_epilogue);
4775 if (epilogue_need_br_not_taken_cost)
4776 (void) add_stmt_cost (target_cost_data, 1,
4777 cond_branch_not_taken, vect_epilogue);
4779 /* Take care of special costs for rgroup controls of partial vectors. */
4780 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4781 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4782 == vect_partial_vectors_avx512))
4784 /* Calculate how many masks we need to generate. */
4785 unsigned int num_masks = 0;
4786 bool need_saturation = false;
4787 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4788 if (rgm.type)
4790 unsigned nvectors = rgm.factor;
4791 num_masks += nvectors;
4792 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4793 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4794 need_saturation = true;
4797 /* ??? The target isn't able to identify the costs below as
4798 producing masks so it cannot penaltize cases where we'd run
4799 out of mask registers for example. */
4801 /* ??? We are also failing to account for smaller vector masks
4802 we generate by splitting larger masks in vect_get_loop_mask. */
4804 /* In the worst case, we need to generate each mask in the prologue
4805 and in the loop body. We need one splat per group and one
4806 compare per mask.
4808 Sometimes the prologue mask will fold to a constant,
4809 so the actual prologue cost might be smaller. However, it's
4810 simpler and safer to use the worst-case cost; if this ends up
4811 being the tie-breaker between vectorizing or not, then it's
4812 probably better not to vectorize. */
4813 (void) add_stmt_cost (target_cost_data,
4814 num_masks
4815 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4816 vector_stmt, NULL, NULL, NULL_TREE, 0,
4817 vect_prologue);
4818 (void) add_stmt_cost (target_cost_data,
4819 num_masks
4820 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4821 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4823 /* When we need saturation we need it both in the prologue and
4824 the epilogue. */
4825 if (need_saturation)
4827 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4828 NULL, NULL, NULL_TREE, 0, vect_prologue);
4829 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4830 NULL, NULL, NULL_TREE, 0, vect_body);
4833 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4834 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4835 == vect_partial_vectors_while_ult))
4837 /* Calculate how many masks we need to generate. */
4838 unsigned int num_masks = 0;
4839 rgroup_controls *rgm;
4840 unsigned int num_vectors_m1;
4841 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4842 num_vectors_m1, rgm)
4843 if (rgm->type)
4844 num_masks += num_vectors_m1 + 1;
4845 gcc_assert (num_masks > 0);
4847 /* In the worst case, we need to generate each mask in the prologue
4848 and in the loop body. One of the loop body mask instructions
4849 replaces the comparison in the scalar loop, and since we don't
4850 count the scalar comparison against the scalar body, we shouldn't
4851 count that vector instruction against the vector body either.
4853 Sometimes we can use unpacks instead of generating prologue
4854 masks and sometimes the prologue mask will fold to a constant,
4855 so the actual prologue cost might be smaller. However, it's
4856 simpler and safer to use the worst-case cost; if this ends up
4857 being the tie-breaker between vectorizing or not, then it's
4858 probably better not to vectorize. */
4859 (void) add_stmt_cost (target_cost_data, num_masks,
4860 vector_stmt, NULL, NULL, NULL_TREE, 0,
4861 vect_prologue);
4862 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4863 vector_stmt, NULL, NULL, NULL_TREE, 0,
4864 vect_body);
4866 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4868 /* Referring to the functions vect_set_loop_condition_partial_vectors
4869 and vect_set_loop_controls_directly, we need to generate each
4870 length in the prologue and in the loop body if required. Although
4871 there are some possible optimizations, we consider the worst case
4872 here. */
4874 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4875 signed char partial_load_store_bias
4876 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4877 bool need_iterate_p
4878 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4879 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4881 /* Calculate how many statements to be added. */
4882 unsigned int prologue_stmts = 0;
4883 unsigned int body_stmts = 0;
4885 rgroup_controls *rgc;
4886 unsigned int num_vectors_m1;
4887 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4888 if (rgc->type)
4890 /* May need one SHIFT for nitems_total computation. */
4891 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4892 if (nitems != 1 && !niters_known_p)
4893 prologue_stmts += 1;
4895 /* May need one MAX and one MINUS for wrap around. */
4896 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4897 prologue_stmts += 2;
4899 /* Need one MAX and one MINUS for each batch limit excepting for
4900 the 1st one. */
4901 prologue_stmts += num_vectors_m1 * 2;
4903 unsigned int num_vectors = num_vectors_m1 + 1;
4905 /* Need to set up lengths in prologue, only one MIN required
4906 for each since start index is zero. */
4907 prologue_stmts += num_vectors;
4909 /* If we have a non-zero partial load bias, we need one PLUS
4910 to adjust the load length. */
4911 if (partial_load_store_bias != 0)
4912 body_stmts += 1;
4914 unsigned int length_update_cost = 0;
4915 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4916 /* For decrement IV style, Each only need a single SELECT_VL
4917 or MIN since beginning to calculate the number of elements
4918 need to be processed in current iteration. */
4919 length_update_cost = 1;
4920 else
4921 /* For increment IV stype, Each may need two MINs and one MINUS to
4922 update lengths in body for next iteration. */
4923 length_update_cost = 3;
4925 if (need_iterate_p)
4926 body_stmts += length_update_cost * num_vectors;
4929 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4930 scalar_stmt, vect_prologue);
4931 (void) add_stmt_cost (target_cost_data, body_stmts,
4932 scalar_stmt, vect_body);
4935 /* FORNOW: The scalar outside cost is incremented in one of the
4936 following ways:
4938 1. The vectorizer checks for alignment and aliasing and generates
4939 a condition that allows dynamic vectorization. A cost model
4940 check is ANDED with the versioning condition. Hence scalar code
4941 path now has the added cost of the versioning check.
4943 if (cost > th & versioning_check)
4944 jmp to vector code
4946 Hence run-time scalar is incremented by not-taken branch cost.
4948 2. The vectorizer then checks if a prologue is required. If the
4949 cost model check was not done before during versioning, it has to
4950 be done before the prologue check.
4952 if (cost <= th)
4953 prologue = scalar_iters
4954 if (prologue == 0)
4955 jmp to vector code
4956 else
4957 execute prologue
4958 if (prologue == num_iters)
4959 go to exit
4961 Hence the run-time scalar cost is incremented by a taken branch,
4962 plus a not-taken branch, plus a taken branch cost.
4964 3. The vectorizer then checks if an epilogue is required. If the
4965 cost model check was not done before during prologue check, it
4966 has to be done with the epilogue check.
4968 if (prologue == 0)
4969 jmp to vector code
4970 else
4971 execute prologue
4972 if (prologue == num_iters)
4973 go to exit
4974 vector code:
4975 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4976 jmp to epilogue
4978 Hence the run-time scalar cost should be incremented by 2 taken
4979 branches.
4981 TODO: The back end may reorder the BBS's differently and reverse
4982 conditions/branch directions. Change the estimates below to
4983 something more reasonable. */
4985 /* If the number of iterations is known and we do not do versioning, we can
4986 decide whether to vectorize at compile time. Hence the scalar version
4987 do not carry cost model guard costs. */
4988 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4989 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4991 /* Cost model check occurs at versioning. */
4992 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4993 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4994 else
4996 /* Cost model check occurs at prologue generation. */
4997 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4998 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4999 + vect_get_stmt_cost (cond_branch_not_taken);
5000 /* Cost model check occurs at epilogue generation. */
5001 else
5002 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
5006 /* Complete the target-specific cost calculations. */
5007 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
5008 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
5009 suggested_unroll_factor);
5011 if (suggested_unroll_factor && *suggested_unroll_factor > 1
5012 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
5013 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
5014 *suggested_unroll_factor,
5015 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5017 if (dump_enabled_p ())
5018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5019 "can't unroll as unrolled vectorization factor larger"
5020 " than maximum vectorization factor: "
5021 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5022 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5023 *suggested_unroll_factor = 1;
5026 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5028 if (dump_enabled_p ())
5030 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5031 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
5032 vec_inside_cost);
5033 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
5034 vec_prologue_cost);
5035 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
5036 vec_epilogue_cost);
5037 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
5038 scalar_single_iter_cost);
5039 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
5040 scalar_outside_cost);
5041 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
5042 vec_outside_cost);
5043 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
5044 peel_iters_prologue);
5045 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
5046 peel_iters_epilogue);
5049 /* Calculate number of iterations required to make the vector version
5050 profitable, relative to the loop bodies only. The following condition
5051 must hold true:
5052 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5053 where
5054 SIC = scalar iteration cost, VIC = vector iteration cost,
5055 VOC = vector outside cost, VF = vectorization factor,
5056 NPEEL = prologue iterations + epilogue iterations,
5057 SOC = scalar outside cost for run time cost model check. */
5059 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5060 - vec_inside_cost);
5061 if (saving_per_viter <= 0)
5063 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5064 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5065 "vectorization did not happen for a simd loop");
5067 if (dump_enabled_p ())
5068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5069 "cost model: the vector iteration cost = %d "
5070 "divided by the scalar iteration cost = %d "
5071 "is greater or equal to the vectorization factor = %d"
5072 ".\n",
5073 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5074 *ret_min_profitable_niters = -1;
5075 *ret_min_profitable_estimate = -1;
5076 return;
5079 /* ??? The "if" arm is written to handle all cases; see below for what
5080 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5081 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5083 /* Rewriting the condition above in terms of the number of
5084 vector iterations (vniters) rather than the number of
5085 scalar iterations (niters) gives:
5087 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5089 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5091 For integer N, X and Y when X > 0:
5093 N * X > Y <==> N >= (Y /[floor] X) + 1. */
5094 int outside_overhead = (vec_outside_cost
5095 - scalar_single_iter_cost * peel_iters_prologue
5096 - scalar_single_iter_cost * peel_iters_epilogue
5097 - scalar_outside_cost);
5098 /* We're only interested in cases that require at least one
5099 vector iteration. */
5100 int min_vec_niters = 1;
5101 if (outside_overhead > 0)
5102 min_vec_niters = outside_overhead / saving_per_viter + 1;
5104 if (dump_enabled_p ())
5105 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5106 min_vec_niters);
5108 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5110 /* Now that we know the minimum number of vector iterations,
5111 find the minimum niters for which the scalar cost is larger:
5113 SIC * niters > VIC * vniters + VOC - SOC
5115 We know that the minimum niters is no more than
5116 vniters * VF + NPEEL, but it might be (and often is) less
5117 than that if a partial vector iteration is cheaper than the
5118 equivalent scalar code. */
5119 int threshold = (vec_inside_cost * min_vec_niters
5120 + vec_outside_cost
5121 - scalar_outside_cost);
5122 if (threshold <= 0)
5123 min_profitable_iters = 1;
5124 else
5125 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5127 else
5128 /* Convert the number of vector iterations into a number of
5129 scalar iterations. */
5130 min_profitable_iters = (min_vec_niters * assumed_vf
5131 + peel_iters_prologue
5132 + peel_iters_epilogue);
5134 else
5136 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5137 * assumed_vf
5138 - vec_inside_cost * peel_iters_prologue
5139 - vec_inside_cost * peel_iters_epilogue);
5140 if (min_profitable_iters <= 0)
5141 min_profitable_iters = 0;
5142 else
5144 min_profitable_iters /= saving_per_viter;
5146 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5147 <= (((int) vec_inside_cost * min_profitable_iters)
5148 + (((int) vec_outside_cost - scalar_outside_cost)
5149 * assumed_vf)))
5150 min_profitable_iters++;
5154 if (dump_enabled_p ())
5155 dump_printf (MSG_NOTE,
5156 " Calculated minimum iters for profitability: %d\n",
5157 min_profitable_iters);
5159 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5160 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5161 /* We want the vectorized loop to execute at least once. */
5162 min_profitable_iters = assumed_vf + peel_iters_prologue;
5163 else if (min_profitable_iters < peel_iters_prologue)
5164 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5165 vectorized loop executes at least once. */
5166 min_profitable_iters = peel_iters_prologue;
5168 if (dump_enabled_p ())
5169 dump_printf_loc (MSG_NOTE, vect_location,
5170 " Runtime profitability threshold = %d\n",
5171 min_profitable_iters);
5173 *ret_min_profitable_niters = min_profitable_iters;
5175 /* Calculate number of iterations required to make the vector version
5176 profitable, relative to the loop bodies only.
5178 Non-vectorized variant is SIC * niters and it must win over vector
5179 variant on the expected loop trip count. The following condition must hold true:
5180 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5182 if (vec_outside_cost <= 0)
5183 min_profitable_estimate = 0;
5184 /* ??? This "else if" arm is written to handle all cases; see below for
5185 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5186 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5188 /* This is a repeat of the code above, but with + SOC rather
5189 than - SOC. */
5190 int outside_overhead = (vec_outside_cost
5191 - scalar_single_iter_cost * peel_iters_prologue
5192 - scalar_single_iter_cost * peel_iters_epilogue
5193 + scalar_outside_cost);
5194 int min_vec_niters = 1;
5195 if (outside_overhead > 0)
5196 min_vec_niters = outside_overhead / saving_per_viter + 1;
5198 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5200 int threshold = (vec_inside_cost * min_vec_niters
5201 + vec_outside_cost
5202 + scalar_outside_cost);
5203 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5205 else
5206 min_profitable_estimate = (min_vec_niters * assumed_vf
5207 + peel_iters_prologue
5208 + peel_iters_epilogue);
5210 else
5212 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5213 * assumed_vf
5214 - vec_inside_cost * peel_iters_prologue
5215 - vec_inside_cost * peel_iters_epilogue)
5216 / ((scalar_single_iter_cost * assumed_vf)
5217 - vec_inside_cost);
5219 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5220 if (dump_enabled_p ())
5221 dump_printf_loc (MSG_NOTE, vect_location,
5222 " Static estimate profitability threshold = %d\n",
5223 min_profitable_estimate);
5225 *ret_min_profitable_estimate = min_profitable_estimate;
5228 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5229 vector elements (not bits) for a vector with NELT elements. */
5230 static void
5231 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5232 vec_perm_builder *sel)
5234 /* The encoding is a single stepped pattern. Any wrap-around is handled
5235 by vec_perm_indices. */
5236 sel->new_vector (nelt, 1, 3);
5237 for (unsigned int i = 0; i < 3; i++)
5238 sel->quick_push (i + offset);
5241 /* Checks whether the target supports whole-vector shifts for vectors of mode
5242 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5243 it supports vec_perm_const with masks for all necessary shift amounts. */
5244 static bool
5245 have_whole_vector_shift (machine_mode mode)
5247 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5248 return true;
5250 /* Variable-length vectors should be handled via the optab. */
5251 unsigned int nelt;
5252 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5253 return false;
5255 vec_perm_builder sel;
5256 vec_perm_indices indices;
5257 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5259 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5260 indices.new_vector (sel, 2, nelt);
5261 if (!can_vec_perm_const_p (mode, mode, indices, false))
5262 return false;
5264 return true;
5267 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5268 multiplication operands have differing signs and (b) we intend
5269 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5270 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5272 static bool
5273 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5274 stmt_vec_info stmt_info)
5276 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5277 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5278 return false;
5280 tree rhs1 = gimple_assign_rhs1 (assign);
5281 tree rhs2 = gimple_assign_rhs2 (assign);
5282 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5283 return false;
5285 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5286 gcc_assert (reduc_info->is_reduc_info);
5287 return !directly_supported_p (DOT_PROD_EXPR,
5288 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5289 optab_vector_mixed_sign);
5292 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5293 functions. Design better to avoid maintenance issues. */
5295 /* Function vect_model_reduction_cost.
5297 Models cost for a reduction operation, including the vector ops
5298 generated within the strip-mine loop in some cases, the initial
5299 definition before the loop, and the epilogue code that must be generated. */
5301 static void
5302 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5303 stmt_vec_info stmt_info, internal_fn reduc_fn,
5304 vect_reduction_type reduction_type,
5305 int ncopies, stmt_vector_for_cost *cost_vec)
5307 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5308 tree vectype;
5309 machine_mode mode;
5310 class loop *loop = NULL;
5312 if (loop_vinfo)
5313 loop = LOOP_VINFO_LOOP (loop_vinfo);
5315 /* Condition reductions generate two reductions in the loop. */
5316 if (reduction_type == COND_REDUCTION)
5317 ncopies *= 2;
5319 vectype = STMT_VINFO_VECTYPE (stmt_info);
5320 mode = TYPE_MODE (vectype);
5321 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5323 gimple_match_op op;
5324 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5325 gcc_unreachable ();
5327 bool emulated_mixed_dot_prod
5328 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5329 if (reduction_type == EXTRACT_LAST_REDUCTION)
5330 /* No extra instructions are needed in the prologue. The loop body
5331 operations are costed in vectorizable_condition. */
5332 inside_cost = 0;
5333 else if (reduction_type == FOLD_LEFT_REDUCTION)
5335 /* No extra instructions needed in the prologue. */
5336 prologue_cost = 0;
5338 if (reduc_fn != IFN_LAST)
5339 /* Count one reduction-like operation per vector. */
5340 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5341 stmt_info, 0, vect_body);
5342 else
5344 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5345 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5346 inside_cost = record_stmt_cost (cost_vec, nelements,
5347 vec_to_scalar, stmt_info, 0,
5348 vect_body);
5349 inside_cost += record_stmt_cost (cost_vec, nelements,
5350 scalar_stmt, stmt_info, 0,
5351 vect_body);
5354 else
5356 /* Add in the cost of the initial definitions. */
5357 int prologue_stmts;
5358 if (reduction_type == COND_REDUCTION)
5359 /* For cond reductions we have four vectors: initial index, step,
5360 initial result of the data reduction, initial value of the index
5361 reduction. */
5362 prologue_stmts = 4;
5363 else if (emulated_mixed_dot_prod)
5364 /* We need the initial reduction value and two invariants:
5365 one that contains the minimum signed value and one that
5366 contains half of its negative. */
5367 prologue_stmts = 3;
5368 else
5369 prologue_stmts = 1;
5370 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5371 scalar_to_vec, stmt_info, 0,
5372 vect_prologue);
5375 /* Determine cost of epilogue code.
5377 We have a reduction operator that will reduce the vector in one statement.
5378 Also requires scalar extract. */
5380 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5382 if (reduc_fn != IFN_LAST)
5384 if (reduction_type == COND_REDUCTION)
5386 /* An EQ stmt and an COND_EXPR stmt. */
5387 epilogue_cost += record_stmt_cost (cost_vec, 2,
5388 vector_stmt, stmt_info, 0,
5389 vect_epilogue);
5390 /* Reduction of the max index and a reduction of the found
5391 values. */
5392 epilogue_cost += record_stmt_cost (cost_vec, 2,
5393 vec_to_scalar, stmt_info, 0,
5394 vect_epilogue);
5395 /* A broadcast of the max value. */
5396 epilogue_cost += record_stmt_cost (cost_vec, 1,
5397 scalar_to_vec, stmt_info, 0,
5398 vect_epilogue);
5400 else
5402 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5403 stmt_info, 0, vect_epilogue);
5404 epilogue_cost += record_stmt_cost (cost_vec, 1,
5405 vec_to_scalar, stmt_info, 0,
5406 vect_epilogue);
5409 else if (reduction_type == COND_REDUCTION)
5411 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5412 /* Extraction of scalar elements. */
5413 epilogue_cost += record_stmt_cost (cost_vec,
5414 2 * estimated_nunits,
5415 vec_to_scalar, stmt_info, 0,
5416 vect_epilogue);
5417 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5418 epilogue_cost += record_stmt_cost (cost_vec,
5419 2 * estimated_nunits - 3,
5420 scalar_stmt, stmt_info, 0,
5421 vect_epilogue);
5423 else if (reduction_type == EXTRACT_LAST_REDUCTION
5424 || reduction_type == FOLD_LEFT_REDUCTION)
5425 /* No extra instructions need in the epilogue. */
5427 else
5429 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5430 tree bitsize = TYPE_SIZE (op.type);
5431 int element_bitsize = tree_to_uhwi (bitsize);
5432 int nelements = vec_size_in_bits / element_bitsize;
5434 if (op.code == COND_EXPR)
5435 op.code = MAX_EXPR;
5437 /* We have a whole vector shift available. */
5438 if (VECTOR_MODE_P (mode)
5439 && directly_supported_p (op.code, vectype)
5440 && have_whole_vector_shift (mode))
5442 /* Final reduction via vector shifts and the reduction operator.
5443 Also requires scalar extract. */
5444 epilogue_cost += record_stmt_cost (cost_vec,
5445 exact_log2 (nelements) * 2,
5446 vector_stmt, stmt_info, 0,
5447 vect_epilogue);
5448 epilogue_cost += record_stmt_cost (cost_vec, 1,
5449 vec_to_scalar, stmt_info, 0,
5450 vect_epilogue);
5452 else
5453 /* Use extracts and reduction op for final reduction. For N
5454 elements, we have N extracts and N-1 reduction ops. */
5455 epilogue_cost += record_stmt_cost (cost_vec,
5456 nelements + nelements - 1,
5457 vector_stmt, stmt_info, 0,
5458 vect_epilogue);
5462 if (dump_enabled_p ())
5463 dump_printf (MSG_NOTE,
5464 "vect_model_reduction_cost: inside_cost = %d, "
5465 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5466 prologue_cost, epilogue_cost);
5469 /* SEQ is a sequence of instructions that initialize the reduction
5470 described by REDUC_INFO. Emit them in the appropriate place. */
5472 static void
5473 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5474 stmt_vec_info reduc_info, gimple *seq)
5476 if (reduc_info->reused_accumulator)
5478 /* When reusing an accumulator from the main loop, we only need
5479 initialization instructions if the main loop can be skipped.
5480 In that case, emit the initialization instructions at the end
5481 of the guard block that does the skip. */
5482 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5483 gcc_assert (skip_edge);
5484 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5485 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5487 else
5489 /* The normal case: emit the initialization instructions on the
5490 preheader edge. */
5491 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5492 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5496 /* Function get_initial_def_for_reduction
5498 Input:
5499 REDUC_INFO - the info_for_reduction
5500 INIT_VAL - the initial value of the reduction variable
5501 NEUTRAL_OP - a value that has no effect on the reduction, as per
5502 neutral_op_for_reduction
5504 Output:
5505 Return a vector variable, initialized according to the operation that
5506 STMT_VINFO performs. This vector will be used as the initial value
5507 of the vector of partial results.
5509 The value we need is a vector in which element 0 has value INIT_VAL
5510 and every other element has value NEUTRAL_OP. */
5512 static tree
5513 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5514 stmt_vec_info reduc_info,
5515 tree init_val, tree neutral_op)
5517 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5518 tree scalar_type = TREE_TYPE (init_val);
5519 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5520 tree init_def;
5521 gimple_seq stmts = NULL;
5523 gcc_assert (vectype);
5525 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5526 || SCALAR_FLOAT_TYPE_P (scalar_type));
5528 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5529 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5531 if (operand_equal_p (init_val, neutral_op))
5533 /* If both elements are equal then the vector described above is
5534 just a splat. */
5535 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5536 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5538 else
5540 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5541 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5542 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5544 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5545 element 0. */
5546 init_def = gimple_build_vector_from_val (&stmts, vectype,
5547 neutral_op);
5548 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5549 vectype, init_def, init_val);
5551 else
5553 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5554 tree_vector_builder elts (vectype, 1, 2);
5555 elts.quick_push (init_val);
5556 elts.quick_push (neutral_op);
5557 init_def = gimple_build_vector (&stmts, &elts);
5561 if (stmts)
5562 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5563 return init_def;
5566 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5567 which performs a reduction involving GROUP_SIZE scalar statements.
5568 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5569 is nonnull, introducing extra elements of that value will not change the
5570 result. */
5572 static void
5573 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5574 stmt_vec_info reduc_info,
5575 vec<tree> *vec_oprnds,
5576 unsigned int number_of_vectors,
5577 unsigned int group_size, tree neutral_op)
5579 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5580 unsigned HOST_WIDE_INT nunits;
5581 unsigned j, number_of_places_left_in_vector;
5582 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5583 unsigned int i;
5585 gcc_assert (group_size == initial_values.length () || neutral_op);
5587 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5588 created vectors. It is greater than 1 if unrolling is performed.
5590 For example, we have two scalar operands, s1 and s2 (e.g., group of
5591 strided accesses of size two), while NUNITS is four (i.e., four scalars
5592 of this type can be packed in a vector). The output vector will contain
5593 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5594 will be 2).
5596 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5597 vectors containing the operands.
5599 For example, NUNITS is four as before, and the group size is 8
5600 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5601 {s5, s6, s7, s8}. */
5603 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5604 nunits = group_size;
5606 number_of_places_left_in_vector = nunits;
5607 bool constant_p = true;
5608 tree_vector_builder elts (vector_type, nunits, 1);
5609 elts.quick_grow (nunits);
5610 gimple_seq ctor_seq = NULL;
5611 for (j = 0; j < nunits * number_of_vectors; ++j)
5613 tree op;
5614 i = j % group_size;
5616 /* Get the def before the loop. In reduction chain we have only
5617 one initial value. Else we have as many as PHIs in the group. */
5618 if (i >= initial_values.length () || (j > i && neutral_op))
5619 op = neutral_op;
5620 else
5621 op = initial_values[i];
5623 /* Create 'vect_ = {op0,op1,...,opn}'. */
5624 number_of_places_left_in_vector--;
5625 elts[nunits - number_of_places_left_in_vector - 1] = op;
5626 if (!CONSTANT_CLASS_P (op))
5627 constant_p = false;
5629 if (number_of_places_left_in_vector == 0)
5631 tree init;
5632 if (constant_p && !neutral_op
5633 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5634 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5635 /* Build the vector directly from ELTS. */
5636 init = gimple_build_vector (&ctor_seq, &elts);
5637 else if (neutral_op)
5639 /* Build a vector of the neutral value and shift the
5640 other elements into place. */
5641 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5642 neutral_op);
5643 int k = nunits;
5644 while (k > 0 && elts[k - 1] == neutral_op)
5645 k -= 1;
5646 while (k > 0)
5648 k -= 1;
5649 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5650 vector_type, init, elts[k]);
5653 else
5655 /* First time round, duplicate ELTS to fill the
5656 required number of vectors. */
5657 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5658 elts, number_of_vectors, *vec_oprnds);
5659 break;
5661 vec_oprnds->quick_push (init);
5663 number_of_places_left_in_vector = nunits;
5664 elts.new_vector (vector_type, nunits, 1);
5665 elts.quick_grow (nunits);
5666 constant_p = true;
5669 if (ctor_seq != NULL)
5670 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5673 /* For a statement STMT_INFO taking part in a reduction operation return
5674 the stmt_vec_info the meta information is stored on. */
5676 stmt_vec_info
5677 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5679 stmt_info = vect_orig_stmt (stmt_info);
5680 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5681 if (!is_a <gphi *> (stmt_info->stmt)
5682 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5683 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5684 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5685 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5687 if (gimple_phi_num_args (phi) == 1)
5688 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5690 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5692 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5693 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5694 stmt_info = info;
5696 return stmt_info;
5699 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5700 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5701 return false. */
5703 static bool
5704 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5705 stmt_vec_info reduc_info)
5707 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5708 if (!main_loop_vinfo)
5709 return false;
5711 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5712 return false;
5714 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5715 auto_vec<tree, 16> main_loop_results (num_phis);
5716 auto_vec<tree, 16> initial_values (num_phis);
5717 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5719 /* The epilogue loop can be entered either from the main loop or
5720 from an earlier guard block. */
5721 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5722 for (tree incoming_value : reduc_info->reduc_initial_values)
5724 /* Look for:
5726 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5727 INITIAL_VALUE(guard block)>. */
5728 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5730 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5731 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5733 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5734 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5736 main_loop_results.quick_push (from_main_loop);
5737 initial_values.quick_push (from_skip);
5740 else
5741 /* The main loop dominates the epilogue loop. */
5742 main_loop_results.splice (reduc_info->reduc_initial_values);
5744 /* See if the main loop has the kind of accumulator we need. */
5745 vect_reusable_accumulator *accumulator
5746 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5747 if (!accumulator
5748 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5749 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5750 accumulator->reduc_info->reduc_scalar_results.begin ()))
5751 return false;
5753 /* Handle the case where we can reduce wider vectors to narrower ones. */
5754 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5755 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5756 unsigned HOST_WIDE_INT m;
5757 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5758 TYPE_VECTOR_SUBPARTS (vectype), &m))
5759 return false;
5760 /* Check the intermediate vector types and operations are available. */
5761 tree prev_vectype = old_vectype;
5762 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5763 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5765 intermediate_nunits = exact_div (intermediate_nunits, 2);
5766 tree intermediate_vectype = get_related_vectype_for_scalar_type
5767 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5768 if (!intermediate_vectype
5769 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5770 intermediate_vectype)
5771 || !can_vec_extract (TYPE_MODE (prev_vectype),
5772 TYPE_MODE (intermediate_vectype)))
5773 return false;
5774 prev_vectype = intermediate_vectype;
5777 /* Non-SLP reductions might apply an adjustment after the reduction
5778 operation, in order to simplify the initialization of the accumulator.
5779 If the epilogue loop carries on from where the main loop left off,
5780 it should apply the same adjustment to the final reduction result.
5782 If the epilogue loop can also be entered directly (rather than via
5783 the main loop), we need to be able to handle that case in the same way,
5784 with the same adjustment. (In principle we could add a PHI node
5785 to select the correct adjustment, but in practice that shouldn't be
5786 necessary.) */
5787 tree main_adjustment
5788 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5789 if (loop_vinfo->main_loop_edge && main_adjustment)
5791 gcc_assert (num_phis == 1);
5792 tree initial_value = initial_values[0];
5793 /* Check that we can use INITIAL_VALUE as the adjustment and
5794 initialize the accumulator with a neutral value instead. */
5795 if (!operand_equal_p (initial_value, main_adjustment))
5796 return false;
5797 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5798 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5799 code, initial_value);
5801 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5802 reduc_info->reduc_initial_values.truncate (0);
5803 reduc_info->reduc_initial_values.splice (initial_values);
5804 reduc_info->reused_accumulator = accumulator;
5805 return true;
5808 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5809 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5811 static tree
5812 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5813 gimple_seq *seq)
5815 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5816 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5817 tree stype = TREE_TYPE (vectype);
5818 tree new_temp = vec_def;
5819 while (nunits > nunits1)
5821 nunits /= 2;
5822 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5823 stype, nunits);
5824 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5826 /* The target has to make sure we support lowpart/highpart
5827 extraction, either via direct vector extract or through
5828 an integer mode punning. */
5829 tree dst1, dst2;
5830 gimple *epilog_stmt;
5831 if (convert_optab_handler (vec_extract_optab,
5832 TYPE_MODE (TREE_TYPE (new_temp)),
5833 TYPE_MODE (vectype1))
5834 != CODE_FOR_nothing)
5836 /* Extract sub-vectors directly once vec_extract becomes
5837 a conversion optab. */
5838 dst1 = make_ssa_name (vectype1);
5839 epilog_stmt
5840 = gimple_build_assign (dst1, BIT_FIELD_REF,
5841 build3 (BIT_FIELD_REF, vectype1,
5842 new_temp, TYPE_SIZE (vectype1),
5843 bitsize_int (0)));
5844 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5845 dst2 = make_ssa_name (vectype1);
5846 epilog_stmt
5847 = gimple_build_assign (dst2, BIT_FIELD_REF,
5848 build3 (BIT_FIELD_REF, vectype1,
5849 new_temp, TYPE_SIZE (vectype1),
5850 bitsize_int (bitsize)));
5851 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5853 else
5855 /* Extract via punning to appropriately sized integer mode
5856 vector. */
5857 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5858 tree etype = build_vector_type (eltype, 2);
5859 gcc_assert (convert_optab_handler (vec_extract_optab,
5860 TYPE_MODE (etype),
5861 TYPE_MODE (eltype))
5862 != CODE_FOR_nothing);
5863 tree tem = make_ssa_name (etype);
5864 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5865 build1 (VIEW_CONVERT_EXPR,
5866 etype, new_temp));
5867 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5868 new_temp = tem;
5869 tem = make_ssa_name (eltype);
5870 epilog_stmt
5871 = gimple_build_assign (tem, BIT_FIELD_REF,
5872 build3 (BIT_FIELD_REF, eltype,
5873 new_temp, TYPE_SIZE (eltype),
5874 bitsize_int (0)));
5875 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5876 dst1 = make_ssa_name (vectype1);
5877 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5878 build1 (VIEW_CONVERT_EXPR,
5879 vectype1, tem));
5880 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5881 tem = make_ssa_name (eltype);
5882 epilog_stmt
5883 = gimple_build_assign (tem, BIT_FIELD_REF,
5884 build3 (BIT_FIELD_REF, eltype,
5885 new_temp, TYPE_SIZE (eltype),
5886 bitsize_int (bitsize)));
5887 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5888 dst2 = make_ssa_name (vectype1);
5889 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5890 build1 (VIEW_CONVERT_EXPR,
5891 vectype1, tem));
5892 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5895 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5898 return new_temp;
5901 /* Retrieves the definining statement to be used for a reduction.
5902 For MAIN_EXIT_P we use the current VEC_STMTs and otherwise we look at
5903 the reduction definitions. */
5905 tree
5906 vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
5907 slp_instance slp_node_instance, bool main_exit_p, unsigned i,
5908 vec <gimple *> &vec_stmts)
5910 tree def;
5912 if (slp_node)
5914 if (!main_exit_p)
5915 slp_node = slp_node_instance->reduc_phis;
5916 def = vect_get_slp_vect_def (slp_node, i);
5918 else
5920 if (!main_exit_p)
5921 reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
5922 vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
5923 def = gimple_get_lhs (vec_stmts[0]);
5926 return def;
5929 /* Function vect_create_epilog_for_reduction
5931 Create code at the loop-epilog to finalize the result of a reduction
5932 computation.
5934 STMT_INFO is the scalar reduction stmt that is being vectorized.
5935 SLP_NODE is an SLP node containing a group of reduction statements. The
5936 first one in this group is STMT_INFO.
5937 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5938 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5939 (counting from 0)
5940 LOOP_EXIT is the edge to update in the merge block. In the case of a single
5941 exit this edge is always the main loop exit.
5943 This function:
5944 1. Completes the reduction def-use cycles.
5945 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5946 by calling the function specified by REDUC_FN if available, or by
5947 other means (whole-vector shifts or a scalar loop).
5948 The function also creates a new phi node at the loop exit to preserve
5949 loop-closed form, as illustrated below.
5951 The flow at the entry to this function:
5953 loop:
5954 vec_def = phi <vec_init, null> # REDUCTION_PHI
5955 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5956 s_loop = scalar_stmt # (scalar) STMT_INFO
5957 loop_exit:
5958 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5959 use <s_out0>
5960 use <s_out0>
5962 The above is transformed by this function into:
5964 loop:
5965 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5966 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5967 s_loop = scalar_stmt # (scalar) STMT_INFO
5968 loop_exit:
5969 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5970 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5971 v_out2 = reduce <v_out1>
5972 s_out3 = extract_field <v_out2, 0>
5973 s_out4 = adjust_result <s_out3>
5974 use <s_out4>
5975 use <s_out4>
5978 static void
5979 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5980 stmt_vec_info stmt_info,
5981 slp_tree slp_node,
5982 slp_instance slp_node_instance,
5983 edge loop_exit)
5985 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5986 gcc_assert (reduc_info->is_reduc_info);
5987 /* For double reductions we need to get at the inner loop reduction
5988 stmt which has the meta info attached. Our stmt_info is that of the
5989 loop-closed PHI of the inner loop which we remember as
5990 def for the reduction PHI generation. */
5991 bool double_reduc = false;
5992 bool main_exit_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit;
5993 stmt_vec_info rdef_info = stmt_info;
5994 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5996 gcc_assert (!slp_node);
5997 double_reduc = true;
5998 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5999 (stmt_info->stmt, 0));
6000 stmt_info = vect_stmt_to_vectorize (stmt_info);
6002 gphi *reduc_def_stmt
6003 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
6004 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
6005 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6006 tree vectype;
6007 machine_mode mode;
6008 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
6009 basic_block exit_bb;
6010 tree scalar_dest;
6011 tree scalar_type;
6012 gimple *new_phi = NULL, *phi = NULL;
6013 gimple_stmt_iterator exit_gsi;
6014 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
6015 gimple *epilog_stmt = NULL;
6016 gimple *exit_phi;
6017 tree bitsize;
6018 tree def;
6019 tree orig_name, scalar_result;
6020 imm_use_iterator imm_iter, phi_imm_iter;
6021 use_operand_p use_p, phi_use_p;
6022 gimple *use_stmt;
6023 auto_vec<tree> reduc_inputs;
6024 int j, i;
6025 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
6026 unsigned int group_size = 1, k;
6027 auto_vec<gimple *> phis;
6028 /* SLP reduction without reduction chain, e.g.,
6029 # a1 = phi <a2, a0>
6030 # b1 = phi <b2, b0>
6031 a2 = operation (a1)
6032 b2 = operation (b1) */
6033 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6034 bool direct_slp_reduc;
6035 tree induction_index = NULL_TREE;
6037 if (slp_node)
6038 group_size = SLP_TREE_LANES (slp_node);
6040 if (nested_in_vect_loop_p (loop, stmt_info))
6042 outer_loop = loop;
6043 loop = loop->inner;
6044 gcc_assert (!slp_node && double_reduc);
6047 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6048 gcc_assert (vectype);
6049 mode = TYPE_MODE (vectype);
6051 tree induc_val = NULL_TREE;
6052 tree adjustment_def = NULL;
6053 if (slp_node)
6055 else
6057 /* Optimize: for induction condition reduction, if we can't use zero
6058 for induc_val, use initial_def. */
6059 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6060 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6061 else if (double_reduc)
6063 else
6064 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6067 stmt_vec_info single_live_out_stmt[] = { stmt_info };
6068 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6069 if (slp_reduc)
6070 /* All statements produce live-out values. */
6071 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6072 else if (slp_node)
6074 /* The last statement in the reduction chain produces the live-out
6075 value. Note SLP optimization can shuffle scalar stmts to
6076 optimize permutations so we have to search for the last stmt. */
6077 for (k = 0; k < group_size; ++k)
6078 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
6080 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6081 break;
6085 unsigned vec_num;
6086 int ncopies;
6087 if (slp_node)
6089 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6090 ncopies = 1;
6092 else
6094 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6095 vec_num = 1;
6096 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6099 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6100 which is updated with the current index of the loop for every match of
6101 the original loop's cond_expr (VEC_STMT). This results in a vector
6102 containing the last time the condition passed for that vector lane.
6103 The first match will be a 1 to allow 0 to be used for non-matching
6104 indexes. If there are no matches at all then the vector will be all
6105 zeroes.
6107 PR92772: This algorithm is broken for architectures that support
6108 masked vectors, but do not provide fold_extract_last. */
6109 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6111 auto_vec<std::pair<tree, bool>, 2> ccompares;
6112 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6113 cond_info = vect_stmt_to_vectorize (cond_info);
6114 while (cond_info != reduc_info)
6116 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6118 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6119 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6120 ccompares.safe_push
6121 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6122 STMT_VINFO_REDUC_IDX (cond_info) == 2));
6124 cond_info
6125 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6126 1 + STMT_VINFO_REDUC_IDX
6127 (cond_info)));
6128 cond_info = vect_stmt_to_vectorize (cond_info);
6130 gcc_assert (ccompares.length () != 0);
6132 tree indx_before_incr, indx_after_incr;
6133 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6134 int scalar_precision
6135 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6136 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6137 tree cr_index_vector_type = get_related_vectype_for_scalar_type
6138 (TYPE_MODE (vectype), cr_index_scalar_type,
6139 TYPE_VECTOR_SUBPARTS (vectype));
6141 /* First we create a simple vector induction variable which starts
6142 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6143 vector size (STEP). */
6145 /* Create a {1,2,3,...} vector. */
6146 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6148 /* Create a vector of the step value. */
6149 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6150 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6152 /* Create an induction variable. */
6153 gimple_stmt_iterator incr_gsi;
6154 bool insert_after;
6155 vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6156 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6157 insert_after, &indx_before_incr, &indx_after_incr);
6159 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6160 filled with zeros (VEC_ZERO). */
6162 /* Create a vector of 0s. */
6163 tree zero = build_zero_cst (cr_index_scalar_type);
6164 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6166 /* Create a vector phi node. */
6167 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6168 new_phi = create_phi_node (new_phi_tree, loop->header);
6169 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6170 loop_preheader_edge (loop), UNKNOWN_LOCATION);
6172 /* Now take the condition from the loops original cond_exprs
6173 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6174 every match uses values from the induction variable
6175 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6176 (NEW_PHI_TREE).
6177 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6178 the new cond_expr (INDEX_COND_EXPR). */
6179 gimple_seq stmts = NULL;
6180 for (int i = ccompares.length () - 1; i != -1; --i)
6182 tree ccompare = ccompares[i].first;
6183 if (ccompares[i].second)
6184 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6185 cr_index_vector_type,
6186 ccompare,
6187 indx_before_incr, new_phi_tree);
6188 else
6189 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6190 cr_index_vector_type,
6191 ccompare,
6192 new_phi_tree, indx_before_incr);
6194 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6196 /* Update the phi with the vec cond. */
6197 induction_index = new_phi_tree;
6198 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6199 loop_latch_edge (loop), UNKNOWN_LOCATION);
6202 /* 2. Create epilog code.
6203 The reduction epilog code operates across the elements of the vector
6204 of partial results computed by the vectorized loop.
6205 The reduction epilog code consists of:
6207 step 1: compute the scalar result in a vector (v_out2)
6208 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6209 step 3: adjust the scalar result (s_out3) if needed.
6211 Step 1 can be accomplished using one the following three schemes:
6212 (scheme 1) using reduc_fn, if available.
6213 (scheme 2) using whole-vector shifts, if available.
6214 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6215 combined.
6217 The overall epilog code looks like this:
6219 s_out0 = phi <s_loop> # original EXIT_PHI
6220 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6221 v_out2 = reduce <v_out1> # step 1
6222 s_out3 = extract_field <v_out2, 0> # step 2
6223 s_out4 = adjust_result <s_out3> # step 3
6225 (step 3 is optional, and steps 1 and 2 may be combined).
6226 Lastly, the uses of s_out0 are replaced by s_out4. */
6229 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6230 v_out1 = phi <VECT_DEF>
6231 Store them in NEW_PHIS. */
6232 if (double_reduc)
6233 loop = outer_loop;
6234 /* We need to reduce values in all exits. */
6235 exit_bb = loop_exit->dest;
6236 exit_gsi = gsi_after_labels (exit_bb);
6237 reduc_inputs.create (slp_node ? vec_num : ncopies);
6238 vec <gimple *> vec_stmts = vNULL;
6239 for (unsigned i = 0; i < vec_num; i++)
6241 gimple_seq stmts = NULL;
6242 def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
6243 main_exit_p, i, vec_stmts);
6244 for (j = 0; j < ncopies; j++)
6246 tree new_def = copy_ssa_name (def);
6247 phi = create_phi_node (new_def, exit_bb);
6248 if (j)
6249 def = gimple_get_lhs (vec_stmts[j]);
6250 if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6251 SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6252 else
6254 for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6255 SET_PHI_ARG_DEF (phi, k, def);
6257 new_def = gimple_convert (&stmts, vectype, new_def);
6258 reduc_inputs.quick_push (new_def);
6260 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6263 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6264 (i.e. when reduc_fn is not available) and in the final adjustment
6265 code (if needed). Also get the original scalar reduction variable as
6266 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6267 represents a reduction pattern), the tree-code and scalar-def are
6268 taken from the original stmt that the pattern-stmt (STMT) replaces.
6269 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6270 are taken from STMT. */
6272 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6273 if (orig_stmt_info != stmt_info)
6275 /* Reduction pattern */
6276 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6277 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6280 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6281 scalar_type = TREE_TYPE (scalar_dest);
6282 scalar_results.truncate (0);
6283 scalar_results.reserve_exact (group_size);
6284 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6285 bitsize = TYPE_SIZE (scalar_type);
6287 /* True if we should implement SLP_REDUC using native reduction operations
6288 instead of scalar operations. */
6289 direct_slp_reduc = (reduc_fn != IFN_LAST
6290 && slp_reduc
6291 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6293 /* In case of reduction chain, e.g.,
6294 # a1 = phi <a3, a0>
6295 a2 = operation (a1)
6296 a3 = operation (a2),
6298 we may end up with more than one vector result. Here we reduce them
6299 to one vector.
6301 The same is true for a SLP reduction, e.g.,
6302 # a1 = phi <a2, a0>
6303 # b1 = phi <b2, b0>
6304 a2 = operation (a1)
6305 b2 = operation (a2),
6307 where we can end up with more than one vector as well. We can
6308 easily accumulate vectors when the number of vector elements is
6309 a multiple of the SLP group size.
6311 The same is true if we couldn't use a single defuse cycle. */
6312 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6313 || direct_slp_reduc
6314 || (slp_reduc
6315 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6316 || ncopies > 1)
6318 gimple_seq stmts = NULL;
6319 tree single_input = reduc_inputs[0];
6320 for (k = 1; k < reduc_inputs.length (); k++)
6321 single_input = gimple_build (&stmts, code, vectype,
6322 single_input, reduc_inputs[k]);
6323 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6325 reduc_inputs.truncate (0);
6326 reduc_inputs.safe_push (single_input);
6329 tree orig_reduc_input = reduc_inputs[0];
6331 /* If this loop is an epilogue loop that can be skipped after the
6332 main loop, we can only share a reduction operation between the
6333 main loop and the epilogue if we put it at the target of the
6334 skip edge.
6336 We can still reuse accumulators if this check fails. Doing so has
6337 the minor(?) benefit of making the epilogue loop's scalar result
6338 independent of the main loop's scalar result. */
6339 bool unify_with_main_loop_p = false;
6340 if (reduc_info->reused_accumulator
6341 && loop_vinfo->skip_this_loop_edge
6342 && single_succ_p (exit_bb)
6343 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6345 unify_with_main_loop_p = true;
6347 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6348 reduc_inputs[0] = make_ssa_name (vectype);
6349 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6350 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6351 UNKNOWN_LOCATION);
6352 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6353 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6354 exit_gsi = gsi_after_labels (reduc_block);
6357 /* Shouldn't be used beyond this point. */
6358 exit_bb = nullptr;
6360 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6361 && reduc_fn != IFN_LAST)
6363 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6364 various data values where the condition matched and another vector
6365 (INDUCTION_INDEX) containing all the indexes of those matches. We
6366 need to extract the last matching index (which will be the index with
6367 highest value) and use this to index into the data vector.
6368 For the case where there were no matches, the data vector will contain
6369 all default values and the index vector will be all zeros. */
6371 /* Get various versions of the type of the vector of indexes. */
6372 tree index_vec_type = TREE_TYPE (induction_index);
6373 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6374 tree index_scalar_type = TREE_TYPE (index_vec_type);
6375 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6377 /* Get an unsigned integer version of the type of the data vector. */
6378 int scalar_precision
6379 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6380 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6381 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6382 vectype);
6384 /* First we need to create a vector (ZERO_VEC) of zeros and another
6385 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6386 can create using a MAX reduction and then expanding.
6387 In the case where the loop never made any matches, the max index will
6388 be zero. */
6390 /* Vector of {0, 0, 0,...}. */
6391 tree zero_vec = build_zero_cst (vectype);
6393 /* Find maximum value from the vector of found indexes. */
6394 tree max_index = make_ssa_name (index_scalar_type);
6395 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6396 1, induction_index);
6397 gimple_call_set_lhs (max_index_stmt, max_index);
6398 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6400 /* Vector of {max_index, max_index, max_index,...}. */
6401 tree max_index_vec = make_ssa_name (index_vec_type);
6402 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6403 max_index);
6404 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6405 max_index_vec_rhs);
6406 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6408 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6409 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6410 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6411 otherwise. Only one value should match, resulting in a vector
6412 (VEC_COND) with one data value and the rest zeros.
6413 In the case where the loop never made any matches, every index will
6414 match, resulting in a vector with all data values (which will all be
6415 the default value). */
6417 /* Compare the max index vector to the vector of found indexes to find
6418 the position of the max value. */
6419 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6420 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6421 induction_index,
6422 max_index_vec);
6423 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6425 /* Use the compare to choose either values from the data vector or
6426 zero. */
6427 tree vec_cond = make_ssa_name (vectype);
6428 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6429 vec_compare,
6430 reduc_inputs[0],
6431 zero_vec);
6432 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6434 /* Finally we need to extract the data value from the vector (VEC_COND)
6435 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6436 reduction, but because this doesn't exist, we can use a MAX reduction
6437 instead. The data value might be signed or a float so we need to cast
6438 it first.
6439 In the case where the loop never made any matches, the data values are
6440 all identical, and so will reduce down correctly. */
6442 /* Make the matched data values unsigned. */
6443 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6444 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6445 vec_cond);
6446 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6447 VIEW_CONVERT_EXPR,
6448 vec_cond_cast_rhs);
6449 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6451 /* Reduce down to a scalar value. */
6452 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6453 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6454 1, vec_cond_cast);
6455 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6456 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6458 /* Convert the reduced value back to the result type and set as the
6459 result. */
6460 gimple_seq stmts = NULL;
6461 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6462 data_reduc);
6463 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6464 scalar_results.safe_push (new_temp);
6466 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6467 && reduc_fn == IFN_LAST)
6469 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6470 idx = 0;
6471 idx_val = induction_index[0];
6472 val = data_reduc[0];
6473 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6474 if (induction_index[i] > idx_val)
6475 val = data_reduc[i], idx_val = induction_index[i];
6476 return val; */
6478 tree data_eltype = TREE_TYPE (vectype);
6479 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6480 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6481 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6482 /* Enforced by vectorizable_reduction, which ensures we have target
6483 support before allowing a conditional reduction on variable-length
6484 vectors. */
6485 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6486 tree idx_val = NULL_TREE, val = NULL_TREE;
6487 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6489 tree old_idx_val = idx_val;
6490 tree old_val = val;
6491 idx_val = make_ssa_name (idx_eltype);
6492 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6493 build3 (BIT_FIELD_REF, idx_eltype,
6494 induction_index,
6495 bitsize_int (el_size),
6496 bitsize_int (off)));
6497 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6498 val = make_ssa_name (data_eltype);
6499 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6500 build3 (BIT_FIELD_REF,
6501 data_eltype,
6502 reduc_inputs[0],
6503 bitsize_int (el_size),
6504 bitsize_int (off)));
6505 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6506 if (off != 0)
6508 tree new_idx_val = idx_val;
6509 if (off != v_size - el_size)
6511 new_idx_val = make_ssa_name (idx_eltype);
6512 epilog_stmt = gimple_build_assign (new_idx_val,
6513 MAX_EXPR, idx_val,
6514 old_idx_val);
6515 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6517 tree cond = make_ssa_name (boolean_type_node);
6518 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6519 idx_val, old_idx_val);
6520 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6521 tree new_val = make_ssa_name (data_eltype);
6522 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6523 cond, val, old_val);
6524 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6525 idx_val = new_idx_val;
6526 val = new_val;
6529 /* Convert the reduced value back to the result type and set as the
6530 result. */
6531 gimple_seq stmts = NULL;
6532 val = gimple_convert (&stmts, scalar_type, val);
6533 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6534 scalar_results.safe_push (val);
6537 /* 2.3 Create the reduction code, using one of the three schemes described
6538 above. In SLP we simply need to extract all the elements from the
6539 vector (without reducing them), so we use scalar shifts. */
6540 else if (reduc_fn != IFN_LAST && !slp_reduc)
6542 tree tmp;
6543 tree vec_elem_type;
6545 /* Case 1: Create:
6546 v_out2 = reduc_expr <v_out1> */
6548 if (dump_enabled_p ())
6549 dump_printf_loc (MSG_NOTE, vect_location,
6550 "Reduce using direct vector reduction.\n");
6552 gimple_seq stmts = NULL;
6553 vec_elem_type = TREE_TYPE (vectype);
6554 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6555 vec_elem_type, reduc_inputs[0]);
6556 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6557 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6559 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6560 && induc_val)
6562 /* Earlier we set the initial value to be a vector if induc_val
6563 values. Check the result and if it is induc_val then replace
6564 with the original initial value, unless induc_val is
6565 the same as initial_def already. */
6566 tree zcompare = make_ssa_name (boolean_type_node);
6567 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6568 new_temp, induc_val);
6569 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6570 tree initial_def = reduc_info->reduc_initial_values[0];
6571 tmp = make_ssa_name (new_scalar_dest);
6572 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6573 initial_def, new_temp);
6574 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6575 new_temp = tmp;
6578 scalar_results.safe_push (new_temp);
6580 else if (direct_slp_reduc)
6582 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6583 with the elements for other SLP statements replaced with the
6584 neutral value. We can then do a normal reduction on each vector. */
6586 /* Enforced by vectorizable_reduction. */
6587 gcc_assert (reduc_inputs.length () == 1);
6588 gcc_assert (pow2p_hwi (group_size));
6590 gimple_seq seq = NULL;
6592 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6593 and the same element size as VECTYPE. */
6594 tree index = build_index_vector (vectype, 0, 1);
6595 tree index_type = TREE_TYPE (index);
6596 tree index_elt_type = TREE_TYPE (index_type);
6597 tree mask_type = truth_type_for (index_type);
6599 /* Create a vector that, for each element, identifies which of
6600 the REDUC_GROUP_SIZE results should use it. */
6601 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6602 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6603 build_vector_from_val (index_type, index_mask));
6605 /* Get a neutral vector value. This is simply a splat of the neutral
6606 scalar value if we have one, otherwise the initial scalar value
6607 is itself a neutral value. */
6608 tree vector_identity = NULL_TREE;
6609 tree neutral_op = NULL_TREE;
6610 if (slp_node)
6612 tree initial_value = NULL_TREE;
6613 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6614 initial_value = reduc_info->reduc_initial_values[0];
6615 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6616 initial_value, false);
6618 if (neutral_op)
6619 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6620 neutral_op);
6621 for (unsigned int i = 0; i < group_size; ++i)
6623 /* If there's no univeral neutral value, we can use the
6624 initial scalar value from the original PHI. This is used
6625 for MIN and MAX reduction, for example. */
6626 if (!neutral_op)
6628 tree scalar_value = reduc_info->reduc_initial_values[i];
6629 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6630 scalar_value);
6631 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6632 scalar_value);
6635 /* Calculate the equivalent of:
6637 sel[j] = (index[j] == i);
6639 which selects the elements of REDUC_INPUTS[0] that should
6640 be included in the result. */
6641 tree compare_val = build_int_cst (index_elt_type, i);
6642 compare_val = build_vector_from_val (index_type, compare_val);
6643 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6644 index, compare_val);
6646 /* Calculate the equivalent of:
6648 vec = seq ? reduc_inputs[0] : vector_identity;
6650 VEC is now suitable for a full vector reduction. */
6651 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6652 sel, reduc_inputs[0], vector_identity);
6654 /* Do the reduction and convert it to the appropriate type. */
6655 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6656 TREE_TYPE (vectype), vec);
6657 scalar = gimple_convert (&seq, scalar_type, scalar);
6658 scalar_results.safe_push (scalar);
6660 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6662 else
6664 bool reduce_with_shift;
6665 tree vec_temp;
6667 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6669 /* See if the target wants to do the final (shift) reduction
6670 in a vector mode of smaller size and first reduce upper/lower
6671 halves against each other. */
6672 enum machine_mode mode1 = mode;
6673 tree stype = TREE_TYPE (vectype);
6674 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6675 unsigned nunits1 = nunits;
6676 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6677 && reduc_inputs.length () == 1)
6679 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6680 /* For SLP reductions we have to make sure lanes match up, but
6681 since we're doing individual element final reduction reducing
6682 vector width here is even more important.
6683 ??? We can also separate lanes with permutes, for the common
6684 case of power-of-two group-size odd/even extracts would work. */
6685 if (slp_reduc && nunits != nunits1)
6687 nunits1 = least_common_multiple (nunits1, group_size);
6688 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6691 if (!slp_reduc
6692 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6693 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6695 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6696 stype, nunits1);
6697 reduce_with_shift = have_whole_vector_shift (mode1);
6698 if (!VECTOR_MODE_P (mode1)
6699 || !directly_supported_p (code, vectype1))
6700 reduce_with_shift = false;
6702 /* First reduce the vector to the desired vector size we should
6703 do shift reduction on by combining upper and lower halves. */
6704 gimple_seq stmts = NULL;
6705 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6706 code, &stmts);
6707 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6708 reduc_inputs[0] = new_temp;
6710 if (reduce_with_shift && !slp_reduc)
6712 int element_bitsize = tree_to_uhwi (bitsize);
6713 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6714 for variable-length vectors and also requires direct target support
6715 for loop reductions. */
6716 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6717 int nelements = vec_size_in_bits / element_bitsize;
6718 vec_perm_builder sel;
6719 vec_perm_indices indices;
6721 int elt_offset;
6723 tree zero_vec = build_zero_cst (vectype1);
6724 /* Case 2: Create:
6725 for (offset = nelements/2; offset >= 1; offset/=2)
6727 Create: va' = vec_shift <va, offset>
6728 Create: va = vop <va, va'>
6729 } */
6731 tree rhs;
6733 if (dump_enabled_p ())
6734 dump_printf_loc (MSG_NOTE, vect_location,
6735 "Reduce using vector shifts\n");
6737 gimple_seq stmts = NULL;
6738 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6739 for (elt_offset = nelements / 2;
6740 elt_offset >= 1;
6741 elt_offset /= 2)
6743 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6744 indices.new_vector (sel, 2, nelements);
6745 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6746 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6747 new_temp, zero_vec, mask);
6748 new_temp = gimple_build (&stmts, code,
6749 vectype1, new_name, new_temp);
6751 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6753 /* 2.4 Extract the final scalar result. Create:
6754 s_out3 = extract_field <v_out2, bitpos> */
6756 if (dump_enabled_p ())
6757 dump_printf_loc (MSG_NOTE, vect_location,
6758 "extract scalar result\n");
6760 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6761 bitsize, bitsize_zero_node);
6762 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6763 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6764 gimple_assign_set_lhs (epilog_stmt, new_temp);
6765 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6766 scalar_results.safe_push (new_temp);
6768 else
6770 /* Case 3: Create:
6771 s = extract_field <v_out2, 0>
6772 for (offset = element_size;
6773 offset < vector_size;
6774 offset += element_size;)
6776 Create: s' = extract_field <v_out2, offset>
6777 Create: s = op <s, s'> // For non SLP cases
6778 } */
6780 if (dump_enabled_p ())
6781 dump_printf_loc (MSG_NOTE, vect_location,
6782 "Reduce using scalar code.\n");
6784 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6785 int element_bitsize = tree_to_uhwi (bitsize);
6786 tree compute_type = TREE_TYPE (vectype);
6787 gimple_seq stmts = NULL;
6788 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6790 int bit_offset;
6791 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6792 vec_temp, bitsize, bitsize_zero_node);
6794 /* In SLP we don't need to apply reduction operation, so we just
6795 collect s' values in SCALAR_RESULTS. */
6796 if (slp_reduc)
6797 scalar_results.safe_push (new_temp);
6799 for (bit_offset = element_bitsize;
6800 bit_offset < vec_size_in_bits;
6801 bit_offset += element_bitsize)
6803 tree bitpos = bitsize_int (bit_offset);
6804 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6805 compute_type, vec_temp,
6806 bitsize, bitpos);
6807 if (slp_reduc)
6809 /* In SLP we don't need to apply reduction operation, so
6810 we just collect s' values in SCALAR_RESULTS. */
6811 new_temp = new_name;
6812 scalar_results.safe_push (new_name);
6814 else
6815 new_temp = gimple_build (&stmts, code, compute_type,
6816 new_name, new_temp);
6820 /* The only case where we need to reduce scalar results in SLP, is
6821 unrolling. If the size of SCALAR_RESULTS is greater than
6822 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6823 REDUC_GROUP_SIZE. */
6824 if (slp_reduc)
6826 tree res, first_res, new_res;
6828 /* Reduce multiple scalar results in case of SLP unrolling. */
6829 for (j = group_size; scalar_results.iterate (j, &res);
6830 j++)
6832 first_res = scalar_results[j % group_size];
6833 new_res = gimple_build (&stmts, code, compute_type,
6834 first_res, res);
6835 scalar_results[j % group_size] = new_res;
6837 scalar_results.truncate (group_size);
6838 for (k = 0; k < group_size; k++)
6839 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6840 scalar_results[k]);
6842 else
6844 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6845 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6846 scalar_results.safe_push (new_temp);
6849 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6852 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6853 && induc_val)
6855 /* Earlier we set the initial value to be a vector if induc_val
6856 values. Check the result and if it is induc_val then replace
6857 with the original initial value, unless induc_val is
6858 the same as initial_def already. */
6859 tree zcompare = make_ssa_name (boolean_type_node);
6860 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6861 induc_val);
6862 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6863 tree initial_def = reduc_info->reduc_initial_values[0];
6864 tree tmp = make_ssa_name (new_scalar_dest);
6865 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6866 initial_def, new_temp);
6867 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6868 scalar_results[0] = tmp;
6872 /* 2.5 Adjust the final result by the initial value of the reduction
6873 variable. (When such adjustment is not needed, then
6874 'adjustment_def' is zero). For example, if code is PLUS we create:
6875 new_temp = loop_exit_def + adjustment_def */
6877 if (adjustment_def)
6879 gcc_assert (!slp_reduc);
6880 gimple_seq stmts = NULL;
6881 if (double_reduc)
6883 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6884 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6885 new_temp = gimple_build (&stmts, code, vectype,
6886 reduc_inputs[0], adjustment_def);
6888 else
6890 new_temp = scalar_results[0];
6891 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6892 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6893 adjustment_def);
6894 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6895 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6896 new_temp, adjustment_def);
6897 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6900 epilog_stmt = gimple_seq_last_stmt (stmts);
6901 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6902 scalar_results[0] = new_temp;
6905 /* Record this operation if it could be reused by the epilogue loop. */
6906 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6907 && reduc_inputs.length () == 1)
6908 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6909 { orig_reduc_input, reduc_info });
6911 if (double_reduc)
6912 loop = outer_loop;
6914 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6915 phis with new adjusted scalar results, i.e., replace use <s_out0>
6916 with use <s_out4>.
6918 Transform:
6919 loop_exit:
6920 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6921 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6922 v_out2 = reduce <v_out1>
6923 s_out3 = extract_field <v_out2, 0>
6924 s_out4 = adjust_result <s_out3>
6925 use <s_out0>
6926 use <s_out0>
6928 into:
6930 loop_exit:
6931 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6932 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6933 v_out2 = reduce <v_out1>
6934 s_out3 = extract_field <v_out2, 0>
6935 s_out4 = adjust_result <s_out3>
6936 use <s_out4>
6937 use <s_out4> */
6939 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6940 for (k = 0; k < live_out_stmts.size (); k++)
6942 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6943 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6945 phis.create (3);
6946 /* Find the loop-closed-use at the loop exit of the original scalar
6947 result. (The reduction result is expected to have two immediate uses,
6948 one at the latch block, and one at the loop exit). For double
6949 reductions we are looking for exit phis of the outer loop. */
6950 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6952 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6954 if (!is_gimple_debug (USE_STMT (use_p)))
6955 phis.safe_push (USE_STMT (use_p));
6957 else
6959 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6961 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6963 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6965 if (!flow_bb_inside_loop_p (loop,
6966 gimple_bb (USE_STMT (phi_use_p)))
6967 && !is_gimple_debug (USE_STMT (phi_use_p)))
6968 phis.safe_push (USE_STMT (phi_use_p));
6974 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6976 /* Replace the uses: */
6977 orig_name = PHI_RESULT (exit_phi);
6979 /* Look for a single use at the target of the skip edge. */
6980 if (unify_with_main_loop_p)
6982 use_operand_p use_p;
6983 gimple *user;
6984 if (!single_imm_use (orig_name, &use_p, &user))
6985 gcc_unreachable ();
6986 orig_name = gimple_get_lhs (user);
6989 scalar_result = scalar_results[k];
6990 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6992 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6993 SET_USE (use_p, scalar_result);
6994 update_stmt (use_stmt);
6998 phis.release ();
7002 /* Return a vector of type VECTYPE that is equal to the vector select
7003 operation "MASK ? VEC : IDENTITY". Insert the select statements
7004 before GSI. */
7006 static tree
7007 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
7008 tree vec, tree identity)
7010 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
7011 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
7012 mask, vec, identity);
7013 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7014 return cond;
7017 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
7018 order, starting with LHS. Insert the extraction statements before GSI and
7019 associate the new scalar SSA names with variable SCALAR_DEST.
7020 If MASK is nonzero mask the input and then operate on it unconditionally.
7021 Return the SSA name for the result. */
7023 static tree
7024 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
7025 tree_code code, tree lhs, tree vector_rhs,
7026 tree mask)
7028 tree vectype = TREE_TYPE (vector_rhs);
7029 tree scalar_type = TREE_TYPE (vectype);
7030 tree bitsize = TYPE_SIZE (scalar_type);
7031 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
7032 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
7034 /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7035 to perform an unconditional element-wise reduction of it. */
7036 if (mask)
7038 tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7039 "masked_vector_rhs");
7040 tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7041 false);
7042 tree vector_identity = build_vector_from_val (vectype, neutral_op);
7043 gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7044 mask, vector_rhs, vector_identity);
7045 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7046 vector_rhs = masked_vector_rhs;
7049 for (unsigned HOST_WIDE_INT bit_offset = 0;
7050 bit_offset < vec_size_in_bits;
7051 bit_offset += element_bitsize)
7053 tree bitpos = bitsize_int (bit_offset);
7054 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7055 bitsize, bitpos);
7057 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7058 rhs = make_ssa_name (scalar_dest, stmt);
7059 gimple_assign_set_lhs (stmt, rhs);
7060 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7062 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7063 tree new_name = make_ssa_name (scalar_dest, stmt);
7064 gimple_assign_set_lhs (stmt, new_name);
7065 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7066 lhs = new_name;
7068 return lhs;
7071 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
7072 type of the vector input. */
7074 static internal_fn
7075 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7077 internal_fn mask_reduc_fn;
7078 internal_fn mask_len_reduc_fn;
7080 switch (reduc_fn)
7082 case IFN_FOLD_LEFT_PLUS:
7083 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7084 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7085 break;
7087 default:
7088 return IFN_LAST;
7091 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7092 OPTIMIZE_FOR_SPEED))
7093 return mask_reduc_fn;
7094 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7095 OPTIMIZE_FOR_SPEED))
7096 return mask_len_reduc_fn;
7097 return IFN_LAST;
7100 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
7101 statement that sets the live-out value. REDUC_DEF_STMT is the phi
7102 statement. CODE is the operation performed by STMT_INFO and OPS are
7103 its scalar operands. REDUC_INDEX is the index of the operand in
7104 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
7105 implements in-order reduction, or IFN_LAST if we should open-code it.
7106 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7107 that should be used to control the operation in a fully-masked loop. */
7109 static bool
7110 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7111 stmt_vec_info stmt_info,
7112 gimple_stmt_iterator *gsi,
7113 gimple **vec_stmt, slp_tree slp_node,
7114 gimple *reduc_def_stmt,
7115 code_helper code, internal_fn reduc_fn,
7116 tree *ops, int num_ops, tree vectype_in,
7117 int reduc_index, vec_loop_masks *masks,
7118 vec_loop_lens *lens)
7120 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7121 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7122 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7124 int ncopies;
7125 if (slp_node)
7126 ncopies = 1;
7127 else
7128 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7130 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7131 gcc_assert (ncopies == 1);
7133 bool is_cond_op = false;
7134 if (!code.is_tree_code ())
7136 code = conditional_internal_fn_code (internal_fn (code));
7137 gcc_assert (code != ERROR_MARK);
7138 is_cond_op = true;
7141 gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7143 if (slp_node)
7145 if (is_cond_op)
7147 if (dump_enabled_p ())
7148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7149 "fold-left reduction on SLP not supported.\n");
7150 return false;
7153 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7154 TYPE_VECTOR_SUBPARTS (vectype_in)));
7157 /* The operands either come from a binary operation or an IFN_COND operation.
7158 The former is a gimple assign with binary rhs and the latter is a
7159 gimple call with four arguments. */
7160 gcc_assert (num_ops == 2 || num_ops == 4);
7161 tree op0, opmask;
7162 if (!is_cond_op)
7163 op0 = ops[1 - reduc_index];
7164 else
7166 op0 = ops[2 + (1 - reduc_index)];
7167 opmask = ops[0];
7168 gcc_assert (!slp_node);
7171 int group_size = 1;
7172 stmt_vec_info scalar_dest_def_info;
7173 auto_vec<tree> vec_oprnds0, vec_opmask;
7174 if (slp_node)
7176 auto_vec<vec<tree> > vec_defs (2);
7177 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7178 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7179 vec_defs[0].release ();
7180 vec_defs[1].release ();
7181 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7182 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7184 else
7186 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7187 op0, &vec_oprnds0);
7188 scalar_dest_def_info = stmt_info;
7190 /* For an IFN_COND_OP we also need the vector mask operand. */
7191 if (is_cond_op)
7192 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7193 opmask, &vec_opmask);
7196 gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7197 tree scalar_dest = gimple_get_lhs (sdef);
7198 tree scalar_type = TREE_TYPE (scalar_dest);
7199 tree reduc_var = gimple_phi_result (reduc_def_stmt);
7201 int vec_num = vec_oprnds0.length ();
7202 gcc_assert (vec_num == 1 || slp_node);
7203 tree vec_elem_type = TREE_TYPE (vectype_out);
7204 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7206 tree vector_identity = NULL_TREE;
7207 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7209 vector_identity = build_zero_cst (vectype_out);
7210 if (!HONOR_SIGNED_ZEROS (vectype_out))
7212 else
7214 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7215 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7216 vector_identity);
7220 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7221 int i;
7222 tree def0;
7223 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7225 gimple *new_stmt;
7226 tree mask = NULL_TREE;
7227 tree len = NULL_TREE;
7228 tree bias = NULL_TREE;
7229 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7230 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7231 else if (is_cond_op)
7232 mask = vec_opmask[0];
7233 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7235 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7236 i, 1);
7237 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7238 bias = build_int_cst (intQI_type_node, biasval);
7239 if (!is_cond_op)
7240 mask = build_minus_one_cst (truth_type_for (vectype_in));
7243 /* Handle MINUS by adding the negative. */
7244 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7246 tree negated = make_ssa_name (vectype_out);
7247 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7248 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7249 def0 = negated;
7252 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7253 && mask && mask_reduc_fn == IFN_LAST)
7254 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7255 vector_identity);
7257 /* On the first iteration the input is simply the scalar phi
7258 result, and for subsequent iterations it is the output of
7259 the preceding operation. */
7260 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7262 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7263 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7264 def0, mask, len, bias);
7265 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7266 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7267 def0, mask);
7268 else
7269 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7270 def0);
7271 /* For chained SLP reductions the output of the previous reduction
7272 operation serves as the input of the next. For the final statement
7273 the output cannot be a temporary - we reuse the original
7274 scalar destination of the last statement. */
7275 if (i != vec_num - 1)
7277 gimple_set_lhs (new_stmt, scalar_dest_var);
7278 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7279 gimple_set_lhs (new_stmt, reduc_var);
7282 else
7284 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7285 tree_code (code), reduc_var, def0,
7286 mask);
7287 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7288 /* Remove the statement, so that we can use the same code paths
7289 as for statements that we've just created. */
7290 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7291 gsi_remove (&tmp_gsi, true);
7294 if (i == vec_num - 1)
7296 gimple_set_lhs (new_stmt, scalar_dest);
7297 vect_finish_replace_stmt (loop_vinfo,
7298 scalar_dest_def_info,
7299 new_stmt);
7301 else
7302 vect_finish_stmt_generation (loop_vinfo,
7303 scalar_dest_def_info,
7304 new_stmt, gsi);
7306 if (slp_node)
7307 slp_node->push_vec_def (new_stmt);
7308 else
7310 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7311 *vec_stmt = new_stmt;
7315 return true;
7318 /* Function is_nonwrapping_integer_induction.
7320 Check if STMT_VINO (which is part of loop LOOP) both increments and
7321 does not cause overflow. */
7323 static bool
7324 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7326 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7327 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7328 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7329 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7330 widest_int ni, max_loop_value, lhs_max;
7331 wi::overflow_type overflow = wi::OVF_NONE;
7333 /* Make sure the loop is integer based. */
7334 if (TREE_CODE (base) != INTEGER_CST
7335 || TREE_CODE (step) != INTEGER_CST)
7336 return false;
7338 /* Check that the max size of the loop will not wrap. */
7340 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7341 return true;
7343 if (! max_stmt_executions (loop, &ni))
7344 return false;
7346 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7347 &overflow);
7348 if (overflow)
7349 return false;
7351 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7352 TYPE_SIGN (lhs_type), &overflow);
7353 if (overflow)
7354 return false;
7356 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7357 <= TYPE_PRECISION (lhs_type));
7360 /* Check if masking can be supported by inserting a conditional expression.
7361 CODE is the code for the operation. COND_FN is the conditional internal
7362 function, if it exists. VECTYPE_IN is the type of the vector input. */
7363 static bool
7364 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7365 tree vectype_in)
7367 if (cond_fn != IFN_LAST
7368 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7369 OPTIMIZE_FOR_SPEED))
7370 return false;
7372 if (code.is_tree_code ())
7373 switch (tree_code (code))
7375 case DOT_PROD_EXPR:
7376 case SAD_EXPR:
7377 return true;
7379 default:
7380 break;
7382 return false;
7385 /* Insert a conditional expression to enable masked vectorization. CODE is the
7386 code for the operation. VOP is the array of operands. MASK is the loop
7387 mask. GSI is a statement iterator used to place the new conditional
7388 expression. */
7389 static void
7390 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7391 gimple_stmt_iterator *gsi)
7393 switch (tree_code (code))
7395 case DOT_PROD_EXPR:
7397 tree vectype = TREE_TYPE (vop[1]);
7398 tree zero = build_zero_cst (vectype);
7399 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7400 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7401 mask, vop[1], zero);
7402 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7403 vop[1] = masked_op1;
7404 break;
7407 case SAD_EXPR:
7409 tree vectype = TREE_TYPE (vop[1]);
7410 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7411 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7412 mask, vop[1], vop[0]);
7413 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7414 vop[1] = masked_op1;
7415 break;
7418 default:
7419 gcc_unreachable ();
7423 /* Function vectorizable_reduction.
7425 Check if STMT_INFO performs a reduction operation that can be vectorized.
7426 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7427 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7428 Return true if STMT_INFO is vectorizable in this way.
7430 This function also handles reduction idioms (patterns) that have been
7431 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7432 may be of this form:
7433 X = pattern_expr (arg0, arg1, ..., X)
7434 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7435 sequence that had been detected and replaced by the pattern-stmt
7436 (STMT_INFO).
7438 This function also handles reduction of condition expressions, for example:
7439 for (int i = 0; i < N; i++)
7440 if (a[i] < value)
7441 last = a[i];
7442 This is handled by vectorising the loop and creating an additional vector
7443 containing the loop indexes for which "a[i] < value" was true. In the
7444 function epilogue this is reduced to a single max value and then used to
7445 index into the vector of results.
7447 In some cases of reduction patterns, the type of the reduction variable X is
7448 different than the type of the other arguments of STMT_INFO.
7449 In such cases, the vectype that is used when transforming STMT_INFO into
7450 a vector stmt is different than the vectype that is used to determine the
7451 vectorization factor, because it consists of a different number of elements
7452 than the actual number of elements that are being operated upon in parallel.
7454 For example, consider an accumulation of shorts into an int accumulator.
7455 On some targets it's possible to vectorize this pattern operating on 8
7456 shorts at a time (hence, the vectype for purposes of determining the
7457 vectorization factor should be V8HI); on the other hand, the vectype that
7458 is used to create the vector form is actually V4SI (the type of the result).
7460 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7461 indicates what is the actual level of parallelism (V8HI in the example), so
7462 that the right vectorization factor would be derived. This vectype
7463 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7464 be used to create the vectorized stmt. The right vectype for the vectorized
7465 stmt is obtained from the type of the result X:
7466 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7468 This means that, contrary to "regular" reductions (or "regular" stmts in
7469 general), the following equation:
7470 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7471 does *NOT* necessarily hold for reduction patterns. */
7473 bool
7474 vectorizable_reduction (loop_vec_info loop_vinfo,
7475 stmt_vec_info stmt_info, slp_tree slp_node,
7476 slp_instance slp_node_instance,
7477 stmt_vector_for_cost *cost_vec)
7479 tree vectype_in = NULL_TREE;
7480 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7481 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7482 stmt_vec_info cond_stmt_vinfo = NULL;
7483 int i;
7484 int ncopies;
7485 bool single_defuse_cycle = false;
7486 bool nested_cycle = false;
7487 bool double_reduc = false;
7488 int vec_num;
7489 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7490 tree cond_reduc_val = NULL_TREE;
7492 /* Make sure it was already recognized as a reduction computation. */
7493 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7494 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7495 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7496 return false;
7498 /* The stmt we store reduction analysis meta on. */
7499 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7500 reduc_info->is_reduc_info = true;
7502 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7504 if (is_a <gphi *> (stmt_info->stmt))
7506 if (slp_node)
7508 /* We eventually need to set a vector type on invariant
7509 arguments. */
7510 unsigned j;
7511 slp_tree child;
7512 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7513 if (!vect_maybe_update_slp_op_vectype
7514 (child, SLP_TREE_VECTYPE (slp_node)))
7516 if (dump_enabled_p ())
7517 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7518 "incompatible vector types for "
7519 "invariants\n");
7520 return false;
7523 /* Analysis for double-reduction is done on the outer
7524 loop PHI, nested cycles have no further restrictions. */
7525 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7527 else
7528 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7529 return true;
7532 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7533 stmt_vec_info phi_info = stmt_info;
7534 if (!is_a <gphi *> (stmt_info->stmt))
7536 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7537 return true;
7539 if (slp_node)
7541 slp_node_instance->reduc_phis = slp_node;
7542 /* ??? We're leaving slp_node to point to the PHIs, we only
7543 need it to get at the number of vector stmts which wasn't
7544 yet initialized for the instance root. */
7546 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7548 use_operand_p use_p;
7549 gimple *use_stmt;
7550 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7551 &use_p, &use_stmt);
7552 gcc_assert (res);
7553 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7556 /* PHIs should not participate in patterns. */
7557 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7558 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7560 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7561 and compute the reduction chain length. Discover the real
7562 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7563 tree reduc_def
7564 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7565 loop_latch_edge
7566 (gimple_bb (reduc_def_phi)->loop_father));
7567 unsigned reduc_chain_length = 0;
7568 bool only_slp_reduc_chain = true;
7569 stmt_info = NULL;
7570 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7571 while (reduc_def != PHI_RESULT (reduc_def_phi))
7573 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7574 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7575 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7577 if (dump_enabled_p ())
7578 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7579 "reduction chain broken by patterns.\n");
7580 return false;
7582 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7583 only_slp_reduc_chain = false;
7584 /* For epilogue generation live members of the chain need
7585 to point back to the PHI via their original stmt for
7586 info_for_reduction to work. For SLP we need to look at
7587 all lanes here - even though we only will vectorize from
7588 the SLP node with live lane zero the other live lanes also
7589 need to be identified as part of a reduction to be able
7590 to skip code generation for them. */
7591 if (slp_for_stmt_info)
7593 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7594 if (STMT_VINFO_LIVE_P (s))
7595 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7597 else if (STMT_VINFO_LIVE_P (vdef))
7598 STMT_VINFO_REDUC_DEF (def) = phi_info;
7599 gimple_match_op op;
7600 if (!gimple_extract_op (vdef->stmt, &op))
7602 if (dump_enabled_p ())
7603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7604 "reduction chain includes unsupported"
7605 " statement type.\n");
7606 return false;
7608 if (CONVERT_EXPR_CODE_P (op.code))
7610 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7612 if (dump_enabled_p ())
7613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7614 "conversion in the reduction chain.\n");
7615 return false;
7618 else if (!stmt_info)
7619 /* First non-conversion stmt. */
7620 stmt_info = vdef;
7621 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7622 reduc_chain_length++;
7623 if (!stmt_info && slp_node)
7624 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7626 /* PHIs should not participate in patterns. */
7627 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7629 if (nested_in_vect_loop_p (loop, stmt_info))
7631 loop = loop->inner;
7632 nested_cycle = true;
7635 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7636 element. */
7637 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7639 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7640 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7642 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7643 gcc_assert (slp_node
7644 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7646 /* 1. Is vectorizable reduction? */
7647 /* Not supportable if the reduction variable is used in the loop, unless
7648 it's a reduction chain. */
7649 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7650 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7651 return false;
7653 /* Reductions that are not used even in an enclosing outer-loop,
7654 are expected to be "live" (used out of the loop). */
7655 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7656 && !STMT_VINFO_LIVE_P (stmt_info))
7657 return false;
7659 /* 2. Has this been recognized as a reduction pattern?
7661 Check if STMT represents a pattern that has been recognized
7662 in earlier analysis stages. For stmts that represent a pattern,
7663 the STMT_VINFO_RELATED_STMT field records the last stmt in
7664 the original sequence that constitutes the pattern. */
7666 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7667 if (orig_stmt_info)
7669 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7670 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7673 /* 3. Check the operands of the operation. The first operands are defined
7674 inside the loop body. The last operand is the reduction variable,
7675 which is defined by the loop-header-phi. */
7677 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7678 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7679 gimple_match_op op;
7680 if (!gimple_extract_op (stmt_info->stmt, &op))
7681 gcc_unreachable ();
7682 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7683 || op.code == WIDEN_SUM_EXPR
7684 || op.code == SAD_EXPR);
7686 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7687 && !SCALAR_FLOAT_TYPE_P (op.type))
7688 return false;
7690 /* Do not try to vectorize bit-precision reductions. */
7691 if (!type_has_mode_precision_p (op.type))
7692 return false;
7694 /* For lane-reducing ops we're reducing the number of reduction PHIs
7695 which means the only use of that may be in the lane-reducing operation. */
7696 if (lane_reduc_code_p
7697 && reduc_chain_length != 1
7698 && !only_slp_reduc_chain)
7700 if (dump_enabled_p ())
7701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7702 "lane-reducing reduction with extra stmts.\n");
7703 return false;
7706 /* All uses but the last are expected to be defined in the loop.
7707 The last use is the reduction variable. In case of nested cycle this
7708 assumption is not true: we use reduc_index to record the index of the
7709 reduction variable. */
7710 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7711 tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7712 /* We need to skip an extra operand for COND_EXPRs with embedded
7713 comparison. */
7714 unsigned opno_adjust = 0;
7715 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7716 opno_adjust = 1;
7717 for (i = 0; i < (int) op.num_ops; i++)
7719 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7720 if (i == 0 && op.code == COND_EXPR)
7721 continue;
7723 stmt_vec_info def_stmt_info;
7724 enum vect_def_type dt;
7725 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7726 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7727 &vectype_op[i], &def_stmt_info))
7729 if (dump_enabled_p ())
7730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7731 "use not simple.\n");
7732 return false;
7734 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7735 continue;
7737 /* For an IFN_COND_OP we might hit the reduction definition operand
7738 twice (once as definition, once as else). */
7739 if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7740 continue;
7742 /* There should be only one cycle def in the stmt, the one
7743 leading to reduc_def. */
7744 if (VECTORIZABLE_CYCLE_DEF (dt))
7745 return false;
7747 if (!vectype_op[i])
7748 vectype_op[i]
7749 = get_vectype_for_scalar_type (loop_vinfo,
7750 TREE_TYPE (op.ops[i]), slp_op[i]);
7752 /* To properly compute ncopies we are interested in the widest
7753 non-reduction input type in case we're looking at a widening
7754 accumulation that we later handle in vect_transform_reduction. */
7755 if (lane_reduc_code_p
7756 && vectype_op[i]
7757 && (!vectype_in
7758 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7759 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7760 vectype_in = vectype_op[i];
7762 if (op.code == COND_EXPR)
7764 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7765 if (dt == vect_constant_def)
7767 cond_reduc_dt = dt;
7768 cond_reduc_val = op.ops[i];
7770 if (dt == vect_induction_def
7771 && def_stmt_info
7772 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7774 cond_reduc_dt = dt;
7775 cond_stmt_vinfo = def_stmt_info;
7779 if (!vectype_in)
7780 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7781 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7783 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7784 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7785 /* If we have a condition reduction, see if we can simplify it further. */
7786 if (v_reduc_type == COND_REDUCTION)
7788 if (slp_node)
7789 return false;
7791 /* When the condition uses the reduction value in the condition, fail. */
7792 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7794 if (dump_enabled_p ())
7795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7796 "condition depends on previous iteration\n");
7797 return false;
7800 if (reduc_chain_length == 1
7801 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7802 OPTIMIZE_FOR_SPEED)
7803 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7804 vectype_in,
7805 OPTIMIZE_FOR_SPEED)))
7807 if (dump_enabled_p ())
7808 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7809 "optimizing condition reduction with"
7810 " FOLD_EXTRACT_LAST.\n");
7811 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7813 else if (cond_reduc_dt == vect_induction_def)
7815 tree base
7816 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7817 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7819 gcc_assert (TREE_CODE (base) == INTEGER_CST
7820 && TREE_CODE (step) == INTEGER_CST);
7821 cond_reduc_val = NULL_TREE;
7822 enum tree_code cond_reduc_op_code = ERROR_MARK;
7823 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7824 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7826 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7827 above base; punt if base is the minimum value of the type for
7828 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7829 else if (tree_int_cst_sgn (step) == -1)
7831 cond_reduc_op_code = MIN_EXPR;
7832 if (tree_int_cst_sgn (base) == -1)
7833 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7834 else if (tree_int_cst_lt (base,
7835 TYPE_MAX_VALUE (TREE_TYPE (base))))
7836 cond_reduc_val
7837 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7839 else
7841 cond_reduc_op_code = MAX_EXPR;
7842 if (tree_int_cst_sgn (base) == 1)
7843 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7844 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7845 base))
7846 cond_reduc_val
7847 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7849 if (cond_reduc_val)
7851 if (dump_enabled_p ())
7852 dump_printf_loc (MSG_NOTE, vect_location,
7853 "condition expression based on "
7854 "integer induction.\n");
7855 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7856 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7857 = cond_reduc_val;
7858 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7861 else if (cond_reduc_dt == vect_constant_def)
7863 enum vect_def_type cond_initial_dt;
7864 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7865 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7866 if (cond_initial_dt == vect_constant_def
7867 && types_compatible_p (TREE_TYPE (cond_initial_val),
7868 TREE_TYPE (cond_reduc_val)))
7870 tree e = fold_binary (LE_EXPR, boolean_type_node,
7871 cond_initial_val, cond_reduc_val);
7872 if (e && (integer_onep (e) || integer_zerop (e)))
7874 if (dump_enabled_p ())
7875 dump_printf_loc (MSG_NOTE, vect_location,
7876 "condition expression based on "
7877 "compile time constant.\n");
7878 /* Record reduction code at analysis stage. */
7879 STMT_VINFO_REDUC_CODE (reduc_info)
7880 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7881 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7887 if (STMT_VINFO_LIVE_P (phi_info))
7888 return false;
7890 if (slp_node)
7891 ncopies = 1;
7892 else
7893 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7895 gcc_assert (ncopies >= 1);
7897 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7899 if (nested_cycle)
7901 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7902 == vect_double_reduction_def);
7903 double_reduc = true;
7906 /* 4.2. Check support for the epilog operation.
7908 If STMT represents a reduction pattern, then the type of the
7909 reduction variable may be different than the type of the rest
7910 of the arguments. For example, consider the case of accumulation
7911 of shorts into an int accumulator; The original code:
7912 S1: int_a = (int) short_a;
7913 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7915 was replaced with:
7916 STMT: int_acc = widen_sum <short_a, int_acc>
7918 This means that:
7919 1. The tree-code that is used to create the vector operation in the
7920 epilog code (that reduces the partial results) is not the
7921 tree-code of STMT, but is rather the tree-code of the original
7922 stmt from the pattern that STMT is replacing. I.e, in the example
7923 above we want to use 'widen_sum' in the loop, but 'plus' in the
7924 epilog.
7925 2. The type (mode) we use to check available target support
7926 for the vector operation to be created in the *epilog*, is
7927 determined by the type of the reduction variable (in the example
7928 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7929 However the type (mode) we use to check available target support
7930 for the vector operation to be created *inside the loop*, is
7931 determined by the type of the other arguments to STMT (in the
7932 example we'd check this: optab_handler (widen_sum_optab,
7933 vect_short_mode)).
7935 This is contrary to "regular" reductions, in which the types of all
7936 the arguments are the same as the type of the reduction variable.
7937 For "regular" reductions we can therefore use the same vector type
7938 (and also the same tree-code) when generating the epilog code and
7939 when generating the code inside the loop. */
7941 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7943 /* If conversion might have created a conditional operation like
7944 IFN_COND_ADD already. Use the internal code for the following checks. */
7945 if (orig_code.is_internal_fn ())
7947 tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7948 orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7951 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7953 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7954 if (reduction_type == TREE_CODE_REDUCTION)
7956 /* Check whether it's ok to change the order of the computation.
7957 Generally, when vectorizing a reduction we change the order of the
7958 computation. This may change the behavior of the program in some
7959 cases, so we need to check that this is ok. One exception is when
7960 vectorizing an outer-loop: the inner-loop is executed sequentially,
7961 and therefore vectorizing reductions in the inner-loop during
7962 outer-loop vectorization is safe. Likewise when we are vectorizing
7963 a series of reductions using SLP and the VF is one the reductions
7964 are performed in scalar order. */
7965 if (slp_node
7966 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7967 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7969 else if (needs_fold_left_reduction_p (op.type, orig_code))
7971 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7972 is not directy used in stmt. */
7973 if (!only_slp_reduc_chain
7974 && reduc_chain_length != 1)
7976 if (dump_enabled_p ())
7977 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7978 "in-order reduction chain without SLP.\n");
7979 return false;
7981 STMT_VINFO_REDUC_TYPE (reduc_info)
7982 = reduction_type = FOLD_LEFT_REDUCTION;
7984 else if (!commutative_binary_op_p (orig_code, op.type)
7985 || !associative_binary_op_p (orig_code, op.type))
7987 if (dump_enabled_p ())
7988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7989 "reduction: not commutative/associative\n");
7990 return false;
7994 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7995 && ncopies > 1)
7997 if (dump_enabled_p ())
7998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7999 "multiple types in double reduction or condition "
8000 "reduction or fold-left reduction.\n");
8001 return false;
8004 internal_fn reduc_fn = IFN_LAST;
8005 if (reduction_type == TREE_CODE_REDUCTION
8006 || reduction_type == FOLD_LEFT_REDUCTION
8007 || reduction_type == INTEGER_INDUC_COND_REDUCTION
8008 || reduction_type == CONST_COND_REDUCTION)
8010 if (reduction_type == FOLD_LEFT_REDUCTION
8011 ? fold_left_reduction_fn (orig_code, &reduc_fn)
8012 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
8014 if (reduc_fn != IFN_LAST
8015 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
8016 OPTIMIZE_FOR_SPEED))
8018 if (dump_enabled_p ())
8019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8020 "reduc op not supported by target.\n");
8022 reduc_fn = IFN_LAST;
8025 else
8027 if (!nested_cycle || double_reduc)
8029 if (dump_enabled_p ())
8030 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8031 "no reduc code for scalar code.\n");
8033 return false;
8037 else if (reduction_type == COND_REDUCTION)
8039 int scalar_precision
8040 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8041 cr_index_scalar_type = make_unsigned_type (scalar_precision);
8042 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8043 vectype_out);
8045 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8046 OPTIMIZE_FOR_SPEED))
8047 reduc_fn = IFN_REDUC_MAX;
8049 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8051 if (reduction_type != EXTRACT_LAST_REDUCTION
8052 && (!nested_cycle || double_reduc)
8053 && reduc_fn == IFN_LAST
8054 && !nunits_out.is_constant ())
8056 if (dump_enabled_p ())
8057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8058 "missing target support for reduction on"
8059 " variable-length vectors.\n");
8060 return false;
8063 /* For SLP reductions, see if there is a neutral value we can use. */
8064 tree neutral_op = NULL_TREE;
8065 if (slp_node)
8067 tree initial_value = NULL_TREE;
8068 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8069 initial_value = vect_phi_initial_value (reduc_def_phi);
8070 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8071 orig_code, initial_value);
8074 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8076 /* We can't support in-order reductions of code such as this:
8078 for (int i = 0; i < n1; ++i)
8079 for (int j = 0; j < n2; ++j)
8080 l += a[j];
8082 since GCC effectively transforms the loop when vectorizing:
8084 for (int i = 0; i < n1 / VF; ++i)
8085 for (int j = 0; j < n2; ++j)
8086 for (int k = 0; k < VF; ++k)
8087 l += a[j];
8089 which is a reassociation of the original operation. */
8090 if (dump_enabled_p ())
8091 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8092 "in-order double reduction not supported.\n");
8094 return false;
8097 if (reduction_type == FOLD_LEFT_REDUCTION
8098 && slp_node
8099 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8101 /* We cannot use in-order reductions in this case because there is
8102 an implicit reassociation of the operations involved. */
8103 if (dump_enabled_p ())
8104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8105 "in-order unchained SLP reductions not supported.\n");
8106 return false;
8109 /* For double reductions, and for SLP reductions with a neutral value,
8110 we construct a variable-length initial vector by loading a vector
8111 full of the neutral value and then shift-and-inserting the start
8112 values into the low-numbered elements. */
8113 if ((double_reduc || neutral_op)
8114 && !nunits_out.is_constant ()
8115 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8116 vectype_out, OPTIMIZE_FOR_SPEED))
8118 if (dump_enabled_p ())
8119 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8120 "reduction on variable-length vectors requires"
8121 " target support for a vector-shift-and-insert"
8122 " operation.\n");
8123 return false;
8126 /* Check extra constraints for variable-length unchained SLP reductions. */
8127 if (slp_node
8128 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8129 && !nunits_out.is_constant ())
8131 /* We checked above that we could build the initial vector when
8132 there's a neutral element value. Check here for the case in
8133 which each SLP statement has its own initial value and in which
8134 that value needs to be repeated for every instance of the
8135 statement within the initial vector. */
8136 unsigned int group_size = SLP_TREE_LANES (slp_node);
8137 if (!neutral_op
8138 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8139 TREE_TYPE (vectype_out)))
8141 if (dump_enabled_p ())
8142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8143 "unsupported form of SLP reduction for"
8144 " variable-length vectors: cannot build"
8145 " initial vector.\n");
8146 return false;
8148 /* The epilogue code relies on the number of elements being a multiple
8149 of the group size. The duplicate-and-interleave approach to setting
8150 up the initial vector does too. */
8151 if (!multiple_p (nunits_out, group_size))
8153 if (dump_enabled_p ())
8154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8155 "unsupported form of SLP reduction for"
8156 " variable-length vectors: the vector size"
8157 " is not a multiple of the number of results.\n");
8158 return false;
8162 if (reduction_type == COND_REDUCTION)
8164 widest_int ni;
8166 if (! max_loop_iterations (loop, &ni))
8168 if (dump_enabled_p ())
8169 dump_printf_loc (MSG_NOTE, vect_location,
8170 "loop count not known, cannot create cond "
8171 "reduction.\n");
8172 return false;
8174 /* Convert backedges to iterations. */
8175 ni += 1;
8177 /* The additional index will be the same type as the condition. Check
8178 that the loop can fit into this less one (because we'll use up the
8179 zero slot for when there are no matches). */
8180 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8181 if (wi::geu_p (ni, wi::to_widest (max_index)))
8183 if (dump_enabled_p ())
8184 dump_printf_loc (MSG_NOTE, vect_location,
8185 "loop size is greater than data size.\n");
8186 return false;
8190 /* In case the vectorization factor (VF) is bigger than the number
8191 of elements that we can fit in a vectype (nunits), we have to generate
8192 more than one vector stmt - i.e - we need to "unroll" the
8193 vector stmt by a factor VF/nunits. For more details see documentation
8194 in vectorizable_operation. */
8196 /* If the reduction is used in an outer loop we need to generate
8197 VF intermediate results, like so (e.g. for ncopies=2):
8198 r0 = phi (init, r0)
8199 r1 = phi (init, r1)
8200 r0 = x0 + r0;
8201 r1 = x1 + r1;
8202 (i.e. we generate VF results in 2 registers).
8203 In this case we have a separate def-use cycle for each copy, and therefore
8204 for each copy we get the vector def for the reduction variable from the
8205 respective phi node created for this copy.
8207 Otherwise (the reduction is unused in the loop nest), we can combine
8208 together intermediate results, like so (e.g. for ncopies=2):
8209 r = phi (init, r)
8210 r = x0 + r;
8211 r = x1 + r;
8212 (i.e. we generate VF/2 results in a single register).
8213 In this case for each copy we get the vector def for the reduction variable
8214 from the vectorized reduction operation generated in the previous iteration.
8216 This only works when we see both the reduction PHI and its only consumer
8217 in vectorizable_reduction and there are no intermediate stmts
8218 participating. When unrolling we want each unrolled iteration to have its
8219 own reduction accumulator since one of the main goals of unrolling a
8220 reduction is to reduce the aggregate loop-carried latency. */
8221 if (ncopies > 1
8222 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8223 && reduc_chain_length == 1
8224 && loop_vinfo->suggested_unroll_factor == 1)
8225 single_defuse_cycle = true;
8227 if (single_defuse_cycle || lane_reduc_code_p)
8229 gcc_assert (op.code != COND_EXPR);
8231 /* 4. Supportable by target? */
8232 bool ok = true;
8234 /* 4.1. check support for the operation in the loop
8236 This isn't necessary for the lane reduction codes, since they
8237 can only be produced by pattern matching, and it's up to the
8238 pattern matcher to test for support. The main reason for
8239 specifically skipping this step is to avoid rechecking whether
8240 mixed-sign dot-products can be implemented using signed
8241 dot-products. */
8242 machine_mode vec_mode = TYPE_MODE (vectype_in);
8243 if (!lane_reduc_code_p
8244 && !directly_supported_p (op.code, vectype_in, optab_vector))
8246 if (dump_enabled_p ())
8247 dump_printf (MSG_NOTE, "op not supported by target.\n");
8248 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8249 || !vect_can_vectorize_without_simd_p (op.code))
8250 ok = false;
8251 else
8252 if (dump_enabled_p ())
8253 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8256 if (vect_emulated_vector_p (vectype_in)
8257 && !vect_can_vectorize_without_simd_p (op.code))
8259 if (dump_enabled_p ())
8260 dump_printf (MSG_NOTE, "using word mode not possible.\n");
8261 return false;
8264 /* lane-reducing operations have to go through vect_transform_reduction.
8265 For the other cases try without the single cycle optimization. */
8266 if (!ok)
8268 if (lane_reduc_code_p)
8269 return false;
8270 else
8271 single_defuse_cycle = false;
8274 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8276 /* If the reduction stmt is one of the patterns that have lane
8277 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
8278 if ((ncopies > 1 && ! single_defuse_cycle)
8279 && lane_reduc_code_p)
8281 if (dump_enabled_p ())
8282 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8283 "multi def-use cycle not possible for lane-reducing "
8284 "reduction operation\n");
8285 return false;
8288 if (slp_node
8289 && !(!single_defuse_cycle
8290 && !lane_reduc_code_p
8291 && reduction_type != FOLD_LEFT_REDUCTION))
8292 for (i = 0; i < (int) op.num_ops; i++)
8293 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8295 if (dump_enabled_p ())
8296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8297 "incompatible vector types for invariants\n");
8298 return false;
8301 if (slp_node)
8302 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8303 else
8304 vec_num = 1;
8306 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8307 reduction_type, ncopies, cost_vec);
8308 /* Cost the reduction op inside the loop if transformed via
8309 vect_transform_reduction. Otherwise this is costed by the
8310 separate vectorizable_* routines. */
8311 if (single_defuse_cycle || lane_reduc_code_p)
8313 int factor = 1;
8314 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8315 /* Three dot-products and a subtraction. */
8316 factor = 4;
8317 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8318 stmt_info, 0, vect_body);
8321 if (dump_enabled_p ()
8322 && reduction_type == FOLD_LEFT_REDUCTION)
8323 dump_printf_loc (MSG_NOTE, vect_location,
8324 "using an in-order (fold-left) reduction.\n");
8325 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8326 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8327 reductions go through their own vectorizable_* routines. */
8328 if (!single_defuse_cycle
8329 && !lane_reduc_code_p
8330 && reduction_type != FOLD_LEFT_REDUCTION)
8332 stmt_vec_info tem
8333 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8334 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8336 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8337 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8339 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8340 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8342 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8344 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8345 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8346 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8348 if (reduction_type != FOLD_LEFT_REDUCTION
8349 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8350 && (cond_fn == IFN_LAST
8351 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8352 OPTIMIZE_FOR_SPEED)))
8354 if (dump_enabled_p ())
8355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8356 "can't operate on partial vectors because"
8357 " no conditional operation is available.\n");
8358 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8360 else if (reduction_type == FOLD_LEFT_REDUCTION
8361 && reduc_fn == IFN_LAST
8362 && !expand_vec_cond_expr_p (vectype_in,
8363 truth_type_for (vectype_in),
8364 SSA_NAME))
8366 if (dump_enabled_p ())
8367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8368 "can't operate on partial vectors because"
8369 " no conditional operation is available.\n");
8370 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8372 else if (reduction_type == FOLD_LEFT_REDUCTION
8373 && internal_fn_mask_index (reduc_fn) == -1
8374 && FLOAT_TYPE_P (vectype_in)
8375 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8377 if (dump_enabled_p ())
8378 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8379 "can't operate on partial vectors because"
8380 " signed zeros cannot be preserved.\n");
8381 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8383 else
8385 internal_fn mask_reduc_fn
8386 = get_masked_reduction_fn (reduc_fn, vectype_in);
8388 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8389 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8390 vectype_in, 1);
8391 else
8392 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8393 vectype_in, NULL);
8396 return true;
8399 /* STMT_INFO is a dot-product reduction whose multiplication operands
8400 have different signs. Emit a sequence to emulate the operation
8401 using a series of signed DOT_PROD_EXPRs and return the last
8402 statement generated. VEC_DEST is the result of the vector operation
8403 and VOP lists its inputs. */
8405 static gassign *
8406 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8407 gimple_stmt_iterator *gsi, tree vec_dest,
8408 tree vop[3])
8410 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8411 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8412 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8413 gimple *new_stmt;
8415 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8416 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8417 std::swap (vop[0], vop[1]);
8419 /* Convert all inputs to signed types. */
8420 for (int i = 0; i < 3; ++i)
8421 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8423 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8424 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8425 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8426 vop[i] = tmp;
8429 /* In the comments below we assume 8-bit inputs for simplicity,
8430 but the approach works for any full integer type. */
8432 /* Create a vector of -128. */
8433 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8434 tree min_narrow = build_vector_from_val (narrow_vectype,
8435 min_narrow_elttype);
8437 /* Create a vector of 64. */
8438 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8439 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8440 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8442 /* Emit: SUB_RES = VOP[0] - 128. */
8443 tree sub_res = make_ssa_name (narrow_vectype);
8444 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8445 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8447 /* Emit:
8449 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8450 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8451 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8453 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8454 Doing the two 64 * y steps first allows more time to compute x. */
8455 tree stage1 = make_ssa_name (wide_vectype);
8456 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8457 vop[1], half_narrow, vop[2]);
8458 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8460 tree stage2 = make_ssa_name (wide_vectype);
8461 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8462 vop[1], half_narrow, stage1);
8463 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8465 tree stage3 = make_ssa_name (wide_vectype);
8466 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8467 sub_res, vop[1], stage2);
8468 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8470 /* Convert STAGE3 to the reduction type. */
8471 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8474 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8475 value. */
8477 bool
8478 vect_transform_reduction (loop_vec_info loop_vinfo,
8479 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8480 gimple **vec_stmt, slp_tree slp_node)
8482 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8483 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8484 int i;
8485 int ncopies;
8486 int vec_num;
8488 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8489 gcc_assert (reduc_info->is_reduc_info);
8491 if (nested_in_vect_loop_p (loop, stmt_info))
8493 loop = loop->inner;
8494 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8497 gimple_match_op op;
8498 if (!gimple_extract_op (stmt_info->stmt, &op))
8499 gcc_unreachable ();
8501 /* All uses but the last are expected to be defined in the loop.
8502 The last use is the reduction variable. In case of nested cycle this
8503 assumption is not true: we use reduc_index to record the index of the
8504 reduction variable. */
8505 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8506 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8507 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8508 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8510 if (slp_node)
8512 ncopies = 1;
8513 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8515 else
8517 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8518 vec_num = 1;
8521 code_helper code = canonicalize_code (op.code, op.type);
8522 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8524 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8525 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8526 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8528 /* Transform. */
8529 tree new_temp = NULL_TREE;
8530 auto_vec<tree> vec_oprnds0;
8531 auto_vec<tree> vec_oprnds1;
8532 auto_vec<tree> vec_oprnds2;
8533 tree def0;
8535 if (dump_enabled_p ())
8536 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8538 /* FORNOW: Multiple types are not supported for condition. */
8539 if (code == COND_EXPR)
8540 gcc_assert (ncopies == 1);
8542 /* A binary COND_OP reduction must have the same definition and else
8543 value. */
8544 bool cond_fn_p = code.is_internal_fn ()
8545 && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8546 if (cond_fn_p)
8548 gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8549 || code == IFN_COND_MUL || code == IFN_COND_AND
8550 || code == IFN_COND_IOR || code == IFN_COND_XOR);
8551 gcc_assert (op.num_ops == 4
8552 && (op.ops[reduc_index]
8553 == op.ops[internal_fn_else_index ((internal_fn) code)]));
8556 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8558 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8559 if (reduction_type == FOLD_LEFT_REDUCTION)
8561 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8562 gcc_assert (code.is_tree_code () || cond_fn_p);
8563 return vectorize_fold_left_reduction
8564 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8565 code, reduc_fn, op.ops, op.num_ops, vectype_in,
8566 reduc_index, masks, lens);
8569 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8570 gcc_assert (single_defuse_cycle
8571 || code == DOT_PROD_EXPR
8572 || code == WIDEN_SUM_EXPR
8573 || code == SAD_EXPR);
8575 /* Create the destination vector */
8576 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8577 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8579 /* Get NCOPIES vector definitions for all operands except the reduction
8580 definition. */
8581 if (!cond_fn_p)
8583 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8584 single_defuse_cycle && reduc_index == 0
8585 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8586 single_defuse_cycle && reduc_index == 1
8587 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8588 op.num_ops == 3
8589 && !(single_defuse_cycle && reduc_index == 2)
8590 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8592 else
8594 /* For a conditional operation pass the truth type as mask
8595 vectype. */
8596 gcc_assert (single_defuse_cycle
8597 && (reduc_index == 1 || reduc_index == 2));
8598 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8599 op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8600 reduc_index == 1 ? NULL_TREE : op.ops[1],
8601 NULL_TREE, &vec_oprnds1,
8602 reduc_index == 2 ? NULL_TREE : op.ops[2],
8603 NULL_TREE, &vec_oprnds2);
8606 /* For single def-use cycles get one copy of the vectorized reduction
8607 definition. */
8608 if (single_defuse_cycle)
8610 gcc_assert (!slp_node);
8611 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8612 op.ops[reduc_index],
8613 reduc_index == 0 ? &vec_oprnds0
8614 : (reduc_index == 1 ? &vec_oprnds1
8615 : &vec_oprnds2));
8618 bool emulated_mixed_dot_prod
8619 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8620 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8622 gimple *new_stmt;
8623 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8624 if (masked_loop_p && !mask_by_cond_expr)
8626 /* No conditional ifns have been defined for dot-product yet. */
8627 gcc_assert (code != DOT_PROD_EXPR);
8629 /* Make sure that the reduction accumulator is vop[0]. */
8630 if (reduc_index == 1)
8632 gcc_assert (commutative_binary_op_p (code, op.type));
8633 std::swap (vop[0], vop[1]);
8635 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8636 vec_num * ncopies, vectype_in, i);
8637 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8638 vop[0], vop[1], vop[0]);
8639 new_temp = make_ssa_name (vec_dest, call);
8640 gimple_call_set_lhs (call, new_temp);
8641 gimple_call_set_nothrow (call, true);
8642 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8643 new_stmt = call;
8645 else
8647 if (op.num_ops >= 3)
8648 vop[2] = vec_oprnds2[i];
8650 if (masked_loop_p && mask_by_cond_expr)
8652 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8653 vec_num * ncopies, vectype_in, i);
8654 build_vect_cond_expr (code, vop, mask, gsi);
8657 if (emulated_mixed_dot_prod)
8658 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8659 vec_dest, vop);
8661 else if (code.is_internal_fn () && !cond_fn_p)
8662 new_stmt = gimple_build_call_internal (internal_fn (code),
8663 op.num_ops,
8664 vop[0], vop[1], vop[2]);
8665 else if (code.is_internal_fn () && cond_fn_p)
8666 new_stmt = gimple_build_call_internal (internal_fn (code),
8667 op.num_ops,
8668 vop[0], vop[1], vop[2],
8669 vop[1]);
8670 else
8671 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8672 vop[0], vop[1], vop[2]);
8673 new_temp = make_ssa_name (vec_dest, new_stmt);
8674 gimple_set_lhs (new_stmt, new_temp);
8675 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8678 if (slp_node)
8679 slp_node->push_vec_def (new_stmt);
8680 else if (single_defuse_cycle
8681 && i < ncopies - 1)
8683 if (reduc_index == 0)
8684 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8685 else if (reduc_index == 1)
8686 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8687 else if (reduc_index == 2)
8688 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8690 else
8691 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8694 if (!slp_node)
8695 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8697 return true;
8700 /* Transform phase of a cycle PHI. */
8702 bool
8703 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8704 stmt_vec_info stmt_info, gimple **vec_stmt,
8705 slp_tree slp_node, slp_instance slp_node_instance)
8707 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8708 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8709 int i;
8710 int ncopies;
8711 int j;
8712 bool nested_cycle = false;
8713 int vec_num;
8715 if (nested_in_vect_loop_p (loop, stmt_info))
8717 loop = loop->inner;
8718 nested_cycle = true;
8721 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8722 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8723 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8724 gcc_assert (reduc_info->is_reduc_info);
8726 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8727 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8728 /* Leave the scalar phi in place. */
8729 return true;
8731 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8732 /* For a nested cycle we do not fill the above. */
8733 if (!vectype_in)
8734 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8735 gcc_assert (vectype_in);
8737 if (slp_node)
8739 /* The size vect_schedule_slp_instance computes is off for us. */
8740 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8741 * SLP_TREE_LANES (slp_node), vectype_in);
8742 ncopies = 1;
8744 else
8746 vec_num = 1;
8747 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8750 /* Check whether we should use a single PHI node and accumulate
8751 vectors to one before the backedge. */
8752 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8753 ncopies = 1;
8755 /* Create the destination vector */
8756 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8757 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8758 vectype_out);
8760 /* Get the loop-entry arguments. */
8761 tree vec_initial_def = NULL_TREE;
8762 auto_vec<tree> vec_initial_defs;
8763 if (slp_node)
8765 vec_initial_defs.reserve (vec_num);
8766 if (nested_cycle)
8768 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8769 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8770 &vec_initial_defs);
8772 else
8774 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8775 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8776 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8778 unsigned int num_phis = stmts.length ();
8779 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8780 num_phis = 1;
8781 initial_values.reserve (num_phis);
8782 for (unsigned int i = 0; i < num_phis; ++i)
8784 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8785 initial_values.quick_push (vect_phi_initial_value (this_phi));
8787 if (vec_num == 1)
8788 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8789 if (!initial_values.is_empty ())
8791 tree initial_value
8792 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8793 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8794 tree neutral_op
8795 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8796 code, initial_value);
8797 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8798 &vec_initial_defs, vec_num,
8799 stmts.length (), neutral_op);
8803 else
8805 /* Get at the scalar def before the loop, that defines the initial
8806 value of the reduction variable. */
8807 tree initial_def = vect_phi_initial_value (phi);
8808 reduc_info->reduc_initial_values.safe_push (initial_def);
8809 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8810 and we can't use zero for induc_val, use initial_def. Similarly
8811 for REDUC_MIN and initial_def larger than the base. */
8812 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8814 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8815 if (TREE_CODE (initial_def) == INTEGER_CST
8816 && !integer_zerop (induc_val)
8817 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8818 && tree_int_cst_lt (initial_def, induc_val))
8819 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8820 && tree_int_cst_lt (induc_val, initial_def))))
8822 induc_val = initial_def;
8823 /* Communicate we used the initial_def to epilouge
8824 generation. */
8825 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8827 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8829 else if (nested_cycle)
8831 /* Do not use an adjustment def as that case is not supported
8832 correctly if ncopies is not one. */
8833 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8834 ncopies, initial_def,
8835 &vec_initial_defs);
8837 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8838 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8839 /* Fill the initial vector with the initial scalar value. */
8840 vec_initial_def
8841 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8842 initial_def, initial_def);
8843 else
8845 if (ncopies == 1)
8846 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8847 if (!reduc_info->reduc_initial_values.is_empty ())
8849 initial_def = reduc_info->reduc_initial_values[0];
8850 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8851 tree neutral_op
8852 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8853 code, initial_def);
8854 gcc_assert (neutral_op);
8855 /* Try to simplify the vector initialization by applying an
8856 adjustment after the reduction has been performed. */
8857 if (!reduc_info->reused_accumulator
8858 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8859 && !operand_equal_p (neutral_op, initial_def))
8861 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8862 = initial_def;
8863 initial_def = neutral_op;
8865 vec_initial_def
8866 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8867 initial_def, neutral_op);
8872 if (vec_initial_def)
8874 vec_initial_defs.create (ncopies);
8875 for (i = 0; i < ncopies; ++i)
8876 vec_initial_defs.quick_push (vec_initial_def);
8879 if (auto *accumulator = reduc_info->reused_accumulator)
8881 tree def = accumulator->reduc_input;
8882 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8884 unsigned int nreduc;
8885 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8886 (TREE_TYPE (def)),
8887 TYPE_VECTOR_SUBPARTS (vectype_out),
8888 &nreduc);
8889 gcc_assert (res);
8890 gimple_seq stmts = NULL;
8891 /* Reduce the single vector to a smaller one. */
8892 if (nreduc != 1)
8894 /* Perform the reduction in the appropriate type. */
8895 tree rvectype = vectype_out;
8896 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8897 TREE_TYPE (TREE_TYPE (def))))
8898 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8899 TYPE_VECTOR_SUBPARTS
8900 (vectype_out));
8901 def = vect_create_partial_epilog (def, rvectype,
8902 STMT_VINFO_REDUC_CODE
8903 (reduc_info),
8904 &stmts);
8906 /* The epilogue loop might use a different vector mode, like
8907 VNx2DI vs. V2DI. */
8908 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8910 tree reduc_type = build_vector_type_for_mode
8911 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8912 def = gimple_convert (&stmts, reduc_type, def);
8914 /* Adjust the input so we pick up the partially reduced value
8915 for the skip edge in vect_create_epilog_for_reduction. */
8916 accumulator->reduc_input = def;
8917 /* And the reduction could be carried out using a different sign. */
8918 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8919 def = gimple_convert (&stmts, vectype_out, def);
8920 if (loop_vinfo->main_loop_edge)
8922 /* While we'd like to insert on the edge this will split
8923 blocks and disturb bookkeeping, we also will eventually
8924 need this on the skip edge. Rely on sinking to
8925 fixup optimal placement and insert in the pred. */
8926 gimple_stmt_iterator gsi
8927 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8928 /* Insert before a cond that eventually skips the
8929 epilogue. */
8930 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8931 gsi_prev (&gsi);
8932 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8934 else
8935 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8936 stmts);
8938 if (loop_vinfo->main_loop_edge)
8939 vec_initial_defs[0]
8940 = vect_get_main_loop_result (loop_vinfo, def,
8941 vec_initial_defs[0]);
8942 else
8943 vec_initial_defs.safe_push (def);
8946 /* Generate the reduction PHIs upfront. */
8947 for (i = 0; i < vec_num; i++)
8949 tree vec_init_def = vec_initial_defs[i];
8950 for (j = 0; j < ncopies; j++)
8952 /* Create the reduction-phi that defines the reduction
8953 operand. */
8954 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8956 /* Set the loop-entry arg of the reduction-phi. */
8957 if (j != 0 && nested_cycle)
8958 vec_init_def = vec_initial_defs[j];
8959 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8960 UNKNOWN_LOCATION);
8962 /* The loop-latch arg is set in epilogue processing. */
8964 if (slp_node)
8965 slp_node->push_vec_def (new_phi);
8966 else
8968 if (j == 0)
8969 *vec_stmt = new_phi;
8970 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8975 return true;
8978 /* Vectorizes LC PHIs. */
8980 bool
8981 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8982 stmt_vec_info stmt_info, gimple **vec_stmt,
8983 slp_tree slp_node)
8985 if (!loop_vinfo
8986 || !is_a <gphi *> (stmt_info->stmt)
8987 || gimple_phi_num_args (stmt_info->stmt) != 1)
8988 return false;
8990 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8991 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8992 return false;
8994 if (!vec_stmt) /* transformation not required. */
8996 /* Deal with copies from externs or constants that disguise as
8997 loop-closed PHI nodes (PR97886). */
8998 if (slp_node
8999 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
9000 SLP_TREE_VECTYPE (slp_node)))
9002 if (dump_enabled_p ())
9003 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9004 "incompatible vector types for invariants\n");
9005 return false;
9007 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
9008 return true;
9011 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9012 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9013 basic_block bb = gimple_bb (stmt_info->stmt);
9014 edge e = single_pred_edge (bb);
9015 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9016 auto_vec<tree> vec_oprnds;
9017 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
9018 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
9019 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
9020 for (unsigned i = 0; i < vec_oprnds.length (); i++)
9022 /* Create the vectorized LC PHI node. */
9023 gphi *new_phi = create_phi_node (vec_dest, bb);
9024 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
9025 if (slp_node)
9026 slp_node->push_vec_def (new_phi);
9027 else
9028 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
9030 if (!slp_node)
9031 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9033 return true;
9036 /* Vectorizes PHIs. */
9038 bool
9039 vectorizable_phi (vec_info *,
9040 stmt_vec_info stmt_info, gimple **vec_stmt,
9041 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9043 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9044 return false;
9046 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9047 return false;
9049 tree vectype = SLP_TREE_VECTYPE (slp_node);
9051 if (!vec_stmt) /* transformation not required. */
9053 slp_tree child;
9054 unsigned i;
9055 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9056 if (!child)
9058 if (dump_enabled_p ())
9059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9060 "PHI node with unvectorized backedge def\n");
9061 return false;
9063 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9065 if (dump_enabled_p ())
9066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9067 "incompatible vector types for invariants\n");
9068 return false;
9070 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9071 && !useless_type_conversion_p (vectype,
9072 SLP_TREE_VECTYPE (child)))
9074 /* With bools we can have mask and non-mask precision vectors
9075 or different non-mask precisions. while pattern recog is
9076 supposed to guarantee consistency here bugs in it can cause
9077 mismatches (PR103489 and PR103800 for example).
9078 Deal with them here instead of ICEing later. */
9079 if (dump_enabled_p ())
9080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9081 "incompatible vector type setup from "
9082 "bool pattern detection\n");
9083 return false;
9086 /* For single-argument PHIs assume coalescing which means zero cost
9087 for the scalar and the vector PHIs. This avoids artificially
9088 favoring the vector path (but may pessimize it in some cases). */
9089 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9090 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9091 vector_stmt, stmt_info, vectype, 0, vect_body);
9092 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9093 return true;
9096 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9097 basic_block bb = gimple_bb (stmt_info->stmt);
9098 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9099 auto_vec<gphi *> new_phis;
9100 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9102 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9104 /* Skip not yet vectorized defs. */
9105 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9106 && SLP_TREE_VEC_DEFS (child).is_empty ())
9107 continue;
9109 auto_vec<tree> vec_oprnds;
9110 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9111 if (!new_phis.exists ())
9113 new_phis.create (vec_oprnds.length ());
9114 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9116 /* Create the vectorized LC PHI node. */
9117 new_phis.quick_push (create_phi_node (vec_dest, bb));
9118 slp_node->push_vec_def (new_phis[j]);
9121 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9122 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9123 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9125 /* We should have at least one already vectorized child. */
9126 gcc_assert (new_phis.exists ());
9128 return true;
9131 /* Vectorizes first order recurrences. An overview of the transformation
9132 is described below. Suppose we have the following loop.
9134 int t = 0;
9135 for (int i = 0; i < n; ++i)
9137 b[i] = a[i] - t;
9138 t = a[i];
9141 There is a first-order recurrence on 'a'. For this loop, the scalar IR
9142 looks (simplified) like:
9144 scalar.preheader:
9145 init = 0;
9147 scalar.body:
9148 i = PHI <0(scalar.preheader), i+1(scalar.body)>
9149 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9150 _1 = a[i]
9151 b[i] = _1 - _2
9152 if (i < n) goto scalar.body
9154 In this example, _2 is a recurrence because it's value depends on the
9155 previous iteration. We vectorize this as (VF = 4)
9157 vector.preheader:
9158 vect_init = vect_cst(..., ..., ..., 0)
9160 vector.body
9161 i = PHI <0(vector.preheader), i+4(vector.body)>
9162 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9163 vect_2 = a[i, i+1, i+2, i+3];
9164 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9165 b[i, i+1, i+2, i+3] = vect_2 - vect_3
9166 if (..) goto vector.body
9168 In this function, vectorizable_recurr, we code generate both the
9169 vector PHI node and the permute since those together compute the
9170 vectorized value of the scalar PHI. We do not yet have the
9171 backedge value to fill in there nor into the vec_perm. Those
9172 are filled in maybe_set_vectorized_backedge_value and
9173 vect_schedule_scc.
9175 TODO: Since the scalar loop does not have a use of the recurrence
9176 outside of the loop the natural way to implement peeling via
9177 vectorizing the live value doesn't work. For now peeling of loops
9178 with a recurrence is not implemented. For SLP the supported cases
9179 are restricted to those requiring a single vector recurrence PHI. */
9181 bool
9182 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9183 gimple **vec_stmt, slp_tree slp_node,
9184 stmt_vector_for_cost *cost_vec)
9186 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9187 return false;
9189 gphi *phi = as_a<gphi *> (stmt_info->stmt);
9191 /* So far we only support first-order recurrence auto-vectorization. */
9192 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9193 return false;
9195 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9196 unsigned ncopies;
9197 if (slp_node)
9198 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9199 else
9200 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9201 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9202 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9203 /* We need to be able to make progress with a single vector. */
9204 if (maybe_gt (dist * 2, nunits))
9206 if (dump_enabled_p ())
9207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9208 "first order recurrence exceeds half of "
9209 "a vector\n");
9210 return false;
9213 /* First-order recurrence autovectorization needs to handle permutation
9214 with indices = [nunits-1, nunits, nunits+1, ...]. */
9215 vec_perm_builder sel (nunits, 1, 3);
9216 for (int i = 0; i < 3; ++i)
9217 sel.quick_push (nunits - dist + i);
9218 vec_perm_indices indices (sel, 2, nunits);
9220 if (!vec_stmt) /* transformation not required. */
9222 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9223 indices))
9224 return false;
9226 if (slp_node)
9228 /* We eventually need to set a vector type on invariant
9229 arguments. */
9230 unsigned j;
9231 slp_tree child;
9232 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9233 if (!vect_maybe_update_slp_op_vectype
9234 (child, SLP_TREE_VECTYPE (slp_node)))
9236 if (dump_enabled_p ())
9237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9238 "incompatible vector types for "
9239 "invariants\n");
9240 return false;
9243 /* The recurrence costs the initialization vector and one permute
9244 for each copy. */
9245 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9246 stmt_info, 0, vect_prologue);
9247 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9248 stmt_info, 0, vect_body);
9249 if (dump_enabled_p ())
9250 dump_printf_loc (MSG_NOTE, vect_location,
9251 "vectorizable_recurr: inside_cost = %d, "
9252 "prologue_cost = %d .\n", inside_cost,
9253 prologue_cost);
9255 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9256 return true;
9259 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9260 basic_block bb = gimple_bb (phi);
9261 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9262 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9264 gimple_seq stmts = NULL;
9265 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9266 gsi_insert_seq_on_edge_immediate (pe, stmts);
9268 tree vec_init = build_vector_from_val (vectype, preheader);
9269 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9271 /* Create the vectorized first-order PHI node. */
9272 tree vec_dest = vect_get_new_vect_var (vectype,
9273 vect_simple_var, "vec_recur_");
9274 gphi *new_phi = create_phi_node (vec_dest, bb);
9275 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9277 /* Insert shuffles the first-order recurrence autovectorization.
9278 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9279 tree perm = vect_gen_perm_mask_checked (vectype, indices);
9281 /* Insert the required permute after the latch definition. The
9282 second and later operands are tentative and will be updated when we have
9283 vectorized the latch definition. */
9284 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9285 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9286 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9287 gsi_next (&gsi2);
9289 for (unsigned i = 0; i < ncopies; ++i)
9291 vec_dest = make_ssa_name (vectype);
9292 gassign *vperm
9293 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9294 i == 0 ? gimple_phi_result (new_phi) : NULL,
9295 NULL, perm);
9296 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9298 if (slp_node)
9299 slp_node->push_vec_def (vperm);
9300 else
9301 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9304 if (!slp_node)
9305 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9306 return true;
9309 /* Return true if VECTYPE represents a vector that requires lowering
9310 by the vector lowering pass. */
9312 bool
9313 vect_emulated_vector_p (tree vectype)
9315 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9316 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9317 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9320 /* Return true if we can emulate CODE on an integer mode representation
9321 of a vector. */
9323 bool
9324 vect_can_vectorize_without_simd_p (tree_code code)
9326 switch (code)
9328 case PLUS_EXPR:
9329 case MINUS_EXPR:
9330 case NEGATE_EXPR:
9331 case BIT_AND_EXPR:
9332 case BIT_IOR_EXPR:
9333 case BIT_XOR_EXPR:
9334 case BIT_NOT_EXPR:
9335 return true;
9337 default:
9338 return false;
9342 /* Likewise, but taking a code_helper. */
9344 bool
9345 vect_can_vectorize_without_simd_p (code_helper code)
9347 return (code.is_tree_code ()
9348 && vect_can_vectorize_without_simd_p (tree_code (code)));
9351 /* Create vector init for vectorized iv. */
9352 static tree
9353 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9354 tree step_expr, poly_uint64 nunits,
9355 tree vectype,
9356 enum vect_induction_op_type induction_type)
9358 unsigned HOST_WIDE_INT const_nunits;
9359 tree vec_shift, vec_init, new_name;
9360 unsigned i;
9361 tree itype = TREE_TYPE (vectype);
9363 /* iv_loop is the loop to be vectorized. Create:
9364 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9365 new_name = gimple_convert (stmts, itype, init_expr);
9366 switch (induction_type)
9368 case vect_step_op_shr:
9369 case vect_step_op_shl:
9370 /* Build the Initial value from shift_expr. */
9371 vec_init = gimple_build_vector_from_val (stmts,
9372 vectype,
9373 new_name);
9374 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9375 build_zero_cst (itype), step_expr);
9376 vec_init = gimple_build (stmts,
9377 (induction_type == vect_step_op_shr
9378 ? RSHIFT_EXPR : LSHIFT_EXPR),
9379 vectype, vec_init, vec_shift);
9380 break;
9382 case vect_step_op_neg:
9384 vec_init = gimple_build_vector_from_val (stmts,
9385 vectype,
9386 new_name);
9387 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9388 vectype, vec_init);
9389 /* The encoding has 2 interleaved stepped patterns. */
9390 vec_perm_builder sel (nunits, 2, 3);
9391 sel.quick_grow (6);
9392 for (i = 0; i < 3; i++)
9394 sel[2 * i] = i;
9395 sel[2 * i + 1] = i + nunits;
9397 vec_perm_indices indices (sel, 2, nunits);
9398 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9399 fail when vec_init is const vector. In that situation vec_perm is not
9400 really needed. */
9401 tree perm_mask_even
9402 = vect_gen_perm_mask_any (vectype, indices);
9403 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9404 vectype,
9405 vec_init, vec_neg,
9406 perm_mask_even);
9408 break;
9410 case vect_step_op_mul:
9412 /* Use unsigned mult to avoid UD integer overflow. */
9413 gcc_assert (nunits.is_constant (&const_nunits));
9414 tree utype = unsigned_type_for (itype);
9415 tree uvectype = build_vector_type (utype,
9416 TYPE_VECTOR_SUBPARTS (vectype));
9417 new_name = gimple_convert (stmts, utype, new_name);
9418 vec_init = gimple_build_vector_from_val (stmts,
9419 uvectype,
9420 new_name);
9421 tree_vector_builder elts (uvectype, const_nunits, 1);
9422 tree elt_step = build_one_cst (utype);
9424 elts.quick_push (elt_step);
9425 for (i = 1; i < const_nunits; i++)
9427 /* Create: new_name_i = new_name + step_expr. */
9428 elt_step = gimple_build (stmts, MULT_EXPR,
9429 utype, elt_step, step_expr);
9430 elts.quick_push (elt_step);
9432 /* Create a vector from [new_name_0, new_name_1, ...,
9433 new_name_nunits-1]. */
9434 tree vec_mul = gimple_build_vector (stmts, &elts);
9435 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9436 vec_init, vec_mul);
9437 vec_init = gimple_convert (stmts, vectype, vec_init);
9439 break;
9441 default:
9442 gcc_unreachable ();
9445 return vec_init;
9448 /* Peel init_expr by skip_niter for induction_type. */
9449 tree
9450 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9451 tree skip_niters, tree step_expr,
9452 enum vect_induction_op_type induction_type)
9454 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9455 tree type = TREE_TYPE (init_expr);
9456 unsigned prec = TYPE_PRECISION (type);
9457 switch (induction_type)
9459 case vect_step_op_neg:
9460 if (TREE_INT_CST_LOW (skip_niters) % 2)
9461 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9462 /* else no change. */
9463 break;
9465 case vect_step_op_shr:
9466 case vect_step_op_shl:
9467 skip_niters = gimple_convert (stmts, type, skip_niters);
9468 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9469 /* When shift mount >= precision, need to avoid UD.
9470 In the original loop, there's no UD, and according to semantic,
9471 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9472 if (!tree_fits_uhwi_p (step_expr)
9473 || tree_to_uhwi (step_expr) >= prec)
9475 if (induction_type == vect_step_op_shl
9476 || TYPE_UNSIGNED (type))
9477 init_expr = build_zero_cst (type);
9478 else
9479 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9480 init_expr,
9481 wide_int_to_tree (type, prec - 1));
9483 else
9484 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9485 ? RSHIFT_EXPR : LSHIFT_EXPR),
9486 type, init_expr, step_expr);
9487 break;
9489 case vect_step_op_mul:
9491 tree utype = unsigned_type_for (type);
9492 init_expr = gimple_convert (stmts, utype, init_expr);
9493 wide_int skipn = wi::to_wide (skip_niters);
9494 wide_int begin = wi::to_wide (step_expr);
9495 auto_mpz base, exp, mod, res;
9496 wi::to_mpz (begin, base, TYPE_SIGN (type));
9497 wi::to_mpz (skipn, exp, UNSIGNED);
9498 mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9499 mpz_powm (res, base, exp, mod);
9500 begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9501 tree mult_expr = wide_int_to_tree (utype, begin);
9502 init_expr = gimple_build (stmts, MULT_EXPR, utype,
9503 init_expr, mult_expr);
9504 init_expr = gimple_convert (stmts, type, init_expr);
9506 break;
9508 default:
9509 gcc_unreachable ();
9512 return init_expr;
9515 /* Create vector step for vectorized iv. */
9516 static tree
9517 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9518 poly_uint64 vf,
9519 enum vect_induction_op_type induction_type)
9521 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9522 tree new_name = NULL;
9523 /* Step should be pow (step, vf) for mult induction. */
9524 if (induction_type == vect_step_op_mul)
9526 gcc_assert (vf.is_constant ());
9527 wide_int begin = wi::to_wide (step_expr);
9529 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9530 begin = wi::mul (begin, wi::to_wide (step_expr));
9532 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9534 else if (induction_type == vect_step_op_neg)
9535 /* Do nothing. */
9537 else
9538 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9539 expr, step_expr);
9540 return new_name;
9543 static tree
9544 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9545 stmt_vec_info stmt_info,
9546 tree new_name, tree vectype,
9547 enum vect_induction_op_type induction_type)
9549 /* No step is needed for neg induction. */
9550 if (induction_type == vect_step_op_neg)
9551 return NULL;
9553 tree t = unshare_expr (new_name);
9554 gcc_assert (CONSTANT_CLASS_P (new_name)
9555 || TREE_CODE (new_name) == SSA_NAME);
9556 tree new_vec = build_vector_from_val (vectype, t);
9557 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9558 new_vec, vectype, NULL);
9559 return vec_step;
9562 /* Update vectorized iv with vect_step, induc_def is init. */
9563 static tree
9564 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9565 tree induc_def, tree vec_step,
9566 enum vect_induction_op_type induction_type)
9568 tree vec_def = induc_def;
9569 switch (induction_type)
9571 case vect_step_op_mul:
9573 /* Use unsigned mult to avoid UD integer overflow. */
9574 tree uvectype
9575 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9576 TYPE_VECTOR_SUBPARTS (vectype));
9577 vec_def = gimple_convert (stmts, uvectype, vec_def);
9578 vec_step = gimple_convert (stmts, uvectype, vec_step);
9579 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9580 vec_def, vec_step);
9581 vec_def = gimple_convert (stmts, vectype, vec_def);
9583 break;
9585 case vect_step_op_shr:
9586 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9587 vec_def, vec_step);
9588 break;
9590 case vect_step_op_shl:
9591 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9592 vec_def, vec_step);
9593 break;
9594 case vect_step_op_neg:
9595 vec_def = induc_def;
9596 /* Do nothing. */
9597 break;
9598 default:
9599 gcc_unreachable ();
9602 return vec_def;
9606 /* Function vectorizable_induction
9608 Check if STMT_INFO performs an nonlinear induction computation that can be
9609 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9610 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9611 basic block.
9612 Return true if STMT_INFO is vectorizable in this way. */
9614 static bool
9615 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9616 stmt_vec_info stmt_info,
9617 gimple **vec_stmt, slp_tree slp_node,
9618 stmt_vector_for_cost *cost_vec)
9620 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9621 unsigned ncopies;
9622 bool nested_in_vect_loop = false;
9623 class loop *iv_loop;
9624 tree vec_def;
9625 edge pe = loop_preheader_edge (loop);
9626 basic_block new_bb;
9627 tree vec_init, vec_step;
9628 tree new_name;
9629 gimple *new_stmt;
9630 gphi *induction_phi;
9631 tree induc_def, vec_dest;
9632 tree init_expr, step_expr;
9633 tree niters_skip;
9634 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9635 unsigned i;
9636 gimple_stmt_iterator si;
9638 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9640 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9641 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9642 enum vect_induction_op_type induction_type
9643 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9645 gcc_assert (induction_type > vect_step_op_add);
9647 if (slp_node)
9648 ncopies = 1;
9649 else
9650 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9651 gcc_assert (ncopies >= 1);
9653 /* FORNOW. Only handle nonlinear induction in the same loop. */
9654 if (nested_in_vect_loop_p (loop, stmt_info))
9656 if (dump_enabled_p ())
9657 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9658 "nonlinear induction in nested loop.\n");
9659 return false;
9662 iv_loop = loop;
9663 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9665 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9666 update for each iv and a permutation to generate wanted vector iv. */
9667 if (slp_node)
9669 if (dump_enabled_p ())
9670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9671 "SLP induction not supported for nonlinear"
9672 " induction.\n");
9673 return false;
9676 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9678 if (dump_enabled_p ())
9679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9680 "floating point nonlinear induction vectorization"
9681 " not supported.\n");
9682 return false;
9685 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9686 init_expr = vect_phi_initial_value (phi);
9687 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9688 && TREE_CODE (step_expr) == INTEGER_CST);
9689 /* step_expr should be aligned with init_expr,
9690 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9691 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9693 if (TREE_CODE (init_expr) == INTEGER_CST)
9694 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9695 else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9697 /* INIT_EXPR could be a bit_field, bail out for such case. */
9698 if (dump_enabled_p ())
9699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9700 "nonlinear induction vectorization failed:"
9701 " component type of vectype is not a nop conversion"
9702 " from type of init_expr.\n");
9703 return false;
9706 switch (induction_type)
9708 case vect_step_op_neg:
9709 if (TREE_CODE (init_expr) != INTEGER_CST
9710 && TREE_CODE (init_expr) != REAL_CST)
9712 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9713 if (!directly_supported_p (NEGATE_EXPR, vectype))
9714 return false;
9716 /* The encoding has 2 interleaved stepped patterns. */
9717 vec_perm_builder sel (nunits, 2, 3);
9718 machine_mode mode = TYPE_MODE (vectype);
9719 sel.quick_grow (6);
9720 for (i = 0; i < 3; i++)
9722 sel[i * 2] = i;
9723 sel[i * 2 + 1] = i + nunits;
9725 vec_perm_indices indices (sel, 2, nunits);
9726 if (!can_vec_perm_const_p (mode, mode, indices))
9727 return false;
9729 break;
9731 case vect_step_op_mul:
9733 /* Check for backend support of MULT_EXPR. */
9734 if (!directly_supported_p (MULT_EXPR, vectype))
9735 return false;
9737 /* ?? How to construct vector step for variable number vector.
9738 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9739 if (!vf.is_constant ())
9740 return false;
9742 break;
9744 case vect_step_op_shr:
9745 /* Check for backend support of RSHIFT_EXPR. */
9746 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9747 return false;
9749 /* Don't shift more than type precision to avoid UD. */
9750 if (!tree_fits_uhwi_p (step_expr)
9751 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9752 TYPE_PRECISION (TREE_TYPE (init_expr))))
9753 return false;
9754 break;
9756 case vect_step_op_shl:
9757 /* Check for backend support of RSHIFT_EXPR. */
9758 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9759 return false;
9761 /* Don't shift more than type precision to avoid UD. */
9762 if (!tree_fits_uhwi_p (step_expr)
9763 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9764 TYPE_PRECISION (TREE_TYPE (init_expr))))
9765 return false;
9767 break;
9769 default:
9770 gcc_unreachable ();
9773 if (!vec_stmt) /* transformation not required. */
9775 unsigned inside_cost = 0, prologue_cost = 0;
9776 /* loop cost for vec_loop. Neg induction doesn't have any
9777 inside_cost. */
9778 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9779 stmt_info, 0, vect_body);
9781 /* loop cost for vec_loop. Neg induction doesn't have any
9782 inside_cost. */
9783 if (induction_type == vect_step_op_neg)
9784 inside_cost = 0;
9786 /* prologue cost for vec_init and vec_step. */
9787 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9788 stmt_info, 0, vect_prologue);
9790 if (dump_enabled_p ())
9791 dump_printf_loc (MSG_NOTE, vect_location,
9792 "vect_model_induction_cost: inside_cost = %d, "
9793 "prologue_cost = %d. \n", inside_cost,
9794 prologue_cost);
9796 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9797 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9798 return true;
9801 /* Transform. */
9803 /* Compute a vector variable, initialized with the first VF values of
9804 the induction variable. E.g., for an iv with IV_PHI='X' and
9805 evolution S, for a vector of 4 units, we want to compute:
9806 [X, X + S, X + 2*S, X + 3*S]. */
9808 if (dump_enabled_p ())
9809 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9811 pe = loop_preheader_edge (iv_loop);
9812 /* Find the first insertion point in the BB. */
9813 basic_block bb = gimple_bb (phi);
9814 si = gsi_after_labels (bb);
9816 gimple_seq stmts = NULL;
9818 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9819 /* If we are using the loop mask to "peel" for alignment then we need
9820 to adjust the start value here. */
9821 if (niters_skip != NULL_TREE)
9822 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9823 step_expr, induction_type);
9825 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9826 step_expr, nunits, vectype,
9827 induction_type);
9828 if (stmts)
9830 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9831 gcc_assert (!new_bb);
9834 stmts = NULL;
9835 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9836 vf, induction_type);
9837 if (stmts)
9839 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9840 gcc_assert (!new_bb);
9843 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9844 new_name, vectype,
9845 induction_type);
9846 /* Create the following def-use cycle:
9847 loop prolog:
9848 vec_init = ...
9849 vec_step = ...
9850 loop:
9851 vec_iv = PHI <vec_init, vec_loop>
9853 STMT
9855 vec_loop = vec_iv + vec_step; */
9857 /* Create the induction-phi that defines the induction-operand. */
9858 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9859 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9860 induc_def = PHI_RESULT (induction_phi);
9862 /* Create the iv update inside the loop. */
9863 stmts = NULL;
9864 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9865 induc_def, vec_step,
9866 induction_type);
9868 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9869 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9871 /* Set the arguments of the phi node: */
9872 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9873 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9874 UNKNOWN_LOCATION);
9876 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9877 *vec_stmt = induction_phi;
9879 /* In case that vectorization factor (VF) is bigger than the number
9880 of elements that we can fit in a vectype (nunits), we have to generate
9881 more than one vector stmt - i.e - we need to "unroll" the
9882 vector stmt by a factor VF/nunits. For more details see documentation
9883 in vectorizable_operation. */
9885 if (ncopies > 1)
9887 stmts = NULL;
9888 /* FORNOW. This restriction should be relaxed. */
9889 gcc_assert (!nested_in_vect_loop);
9891 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9892 nunits, induction_type);
9894 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9895 new_name, vectype,
9896 induction_type);
9897 vec_def = induc_def;
9898 for (i = 1; i < ncopies; i++)
9900 /* vec_i = vec_prev + vec_step. */
9901 stmts = NULL;
9902 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9903 vec_def, vec_step,
9904 induction_type);
9905 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9906 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9907 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9911 if (dump_enabled_p ())
9912 dump_printf_loc (MSG_NOTE, vect_location,
9913 "transform induction: created def-use cycle: %G%G",
9914 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9916 return true;
9919 /* Function vectorizable_induction
9921 Check if STMT_INFO performs an induction computation that can be vectorized.
9922 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9923 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9924 Return true if STMT_INFO is vectorizable in this way. */
9926 bool
9927 vectorizable_induction (loop_vec_info loop_vinfo,
9928 stmt_vec_info stmt_info,
9929 gimple **vec_stmt, slp_tree slp_node,
9930 stmt_vector_for_cost *cost_vec)
9932 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9933 unsigned ncopies;
9934 bool nested_in_vect_loop = false;
9935 class loop *iv_loop;
9936 tree vec_def;
9937 edge pe = loop_preheader_edge (loop);
9938 basic_block new_bb;
9939 tree new_vec, vec_init, vec_step, t;
9940 tree new_name;
9941 gimple *new_stmt;
9942 gphi *induction_phi;
9943 tree induc_def, vec_dest;
9944 tree init_expr, step_expr;
9945 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9946 unsigned i;
9947 tree expr;
9948 gimple_stmt_iterator si;
9949 enum vect_induction_op_type induction_type
9950 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9952 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9953 if (!phi)
9954 return false;
9956 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9957 return false;
9959 /* Make sure it was recognized as induction computation. */
9960 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9961 return false;
9963 /* Handle nonlinear induction in a separate place. */
9964 if (induction_type != vect_step_op_add)
9965 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9966 vec_stmt, slp_node, cost_vec);
9968 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9969 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9971 if (slp_node)
9972 ncopies = 1;
9973 else
9974 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9975 gcc_assert (ncopies >= 1);
9977 /* FORNOW. These restrictions should be relaxed. */
9978 if (nested_in_vect_loop_p (loop, stmt_info))
9980 imm_use_iterator imm_iter;
9981 use_operand_p use_p;
9982 gimple *exit_phi;
9983 edge latch_e;
9984 tree loop_arg;
9986 if (ncopies > 1)
9988 if (dump_enabled_p ())
9989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9990 "multiple types in nested loop.\n");
9991 return false;
9994 exit_phi = NULL;
9995 latch_e = loop_latch_edge (loop->inner);
9996 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9997 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9999 gimple *use_stmt = USE_STMT (use_p);
10000 if (is_gimple_debug (use_stmt))
10001 continue;
10003 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
10005 exit_phi = use_stmt;
10006 break;
10009 if (exit_phi)
10011 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
10012 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
10013 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
10015 if (dump_enabled_p ())
10016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10017 "inner-loop induction only used outside "
10018 "of the outer vectorized loop.\n");
10019 return false;
10023 nested_in_vect_loop = true;
10024 iv_loop = loop->inner;
10026 else
10027 iv_loop = loop;
10028 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10030 if (slp_node && !nunits.is_constant ())
10032 /* The current SLP code creates the step value element-by-element. */
10033 if (dump_enabled_p ())
10034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10035 "SLP induction not supported for variable-length"
10036 " vectors.\n");
10037 return false;
10040 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10042 if (dump_enabled_p ())
10043 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10044 "floating point induction vectorization disabled\n");
10045 return false;
10048 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10049 gcc_assert (step_expr != NULL_TREE);
10050 if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10051 && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10053 if (dump_enabled_p ())
10054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10055 "bit-precision induction vectorization not "
10056 "supported.\n");
10057 return false;
10059 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10061 /* Check for backend support of PLUS/MINUS_EXPR. */
10062 if (!directly_supported_p (PLUS_EXPR, step_vectype)
10063 || !directly_supported_p (MINUS_EXPR, step_vectype))
10064 return false;
10066 if (!vec_stmt) /* transformation not required. */
10068 unsigned inside_cost = 0, prologue_cost = 0;
10069 if (slp_node)
10071 /* We eventually need to set a vector type on invariant
10072 arguments. */
10073 unsigned j;
10074 slp_tree child;
10075 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10076 if (!vect_maybe_update_slp_op_vectype
10077 (child, SLP_TREE_VECTYPE (slp_node)))
10079 if (dump_enabled_p ())
10080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10081 "incompatible vector types for "
10082 "invariants\n");
10083 return false;
10085 /* loop cost for vec_loop. */
10086 inside_cost
10087 = record_stmt_cost (cost_vec,
10088 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10089 vector_stmt, stmt_info, 0, vect_body);
10090 /* prologue cost for vec_init (if not nested) and step. */
10091 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10092 scalar_to_vec,
10093 stmt_info, 0, vect_prologue);
10095 else /* if (!slp_node) */
10097 /* loop cost for vec_loop. */
10098 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10099 stmt_info, 0, vect_body);
10100 /* prologue cost for vec_init and vec_step. */
10101 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10102 stmt_info, 0, vect_prologue);
10104 if (dump_enabled_p ())
10105 dump_printf_loc (MSG_NOTE, vect_location,
10106 "vect_model_induction_cost: inside_cost = %d, "
10107 "prologue_cost = %d .\n", inside_cost,
10108 prologue_cost);
10110 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10111 DUMP_VECT_SCOPE ("vectorizable_induction");
10112 return true;
10115 /* Transform. */
10117 /* Compute a vector variable, initialized with the first VF values of
10118 the induction variable. E.g., for an iv with IV_PHI='X' and
10119 evolution S, for a vector of 4 units, we want to compute:
10120 [X, X + S, X + 2*S, X + 3*S]. */
10122 if (dump_enabled_p ())
10123 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10125 pe = loop_preheader_edge (iv_loop);
10126 /* Find the first insertion point in the BB. */
10127 basic_block bb = gimple_bb (phi);
10128 si = gsi_after_labels (bb);
10130 /* For SLP induction we have to generate several IVs as for example
10131 with group size 3 we need
10132 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10133 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
10134 if (slp_node)
10136 /* Enforced above. */
10137 unsigned int const_nunits = nunits.to_constant ();
10139 /* The initial values are vectorized, but any lanes > group_size
10140 need adjustment. */
10141 slp_tree init_node
10142 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10144 /* Gather steps. Since we do not vectorize inductions as
10145 cycles we have to reconstruct the step from SCEV data. */
10146 unsigned group_size = SLP_TREE_LANES (slp_node);
10147 tree *steps = XALLOCAVEC (tree, group_size);
10148 tree *inits = XALLOCAVEC (tree, group_size);
10149 stmt_vec_info phi_info;
10150 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10152 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10153 if (!init_node)
10154 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10155 pe->dest_idx);
10158 /* Now generate the IVs. */
10159 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10160 gcc_assert ((const_nunits * nvects) % group_size == 0);
10161 unsigned nivs;
10162 if (nested_in_vect_loop)
10163 nivs = nvects;
10164 else
10166 /* Compute the number of distinct IVs we need. First reduce
10167 group_size if it is a multiple of const_nunits so we get
10168 one IV for a group_size of 4 but const_nunits 2. */
10169 unsigned group_sizep = group_size;
10170 if (group_sizep % const_nunits == 0)
10171 group_sizep = group_sizep / const_nunits;
10172 nivs = least_common_multiple (group_sizep,
10173 const_nunits) / const_nunits;
10175 tree stept = TREE_TYPE (step_vectype);
10176 tree lupdate_mul = NULL_TREE;
10177 if (!nested_in_vect_loop)
10179 /* The number of iterations covered in one vector iteration. */
10180 unsigned lup_mul = (nvects * const_nunits) / group_size;
10181 lupdate_mul
10182 = build_vector_from_val (step_vectype,
10183 SCALAR_FLOAT_TYPE_P (stept)
10184 ? build_real_from_wide (stept, lup_mul,
10185 UNSIGNED)
10186 : build_int_cstu (stept, lup_mul));
10188 tree peel_mul = NULL_TREE;
10189 gimple_seq init_stmts = NULL;
10190 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10192 if (SCALAR_FLOAT_TYPE_P (stept))
10193 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10194 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10195 else
10196 peel_mul = gimple_convert (&init_stmts, stept,
10197 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10198 peel_mul = gimple_build_vector_from_val (&init_stmts,
10199 step_vectype, peel_mul);
10201 unsigned ivn;
10202 auto_vec<tree> vec_steps;
10203 for (ivn = 0; ivn < nivs; ++ivn)
10205 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10206 tree_vector_builder init_elts (vectype, const_nunits, 1);
10207 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10208 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10210 /* The scalar steps of the IVs. */
10211 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10212 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10213 step_elts.quick_push (elt);
10214 if (!init_node)
10216 /* The scalar inits of the IVs if not vectorized. */
10217 elt = inits[(ivn*const_nunits + eltn) % group_size];
10218 if (!useless_type_conversion_p (TREE_TYPE (vectype),
10219 TREE_TYPE (elt)))
10220 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10221 TREE_TYPE (vectype), elt);
10222 init_elts.quick_push (elt);
10224 /* The number of steps to add to the initial values. */
10225 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10226 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10227 ? build_real_from_wide (stept,
10228 mul_elt, UNSIGNED)
10229 : build_int_cstu (stept, mul_elt));
10231 vec_step = gimple_build_vector (&init_stmts, &step_elts);
10232 vec_steps.safe_push (vec_step);
10233 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10234 if (peel_mul)
10235 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10236 step_mul, peel_mul);
10237 if (!init_node)
10238 vec_init = gimple_build_vector (&init_stmts, &init_elts);
10240 /* Create the induction-phi that defines the induction-operand. */
10241 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10242 "vec_iv_");
10243 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10244 induc_def = PHI_RESULT (induction_phi);
10246 /* Create the iv update inside the loop */
10247 tree up = vec_step;
10248 if (lupdate_mul)
10249 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10250 vec_step, lupdate_mul);
10251 gimple_seq stmts = NULL;
10252 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10253 vec_def = gimple_build (&stmts,
10254 PLUS_EXPR, step_vectype, vec_def, up);
10255 vec_def = gimple_convert (&stmts, vectype, vec_def);
10256 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10257 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10258 UNKNOWN_LOCATION);
10260 if (init_node)
10261 vec_init = vect_get_slp_vect_def (init_node, ivn);
10262 if (!nested_in_vect_loop
10263 && !integer_zerop (step_mul))
10265 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10266 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10267 vec_step, step_mul);
10268 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10269 vec_def, up);
10270 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10273 /* Set the arguments of the phi node: */
10274 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10276 slp_node->push_vec_def (induction_phi);
10278 if (!nested_in_vect_loop)
10280 /* Fill up to the number of vectors we need for the whole group. */
10281 nivs = least_common_multiple (group_size,
10282 const_nunits) / const_nunits;
10283 vec_steps.reserve (nivs-ivn);
10284 for (; ivn < nivs; ++ivn)
10286 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10287 vec_steps.quick_push (vec_steps[0]);
10291 /* Re-use IVs when we can. We are generating further vector
10292 stmts by adding VF' * stride to the IVs generated above. */
10293 if (ivn < nvects)
10295 unsigned vfp
10296 = least_common_multiple (group_size, const_nunits) / group_size;
10297 tree lupdate_mul
10298 = build_vector_from_val (step_vectype,
10299 SCALAR_FLOAT_TYPE_P (stept)
10300 ? build_real_from_wide (stept,
10301 vfp, UNSIGNED)
10302 : build_int_cstu (stept, vfp));
10303 for (; ivn < nvects; ++ivn)
10305 gimple *iv
10306 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10307 tree def = gimple_get_lhs (iv);
10308 if (ivn < 2*nivs)
10309 vec_steps[ivn - nivs]
10310 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10311 vec_steps[ivn - nivs], lupdate_mul);
10312 gimple_seq stmts = NULL;
10313 def = gimple_convert (&stmts, step_vectype, def);
10314 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10315 def, vec_steps[ivn % nivs]);
10316 def = gimple_convert (&stmts, vectype, def);
10317 if (gimple_code (iv) == GIMPLE_PHI)
10318 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10319 else
10321 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10322 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10324 slp_node->push_vec_def (def);
10328 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10329 gcc_assert (!new_bb);
10331 return true;
10334 init_expr = vect_phi_initial_value (phi);
10336 gimple_seq stmts = NULL;
10337 if (!nested_in_vect_loop)
10339 /* Convert the initial value to the IV update type. */
10340 tree new_type = TREE_TYPE (step_expr);
10341 init_expr = gimple_convert (&stmts, new_type, init_expr);
10343 /* If we are using the loop mask to "peel" for alignment then we need
10344 to adjust the start value here. */
10345 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10346 if (skip_niters != NULL_TREE)
10348 if (FLOAT_TYPE_P (vectype))
10349 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10350 skip_niters);
10351 else
10352 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10353 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10354 skip_niters, step_expr);
10355 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10356 init_expr, skip_step);
10360 if (stmts)
10362 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10363 gcc_assert (!new_bb);
10366 /* Create the vector that holds the initial_value of the induction. */
10367 if (nested_in_vect_loop)
10369 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10370 been created during vectorization of previous stmts. We obtain it
10371 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10372 auto_vec<tree> vec_inits;
10373 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10374 init_expr, &vec_inits);
10375 vec_init = vec_inits[0];
10376 /* If the initial value is not of proper type, convert it. */
10377 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10379 new_stmt
10380 = gimple_build_assign (vect_get_new_ssa_name (vectype,
10381 vect_simple_var,
10382 "vec_iv_"),
10383 VIEW_CONVERT_EXPR,
10384 build1 (VIEW_CONVERT_EXPR, vectype,
10385 vec_init));
10386 vec_init = gimple_assign_lhs (new_stmt);
10387 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10388 new_stmt);
10389 gcc_assert (!new_bb);
10392 else
10394 /* iv_loop is the loop to be vectorized. Create:
10395 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10396 stmts = NULL;
10397 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10399 unsigned HOST_WIDE_INT const_nunits;
10400 if (nunits.is_constant (&const_nunits))
10402 tree_vector_builder elts (step_vectype, const_nunits, 1);
10403 elts.quick_push (new_name);
10404 for (i = 1; i < const_nunits; i++)
10406 /* Create: new_name_i = new_name + step_expr */
10407 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10408 new_name, step_expr);
10409 elts.quick_push (new_name);
10411 /* Create a vector from [new_name_0, new_name_1, ...,
10412 new_name_nunits-1] */
10413 vec_init = gimple_build_vector (&stmts, &elts);
10415 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10416 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10417 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10418 new_name, step_expr);
10419 else
10421 /* Build:
10422 [base, base, base, ...]
10423 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10424 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10425 gcc_assert (flag_associative_math);
10426 tree index = build_index_vector (step_vectype, 0, 1);
10427 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10428 new_name);
10429 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10430 step_expr);
10431 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10432 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10433 vec_init, step_vec);
10434 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10435 vec_init, base_vec);
10437 vec_init = gimple_convert (&stmts, vectype, vec_init);
10439 if (stmts)
10441 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10442 gcc_assert (!new_bb);
10447 /* Create the vector that holds the step of the induction. */
10448 gimple_stmt_iterator *step_iv_si = NULL;
10449 if (nested_in_vect_loop)
10450 /* iv_loop is nested in the loop to be vectorized. Generate:
10451 vec_step = [S, S, S, S] */
10452 new_name = step_expr;
10453 else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10455 /* When we're using loop_len produced by SELEC_VL, the non-final
10456 iterations are not always processing VF elements. So vectorize
10457 induction variable instead of
10459 _21 = vect_vec_iv_.6_22 + { VF, ... };
10461 We should generate:
10463 _35 = .SELECT_VL (ivtmp_33, VF);
10464 vect_cst__22 = [vec_duplicate_expr] _35;
10465 _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10466 gcc_assert (!slp_node);
10467 gimple_seq seq = NULL;
10468 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10469 tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10470 expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10471 unshare_expr (len)),
10472 &seq, true, NULL_TREE);
10473 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10474 step_expr);
10475 gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10476 step_iv_si = &si;
10478 else
10480 /* iv_loop is the loop to be vectorized. Generate:
10481 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10482 gimple_seq seq = NULL;
10483 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10485 expr = build_int_cst (integer_type_node, vf);
10486 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10488 else
10489 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10490 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10491 expr, step_expr);
10492 if (seq)
10494 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10495 gcc_assert (!new_bb);
10499 t = unshare_expr (new_name);
10500 gcc_assert (CONSTANT_CLASS_P (new_name)
10501 || TREE_CODE (new_name) == SSA_NAME);
10502 new_vec = build_vector_from_val (step_vectype, t);
10503 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10504 new_vec, step_vectype, step_iv_si);
10507 /* Create the following def-use cycle:
10508 loop prolog:
10509 vec_init = ...
10510 vec_step = ...
10511 loop:
10512 vec_iv = PHI <vec_init, vec_loop>
10514 STMT
10516 vec_loop = vec_iv + vec_step; */
10518 /* Create the induction-phi that defines the induction-operand. */
10519 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10520 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10521 induc_def = PHI_RESULT (induction_phi);
10523 /* Create the iv update inside the loop */
10524 stmts = NULL;
10525 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10526 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10527 vec_def = gimple_convert (&stmts, vectype, vec_def);
10528 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10529 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10531 /* Set the arguments of the phi node: */
10532 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10533 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10534 UNKNOWN_LOCATION);
10536 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10537 *vec_stmt = induction_phi;
10539 /* In case that vectorization factor (VF) is bigger than the number
10540 of elements that we can fit in a vectype (nunits), we have to generate
10541 more than one vector stmt - i.e - we need to "unroll" the
10542 vector stmt by a factor VF/nunits. For more details see documentation
10543 in vectorizable_operation. */
10545 if (ncopies > 1)
10547 gimple_seq seq = NULL;
10548 /* FORNOW. This restriction should be relaxed. */
10549 gcc_assert (!nested_in_vect_loop);
10550 /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1. */
10551 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10553 /* Create the vector that holds the step of the induction. */
10554 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10556 expr = build_int_cst (integer_type_node, nunits);
10557 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10559 else
10560 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10561 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10562 expr, step_expr);
10563 if (seq)
10565 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10566 gcc_assert (!new_bb);
10569 t = unshare_expr (new_name);
10570 gcc_assert (CONSTANT_CLASS_P (new_name)
10571 || TREE_CODE (new_name) == SSA_NAME);
10572 new_vec = build_vector_from_val (step_vectype, t);
10573 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10574 new_vec, step_vectype, NULL);
10576 vec_def = induc_def;
10577 for (i = 1; i < ncopies + 1; i++)
10579 /* vec_i = vec_prev + vec_step */
10580 gimple_seq stmts = NULL;
10581 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10582 vec_def = gimple_build (&stmts,
10583 PLUS_EXPR, step_vectype, vec_def, vec_step);
10584 vec_def = gimple_convert (&stmts, vectype, vec_def);
10586 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10587 if (i < ncopies)
10589 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10590 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10592 else
10594 /* vec_1 = vec_iv + (VF/n * S)
10595 vec_2 = vec_1 + (VF/n * S)
10597 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10599 vec_n is used as vec_loop to save the large step register and
10600 related operations. */
10601 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10602 UNKNOWN_LOCATION);
10607 if (dump_enabled_p ())
10608 dump_printf_loc (MSG_NOTE, vect_location,
10609 "transform induction: created def-use cycle: %G%G",
10610 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10612 return true;
10615 /* Function vectorizable_live_operation_1.
10617 helper function for vectorizable_live_operation. */
10619 static tree
10620 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10621 stmt_vec_info stmt_info, basic_block exit_bb,
10622 tree vectype, int ncopies, slp_tree slp_node,
10623 tree bitsize, tree bitstart, tree vec_lhs,
10624 tree lhs_type, gimple_stmt_iterator *exit_gsi)
10626 gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10628 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10629 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10630 for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10631 SET_PHI_ARG_DEF (phi, i, vec_lhs);
10633 gimple_seq stmts = NULL;
10634 tree new_tree;
10636 /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10637 if (integer_zerop (bitstart))
10639 tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10640 vec_lhs_phi, bitsize, bitstart);
10642 /* Convert the extracted vector element to the scalar type. */
10643 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10645 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10647 /* Emit:
10649 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10651 where VEC_LHS is the vectorized live-out result and MASK is
10652 the loop mask for the final iteration. */
10653 gcc_assert (ncopies == 1 && !slp_node);
10654 gimple_seq tem = NULL;
10655 gimple_stmt_iterator gsi = gsi_last (tem);
10656 tree len = vect_get_loop_len (loop_vinfo, &gsi,
10657 &LOOP_VINFO_LENS (loop_vinfo),
10658 1, vectype, 0, 0);
10660 /* BIAS - 1. */
10661 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10662 tree bias_minus_one
10663 = int_const_binop (MINUS_EXPR,
10664 build_int_cst (TREE_TYPE (len), biasval),
10665 build_one_cst (TREE_TYPE (len)));
10667 /* LAST_INDEX = LEN + (BIAS - 1). */
10668 tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10669 len, bias_minus_one);
10671 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10672 tree scalar_res
10673 = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10674 vec_lhs_phi, last_index);
10676 /* Convert the extracted vector element to the scalar type. */
10677 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10679 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10681 /* Emit:
10683 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10685 where VEC_LHS is the vectorized live-out result and MASK is
10686 the loop mask for the final iteration. */
10687 gcc_assert (!slp_node);
10688 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10689 gimple_seq tem = NULL;
10690 gimple_stmt_iterator gsi = gsi_last (tem);
10691 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10692 &LOOP_VINFO_MASKS (loop_vinfo),
10693 1, vectype, 0);
10694 tree scalar_res;
10695 gimple_seq_add_seq (&stmts, tem);
10697 scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10698 mask, vec_lhs_phi);
10700 /* Convert the extracted vector element to the scalar type. */
10701 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10703 else
10705 tree bftype = TREE_TYPE (vectype);
10706 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10707 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10708 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10709 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10710 &stmts, true, NULL_TREE);
10713 *exit_gsi = gsi_after_labels (exit_bb);
10714 if (stmts)
10715 gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10717 return new_tree;
10720 /* Find the edge that's the final one in the path from SRC to DEST and
10721 return it. This edge must exist in at most one forwarder edge between. */
10723 static edge
10724 find_connected_edge (edge src, basic_block dest)
10726 if (src->dest == dest)
10727 return src;
10729 return find_edge (src->dest, dest);
10732 /* Function vectorizable_live_operation.
10734 STMT_INFO computes a value that is used outside the loop. Check if
10735 it can be supported. */
10737 bool
10738 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10739 slp_tree slp_node, slp_instance slp_node_instance,
10740 int slp_index, bool vec_stmt_p,
10741 stmt_vector_for_cost *cost_vec)
10743 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10744 imm_use_iterator imm_iter;
10745 tree lhs, lhs_type, bitsize;
10746 tree vectype = (slp_node
10747 ? SLP_TREE_VECTYPE (slp_node)
10748 : STMT_VINFO_VECTYPE (stmt_info));
10749 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10750 int ncopies;
10751 gimple *use_stmt;
10752 use_operand_p use_p;
10753 auto_vec<tree> vec_oprnds;
10754 int vec_entry = 0;
10755 poly_uint64 vec_index = 0;
10757 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10758 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10760 /* If a stmt of a reduction is live, vectorize it via
10761 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10762 validity so just trigger the transform here. */
10763 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10765 if (!vec_stmt_p)
10766 return true;
10767 if (slp_node)
10769 /* For reduction chains the meta-info is attached to
10770 the group leader. */
10771 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10772 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10773 /* For SLP reductions we vectorize the epilogue for
10774 all involved stmts together. */
10775 else if (slp_index != 0)
10776 return true;
10778 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10779 gcc_assert (reduc_info->is_reduc_info);
10780 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10781 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10782 return true;
10784 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10785 slp_node_instance,
10786 LOOP_VINFO_IV_EXIT (loop_vinfo));
10788 /* If early break we only have to materialize the reduction on the merge
10789 block, but we have to find an alternate exit first. */
10790 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10792 for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10793 if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10795 vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10796 slp_node, slp_node_instance,
10797 exit);
10798 break;
10802 return true;
10805 /* If STMT is not relevant and it is a simple assignment and its inputs are
10806 invariant then it can remain in place, unvectorized. The original last
10807 scalar value that it computes will be used. */
10808 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10810 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10811 if (dump_enabled_p ())
10812 dump_printf_loc (MSG_NOTE, vect_location,
10813 "statement is simple and uses invariant. Leaving in "
10814 "place.\n");
10815 return true;
10818 if (slp_node)
10819 ncopies = 1;
10820 else
10821 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10823 if (slp_node)
10825 gcc_assert (slp_index >= 0);
10827 /* Get the last occurrence of the scalar index from the concatenation of
10828 all the slp vectors. Calculate which slp vector it is and the index
10829 within. */
10830 int num_scalar = SLP_TREE_LANES (slp_node);
10831 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10832 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10834 /* Calculate which vector contains the result, and which lane of
10835 that vector we need. */
10836 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10838 if (dump_enabled_p ())
10839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10840 "Cannot determine which vector holds the"
10841 " final result.\n");
10842 return false;
10846 if (!vec_stmt_p)
10848 /* No transformation required. */
10849 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10851 if (slp_node)
10853 if (dump_enabled_p ())
10854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10855 "can't operate on partial vectors "
10856 "because an SLP statement is live after "
10857 "the loop.\n");
10858 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10860 else if (ncopies > 1)
10862 if (dump_enabled_p ())
10863 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10864 "can't operate on partial vectors "
10865 "because ncopies is greater than 1.\n");
10866 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10868 else
10870 gcc_assert (ncopies == 1 && !slp_node);
10871 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10872 OPTIMIZE_FOR_SPEED))
10873 vect_record_loop_mask (loop_vinfo,
10874 &LOOP_VINFO_MASKS (loop_vinfo),
10875 1, vectype, NULL);
10876 else if (can_vec_extract_var_idx_p (
10877 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10878 vect_record_loop_len (loop_vinfo,
10879 &LOOP_VINFO_LENS (loop_vinfo),
10880 1, vectype, 1);
10881 else
10883 if (dump_enabled_p ())
10884 dump_printf_loc (
10885 MSG_MISSED_OPTIMIZATION, vect_location,
10886 "can't operate on partial vectors "
10887 "because the target doesn't support extract "
10888 "last reduction.\n");
10889 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10893 /* ??? Enable for loop costing as well. */
10894 if (!loop_vinfo)
10895 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10896 0, vect_epilogue);
10897 return true;
10900 /* Use the lhs of the original scalar statement. */
10901 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10902 if (dump_enabled_p ())
10903 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10904 "stmt %G", stmt);
10906 lhs = gimple_get_lhs (stmt);
10907 lhs_type = TREE_TYPE (lhs);
10909 bitsize = vector_element_bits_tree (vectype);
10911 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10912 tree vec_lhs, vec_lhs0, bitstart;
10913 gimple *vec_stmt, *vec_stmt0;
10914 if (slp_node)
10916 gcc_assert (!loop_vinfo
10917 || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10918 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10920 /* Get the correct slp vectorized stmt. */
10921 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10922 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10924 /* In case we need to early break vectorize also get the first stmt. */
10925 vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10926 vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10928 /* Get entry to use. */
10929 bitstart = bitsize_int (vec_index);
10930 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10932 else
10934 /* For multiple copies, get the last copy. */
10935 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10936 vec_lhs = gimple_get_lhs (vec_stmt);
10938 /* In case we need to early break vectorize also get the first stmt. */
10939 vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10940 vec_lhs0 = gimple_get_lhs (vec_stmt0);
10942 /* Get the last lane in the vector. */
10943 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10946 if (loop_vinfo)
10948 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10949 requirement, insert one phi node for it. It looks like:
10950 loop;
10952 # lhs' = PHI <lhs>
10954 loop;
10956 # vec_lhs' = PHI <vec_lhs>
10957 new_tree = lane_extract <vec_lhs', ...>;
10958 lhs' = new_tree; */
10960 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10961 /* Check if we have a loop where the chosen exit is not the main exit,
10962 in these cases for an early break we restart the iteration the vector code
10963 did. For the live values we want the value at the start of the iteration
10964 rather than at the end. */
10965 edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10966 bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10967 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10968 if (!is_gimple_debug (use_stmt)
10969 && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10970 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10972 edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10973 phi_arg_index_from_use (use_p));
10974 bool main_exit_edge = e == main_e
10975 || find_connected_edge (main_e, e->src);
10977 /* Early exits have an merge block, we want the merge block itself
10978 so use ->src. For main exit the merge block is the
10979 destination. */
10980 basic_block dest = main_exit_edge ? main_e->dest : e->src;
10981 tree tmp_vec_lhs = vec_lhs;
10982 tree tmp_bitstart = bitstart;
10984 /* For early exit where the exit is not in the BB that leads
10985 to the latch then we're restarting the iteration in the
10986 scalar loop. So get the first live value. */
10987 restart_loop = restart_loop || !main_exit_edge;
10988 if (restart_loop
10989 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10991 tmp_vec_lhs = vec_lhs0;
10992 tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10995 gimple_stmt_iterator exit_gsi;
10996 tree new_tree
10997 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10998 dest, vectype, ncopies,
10999 slp_node, bitsize,
11000 tmp_bitstart, tmp_vec_lhs,
11001 lhs_type, &exit_gsi);
11003 if (gimple_phi_num_args (use_stmt) == 1)
11005 auto gsi = gsi_for_stmt (use_stmt);
11006 remove_phi_node (&gsi, false);
11007 tree lhs_phi = gimple_phi_result (use_stmt);
11008 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
11009 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
11011 else
11012 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, new_tree);
11015 /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
11016 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11017 gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
11019 else
11021 /* For basic-block vectorization simply insert the lane-extraction. */
11022 tree bftype = TREE_TYPE (vectype);
11023 if (VECTOR_BOOLEAN_TYPE_P (vectype))
11024 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11025 tree new_tree = build3 (BIT_FIELD_REF, bftype,
11026 vec_lhs, bitsize, bitstart);
11027 gimple_seq stmts = NULL;
11028 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11029 &stmts, true, NULL_TREE);
11030 if (TREE_CODE (new_tree) == SSA_NAME
11031 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11032 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11033 if (is_a <gphi *> (vec_stmt))
11035 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11036 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11038 else
11040 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11041 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11044 /* Replace use of lhs with newly computed result. If the use stmt is a
11045 single arg PHI, just replace all uses of PHI result. It's necessary
11046 because lcssa PHI defining lhs may be before newly inserted stmt. */
11047 use_operand_p use_p;
11048 stmt_vec_info use_stmt_info;
11049 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11050 if (!is_gimple_debug (use_stmt)
11051 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11052 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11054 /* ??? This can happen when the live lane ends up being
11055 rooted in a vector construction code-generated by an
11056 external SLP node (and code-generation for that already
11057 happened). See gcc.dg/vect/bb-slp-47.c.
11058 Doing this is what would happen if that vector CTOR
11059 were not code-generated yet so it is not too bad.
11060 ??? In fact we'd likely want to avoid this situation
11061 in the first place. */
11062 if (TREE_CODE (new_tree) == SSA_NAME
11063 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11064 && gimple_code (use_stmt) != GIMPLE_PHI
11065 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11066 use_stmt))
11068 if (dump_enabled_p ())
11069 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11070 "Using original scalar computation for "
11071 "live lane because use preceeds vector "
11072 "def\n");
11073 continue;
11075 /* ??? It can also happen that we end up pulling a def into
11076 a loop where replacing out-of-loop uses would require
11077 a new LC SSA PHI node. Retain the original scalar in
11078 those cases as well. PR98064. */
11079 if (TREE_CODE (new_tree) == SSA_NAME
11080 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11081 && (gimple_bb (use_stmt)->loop_father
11082 != gimple_bb (vec_stmt)->loop_father)
11083 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11084 gimple_bb (use_stmt)->loop_father))
11086 if (dump_enabled_p ())
11087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11088 "Using original scalar computation for "
11089 "live lane because there is an out-of-loop "
11090 "definition for it\n");
11091 continue;
11093 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11094 SET_USE (use_p, new_tree);
11095 update_stmt (use_stmt);
11099 return true;
11102 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
11104 static void
11105 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11107 ssa_op_iter op_iter;
11108 imm_use_iterator imm_iter;
11109 def_operand_p def_p;
11110 gimple *ustmt;
11112 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11114 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11116 basic_block bb;
11118 if (!is_gimple_debug (ustmt))
11119 continue;
11121 bb = gimple_bb (ustmt);
11123 if (!flow_bb_inside_loop_p (loop, bb))
11125 if (gimple_debug_bind_p (ustmt))
11127 if (dump_enabled_p ())
11128 dump_printf_loc (MSG_NOTE, vect_location,
11129 "killing debug use\n");
11131 gimple_debug_bind_reset_value (ustmt);
11132 update_stmt (ustmt);
11134 else
11135 gcc_unreachable ();
11141 /* Given loop represented by LOOP_VINFO, return true if computation of
11142 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11143 otherwise. */
11145 static bool
11146 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11148 /* Constant case. */
11149 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11151 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11152 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11154 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11155 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11156 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11157 return true;
11160 widest_int max;
11161 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11162 /* Check the upper bound of loop niters. */
11163 if (get_max_loop_iterations (loop, &max))
11165 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11166 signop sgn = TYPE_SIGN (type);
11167 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11168 if (max < type_max)
11169 return true;
11171 return false;
11174 /* Return a mask type with half the number of elements as OLD_TYPE,
11175 given that it should have mode NEW_MODE. */
11177 tree
11178 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11180 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11181 return build_truth_vector_type_for_mode (nunits, new_mode);
11184 /* Return a mask type with twice as many elements as OLD_TYPE,
11185 given that it should have mode NEW_MODE. */
11187 tree
11188 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11190 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11191 return build_truth_vector_type_for_mode (nunits, new_mode);
11194 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11195 contain a sequence of NVECTORS masks that each control a vector of type
11196 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
11197 these vector masks with the vector version of SCALAR_MASK. */
11199 void
11200 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11201 unsigned int nvectors, tree vectype, tree scalar_mask)
11203 gcc_assert (nvectors != 0);
11205 if (scalar_mask)
11207 scalar_cond_masked_key cond (scalar_mask, nvectors);
11208 loop_vinfo->scalar_cond_masked_set.add (cond);
11211 masks->mask_set.add (std::make_pair (vectype, nvectors));
11214 /* Given a complete set of masks MASKS, extract mask number INDEX
11215 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11216 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
11218 See the comment above vec_loop_masks for more details about the mask
11219 arrangement. */
11221 tree
11222 vect_get_loop_mask (loop_vec_info loop_vinfo,
11223 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11224 unsigned int nvectors, tree vectype, unsigned int index)
11226 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11227 == vect_partial_vectors_while_ult)
11229 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11230 tree mask_type = rgm->type;
11232 /* Populate the rgroup's mask array, if this is the first time we've
11233 used it. */
11234 if (rgm->controls.is_empty ())
11236 rgm->controls.safe_grow_cleared (nvectors, true);
11237 for (unsigned int i = 0; i < nvectors; ++i)
11239 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11240 /* Provide a dummy definition until the real one is available. */
11241 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11242 rgm->controls[i] = mask;
11246 tree mask = rgm->controls[index];
11247 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11248 TYPE_VECTOR_SUBPARTS (vectype)))
11250 /* A loop mask for data type X can be reused for data type Y
11251 if X has N times more elements than Y and if Y's elements
11252 are N times bigger than X's. In this case each sequence
11253 of N elements in the loop mask will be all-zero or all-one.
11254 We can then view-convert the mask so that each sequence of
11255 N elements is replaced by a single element. */
11256 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11257 TYPE_VECTOR_SUBPARTS (vectype)));
11258 gimple_seq seq = NULL;
11259 mask_type = truth_type_for (vectype);
11260 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11261 if (seq)
11262 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11264 return mask;
11266 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11267 == vect_partial_vectors_avx512)
11269 /* The number of scalars per iteration and the number of vectors are
11270 both compile-time constants. */
11271 unsigned int nscalars_per_iter
11272 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11273 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11275 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11277 /* The stored nV is dependent on the mask type produced. */
11278 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11279 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11280 == rgm->factor);
11281 nvectors = rgm->factor;
11283 /* Populate the rgroup's mask array, if this is the first time we've
11284 used it. */
11285 if (rgm->controls.is_empty ())
11287 rgm->controls.safe_grow_cleared (nvectors, true);
11288 for (unsigned int i = 0; i < nvectors; ++i)
11290 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11291 /* Provide a dummy definition until the real one is available. */
11292 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11293 rgm->controls[i] = mask;
11296 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11297 TYPE_VECTOR_SUBPARTS (vectype)))
11298 return rgm->controls[index];
11300 /* Split the vector if needed. Since we are dealing with integer mode
11301 masks with AVX512 we can operate on the integer representation
11302 performing the whole vector shifting. */
11303 unsigned HOST_WIDE_INT factor;
11304 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11305 TYPE_VECTOR_SUBPARTS (vectype), &factor);
11306 gcc_assert (ok);
11307 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11308 tree mask_type = truth_type_for (vectype);
11309 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11310 unsigned vi = index / factor;
11311 unsigned vpart = index % factor;
11312 tree vec = rgm->controls[vi];
11313 gimple_seq seq = NULL;
11314 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11315 lang_hooks.types.type_for_mode
11316 (TYPE_MODE (rgm->type), 1), vec);
11317 /* For integer mode masks simply shift the right bits into position. */
11318 if (vpart != 0)
11319 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11320 build_int_cst (integer_type_node,
11321 (TYPE_VECTOR_SUBPARTS (vectype)
11322 * vpart)));
11323 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11324 (TYPE_MODE (mask_type), 1), vec);
11325 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11326 if (seq)
11327 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11328 return vec;
11330 else
11331 gcc_unreachable ();
11334 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11335 lengths for controlling an operation on VECTYPE. The operation splits
11336 each element of VECTYPE into FACTOR separate subelements, measuring the
11337 length as a number of these subelements. */
11339 void
11340 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11341 unsigned int nvectors, tree vectype, unsigned int factor)
11343 gcc_assert (nvectors != 0);
11344 if (lens->length () < nvectors)
11345 lens->safe_grow_cleared (nvectors, true);
11346 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11348 /* The number of scalars per iteration, scalar occupied bytes and
11349 the number of vectors are both compile-time constants. */
11350 unsigned int nscalars_per_iter
11351 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11352 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11354 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11356 /* For now, we only support cases in which all loads and stores fall back
11357 to VnQI or none do. */
11358 gcc_assert (!rgl->max_nscalars_per_iter
11359 || (rgl->factor == 1 && factor == 1)
11360 || (rgl->max_nscalars_per_iter * rgl->factor
11361 == nscalars_per_iter * factor));
11362 rgl->max_nscalars_per_iter = nscalars_per_iter;
11363 rgl->type = vectype;
11364 rgl->factor = factor;
11368 /* Given a complete set of lengths LENS, extract length number INDEX
11369 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11370 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11371 multipled by the number of elements that should be processed.
11372 Insert any set-up statements before GSI. */
11374 tree
11375 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11376 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11377 unsigned int index, unsigned int factor)
11379 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11380 bool use_bias_adjusted_len =
11381 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11383 /* Populate the rgroup's len array, if this is the first time we've
11384 used it. */
11385 if (rgl->controls.is_empty ())
11387 rgl->controls.safe_grow_cleared (nvectors, true);
11388 for (unsigned int i = 0; i < nvectors; ++i)
11390 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11391 gcc_assert (len_type != NULL_TREE);
11393 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11395 /* Provide a dummy definition until the real one is available. */
11396 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11397 rgl->controls[i] = len;
11399 if (use_bias_adjusted_len)
11401 gcc_assert (i == 0);
11402 tree adjusted_len =
11403 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11404 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11405 rgl->bias_adjusted_ctrl = adjusted_len;
11410 if (use_bias_adjusted_len)
11411 return rgl->bias_adjusted_ctrl;
11413 tree loop_len = rgl->controls[index];
11414 if (rgl->factor == 1 && factor == 1)
11416 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11417 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11418 if (maybe_ne (nunits1, nunits2))
11420 /* A loop len for data type X can be reused for data type Y
11421 if X has N times more elements than Y and if Y's elements
11422 are N times bigger than X's. */
11423 gcc_assert (multiple_p (nunits1, nunits2));
11424 factor = exact_div (nunits1, nunits2).to_constant ();
11425 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11426 gimple_seq seq = NULL;
11427 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11428 build_int_cst (iv_type, factor));
11429 if (seq)
11430 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11433 return loop_len;
11436 /* Scale profiling counters by estimation for LOOP which is vectorized
11437 by factor VF.
11438 If FLAT is true, the loop we started with had unrealistically flat
11439 profile. */
11441 static void
11442 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11444 /* For flat profiles do not scale down proportionally by VF and only
11445 cap by known iteration count bounds. */
11446 if (flat)
11448 if (dump_file && (dump_flags & TDF_DETAILS))
11449 fprintf (dump_file,
11450 "Vectorized loop profile seems flat; not scaling iteration "
11451 "count down by the vectorization factor %i\n", vf);
11452 scale_loop_profile (loop, profile_probability::always (),
11453 get_likely_max_loop_iterations_int (loop));
11454 return;
11456 /* Loop body executes VF fewer times and exit increases VF times. */
11457 profile_count entry_count = loop_preheader_edge (loop)->count ();
11459 /* If we have unreliable loop profile avoid dropping entry
11460 count bellow header count. This can happen since loops
11461 has unrealistically low trip counts. */
11462 while (vf > 1
11463 && loop->header->count > entry_count
11464 && loop->header->count < entry_count * vf)
11466 if (dump_file && (dump_flags & TDF_DETAILS))
11467 fprintf (dump_file,
11468 "Vectorization factor %i seems too large for profile "
11469 "prevoiusly believed to be consistent; reducing.\n", vf);
11470 vf /= 2;
11473 if (entry_count.nonzero_p ())
11474 set_edge_probability_and_rescale_others
11475 (exit_e,
11476 entry_count.probability_in (loop->header->count / vf));
11477 /* Avoid producing very large exit probability when we do not have
11478 sensible profile. */
11479 else if (exit_e->probability < profile_probability::always () / (vf * 2))
11480 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11481 loop->latch->count = single_pred_edge (loop->latch)->count ();
11483 scale_loop_profile (loop, profile_probability::always () / vf,
11484 get_likely_max_loop_iterations_int (loop));
11487 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11488 latch edge values originally defined by it. */
11490 static void
11491 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11492 stmt_vec_info def_stmt_info)
11494 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11495 if (!def || TREE_CODE (def) != SSA_NAME)
11496 return;
11497 stmt_vec_info phi_info;
11498 imm_use_iterator iter;
11499 use_operand_p use_p;
11500 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11502 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11503 if (!phi)
11504 continue;
11505 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11506 && (phi_info = loop_vinfo->lookup_stmt (phi))
11507 && STMT_VINFO_RELEVANT_P (phi_info)))
11508 continue;
11509 loop_p loop = gimple_bb (phi)->loop_father;
11510 edge e = loop_latch_edge (loop);
11511 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11512 continue;
11514 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11515 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11516 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11518 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11519 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11520 gcc_assert (phi_defs.length () == latch_defs.length ());
11521 for (unsigned i = 0; i < phi_defs.length (); ++i)
11522 add_phi_arg (as_a <gphi *> (phi_defs[i]),
11523 gimple_get_lhs (latch_defs[i]), e,
11524 gimple_phi_arg_location (phi, e->dest_idx));
11526 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11528 /* For first order recurrences we have to update both uses of
11529 the latch definition, the one in the PHI node and the one
11530 in the generated VEC_PERM_EXPR. */
11531 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11532 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11533 gcc_assert (phi_defs.length () == latch_defs.length ());
11534 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11535 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11536 for (unsigned i = 0; i < phi_defs.length (); ++i)
11538 gassign *perm = as_a <gassign *> (phi_defs[i]);
11539 if (i > 0)
11540 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11541 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11542 update_stmt (perm);
11544 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11545 gimple_phi_arg_location (phi, e->dest_idx));
11550 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11551 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11552 stmt_vec_info. */
11554 static bool
11555 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11556 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11558 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11559 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11561 if (dump_enabled_p ())
11562 dump_printf_loc (MSG_NOTE, vect_location,
11563 "------>vectorizing statement: %G", stmt_info->stmt);
11565 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11566 vect_loop_kill_debug_uses (loop, stmt_info);
11568 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11569 && !STMT_VINFO_LIVE_P (stmt_info))
11571 if (is_gimple_call (stmt_info->stmt)
11572 && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11574 gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11575 *seen_store = stmt_info;
11576 return false;
11578 return false;
11581 if (STMT_VINFO_VECTYPE (stmt_info))
11583 poly_uint64 nunits
11584 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11585 if (!STMT_SLP_TYPE (stmt_info)
11586 && maybe_ne (nunits, vf)
11587 && dump_enabled_p ())
11588 /* For SLP VF is set according to unrolling factor, and not
11589 to vector size, hence for SLP this print is not valid. */
11590 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11593 /* Pure SLP statements have already been vectorized. We still need
11594 to apply loop vectorization to hybrid SLP statements. */
11595 if (PURE_SLP_STMT (stmt_info))
11596 return false;
11598 if (dump_enabled_p ())
11599 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11601 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11602 *seen_store = stmt_info;
11604 return true;
11607 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11608 in the hash_map with its corresponding values. */
11610 static tree
11611 find_in_mapping (tree t, void *context)
11613 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11615 tree *value = mapping->get (t);
11616 return value ? *value : t;
11619 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11620 original loop that has now been vectorized.
11622 The inits of the data_references need to be advanced with the number of
11623 iterations of the main loop. This has been computed in vect_do_peeling and
11624 is stored in parameter ADVANCE. We first restore the data_references
11625 initial offset with the values recored in ORIG_DRS_INIT.
11627 Since the loop_vec_info of this EPILOGUE was constructed for the original
11628 loop, its stmt_vec_infos all point to the original statements. These need
11629 to be updated to point to their corresponding copies as well as the SSA_NAMES
11630 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11632 The data_reference's connections also need to be updated. Their
11633 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11634 stmt_vec_infos, their statements need to point to their corresponding copy,
11635 if they are gather loads or scatter stores then their reference needs to be
11636 updated to point to its corresponding copy and finally we set
11637 'base_misaligned' to false as we have already peeled for alignment in the
11638 prologue of the main loop. */
11640 static void
11641 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11643 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11644 auto_vec<gimple *> stmt_worklist;
11645 hash_map<tree,tree> mapping;
11646 gimple *orig_stmt, *new_stmt;
11647 gimple_stmt_iterator epilogue_gsi;
11648 gphi_iterator epilogue_phi_gsi;
11649 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11650 basic_block *epilogue_bbs = get_loop_body (epilogue);
11651 unsigned i;
11653 free (LOOP_VINFO_BBS (epilogue_vinfo));
11654 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11656 /* Advance data_reference's with the number of iterations of the previous
11657 loop and its prologue. */
11658 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11661 /* The EPILOGUE loop is a copy of the original loop so they share the same
11662 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11663 point to the copied statements. We also create a mapping of all LHS' in
11664 the original loop and all the LHS' in the EPILOGUE and create worklists to
11665 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11666 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11668 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11669 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11671 new_stmt = epilogue_phi_gsi.phi ();
11673 gcc_assert (gimple_uid (new_stmt) > 0);
11674 stmt_vinfo
11675 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11677 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11678 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11680 mapping.put (gimple_phi_result (orig_stmt),
11681 gimple_phi_result (new_stmt));
11682 /* PHI nodes can not have patterns or related statements. */
11683 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11684 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11687 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11688 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11690 new_stmt = gsi_stmt (epilogue_gsi);
11691 if (is_gimple_debug (new_stmt))
11692 continue;
11694 gcc_assert (gimple_uid (new_stmt) > 0);
11695 stmt_vinfo
11696 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11698 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11699 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11701 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11702 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11704 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11706 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11707 for (gimple_stmt_iterator gsi = gsi_start (seq);
11708 !gsi_end_p (gsi); gsi_next (&gsi))
11709 stmt_worklist.safe_push (gsi_stmt (gsi));
11712 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11713 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11715 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11716 stmt_worklist.safe_push (stmt);
11717 /* Set BB such that the assert in
11718 'get_initial_def_for_reduction' is able to determine that
11719 the BB of the related stmt is inside this loop. */
11720 gimple_set_bb (stmt,
11721 gimple_bb (new_stmt));
11722 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11723 gcc_assert (related_vinfo == NULL
11724 || related_vinfo == stmt_vinfo);
11729 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11730 using the original main loop and thus need to be updated to refer to the
11731 cloned variables used in the epilogue. */
11732 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11734 gimple *stmt = stmt_worklist[i];
11735 tree *new_op;
11737 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11739 tree op = gimple_op (stmt, j);
11740 if ((new_op = mapping.get(op)))
11741 gimple_set_op (stmt, j, *new_op);
11742 else
11744 /* PR92429: The last argument of simplify_replace_tree disables
11745 folding when replacing arguments. This is required as
11746 otherwise you might end up with different statements than the
11747 ones analyzed in vect_loop_analyze, leading to different
11748 vectorization. */
11749 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11750 &find_in_mapping, &mapping, false);
11751 gimple_set_op (stmt, j, op);
11756 struct data_reference *dr;
11757 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11758 FOR_EACH_VEC_ELT (datarefs, i, dr)
11760 orig_stmt = DR_STMT (dr);
11761 gcc_assert (gimple_uid (orig_stmt) > 0);
11762 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11763 /* Data references for gather loads and scatter stores do not use the
11764 updated offset we set using ADVANCE. Instead we have to make sure the
11765 reference in the data references point to the corresponding copy of
11766 the original in the epilogue. Make sure to update both
11767 gather/scatters recognized by dataref analysis and also other
11768 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11769 auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11770 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11771 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11773 DR_REF (dr)
11774 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11775 &find_in_mapping, &mapping);
11776 DR_BASE_ADDRESS (dr)
11777 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11778 &find_in_mapping, &mapping);
11780 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11781 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11782 /* The vector size of the epilogue is smaller than that of the main loop
11783 so the alignment is either the same or lower. This means the dr will
11784 thus by definition be aligned. */
11785 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11788 epilogue_vinfo->shared->datarefs_copy.release ();
11789 epilogue_vinfo->shared->save_datarefs ();
11792 /* When vectorizing early break statements instructions that happen before
11793 the early break in the current BB need to be moved to after the early
11794 break. This function deals with that and assumes that any validity
11795 checks has already been performed.
11797 While moving the instructions if it encounters a VUSE or VDEF it then
11798 corrects the VUSES as it moves the statements along. GDEST is the location
11799 in which to insert the new statements. */
11801 static void
11802 move_early_exit_stmts (loop_vec_info loop_vinfo)
11804 DUMP_VECT_SCOPE ("move_early_exit_stmts");
11806 if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11807 return;
11809 /* Move all stmts that need moving. */
11810 basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11811 gimple_stmt_iterator dest_gsi = gsi_start_bb (dest_bb);
11813 for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11815 /* Check to see if statement is still required for vect or has been
11816 elided. */
11817 auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11818 if (!stmt_info)
11819 continue;
11821 if (dump_enabled_p ())
11822 dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11824 gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11825 gsi_move_before (&stmt_gsi, &dest_gsi);
11826 gsi_prev (&dest_gsi);
11829 /* Update all the stmts with their new reaching VUSES. */
11830 tree vuse
11831 = gimple_vuse (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).last ());
11832 for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11834 if (dump_enabled_p ())
11835 dump_printf_loc (MSG_NOTE, vect_location,
11836 "updating vuse to %T for load %G", vuse, p);
11837 gimple_set_vuse (p, vuse);
11838 update_stmt (p);
11842 /* Function vect_transform_loop.
11844 The analysis phase has determined that the loop is vectorizable.
11845 Vectorize the loop - created vectorized stmts to replace the scalar
11846 stmts in the loop, and update the loop exit condition.
11847 Returns scalar epilogue loop if any. */
11849 class loop *
11850 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11852 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11853 class loop *epilogue = NULL;
11854 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11855 int nbbs = loop->num_nodes;
11856 int i;
11857 tree niters_vector = NULL_TREE;
11858 tree step_vector = NULL_TREE;
11859 tree niters_vector_mult_vf = NULL_TREE;
11860 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11861 unsigned int lowest_vf = constant_lower_bound (vf);
11862 gimple *stmt;
11863 bool check_profitability = false;
11864 unsigned int th;
11865 bool flat = maybe_flat_loop_profile (loop);
11867 DUMP_VECT_SCOPE ("vec_transform_loop");
11869 loop_vinfo->shared->check_datarefs ();
11871 /* Use the more conservative vectorization threshold. If the number
11872 of iterations is constant assume the cost check has been performed
11873 by our caller. If the threshold makes all loops profitable that
11874 run at least the (estimated) vectorization factor number of times
11875 checking is pointless, too. */
11876 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11877 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11879 if (dump_enabled_p ())
11880 dump_printf_loc (MSG_NOTE, vect_location,
11881 "Profitability threshold is %d loop iterations.\n",
11882 th);
11883 check_profitability = true;
11886 /* Make sure there exists a single-predecessor exit bb. Do this before
11887 versioning. */
11888 edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11889 if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11891 split_loop_exit_edge (e, true);
11892 if (dump_enabled_p ())
11893 dump_printf (MSG_NOTE, "split exit edge\n");
11896 /* Version the loop first, if required, so the profitability check
11897 comes first. */
11899 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11901 class loop *sloop
11902 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11903 sloop->force_vectorize = false;
11904 check_profitability = false;
11907 /* Make sure there exists a single-predecessor exit bb also on the
11908 scalar loop copy. Do this after versioning but before peeling
11909 so CFG structure is fine for both scalar and if-converted loop
11910 to make slpeel_duplicate_current_defs_from_edges face matched
11911 loop closed PHI nodes on the exit. */
11912 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11914 e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11915 if (! single_pred_p (e->dest))
11917 split_loop_exit_edge (e, true);
11918 if (dump_enabled_p ())
11919 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11923 tree niters = vect_build_loop_niters (loop_vinfo);
11924 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11925 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11926 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11927 tree advance;
11928 drs_init_vec orig_drs_init;
11930 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11931 &step_vector, &niters_vector_mult_vf, th,
11932 check_profitability, niters_no_overflow,
11933 &advance);
11934 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11935 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11937 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11938 block after loop exit. We need to scale all that. */
11939 basic_block preheader
11940 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11941 preheader->count
11942 = preheader->count.apply_probability
11943 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11944 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11945 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11946 LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11949 if (niters_vector == NULL_TREE)
11951 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11952 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11953 && known_eq (lowest_vf, vf))
11955 niters_vector
11956 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11957 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11958 step_vector = build_one_cst (TREE_TYPE (niters));
11960 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11961 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11962 &step_vector, niters_no_overflow);
11963 else
11964 /* vect_do_peeling subtracted the number of peeled prologue
11965 iterations from LOOP_VINFO_NITERS. */
11966 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11967 &niters_vector, &step_vector,
11968 niters_no_overflow);
11971 /* 1) Make sure the loop header has exactly two entries
11972 2) Make sure we have a preheader basic block. */
11974 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11976 split_edge (loop_preheader_edge (loop));
11978 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11979 /* This will deal with any possible peeling. */
11980 vect_prepare_for_masked_peels (loop_vinfo);
11982 /* Handle any code motion that we need to for early-break vectorization after
11983 we've done peeling but just before we start vectorizing. */
11984 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11985 move_early_exit_stmts (loop_vinfo);
11987 /* Schedule the SLP instances first, then handle loop vectorization
11988 below. */
11989 if (!loop_vinfo->slp_instances.is_empty ())
11991 DUMP_VECT_SCOPE ("scheduling SLP instances");
11992 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11995 /* FORNOW: the vectorizer supports only loops which body consist
11996 of one basic block (header + empty latch). When the vectorizer will
11997 support more involved loop forms, the order by which the BBs are
11998 traversed need to be reconsidered. */
12000 for (i = 0; i < nbbs; i++)
12002 basic_block bb = bbs[i];
12003 stmt_vec_info stmt_info;
12005 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12006 gsi_next (&si))
12008 gphi *phi = si.phi ();
12009 if (dump_enabled_p ())
12010 dump_printf_loc (MSG_NOTE, vect_location,
12011 "------>vectorizing phi: %G", (gimple *) phi);
12012 stmt_info = loop_vinfo->lookup_stmt (phi);
12013 if (!stmt_info)
12014 continue;
12016 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
12017 vect_loop_kill_debug_uses (loop, stmt_info);
12019 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12020 && !STMT_VINFO_LIVE_P (stmt_info))
12021 continue;
12023 if (STMT_VINFO_VECTYPE (stmt_info)
12024 && (maybe_ne
12025 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12026 && dump_enabled_p ())
12027 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12029 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12030 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12031 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12032 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12033 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12034 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12035 && ! PURE_SLP_STMT (stmt_info))
12037 if (dump_enabled_p ())
12038 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12039 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12043 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12044 gsi_next (&si))
12046 gphi *phi = si.phi ();
12047 stmt_info = loop_vinfo->lookup_stmt (phi);
12048 if (!stmt_info)
12049 continue;
12051 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12052 && !STMT_VINFO_LIVE_P (stmt_info))
12053 continue;
12055 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12056 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12057 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12058 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12059 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12060 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12061 && ! PURE_SLP_STMT (stmt_info))
12062 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12065 for (gimple_stmt_iterator si = gsi_start_bb (bb);
12066 !gsi_end_p (si);)
12068 stmt = gsi_stmt (si);
12069 /* During vectorization remove existing clobber stmts. */
12070 if (gimple_clobber_p (stmt))
12072 unlink_stmt_vdef (stmt);
12073 gsi_remove (&si, true);
12074 release_defs (stmt);
12076 else
12078 /* Ignore vector stmts created in the outer loop. */
12079 stmt_info = loop_vinfo->lookup_stmt (stmt);
12081 /* vector stmts created in the outer-loop during vectorization of
12082 stmts in an inner-loop may not have a stmt_info, and do not
12083 need to be vectorized. */
12084 stmt_vec_info seen_store = NULL;
12085 if (stmt_info)
12087 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12089 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12090 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12091 !gsi_end_p (subsi); gsi_next (&subsi))
12093 stmt_vec_info pat_stmt_info
12094 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12095 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12096 &si, &seen_store);
12098 stmt_vec_info pat_stmt_info
12099 = STMT_VINFO_RELATED_STMT (stmt_info);
12100 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12101 &si, &seen_store))
12102 maybe_set_vectorized_backedge_value (loop_vinfo,
12103 pat_stmt_info);
12105 else
12107 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12108 &seen_store))
12109 maybe_set_vectorized_backedge_value (loop_vinfo,
12110 stmt_info);
12113 gsi_next (&si);
12114 if (seen_store)
12116 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12117 /* Interleaving. If IS_STORE is TRUE, the
12118 vectorization of the interleaving chain was
12119 completed - free all the stores in the chain. */
12120 vect_remove_stores (loop_vinfo,
12121 DR_GROUP_FIRST_ELEMENT (seen_store));
12122 else
12123 /* Free the attached stmt_vec_info and remove the stmt. */
12124 loop_vinfo->remove_stmt (stmt_info);
12129 /* Stub out scalar statements that must not survive vectorization.
12130 Doing this here helps with grouped statements, or statements that
12131 are involved in patterns. */
12132 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12133 !gsi_end_p (gsi); gsi_next (&gsi))
12135 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12136 if (!call || !gimple_call_internal_p (call))
12137 continue;
12138 internal_fn ifn = gimple_call_internal_fn (call);
12139 if (ifn == IFN_MASK_LOAD)
12141 tree lhs = gimple_get_lhs (call);
12142 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12144 tree zero = build_zero_cst (TREE_TYPE (lhs));
12145 gimple *new_stmt = gimple_build_assign (lhs, zero);
12146 gsi_replace (&gsi, new_stmt, true);
12149 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12151 tree lhs = gimple_get_lhs (call);
12152 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12154 tree else_arg
12155 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12156 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12157 gsi_replace (&gsi, new_stmt, true);
12161 } /* BBs in loop */
12163 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12164 a zero NITERS becomes a nonzero NITERS_VECTOR. */
12165 if (integer_onep (step_vector))
12166 niters_no_overflow = true;
12167 vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12168 niters_vector, step_vector, niters_vector_mult_vf,
12169 !niters_no_overflow);
12171 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12173 /* True if the final iteration might not handle a full vector's
12174 worth of scalar iterations. */
12175 bool final_iter_may_be_partial
12176 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
12177 /* The minimum number of iterations performed by the epilogue. This
12178 is 1 when peeling for gaps because we always need a final scalar
12179 iteration. */
12180 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12181 /* +1 to convert latch counts to loop iteration counts,
12182 -min_epilogue_iters to remove iterations that cannot be performed
12183 by the vector code. */
12184 int bias_for_lowest = 1 - min_epilogue_iters;
12185 int bias_for_assumed = bias_for_lowest;
12186 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12187 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12189 /* When the amount of peeling is known at compile time, the first
12190 iteration will have exactly alignment_npeels active elements.
12191 In the worst case it will have at least one. */
12192 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12193 bias_for_lowest += lowest_vf - min_first_active;
12194 bias_for_assumed += assumed_vf - min_first_active;
12196 /* In these calculations the "- 1" converts loop iteration counts
12197 back to latch counts. */
12198 if (loop->any_upper_bound)
12200 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12201 loop->nb_iterations_upper_bound
12202 = (final_iter_may_be_partial
12203 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12204 lowest_vf) - 1
12205 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12206 lowest_vf) - 1);
12207 if (main_vinfo
12208 /* Both peeling for alignment and peeling for gaps can end up
12209 with the scalar epilogue running for more than VF-1 iterations. */
12210 && !main_vinfo->peeling_for_alignment
12211 && !main_vinfo->peeling_for_gaps)
12213 unsigned int bound;
12214 poly_uint64 main_iters
12215 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12216 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12217 main_iters
12218 = upper_bound (main_iters,
12219 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12220 if (can_div_away_from_zero_p (main_iters,
12221 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12222 &bound))
12223 loop->nb_iterations_upper_bound
12224 = wi::umin ((bound_wide_int) (bound - 1),
12225 loop->nb_iterations_upper_bound);
12228 if (loop->any_likely_upper_bound)
12229 loop->nb_iterations_likely_upper_bound
12230 = (final_iter_may_be_partial
12231 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12232 + bias_for_lowest, lowest_vf) - 1
12233 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12234 + bias_for_lowest, lowest_vf) - 1);
12235 if (loop->any_estimate)
12236 loop->nb_iterations_estimate
12237 = (final_iter_may_be_partial
12238 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12239 assumed_vf) - 1
12240 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12241 assumed_vf) - 1);
12242 scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12243 assumed_vf, flat);
12245 if (dump_enabled_p ())
12247 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12249 dump_printf_loc (MSG_NOTE, vect_location,
12250 "LOOP VECTORIZED\n");
12251 if (loop->inner)
12252 dump_printf_loc (MSG_NOTE, vect_location,
12253 "OUTER LOOP VECTORIZED\n");
12254 dump_printf (MSG_NOTE, "\n");
12256 else
12257 dump_printf_loc (MSG_NOTE, vect_location,
12258 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12259 GET_MODE_NAME (loop_vinfo->vector_mode));
12262 /* Loops vectorized with a variable factor won't benefit from
12263 unrolling/peeling. */
12264 if (!vf.is_constant ())
12266 loop->unroll = 1;
12267 if (dump_enabled_p ())
12268 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12269 " variable-length vectorization factor\n");
12271 /* Free SLP instances here because otherwise stmt reference counting
12272 won't work. */
12273 slp_instance instance;
12274 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12275 vect_free_slp_instance (instance);
12276 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12277 /* Clear-up safelen field since its value is invalid after vectorization
12278 since vectorized loop can have loop-carried dependencies. */
12279 loop->safelen = 0;
12281 if (epilogue)
12283 update_epilogue_loop_vinfo (epilogue, advance);
12285 epilogue->simduid = loop->simduid;
12286 epilogue->force_vectorize = loop->force_vectorize;
12287 epilogue->dont_vectorize = false;
12290 return epilogue;
12293 /* The code below is trying to perform simple optimization - revert
12294 if-conversion for masked stores, i.e. if the mask of a store is zero
12295 do not perform it and all stored value producers also if possible.
12296 For example,
12297 for (i=0; i<n; i++)
12298 if (c[i])
12300 p1[i] += 1;
12301 p2[i] = p3[i] +2;
12303 this transformation will produce the following semi-hammock:
12305 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12307 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12308 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12309 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12310 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12311 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12312 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12316 void
12317 optimize_mask_stores (class loop *loop)
12319 basic_block *bbs = get_loop_body (loop);
12320 unsigned nbbs = loop->num_nodes;
12321 unsigned i;
12322 basic_block bb;
12323 class loop *bb_loop;
12324 gimple_stmt_iterator gsi;
12325 gimple *stmt;
12326 auto_vec<gimple *> worklist;
12327 auto_purge_vect_location sentinel;
12329 vect_location = find_loop_location (loop);
12330 /* Pick up all masked stores in loop if any. */
12331 for (i = 0; i < nbbs; i++)
12333 bb = bbs[i];
12334 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12335 gsi_next (&gsi))
12337 stmt = gsi_stmt (gsi);
12338 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12339 worklist.safe_push (stmt);
12343 free (bbs);
12344 if (worklist.is_empty ())
12345 return;
12347 /* Loop has masked stores. */
12348 while (!worklist.is_empty ())
12350 gimple *last, *last_store;
12351 edge e, efalse;
12352 tree mask;
12353 basic_block store_bb, join_bb;
12354 gimple_stmt_iterator gsi_to;
12355 tree vdef, new_vdef;
12356 gphi *phi;
12357 tree vectype;
12358 tree zero;
12360 last = worklist.pop ();
12361 mask = gimple_call_arg (last, 2);
12362 bb = gimple_bb (last);
12363 /* Create then_bb and if-then structure in CFG, then_bb belongs to
12364 the same loop as if_bb. It could be different to LOOP when two
12365 level loop-nest is vectorized and mask_store belongs to the inner
12366 one. */
12367 e = split_block (bb, last);
12368 bb_loop = bb->loop_father;
12369 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12370 join_bb = e->dest;
12371 store_bb = create_empty_bb (bb);
12372 add_bb_to_loop (store_bb, bb_loop);
12373 e->flags = EDGE_TRUE_VALUE;
12374 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12375 /* Put STORE_BB to likely part. */
12376 efalse->probability = profile_probability::likely ();
12377 e->probability = efalse->probability.invert ();
12378 store_bb->count = efalse->count ();
12379 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12380 if (dom_info_available_p (CDI_DOMINATORS))
12381 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12382 if (dump_enabled_p ())
12383 dump_printf_loc (MSG_NOTE, vect_location,
12384 "Create new block %d to sink mask stores.",
12385 store_bb->index);
12386 /* Create vector comparison with boolean result. */
12387 vectype = TREE_TYPE (mask);
12388 zero = build_zero_cst (vectype);
12389 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12390 gsi = gsi_last_bb (bb);
12391 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12392 /* Create new PHI node for vdef of the last masked store:
12393 .MEM_2 = VDEF <.MEM_1>
12394 will be converted to
12395 .MEM.3 = VDEF <.MEM_1>
12396 and new PHI node will be created in join bb
12397 .MEM_2 = PHI <.MEM_1, .MEM_3>
12399 vdef = gimple_vdef (last);
12400 new_vdef = make_ssa_name (gimple_vop (cfun), last);
12401 gimple_set_vdef (last, new_vdef);
12402 phi = create_phi_node (vdef, join_bb);
12403 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12405 /* Put all masked stores with the same mask to STORE_BB if possible. */
12406 while (true)
12408 gimple_stmt_iterator gsi_from;
12409 gimple *stmt1 = NULL;
12411 /* Move masked store to STORE_BB. */
12412 last_store = last;
12413 gsi = gsi_for_stmt (last);
12414 gsi_from = gsi;
12415 /* Shift GSI to the previous stmt for further traversal. */
12416 gsi_prev (&gsi);
12417 gsi_to = gsi_start_bb (store_bb);
12418 gsi_move_before (&gsi_from, &gsi_to);
12419 /* Setup GSI_TO to the non-empty block start. */
12420 gsi_to = gsi_start_bb (store_bb);
12421 if (dump_enabled_p ())
12422 dump_printf_loc (MSG_NOTE, vect_location,
12423 "Move stmt to created bb\n%G", last);
12424 /* Move all stored value producers if possible. */
12425 while (!gsi_end_p (gsi))
12427 tree lhs;
12428 imm_use_iterator imm_iter;
12429 use_operand_p use_p;
12430 bool res;
12432 /* Skip debug statements. */
12433 if (is_gimple_debug (gsi_stmt (gsi)))
12435 gsi_prev (&gsi);
12436 continue;
12438 stmt1 = gsi_stmt (gsi);
12439 /* Do not consider statements writing to memory or having
12440 volatile operand. */
12441 if (gimple_vdef (stmt1)
12442 || gimple_has_volatile_ops (stmt1))
12443 break;
12444 gsi_from = gsi;
12445 gsi_prev (&gsi);
12446 lhs = gimple_get_lhs (stmt1);
12447 if (!lhs)
12448 break;
12450 /* LHS of vectorized stmt must be SSA_NAME. */
12451 if (TREE_CODE (lhs) != SSA_NAME)
12452 break;
12454 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12456 /* Remove dead scalar statement. */
12457 if (has_zero_uses (lhs))
12459 gsi_remove (&gsi_from, true);
12460 continue;
12464 /* Check that LHS does not have uses outside of STORE_BB. */
12465 res = true;
12466 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12468 gimple *use_stmt;
12469 use_stmt = USE_STMT (use_p);
12470 if (is_gimple_debug (use_stmt))
12471 continue;
12472 if (gimple_bb (use_stmt) != store_bb)
12474 res = false;
12475 break;
12478 if (!res)
12479 break;
12481 if (gimple_vuse (stmt1)
12482 && gimple_vuse (stmt1) != gimple_vuse (last_store))
12483 break;
12485 /* Can move STMT1 to STORE_BB. */
12486 if (dump_enabled_p ())
12487 dump_printf_loc (MSG_NOTE, vect_location,
12488 "Move stmt to created bb\n%G", stmt1);
12489 gsi_move_before (&gsi_from, &gsi_to);
12490 /* Shift GSI_TO for further insertion. */
12491 gsi_prev (&gsi_to);
12493 /* Put other masked stores with the same mask to STORE_BB. */
12494 if (worklist.is_empty ()
12495 || gimple_call_arg (worklist.last (), 2) != mask
12496 || worklist.last () != stmt1)
12497 break;
12498 last = worklist.pop ();
12500 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12504 /* Decide whether it is possible to use a zero-based induction variable
12505 when vectorizing LOOP_VINFO with partial vectors. If it is, return
12506 the value that the induction variable must be able to hold in order
12507 to ensure that the rgroups eventually have no active vector elements.
12508 Return -1 otherwise. */
12510 widest_int
12511 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12513 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12514 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12515 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12517 /* Calculate the value that the induction variable must be able
12518 to hit in order to ensure that we end the loop with an all-false mask.
12519 This involves adding the maximum number of inactive trailing scalar
12520 iterations. */
12521 widest_int iv_limit = -1;
12522 if (max_loop_iterations (loop, &iv_limit))
12524 if (niters_skip)
12526 /* Add the maximum number of skipped iterations to the
12527 maximum iteration count. */
12528 if (TREE_CODE (niters_skip) == INTEGER_CST)
12529 iv_limit += wi::to_widest (niters_skip);
12530 else
12531 iv_limit += max_vf - 1;
12533 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12534 /* Make a conservatively-correct assumption. */
12535 iv_limit += max_vf - 1;
12537 /* IV_LIMIT is the maximum number of latch iterations, which is also
12538 the maximum in-range IV value. Round this value down to the previous
12539 vector alignment boundary and then add an extra full iteration. */
12540 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12541 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12543 return iv_limit;
12546 /* For the given rgroup_controls RGC, check whether an induction variable
12547 would ever hit a value that produces a set of all-false masks or zero
12548 lengths before wrapping around. Return true if it's possible to wrap
12549 around before hitting the desirable value, otherwise return false. */
12551 bool
12552 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12554 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12556 if (iv_limit == -1)
12557 return true;
12559 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12560 unsigned int compare_precision = TYPE_PRECISION (compare_type);
12561 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12563 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12564 return true;
12566 return false;