hppa: Always enable PIE on 64-bit target
[official-gcc.git] / gcc / tree-vect-loop.cc
blob4769d6f53e49ab92d71ba04e64b41bc8448b2709
1 /* Loop Vectorization
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "memmodel.h"
36 #include "optabs.h"
37 #include "diagnostic-core.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "cfganal.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "gimple-fold.h"
52 #include "cgraph.h"
53 #include "tree-cfg.h"
54 #include "tree-if-conv.h"
55 #include "internal-fn.h"
56 #include "tree-vector-builder.h"
57 #include "vec-perm-indices.h"
58 #include "tree-eh.h"
59 #include "case-cfn-macros.h"
60 #include "langhooks.h"
62 /* Loop Vectorization Pass.
64 This pass tries to vectorize loops.
66 For example, the vectorizer transforms the following simple loop:
68 short a[N]; short b[N]; short c[N]; int i;
70 for (i=0; i<N; i++){
71 a[i] = b[i] + c[i];
74 as if it was manually vectorized by rewriting the source code into:
76 typedef int __attribute__((mode(V8HI))) v8hi;
77 short a[N]; short b[N]; short c[N]; int i;
78 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 v8hi va, vb, vc;
81 for (i=0; i<N/8; i++){
82 vb = pb[i];
83 vc = pc[i];
84 va = vb + vc;
85 pa[i] = va;
88 The main entry to this pass is vectorize_loops(), in which
89 the vectorizer applies a set of analyses on a given set of loops,
90 followed by the actual vectorization transformation for the loops that
91 had successfully passed the analysis phase.
92 Throughout this pass we make a distinction between two types of
93 data: scalars (which are represented by SSA_NAMES), and memory references
94 ("data-refs"). These two types of data require different handling both
95 during analysis and transformation. The types of data-refs that the
96 vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 accesses are required to have a simple (consecutive) access pattern.
100 Analysis phase:
101 ===============
102 The driver for the analysis phase is vect_analyze_loop().
103 It applies a set of analyses, some of which rely on the scalar evolution
104 analyzer (scev) developed by Sebastian Pop.
106 During the analysis phase the vectorizer records some information
107 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 loop, as well as general information about the loop as a whole, which is
109 recorded in a "loop_vec_info" struct attached to each loop.
111 Transformation phase:
112 =====================
113 The loop transformation phase scans all the stmts in the loop, and
114 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 the loop that needs to be vectorized. It inserts the vector code sequence
116 just before the scalar stmt S, and records a pointer to the vector code
117 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 attached to S). This pointer will be used for the vectorization of following
119 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 otherwise, we rely on dead code elimination for removing it.
122 For example, say stmt S1 was vectorized into stmt VS1:
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 S2: a = b;
128 To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 resulting sequence would be:
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 VS2: va = vb;
136 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 Operands that are not SSA_NAMEs, are data-refs that appear in
139 load/store operations (like 'x[i]' in S1), and are handled differently.
141 Target modeling:
142 =================
143 Currently the only target specific information that is used is the
144 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 Targets that can support different sizes of vectors, for now will need
146 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 flexibility will be added in the future.
149 Since we only vectorize operations which vector form can be
150 expressed using existing tree codes, to verify that an operation is
151 supported, the vectorizer checks the relevant optab at the relevant
152 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 the value found is CODE_FOR_nothing, then there's no target support, and
154 we can't vectorize the stmt.
156 For additional information on this project see:
157 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 unsigned *);
162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 bool *, bool *, bool);
165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 may already be set for general statements (not just data refs). */
169 static opt_result
170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 bool vectype_maybe_set_p,
172 poly_uint64 *vf)
174 gimple *stmt = stmt_info->stmt;
176 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 && !STMT_VINFO_LIVE_P (stmt_info))
178 || gimple_clobber_p (stmt))
180 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 return opt_result::success ();
185 tree stmt_vectype, nunits_vectype;
186 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 &stmt_vectype,
188 &nunits_vectype);
189 if (!res)
190 return res;
192 if (stmt_vectype)
194 if (STMT_VINFO_VECTYPE (stmt_info))
195 /* The only case when a vectype had been already set is for stmts
196 that contain a data ref, or for "pattern-stmts" (stmts generated
197 by the vectorizer to represent/replace a certain idiom). */
198 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 || vectype_maybe_set_p)
200 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 else
202 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
205 if (nunits_vectype)
206 vect_update_max_nunits (vf, nunits_vectype);
208 return opt_result::success ();
211 /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 types of STMT_INFO and all attached pattern statements and update
213 the vectorization factor VF accordingly. Return true on success
214 or false if something prevented vectorization. */
216 static opt_result
217 vect_determine_vf_for_stmt (vec_info *vinfo,
218 stmt_vec_info stmt_info, poly_uint64 *vf)
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 if (!res)
225 return res;
227 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 && STMT_VINFO_RELATED_STMT (stmt_info))
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
233 /* If a pattern statement has def stmts, analyze them too. */
234 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 !gsi_end_p (si); gsi_next (&si))
237 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "==> examining pattern def stmt: %G",
241 def_stmt_info->stmt);
242 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 if (!res)
244 return res;
247 if (dump_enabled_p ())
248 dump_printf_loc (MSG_NOTE, vect_location,
249 "==> examining pattern statement: %G",
250 stmt_info->stmt);
251 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 if (!res)
253 return res;
256 return opt_result::success ();
259 /* Function vect_determine_vectorization_factor
261 Determine the vectorization factor (VF). VF is the number of data elements
262 that are operated upon in parallel in a single iteration of the vectorized
263 loop. For example, when vectorizing a loop that operates on 4byte elements,
264 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 elements can fit in a single vector register.
267 We currently support vectorization of loops in which all types operated upon
268 are of the same size. Therefore this function currently sets VF according to
269 the size of the types operated upon, and fails if there are multiple sizes
270 in the loop.
272 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 original loop:
274 for (i=0; i<N; i++){
275 a[i] = b[i] + c[i];
278 vectorized loop:
279 for (i=0; i<N; i+=VF){
280 a[i:VF] = b[i:VF] + c[i:VF];
284 static opt_result
285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
287 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 unsigned nbbs = loop->num_nodes;
290 poly_uint64 vectorization_factor = 1;
291 tree scalar_type = NULL_TREE;
292 gphi *phi;
293 tree vectype;
294 stmt_vec_info stmt_info;
295 unsigned i;
297 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
299 for (i = 0; i < nbbs; i++)
301 basic_block bb = bbs[i];
303 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 gsi_next (&si))
306 phi = si.phi ();
307 stmt_info = loop_vinfo->lookup_stmt (phi);
308 if (dump_enabled_p ())
309 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 (gimple *) phi);
312 gcc_assert (stmt_info);
314 if (STMT_VINFO_RELEVANT_P (stmt_info)
315 || STMT_VINFO_LIVE_P (stmt_info))
317 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 scalar_type = TREE_TYPE (PHI_RESULT (phi));
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE, vect_location,
322 "get vectype for scalar type: %T\n",
323 scalar_type);
325 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 if (!vectype)
327 return opt_result::failure_at (phi,
328 "not vectorized: unsupported "
329 "data-type %T\n",
330 scalar_type);
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 dump_printf (MSG_NOTE, "\n");
344 vect_update_max_nunits (&vectorization_factor, vectype);
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 gsi_next (&si))
351 if (is_gimple_debug (gsi_stmt (si)))
352 continue;
353 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 opt_result res
355 = vect_determine_vf_for_stmt (loop_vinfo,
356 stmt_info, &vectorization_factor);
357 if (!res)
358 return res;
362 /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 dump_dec (MSG_NOTE, vectorization_factor);
367 dump_printf (MSG_NOTE, "\n");
370 if (known_le (vectorization_factor, 1U))
371 return opt_result::failure_at (vect_location,
372 "not vectorized: unsupported data-type\n");
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 return opt_result::success ();
378 /* Function vect_is_simple_iv_evolution.
380 FORNOW: A simple evolution of an induction variables in the loop is
381 considered a polynomial evolution. */
383 static bool
384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 tree * step)
387 tree init_expr;
388 tree step_expr;
389 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 basic_block bb;
392 /* When there is no evolution in this loop, the evolution function
393 is not "simple". */
394 if (evolution_part == NULL_TREE)
395 return false;
397 /* When the evolution is a polynomial of degree >= 2
398 the evolution function is not "simple". */
399 if (tree_is_chrec (evolution_part))
400 return false;
402 step_expr = evolution_part;
403 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 step_expr, init_expr);
409 *init = init_expr;
410 *step = step_expr;
412 if (TREE_CODE (step_expr) != INTEGER_CST
413 && (TREE_CODE (step_expr) != SSA_NAME
414 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 || !flag_associative_math)))
419 && (TREE_CODE (step_expr) != REAL_CST
420 || !flag_associative_math))
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "step unknown.\n");
425 return false;
428 return true;
431 /* Function vect_is_nonlinear_iv_evolution
433 Only support nonlinear induction for integer type
434 1. neg
435 2. mul by constant
436 3. lshift/rshift by constant.
438 For neg induction, return a fake step as integer -1. */
439 static bool
440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 gphi* loop_phi_node, tree *init, tree *step)
443 tree init_expr, ev_expr, result, op1, op2;
444 gimple* def;
446 if (gimple_phi_num_args (loop_phi_node) != 2)
447 return false;
449 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
452 /* Support nonlinear induction only for integer type. */
453 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 return false;
456 *init = init_expr;
457 result = PHI_RESULT (loop_phi_node);
459 if (TREE_CODE (ev_expr) != SSA_NAME
460 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 || !is_gimple_assign (def))
462 return false;
464 enum tree_code t_code = gimple_assign_rhs_code (def);
465 switch (t_code)
467 case NEGATE_EXPR:
468 if (gimple_assign_rhs1 (def) != result)
469 return false;
470 *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 break;
474 case RSHIFT_EXPR:
475 case LSHIFT_EXPR:
476 case MULT_EXPR:
477 op1 = gimple_assign_rhs1 (def);
478 op2 = gimple_assign_rhs2 (def);
479 if (TREE_CODE (op2) != INTEGER_CST
480 || op1 != result)
481 return false;
482 *step = op2;
483 if (t_code == LSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 else if (t_code == RSHIFT_EXPR)
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 else
489 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 break;
492 default:
493 return false;
496 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
499 return true;
502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 what we are assuming is a double reduction. For example, given
504 a structure like this:
506 outer1:
507 x_1 = PHI <x_4(outer2), ...>;
510 inner:
511 x_2 = PHI <x_1(outer1), ...>;
513 x_3 = ...;
516 outer2:
517 x_4 = PHI <x_3(inner)>;
520 outer loop analysis would treat x_1 as a double reduction phi and
521 this function would then return true for x_2. */
523 static bool
524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
526 use_operand_p use_p;
527 ssa_op_iter op_iter;
528 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 return true;
532 return false;
535 /* Returns true if Phi is a first-order recurrence. A first-order
536 recurrence is a non-reduction recurrence relation in which the value of
537 the recurrence in the current loop iteration equals a value defined in
538 the previous iteration. */
540 static bool
541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 gphi *phi)
544 /* A nested cycle isn't vectorizable as first order recurrence. */
545 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 return false;
548 /* Ensure the loop latch definition is from within the loop. */
549 edge latch = loop_latch_edge (loop);
550 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 if (TREE_CODE (ldef) != SSA_NAME
552 || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 return false;
557 tree def = gimple_phi_result (phi);
559 /* Ensure every use_stmt of the phi node is dominated by the latch
560 definition. */
561 imm_use_iterator imm_iter;
562 use_operand_p use_p;
563 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 if (!is_gimple_debug (USE_STMT (use_p))
565 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 USE_STMT (use_p))))
568 return false;
570 /* First-order recurrence autovectorization needs shuffle vector. */
571 tree scalar_type = TREE_TYPE (def);
572 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 if (!vectype)
574 return false;
576 return true;
579 /* Function vect_analyze_scalar_cycles_1.
581 Examine the cross iteration def-use cycles of scalar variables
582 in LOOP. LOOP_VINFO represents the loop that is now being
583 considered for vectorization (can be LOOP, or an outer-loop
584 enclosing LOOP). SLP indicates there will be some subsequent
585 slp analyses or not. */
587 static void
588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 bool slp)
591 basic_block bb = loop->header;
592 tree init, step;
593 auto_vec<stmt_vec_info, 64> worklist;
594 gphi_iterator gsi;
595 bool double_reduc, reduc_chain;
597 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
599 /* First - identify all inductions. Reduction detection assumes that all the
600 inductions have been identified, therefore, this order must not be
601 changed. */
602 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
604 gphi *phi = gsi.phi ();
605 tree access_fn = NULL;
606 tree def = PHI_RESULT (phi);
607 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 (gimple *) phi);
613 /* Skip virtual phi's. The data dependences that are associated with
614 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 if (virtual_operand_p (def))
616 continue;
618 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
620 /* Analyze the evolution function. */
621 access_fn = analyze_scalar_evolution (loop, def);
622 if (access_fn)
624 STRIP_NOPS (access_fn);
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "Access function of PHI: %T\n", access_fn);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 = initial_condition_in_loop_num (access_fn, loop->num);
630 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 = evolution_part_in_loop_num (access_fn, loop->num);
634 if ((!access_fn
635 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 &init, &step)
638 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 && TREE_CODE (step) != INTEGER_CST))
640 /* Only handle nonlinear iv for same loop. */
641 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 phi, &init, &step)))
645 worklist.safe_push (stmt_vinfo);
646 continue;
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 != NULL_TREE);
651 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
659 /* Second - identify all reductions and nested cycles. */
660 while (worklist.length () > 0)
662 stmt_vec_info stmt_vinfo = worklist.pop ();
663 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
664 tree def = PHI_RESULT (phi);
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 (gimple *) phi);
670 gcc_assert (!virtual_operand_p (def)
671 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
673 stmt_vec_info reduc_stmt_info
674 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 &reduc_chain, slp);
676 if (reduc_stmt_info)
678 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 if (double_reduc)
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location,
684 "Detected double reduction.\n");
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
689 else
691 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "Detected vectorizable nested cycle.\n");
697 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
699 else
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_NOTE, vect_location,
703 "Detected reduction.\n");
705 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
706 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
707 /* Store the reduction cycles for possible vectorization in
708 loop-aware SLP if it was not detected as reduction
709 chain. */
710 if (! reduc_chain)
711 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
712 (reduc_stmt_info);
716 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
718 else
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown def-use cycle pattern.\n");
726 /* Function vect_analyze_scalar_cycles.
728 Examine the cross iteration def-use cycles of scalar variables, by
729 analyzing the loop-header PHIs of scalar variables. Classify each
730 cycle as one of the following: invariant, induction, reduction, unknown.
731 We do that for the loop represented by LOOP_VINFO, and also to its
732 inner-loop, if exists.
733 Examples for scalar cycles:
735 Example1: reduction:
737 loop1:
738 for (i=0; i<N; i++)
739 sum += a[i];
741 Example2: induction:
743 loop2:
744 for (i=0; i<N; i++)
745 a[i] = i; */
747 static void
748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
752 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
754 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 Reductions in such inner-loop therefore have different properties than
756 the reductions in the nest that gets vectorized:
757 1. When vectorized, they are executed in the same order as in the original
758 scalar loop, so we can't change the order of computation when
759 vectorizing them.
760 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 current checks are too strict. */
763 if (loop->inner)
764 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
767 /* Transfer group and reduction information from STMT_INFO to its
768 pattern stmt. */
770 static void
771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
773 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
774 stmt_vec_info stmtp;
775 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
777 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
780 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
781 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
782 == STMT_VINFO_DEF_TYPE (stmt_info));
783 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
784 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
785 if (stmt_info)
786 REDUC_GROUP_NEXT_ELEMENT (stmtp)
787 = STMT_VINFO_RELATED_STMT (stmt_info);
789 while (stmt_info);
792 /* Fixup scalar cycles that now have their stmts detected as patterns. */
794 static void
795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
797 stmt_vec_info first;
798 unsigned i;
800 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
802 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
803 while (next)
805 if ((STMT_VINFO_IN_PATTERN_P (next)
806 != STMT_VINFO_IN_PATTERN_P (first))
807 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
808 break;
809 next = REDUC_GROUP_NEXT_ELEMENT (next);
811 /* If all reduction chain members are well-formed patterns adjust
812 the group to group the pattern stmts instead. */
813 if (! next
814 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
816 if (STMT_VINFO_IN_PATTERN_P (first))
818 vect_fixup_reduc_chain (first);
819 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
820 = STMT_VINFO_RELATED_STMT (first);
823 /* If not all stmt in the chain are patterns or if we failed
824 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 it as regular reduction instead. */
826 else
828 stmt_vec_info vinfo = first;
829 stmt_vec_info last = NULL;
830 while (vinfo)
832 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
833 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
834 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
835 last = vinfo;
836 vinfo = next;
838 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
839 = vect_internal_def;
840 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
841 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
842 --i;
847 /* Function vect_get_loop_niters.
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 niter information holds in ASSUMPTIONS.
854 Return the loop exit conditions. */
857 static vec<gcond *>
858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
859 tree *number_of_iterations, tree *number_of_iterationsm1)
861 auto_vec<edge> exits = get_loop_exit_edges (loop);
862 vec<gcond *> conds;
863 conds.create (exits.length ());
864 class tree_niter_desc niter_desc;
865 tree niter_assumptions, niter, may_be_zero;
867 *assumptions = boolean_true_node;
868 *number_of_iterationsm1 = chrec_dont_know;
869 *number_of_iterations = chrec_dont_know;
871 DUMP_VECT_SCOPE ("get_loop_niters");
873 if (exits.is_empty ())
874 return conds;
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
878 exits.length ());
880 edge exit;
881 unsigned int i;
882 FOR_EACH_VEC_ELT (exits, i, exit)
884 gcond *cond = get_loop_exit_condition (exit);
885 if (cond)
886 conds.safe_push (cond);
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
891 if (exit != main_exit)
892 continue;
894 may_be_zero = NULL_TREE;
895 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
896 || chrec_contains_undetermined (niter_desc.niter))
897 continue;
899 niter_assumptions = niter_desc.assumptions;
900 may_be_zero = niter_desc.may_be_zero;
901 niter = niter_desc.niter;
903 if (may_be_zero && integer_zerop (may_be_zero))
904 may_be_zero = NULL_TREE;
906 if (may_be_zero)
908 if (COMPARISON_CLASS_P (may_be_zero))
910 /* Try to combine may_be_zero with assumptions, this can simplify
911 computation of niter expression. */
912 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
913 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
914 niter_assumptions,
915 fold_build1 (TRUTH_NOT_EXPR,
916 boolean_type_node,
917 may_be_zero));
918 else
919 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
920 build_int_cst (TREE_TYPE (niter), 0),
921 rewrite_to_non_trapping_overflow (niter));
923 may_be_zero = NULL_TREE;
925 else if (integer_nonzerop (may_be_zero))
927 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
928 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
929 continue;
931 else
932 continue;
935 /* Loop assumptions are based off the normal exit. */
936 *assumptions = niter_assumptions;
937 *number_of_iterationsm1 = niter;
939 /* We want the number of loop header executions which is the number
940 of latch executions plus one.
941 ??? For UINT_MAX latch executions this number overflows to zero
942 for loops like do { n++; } while (n != 0); */
943 if (niter && !chrec_contains_undetermined (niter))
945 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
946 unshare_expr (niter),
947 build_int_cst (TREE_TYPE (niter), 1));
948 if (TREE_CODE (niter) == INTEGER_CST
949 && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
951 /* If we manage to fold niter + 1 into INTEGER_CST even when
952 niter is some complex expression, ensure back
953 *number_of_iterationsm1 is an INTEGER_CST as well. See
954 PR113210. */
955 *number_of_iterationsm1
956 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
957 build_minus_one_cst (TREE_TYPE (niter)));
960 *number_of_iterations = niter;
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
966 return conds;
969 /* Determine the main loop exit for the vectorizer. */
971 edge
972 vec_init_loop_exit_info (class loop *loop)
974 /* Before we begin we must first determine which exit is the main one and
975 which are auxilary exits. */
976 auto_vec<edge> exits = get_loop_exit_edges (loop);
977 if (exits.length () == 1)
978 return exits[0];
980 /* If we have multiple exits we only support counting IV at the moment.
981 Analyze all exits and return the last one we can analyze. */
982 class tree_niter_desc niter_desc;
983 edge candidate = NULL;
984 for (edge exit : exits)
986 if (!get_loop_exit_condition (exit))
987 continue;
989 if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
990 && !chrec_contains_undetermined (niter_desc.niter))
992 tree may_be_zero = niter_desc.may_be_zero;
993 if ((integer_zerop (may_be_zero)
994 || integer_nonzerop (may_be_zero)
995 || COMPARISON_CLASS_P (may_be_zero))
996 && (!candidate
997 || dominated_by_p (CDI_DOMINATORS, exit->src,
998 candidate->src)))
999 candidate = exit;
1003 return candidate;
1006 /* Function bb_in_loop_p
1008 Used as predicate for dfs order traversal of the loop bbs. */
1010 static bool
1011 bb_in_loop_p (const_basic_block bb, const void *data)
1013 const class loop *const loop = (const class loop *)data;
1014 if (flow_bb_inside_loop_p (loop, bb))
1015 return true;
1016 return false;
1020 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1021 stmt_vec_info structs for all the stmts in LOOP_IN. */
1023 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1024 : vec_info (vec_info::loop, shared),
1025 loop (loop_in),
1026 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1027 num_itersm1 (NULL_TREE),
1028 num_iters (NULL_TREE),
1029 num_iters_unchanged (NULL_TREE),
1030 num_iters_assumptions (NULL_TREE),
1031 vector_costs (nullptr),
1032 scalar_costs (nullptr),
1033 th (0),
1034 versioning_threshold (0),
1035 vectorization_factor (0),
1036 main_loop_edge (nullptr),
1037 skip_main_loop_edge (nullptr),
1038 skip_this_loop_edge (nullptr),
1039 reusable_accumulators (),
1040 suggested_unroll_factor (1),
1041 max_vectorization_factor (0),
1042 mask_skip_niters (NULL_TREE),
1043 rgroup_compare_type (NULL_TREE),
1044 simd_if_cond (NULL_TREE),
1045 partial_vector_style (vect_partial_vectors_none),
1046 unaligned_dr (NULL),
1047 peeling_for_alignment (0),
1048 ptr_mask (0),
1049 ivexpr_map (NULL),
1050 scan_map (NULL),
1051 slp_unrolling_factor (1),
1052 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1053 vectorizable (false),
1054 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1055 using_partial_vectors_p (false),
1056 using_decrementing_iv_p (false),
1057 using_select_vl_p (false),
1058 epil_using_partial_vectors_p (false),
1059 partial_load_store_bias (0),
1060 peeling_for_gaps (false),
1061 peeling_for_niter (false),
1062 early_breaks (false),
1063 no_data_dependencies (false),
1064 has_mask_store (false),
1065 scalar_loop_scaling (profile_probability::uninitialized ()),
1066 scalar_loop (NULL),
1067 orig_loop_info (NULL),
1068 vec_loop_iv_exit (NULL),
1069 vec_epilogue_loop_iv_exit (NULL),
1070 scalar_loop_iv_exit (NULL)
1072 /* CHECKME: We want to visit all BBs before their successors (except for
1073 latch blocks, for which this assertion wouldn't hold). In the simple
1074 case of the loop forms we allow, a dfs order of the BBs would the same
1075 as reversed postorder traversal, so we are safe. */
1077 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1078 bbs, loop->num_nodes, loop);
1079 gcc_assert (nbbs == loop->num_nodes);
1081 for (unsigned int i = 0; i < nbbs; i++)
1083 basic_block bb = bbs[i];
1084 gimple_stmt_iterator si;
1086 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1088 gimple *phi = gsi_stmt (si);
1089 gimple_set_uid (phi, 0);
1090 add_stmt (phi);
1093 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1095 gimple *stmt = gsi_stmt (si);
1096 gimple_set_uid (stmt, 0);
1097 if (is_gimple_debug (stmt))
1098 continue;
1099 add_stmt (stmt);
1100 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1101 third argument is the #pragma omp simd if (x) condition, when 0,
1102 loop shouldn't be vectorized, when non-zero constant, it should
1103 be vectorized normally, otherwise versioned with vectorized loop
1104 done if the condition is non-zero at runtime. */
1105 if (loop_in->simduid
1106 && is_gimple_call (stmt)
1107 && gimple_call_internal_p (stmt)
1108 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1109 && gimple_call_num_args (stmt) >= 3
1110 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1111 && (loop_in->simduid
1112 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1114 tree arg = gimple_call_arg (stmt, 2);
1115 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1116 simd_if_cond = arg;
1117 else
1118 gcc_assert (integer_nonzerop (arg));
1123 epilogue_vinfos.create (6);
1126 /* Free all levels of rgroup CONTROLS. */
1128 void
1129 release_vec_loop_controls (vec<rgroup_controls> *controls)
1131 rgroup_controls *rgc;
1132 unsigned int i;
1133 FOR_EACH_VEC_ELT (*controls, i, rgc)
1134 rgc->controls.release ();
1135 controls->release ();
1138 /* Free all memory used by the _loop_vec_info, as well as all the
1139 stmt_vec_info structs of all the stmts in the loop. */
1141 _loop_vec_info::~_loop_vec_info ()
1143 free (bbs);
1145 release_vec_loop_controls (&masks.rgc_vec);
1146 release_vec_loop_controls (&lens);
1147 delete ivexpr_map;
1148 delete scan_map;
1149 epilogue_vinfos.release ();
1150 delete scalar_costs;
1151 delete vector_costs;
1153 /* When we release an epiloge vinfo that we do not intend to use
1154 avoid clearing AUX of the main loop which should continue to
1155 point to the main loop vinfo since otherwise we'll leak that. */
1156 if (loop->aux == this)
1157 loop->aux = NULL;
1160 /* Return an invariant or register for EXPR and emit necessary
1161 computations in the LOOP_VINFO loop preheader. */
1163 tree
1164 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1166 if (is_gimple_reg (expr)
1167 || is_gimple_min_invariant (expr))
1168 return expr;
1170 if (! loop_vinfo->ivexpr_map)
1171 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1172 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1173 if (! cached)
1175 gimple_seq stmts = NULL;
1176 cached = force_gimple_operand (unshare_expr (expr),
1177 &stmts, true, NULL_TREE);
1178 if (stmts)
1180 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1181 gsi_insert_seq_on_edge_immediate (e, stmts);
1184 return cached;
1187 /* Return true if we can use CMP_TYPE as the comparison type to produce
1188 all masks required to mask LOOP_VINFO. */
1190 static bool
1191 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1193 rgroup_controls *rgm;
1194 unsigned int i;
1195 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1196 if (rgm->type != NULL_TREE
1197 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1198 cmp_type, rgm->type,
1199 OPTIMIZE_FOR_SPEED))
1200 return false;
1201 return true;
1204 /* Calculate the maximum number of scalars per iteration for every
1205 rgroup in LOOP_VINFO. */
1207 static unsigned int
1208 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1210 unsigned int res = 1;
1211 unsigned int i;
1212 rgroup_controls *rgm;
1213 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1214 res = MAX (res, rgm->max_nscalars_per_iter);
1215 return res;
1218 /* Calculate the minimum precision necessary to represent:
1220 MAX_NITERS * FACTOR
1222 as an unsigned integer, where MAX_NITERS is the maximum number of
1223 loop header iterations for the original scalar form of LOOP_VINFO. */
1225 static unsigned
1226 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1228 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1230 /* Get the maximum number of iterations that is representable
1231 in the counter type. */
1232 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1233 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1235 /* Get a more refined estimate for the number of iterations. */
1236 widest_int max_back_edges;
1237 if (max_loop_iterations (loop, &max_back_edges))
1238 max_ni = wi::smin (max_ni, max_back_edges + 1);
1240 /* Work out how many bits we need to represent the limit. */
1241 return wi::min_precision (max_ni * factor, UNSIGNED);
1244 /* True if the loop needs peeling or partial vectors when vectorized. */
1246 static bool
1247 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1249 unsigned HOST_WIDE_INT const_vf;
1250 HOST_WIDE_INT max_niter
1251 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1253 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1254 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1255 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1256 (loop_vinfo));
1258 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1259 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1261 /* Work out the (constant) number of iterations that need to be
1262 peeled for reasons other than niters. */
1263 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1264 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1265 peel_niter += 1;
1266 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1267 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1268 return true;
1270 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1271 /* ??? When peeling for gaps but not alignment, we could
1272 try to check whether the (variable) niters is known to be
1273 VF * N + 1. That's something of a niche case though. */
1274 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1275 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1276 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1277 < (unsigned) exact_log2 (const_vf))
1278 /* In case of versioning, check if the maximum number of
1279 iterations is greater than th. If they are identical,
1280 the epilogue is unnecessary. */
1281 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1282 || ((unsigned HOST_WIDE_INT) max_niter
1283 /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1284 but that's only computed later based on our result.
1285 The following is the most conservative approximation. */
1286 > (std::max ((unsigned HOST_WIDE_INT) th,
1287 const_vf) / const_vf) * const_vf))))
1288 return true;
1290 return false;
1293 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1294 whether we can actually generate the masks required. Return true if so,
1295 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1297 static bool
1298 vect_verify_full_masking (loop_vec_info loop_vinfo)
1300 unsigned int min_ni_width;
1302 /* Use a normal loop if there are no statements that need masking.
1303 This only happens in rare degenerate cases: it means that the loop
1304 has no loads, no stores, and no live-out values. */
1305 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1306 return false;
1308 /* Produce the rgroup controls. */
1309 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1311 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1312 tree vectype = mask.first;
1313 unsigned nvectors = mask.second;
1315 if (masks->rgc_vec.length () < nvectors)
1316 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1317 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1318 /* The number of scalars per iteration and the number of vectors are
1319 both compile-time constants. */
1320 unsigned int nscalars_per_iter
1321 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1322 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1324 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1326 rgm->max_nscalars_per_iter = nscalars_per_iter;
1327 rgm->type = truth_type_for (vectype);
1328 rgm->factor = 1;
1332 unsigned int max_nscalars_per_iter
1333 = vect_get_max_nscalars_per_iter (loop_vinfo);
1335 /* Work out how many bits we need to represent the limit. */
1336 min_ni_width
1337 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1339 /* Find a scalar mode for which WHILE_ULT is supported. */
1340 opt_scalar_int_mode cmp_mode_iter;
1341 tree cmp_type = NULL_TREE;
1342 tree iv_type = NULL_TREE;
1343 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1344 unsigned int iv_precision = UINT_MAX;
1346 if (iv_limit != -1)
1347 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1348 UNSIGNED);
1350 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1352 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1353 if (cmp_bits >= min_ni_width
1354 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1356 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1357 if (this_type
1358 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1360 /* Although we could stop as soon as we find a valid mode,
1361 there are at least two reasons why that's not always the
1362 best choice:
1364 - An IV that's Pmode or wider is more likely to be reusable
1365 in address calculations than an IV that's narrower than
1366 Pmode.
1368 - Doing the comparison in IV_PRECISION or wider allows
1369 a natural 0-based IV, whereas using a narrower comparison
1370 type requires mitigations against wrap-around.
1372 Conversely, if the IV limit is variable, doing the comparison
1373 in a wider type than the original type can introduce
1374 unnecessary extensions, so picking the widest valid mode
1375 is not always a good choice either.
1377 Here we prefer the first IV type that's Pmode or wider,
1378 and the first comparison type that's IV_PRECISION or wider.
1379 (The comparison type must be no wider than the IV type,
1380 to avoid extensions in the vector loop.)
1382 ??? We might want to try continuing beyond Pmode for ILP32
1383 targets if CMP_BITS < IV_PRECISION. */
1384 iv_type = this_type;
1385 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1386 cmp_type = this_type;
1387 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1388 break;
1393 if (!cmp_type)
1395 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1396 return false;
1399 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1400 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1401 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1402 return true;
1405 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1406 whether we can actually generate AVX512 style masks. Return true if so,
1407 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1409 static bool
1410 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1412 /* Produce differently organized rgc_vec and differently check
1413 we can produce masks. */
1415 /* Use a normal loop if there are no statements that need masking.
1416 This only happens in rare degenerate cases: it means that the loop
1417 has no loads, no stores, and no live-out values. */
1418 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1419 return false;
1421 /* For the decrementing IV we need to represent all values in
1422 [0, niter + niter_skip] where niter_skip is the elements we
1423 skip in the first iteration for prologue peeling. */
1424 tree iv_type = NULL_TREE;
1425 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1426 unsigned int iv_precision = UINT_MAX;
1427 if (iv_limit != -1)
1428 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1430 /* First compute the type for the IV we use to track the remaining
1431 scalar iterations. */
1432 opt_scalar_int_mode cmp_mode_iter;
1433 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1435 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1436 if (cmp_bits >= iv_precision
1437 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1439 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1440 if (iv_type)
1441 break;
1444 if (!iv_type)
1445 return false;
1447 /* Produce the rgroup controls. */
1448 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1450 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1451 tree vectype = mask.first;
1452 unsigned nvectors = mask.second;
1454 /* The number of scalars per iteration and the number of vectors are
1455 both compile-time constants. */
1456 unsigned int nscalars_per_iter
1457 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1458 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1460 /* We index the rgroup_controls vector with nscalars_per_iter
1461 which we keep constant and instead have a varying nvectors,
1462 remembering the vector mask with the fewest nV. */
1463 if (masks->rgc_vec.length () < nscalars_per_iter)
1464 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1465 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1467 if (!rgm->type || rgm->factor > nvectors)
1469 rgm->type = truth_type_for (vectype);
1470 rgm->compare_type = NULL_TREE;
1471 rgm->max_nscalars_per_iter = nscalars_per_iter;
1472 rgm->factor = nvectors;
1473 rgm->bias_adjusted_ctrl = NULL_TREE;
1477 /* There is no fixed compare type we are going to use but we have to
1478 be able to get at one for each mask group. */
1479 unsigned int min_ni_width
1480 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1482 bool ok = true;
1483 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1485 tree mask_type = rgc.type;
1486 if (!mask_type)
1487 continue;
1489 /* For now vect_get_loop_mask only supports integer mode masks
1490 when we need to split it. */
1491 if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1492 || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1494 ok = false;
1495 break;
1498 /* If iv_type is usable as compare type use that - we can elide the
1499 saturation in that case. */
1500 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1502 tree cmp_vectype
1503 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1504 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1505 rgc.compare_type = cmp_vectype;
1507 if (!rgc.compare_type)
1508 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1510 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1511 if (cmp_bits >= min_ni_width
1512 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1514 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1515 if (!cmp_type)
1516 continue;
1518 /* Check whether we can produce the mask with cmp_type. */
1519 tree cmp_vectype
1520 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1521 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1523 rgc.compare_type = cmp_vectype;
1524 break;
1528 if (!rgc.compare_type)
1530 ok = false;
1531 break;
1534 if (!ok)
1536 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1537 return false;
1540 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1541 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1542 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1543 return true;
1546 /* Check whether we can use vector access with length based on precison
1547 comparison. So far, to keep it simple, we only allow the case that the
1548 precision of the target supported length is larger than the precision
1549 required by loop niters. */
1551 static bool
1552 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1554 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1555 return false;
1557 machine_mode len_load_mode, len_store_mode;
1558 if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1559 .exists (&len_load_mode))
1560 return false;
1561 if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1562 .exists (&len_store_mode))
1563 return false;
1565 signed char partial_load_bias = internal_len_load_store_bias
1566 (IFN_LEN_LOAD, len_load_mode);
1568 signed char partial_store_bias = internal_len_load_store_bias
1569 (IFN_LEN_STORE, len_store_mode);
1571 gcc_assert (partial_load_bias == partial_store_bias);
1573 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1574 return false;
1576 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1577 len_loads with a length of zero. In order to avoid that we prohibit
1578 more than one loop length here. */
1579 if (partial_load_bias == -1
1580 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1581 return false;
1583 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1585 unsigned int max_nitems_per_iter = 1;
1586 unsigned int i;
1587 rgroup_controls *rgl;
1588 /* Find the maximum number of items per iteration for every rgroup. */
1589 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1591 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1592 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1595 /* Work out how many bits we need to represent the length limit. */
1596 unsigned int min_ni_prec
1597 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1599 /* Now use the maximum of below precisions for one suitable IV type:
1600 - the IV's natural precision
1601 - the precision needed to hold: the maximum number of scalar
1602 iterations multiplied by the scale factor (min_ni_prec above)
1603 - the Pmode precision
1605 If min_ni_prec is less than the precision of the current niters,
1606 we perfer to still use the niters type. Prefer to use Pmode and
1607 wider IV to avoid narrow conversions. */
1609 unsigned int ni_prec
1610 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1611 min_ni_prec = MAX (min_ni_prec, ni_prec);
1612 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1614 tree iv_type = NULL_TREE;
1615 opt_scalar_int_mode tmode_iter;
1616 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1618 scalar_mode tmode = tmode_iter.require ();
1619 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1621 /* ??? Do we really want to construct one IV whose precision exceeds
1622 BITS_PER_WORD? */
1623 if (tbits > BITS_PER_WORD)
1624 break;
1626 /* Find the first available standard integral type. */
1627 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1629 iv_type = build_nonstandard_integer_type (tbits, true);
1630 break;
1634 if (!iv_type)
1636 if (dump_enabled_p ())
1637 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1638 "can't vectorize with length-based partial vectors"
1639 " because there is no suitable iv type.\n");
1640 return false;
1643 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1644 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1645 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1647 return true;
1650 /* Calculate the cost of one scalar iteration of the loop. */
1651 static void
1652 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1654 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1655 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1656 int nbbs = loop->num_nodes, factor;
1657 int innerloop_iters, i;
1659 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1661 /* Gather costs for statements in the scalar loop. */
1663 /* FORNOW. */
1664 innerloop_iters = 1;
1665 if (loop->inner)
1666 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1668 for (i = 0; i < nbbs; i++)
1670 gimple_stmt_iterator si;
1671 basic_block bb = bbs[i];
1673 if (bb->loop_father == loop->inner)
1674 factor = innerloop_iters;
1675 else
1676 factor = 1;
1678 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1680 gimple *stmt = gsi_stmt (si);
1681 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1683 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1684 continue;
1686 /* Skip stmts that are not vectorized inside the loop. */
1687 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1688 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1689 && (!STMT_VINFO_LIVE_P (vstmt_info)
1690 || !VECTORIZABLE_CYCLE_DEF
1691 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1692 continue;
1694 vect_cost_for_stmt kind;
1695 if (STMT_VINFO_DATA_REF (stmt_info))
1697 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1698 kind = scalar_load;
1699 else
1700 kind = scalar_store;
1702 else if (vect_nop_conversion_p (stmt_info))
1703 continue;
1704 else
1705 kind = scalar_stmt;
1707 /* We are using vect_prologue here to avoid scaling twice
1708 by the inner loop factor. */
1709 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1710 factor, kind, stmt_info, 0, vect_prologue);
1714 /* Now accumulate cost. */
1715 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1716 add_stmt_costs (loop_vinfo->scalar_costs,
1717 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1718 loop_vinfo->scalar_costs->finish_cost (nullptr);
1721 /* Function vect_analyze_loop_form.
1723 Verify that certain CFG restrictions hold, including:
1724 - the loop has a pre-header
1725 - the loop has a single entry
1726 - nested loops can have only a single exit.
1727 - the loop exit condition is simple enough
1728 - the number of iterations can be analyzed, i.e, a countable loop. The
1729 niter could be analyzed under some assumptions. */
1731 opt_result
1732 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1734 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1736 edge exit_e = vec_init_loop_exit_info (loop);
1737 if (!exit_e)
1738 return opt_result::failure_at (vect_location,
1739 "not vectorized:"
1740 " could not determine main exit from"
1741 " loop with multiple exits.\n");
1742 info->loop_exit = exit_e;
1743 if (dump_enabled_p ())
1744 dump_printf_loc (MSG_NOTE, vect_location,
1745 "using as main loop exit: %d -> %d [AUX: %p]\n",
1746 exit_e->src->index, exit_e->dest->index, exit_e->aux);
1748 /* Check if we have any control flow that doesn't leave the loop. */
1749 class loop *v_loop = loop->inner ? loop->inner : loop;
1750 basic_block *bbs = get_loop_body (v_loop);
1751 for (unsigned i = 0; i < v_loop->num_nodes; i++)
1752 if (EDGE_COUNT (bbs[i]->succs) != 1
1753 && (EDGE_COUNT (bbs[i]->succs) != 2
1754 || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1756 free (bbs);
1757 return opt_result::failure_at (vect_location,
1758 "not vectorized:"
1759 " unsupported control flow in loop.\n");
1761 free (bbs);
1763 /* Different restrictions apply when we are considering an inner-most loop,
1764 vs. an outer (nested) loop.
1765 (FORNOW. May want to relax some of these restrictions in the future). */
1767 info->inner_loop_cond = NULL;
1768 if (!loop->inner)
1770 /* Inner-most loop. */
1772 if (empty_block_p (loop->header))
1773 return opt_result::failure_at (vect_location,
1774 "not vectorized: empty loop.\n");
1776 else
1778 class loop *innerloop = loop->inner;
1779 edge entryedge;
1781 /* Nested loop. We currently require that the loop is doubly-nested,
1782 contains a single inner loop with a single exit to the block
1783 with the single exit condition in the outer loop.
1784 Vectorizable outer-loops look like this:
1786 (pre-header)
1788 header <---+
1790 inner-loop |
1792 tail ------+
1794 (exit-bb)
1796 The inner-loop also has the properties expected of inner-most loops
1797 as described above. */
1799 if ((loop->inner)->inner || (loop->inner)->next)
1800 return opt_result::failure_at (vect_location,
1801 "not vectorized:"
1802 " multiple nested loops.\n");
1804 entryedge = loop_preheader_edge (innerloop);
1805 if (entryedge->src != loop->header
1806 || !single_exit (innerloop)
1807 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1808 return opt_result::failure_at (vect_location,
1809 "not vectorized:"
1810 " unsupported outerloop form.\n");
1812 /* Analyze the inner-loop. */
1813 vect_loop_form_info inner;
1814 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1815 if (!res)
1817 if (dump_enabled_p ())
1818 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1819 "not vectorized: Bad inner loop.\n");
1820 return res;
1823 /* Don't support analyzing niter under assumptions for inner
1824 loop. */
1825 if (!integer_onep (inner.assumptions))
1826 return opt_result::failure_at (vect_location,
1827 "not vectorized: Bad inner loop.\n");
1829 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1830 return opt_result::failure_at (vect_location,
1831 "not vectorized: inner-loop count not"
1832 " invariant.\n");
1834 if (dump_enabled_p ())
1835 dump_printf_loc (MSG_NOTE, vect_location,
1836 "Considering outer-loop vectorization.\n");
1837 info->inner_loop_cond = inner.conds[0];
1840 if (EDGE_COUNT (loop->header->preds) != 2)
1841 return opt_result::failure_at (vect_location,
1842 "not vectorized:"
1843 " too many incoming edges.\n");
1845 /* We assume that the latch is empty. */
1846 if (!empty_block_p (loop->latch)
1847 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1848 return opt_result::failure_at (vect_location,
1849 "not vectorized: latch block not empty.\n");
1851 /* Make sure there is no abnormal exit. */
1852 auto_vec<edge> exits = get_loop_exit_edges (loop);
1853 for (edge e : exits)
1855 if (e->flags & EDGE_ABNORMAL)
1856 return opt_result::failure_at (vect_location,
1857 "not vectorized:"
1858 " abnormal loop exit edge.\n");
1861 info->conds
1862 = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1863 &info->number_of_iterations,
1864 &info->number_of_iterationsm1);
1865 if (info->conds.is_empty ())
1866 return opt_result::failure_at
1867 (vect_location,
1868 "not vectorized: complicated exit condition.\n");
1870 /* Determine what the primary and alternate exit conds are. */
1871 for (unsigned i = 0; i < info->conds.length (); i++)
1873 gcond *cond = info->conds[i];
1874 if (exit_e->src == gimple_bb (cond))
1875 std::swap (info->conds[0], info->conds[i]);
1878 if (integer_zerop (info->assumptions)
1879 || !info->number_of_iterations
1880 || chrec_contains_undetermined (info->number_of_iterations))
1881 return opt_result::failure_at
1882 (info->conds[0],
1883 "not vectorized: number of iterations cannot be computed.\n");
1885 if (integer_zerop (info->number_of_iterations))
1886 return opt_result::failure_at
1887 (info->conds[0],
1888 "not vectorized: number of iterations = 0.\n");
1890 if (!(tree_fits_shwi_p (info->number_of_iterations)
1891 && tree_to_shwi (info->number_of_iterations) > 0))
1893 if (dump_enabled_p ())
1895 dump_printf_loc (MSG_NOTE, vect_location,
1896 "Symbolic number of iterations is ");
1897 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1898 dump_printf (MSG_NOTE, "\n");
1902 return opt_result::success ();
1905 /* Create a loop_vec_info for LOOP with SHARED and the
1906 vect_analyze_loop_form result. */
1908 loop_vec_info
1909 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1910 const vect_loop_form_info *info,
1911 loop_vec_info main_loop_info)
1913 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1914 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1915 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1916 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1917 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1918 /* Also record the assumptions for versioning. */
1919 if (!integer_onep (info->assumptions) && !main_loop_info)
1920 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1922 for (gcond *cond : info->conds)
1924 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1925 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1926 /* Mark the statement as a condition. */
1927 STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1930 for (unsigned i = 1; i < info->conds.length (); i ++)
1931 LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1932 LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1934 LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1936 /* Check to see if we're vectorizing multiple exits. */
1937 LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1938 = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1940 if (info->inner_loop_cond)
1942 stmt_vec_info inner_loop_cond_info
1943 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1944 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1945 /* If we have an estimate on the number of iterations of the inner
1946 loop use that to limit the scale for costing, otherwise use
1947 --param vect-inner-loop-cost-factor literally. */
1948 widest_int nit;
1949 if (estimated_stmt_executions (loop->inner, &nit))
1950 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1951 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1954 return loop_vinfo;
1959 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1960 statements update the vectorization factor. */
1962 static void
1963 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1965 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1966 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1967 int nbbs = loop->num_nodes;
1968 poly_uint64 vectorization_factor;
1969 int i;
1971 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1973 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1974 gcc_assert (known_ne (vectorization_factor, 0U));
1976 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1977 vectorization factor of the loop is the unrolling factor required by
1978 the SLP instances. If that unrolling factor is 1, we say, that we
1979 perform pure SLP on loop - cross iteration parallelism is not
1980 exploited. */
1981 bool only_slp_in_loop = true;
1982 for (i = 0; i < nbbs; i++)
1984 basic_block bb = bbs[i];
1985 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1986 gsi_next (&si))
1988 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1989 if (!stmt_info)
1990 continue;
1991 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1992 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1993 && !PURE_SLP_STMT (stmt_info))
1994 /* STMT needs both SLP and loop-based vectorization. */
1995 only_slp_in_loop = false;
1997 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1998 gsi_next (&si))
2000 if (is_gimple_debug (gsi_stmt (si)))
2001 continue;
2002 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2003 stmt_info = vect_stmt_to_vectorize (stmt_info);
2004 if ((STMT_VINFO_RELEVANT_P (stmt_info)
2005 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2006 && !PURE_SLP_STMT (stmt_info))
2007 /* STMT needs both SLP and loop-based vectorization. */
2008 only_slp_in_loop = false;
2012 if (only_slp_in_loop)
2014 if (dump_enabled_p ())
2015 dump_printf_loc (MSG_NOTE, vect_location,
2016 "Loop contains only SLP stmts\n");
2017 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2019 else
2021 if (dump_enabled_p ())
2022 dump_printf_loc (MSG_NOTE, vect_location,
2023 "Loop contains SLP and non-SLP stmts\n");
2024 /* Both the vectorization factor and unroll factor have the form
2025 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2026 so they must have a common multiple. */
2027 vectorization_factor
2028 = force_common_multiple (vectorization_factor,
2029 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2032 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2033 if (dump_enabled_p ())
2035 dump_printf_loc (MSG_NOTE, vect_location,
2036 "Updating vectorization factor to ");
2037 dump_dec (MSG_NOTE, vectorization_factor);
2038 dump_printf (MSG_NOTE, ".\n");
2042 /* Return true if STMT_INFO describes a double reduction phi and if
2043 the other phi in the reduction is also relevant for vectorization.
2044 This rejects cases such as:
2046 outer1:
2047 x_1 = PHI <x_3(outer2), ...>;
2050 inner:
2051 x_2 = ...;
2054 outer2:
2055 x_3 = PHI <x_2(inner)>;
2057 if nothing in x_2 or elsewhere makes x_1 relevant. */
2059 static bool
2060 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2062 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2063 return false;
2065 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2068 /* Function vect_analyze_loop_operations.
2070 Scan the loop stmts and make sure they are all vectorizable. */
2072 static opt_result
2073 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2075 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2076 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2077 int nbbs = loop->num_nodes;
2078 int i;
2079 stmt_vec_info stmt_info;
2080 bool need_to_vectorize = false;
2081 bool ok;
2083 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2085 auto_vec<stmt_info_for_cost> cost_vec;
2087 for (i = 0; i < nbbs; i++)
2089 basic_block bb = bbs[i];
2091 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2092 gsi_next (&si))
2094 gphi *phi = si.phi ();
2095 ok = true;
2097 stmt_info = loop_vinfo->lookup_stmt (phi);
2098 if (dump_enabled_p ())
2099 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2100 (gimple *) phi);
2101 if (virtual_operand_p (gimple_phi_result (phi)))
2102 continue;
2104 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2105 (i.e., a phi in the tail of the outer-loop). */
2106 if (! is_loop_header_bb_p (bb))
2108 /* FORNOW: we currently don't support the case that these phis
2109 are not used in the outerloop (unless it is double reduction,
2110 i.e., this phi is vect_reduction_def), cause this case
2111 requires to actually do something here. */
2112 if (STMT_VINFO_LIVE_P (stmt_info)
2113 && !vect_active_double_reduction_p (stmt_info))
2114 return opt_result::failure_at (phi,
2115 "Unsupported loop-closed phi"
2116 " in outer-loop.\n");
2118 /* If PHI is used in the outer loop, we check that its operand
2119 is defined in the inner loop. */
2120 if (STMT_VINFO_RELEVANT_P (stmt_info))
2122 tree phi_op;
2124 if (gimple_phi_num_args (phi) != 1)
2125 return opt_result::failure_at (phi, "unsupported phi");
2127 phi_op = PHI_ARG_DEF (phi, 0);
2128 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2129 if (!op_def_info)
2130 return opt_result::failure_at (phi, "unsupported phi\n");
2132 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2133 && (STMT_VINFO_RELEVANT (op_def_info)
2134 != vect_used_in_outer_by_reduction))
2135 return opt_result::failure_at (phi, "unsupported phi\n");
2137 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2138 || (STMT_VINFO_DEF_TYPE (stmt_info)
2139 == vect_double_reduction_def))
2140 && !vectorizable_lc_phi (loop_vinfo,
2141 stmt_info, NULL, NULL))
2142 return opt_result::failure_at (phi, "unsupported phi\n");
2145 continue;
2148 gcc_assert (stmt_info);
2150 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2151 || STMT_VINFO_LIVE_P (stmt_info))
2152 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2153 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2154 /* A scalar-dependence cycle that we don't support. */
2155 return opt_result::failure_at (phi,
2156 "not vectorized:"
2157 " scalar dependence cycle.\n");
2159 if (STMT_VINFO_RELEVANT_P (stmt_info))
2161 need_to_vectorize = true;
2162 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2163 && ! PURE_SLP_STMT (stmt_info))
2164 ok = vectorizable_induction (loop_vinfo,
2165 stmt_info, NULL, NULL,
2166 &cost_vec);
2167 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2168 || (STMT_VINFO_DEF_TYPE (stmt_info)
2169 == vect_double_reduction_def)
2170 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2171 && ! PURE_SLP_STMT (stmt_info))
2172 ok = vectorizable_reduction (loop_vinfo,
2173 stmt_info, NULL, NULL, &cost_vec);
2174 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2175 == vect_first_order_recurrence)
2176 && ! PURE_SLP_STMT (stmt_info))
2177 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2178 &cost_vec);
2181 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2182 if (ok
2183 && STMT_VINFO_LIVE_P (stmt_info)
2184 && !PURE_SLP_STMT (stmt_info))
2185 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2186 -1, false, &cost_vec);
2188 if (!ok)
2189 return opt_result::failure_at (phi,
2190 "not vectorized: relevant phi not "
2191 "supported: %G",
2192 static_cast <gimple *> (phi));
2195 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2196 gsi_next (&si))
2198 gimple *stmt = gsi_stmt (si);
2199 if (!gimple_clobber_p (stmt)
2200 && !is_gimple_debug (stmt))
2202 opt_result res
2203 = vect_analyze_stmt (loop_vinfo,
2204 loop_vinfo->lookup_stmt (stmt),
2205 &need_to_vectorize,
2206 NULL, NULL, &cost_vec);
2207 if (!res)
2208 return res;
2211 } /* bbs */
2213 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2215 /* All operations in the loop are either irrelevant (deal with loop
2216 control, or dead), or only used outside the loop and can be moved
2217 out of the loop (e.g. invariants, inductions). The loop can be
2218 optimized away by scalar optimizations. We're better off not
2219 touching this loop. */
2220 if (!need_to_vectorize)
2222 if (dump_enabled_p ())
2223 dump_printf_loc (MSG_NOTE, vect_location,
2224 "All the computation can be taken out of the loop.\n");
2225 return opt_result::failure_at
2226 (vect_location,
2227 "not vectorized: redundant loop. no profit to vectorize.\n");
2230 return opt_result::success ();
2233 /* Return true if we know that the iteration count is smaller than the
2234 vectorization factor. Return false if it isn't, or if we can't be sure
2235 either way. */
2237 static bool
2238 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2240 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2242 HOST_WIDE_INT max_niter;
2243 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2244 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2245 else
2246 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2248 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2249 return true;
2251 return false;
2254 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2255 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2256 definitely no, or -1 if it's worth retrying. */
2258 static int
2259 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2260 unsigned *suggested_unroll_factor)
2262 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2263 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2265 /* Only loops that can handle partially-populated vectors can have iteration
2266 counts less than the vectorization factor. */
2267 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2268 && vect_known_niters_smaller_than_vf (loop_vinfo))
2270 if (dump_enabled_p ())
2271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2272 "not vectorized: iteration count smaller than "
2273 "vectorization factor.\n");
2274 return 0;
2277 /* If we know the number of iterations we can do better, for the
2278 epilogue we can also decide whether the main loop leaves us
2279 with enough iterations, prefering a smaller vector epilog then
2280 also possibly used for the case we skip the vector loop. */
2281 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2283 widest_int scalar_niters
2284 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2285 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2287 loop_vec_info orig_loop_vinfo
2288 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2289 unsigned lowest_vf
2290 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2291 int prolog_peeling = 0;
2292 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2293 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2294 if (prolog_peeling >= 0
2295 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2296 lowest_vf))
2298 unsigned gap
2299 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2300 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2301 % lowest_vf + gap);
2304 /* Reject vectorizing for a single scalar iteration, even if
2305 we could in principle implement that using partial vectors. */
2306 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2307 if (scalar_niters <= peeling_gap + 1)
2309 if (dump_enabled_p ())
2310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311 "not vectorized: loop only has a single "
2312 "scalar iteration.\n");
2313 return 0;
2316 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2318 /* Check that the loop processes at least one full vector. */
2319 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2320 if (known_lt (scalar_niters, vf))
2322 if (dump_enabled_p ())
2323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2324 "loop does not have enough iterations "
2325 "to support vectorization.\n");
2326 return 0;
2329 /* If we need to peel an extra epilogue iteration to handle data
2330 accesses with gaps, check that there are enough scalar iterations
2331 available.
2333 The check above is redundant with this one when peeling for gaps,
2334 but the distinction is useful for diagnostics. */
2335 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2336 && known_le (scalar_niters, vf))
2338 if (dump_enabled_p ())
2339 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2340 "loop does not have enough iterations "
2341 "to support peeling for gaps.\n");
2342 return 0;
2347 /* If using the "very cheap" model. reject cases in which we'd keep
2348 a copy of the scalar code (even if we might be able to vectorize it). */
2349 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2350 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2351 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2352 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2354 if (dump_enabled_p ())
2355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2356 "some scalar iterations would need to be peeled\n");
2357 return 0;
2360 int min_profitable_iters, min_profitable_estimate;
2361 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2362 &min_profitable_estimate,
2363 suggested_unroll_factor);
2365 if (min_profitable_iters < 0)
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2369 "not vectorized: vectorization not profitable.\n");
2370 if (dump_enabled_p ())
2371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2372 "not vectorized: vector version will never be "
2373 "profitable.\n");
2374 return -1;
2377 int min_scalar_loop_bound = (param_min_vect_loop_bound
2378 * assumed_vf);
2380 /* Use the cost model only if it is more conservative than user specified
2381 threshold. */
2382 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2383 min_profitable_iters);
2385 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2387 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2388 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2390 if (dump_enabled_p ())
2391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2392 "not vectorized: vectorization not profitable.\n");
2393 if (dump_enabled_p ())
2394 dump_printf_loc (MSG_NOTE, vect_location,
2395 "not vectorized: iteration count smaller than user "
2396 "specified loop bound parameter or minimum profitable "
2397 "iterations (whichever is more conservative).\n");
2398 return 0;
2401 /* The static profitablity threshold min_profitable_estimate includes
2402 the cost of having to check at runtime whether the scalar loop
2403 should be used instead. If it turns out that we don't need or want
2404 such a check, the threshold we should use for the static estimate
2405 is simply the point at which the vector loop becomes more profitable
2406 than the scalar loop. */
2407 if (min_profitable_estimate > min_profitable_iters
2408 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2409 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2410 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2411 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2413 if (dump_enabled_p ())
2414 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2415 " choice between the scalar and vector loops\n");
2416 min_profitable_estimate = min_profitable_iters;
2419 /* If the vector loop needs multiple iterations to be beneficial then
2420 things are probably too close to call, and the conservative thing
2421 would be to stick with the scalar code. */
2422 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2423 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2425 if (dump_enabled_p ())
2426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2427 "one iteration of the vector loop would be"
2428 " more expensive than the equivalent number of"
2429 " iterations of the scalar loop\n");
2430 return 0;
2433 HOST_WIDE_INT estimated_niter;
2435 /* If we are vectorizing an epilogue then we know the maximum number of
2436 scalar iterations it will cover is at least one lower than the
2437 vectorization factor of the main loop. */
2438 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2439 estimated_niter
2440 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2441 else
2443 estimated_niter = estimated_stmt_executions_int (loop);
2444 if (estimated_niter == -1)
2445 estimated_niter = likely_max_stmt_executions_int (loop);
2447 if (estimated_niter != -1
2448 && ((unsigned HOST_WIDE_INT) estimated_niter
2449 < MAX (th, (unsigned) min_profitable_estimate)))
2451 if (dump_enabled_p ())
2452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2453 "not vectorized: estimated iteration count too "
2454 "small.\n");
2455 if (dump_enabled_p ())
2456 dump_printf_loc (MSG_NOTE, vect_location,
2457 "not vectorized: estimated iteration count smaller "
2458 "than specified loop bound parameter or minimum "
2459 "profitable iterations (whichever is more "
2460 "conservative).\n");
2461 return -1;
2464 return 1;
2467 static opt_result
2468 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2469 vec<data_reference_p> *datarefs,
2470 unsigned int *n_stmts)
2472 *n_stmts = 0;
2473 for (unsigned i = 0; i < loop->num_nodes; i++)
2474 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2475 !gsi_end_p (gsi); gsi_next (&gsi))
2477 gimple *stmt = gsi_stmt (gsi);
2478 if (is_gimple_debug (stmt))
2479 continue;
2480 ++(*n_stmts);
2481 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2482 NULL, 0);
2483 if (!res)
2485 if (is_gimple_call (stmt) && loop->safelen)
2487 tree fndecl = gimple_call_fndecl (stmt), op;
2488 if (fndecl == NULL_TREE
2489 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2491 fndecl = gimple_call_arg (stmt, 0);
2492 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2493 fndecl = TREE_OPERAND (fndecl, 0);
2494 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2496 if (fndecl != NULL_TREE)
2498 cgraph_node *node = cgraph_node::get (fndecl);
2499 if (node != NULL && node->simd_clones != NULL)
2501 unsigned int j, n = gimple_call_num_args (stmt);
2502 for (j = 0; j < n; j++)
2504 op = gimple_call_arg (stmt, j);
2505 if (DECL_P (op)
2506 || (REFERENCE_CLASS_P (op)
2507 && get_base_address (op)))
2508 break;
2510 op = gimple_call_lhs (stmt);
2511 /* Ignore #pragma omp declare simd functions
2512 if they don't have data references in the
2513 call stmt itself. */
2514 if (j == n
2515 && !(op
2516 && (DECL_P (op)
2517 || (REFERENCE_CLASS_P (op)
2518 && get_base_address (op)))))
2519 continue;
2523 return res;
2525 /* If dependence analysis will give up due to the limit on the
2526 number of datarefs stop here and fail fatally. */
2527 if (datarefs->length ()
2528 > (unsigned)param_loop_max_datarefs_for_datadeps)
2529 return opt_result::failure_at (stmt, "exceeded param "
2530 "loop-max-datarefs-for-datadeps\n");
2532 return opt_result::success ();
2535 /* Look for SLP-only access groups and turn each individual access into its own
2536 group. */
2537 static void
2538 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2540 unsigned int i;
2541 struct data_reference *dr;
2543 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2545 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2546 FOR_EACH_VEC_ELT (datarefs, i, dr)
2548 gcc_assert (DR_REF (dr));
2549 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2551 /* Check if the load is a part of an interleaving chain. */
2552 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2554 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2555 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2556 unsigned int group_size = DR_GROUP_SIZE (first_element);
2558 /* Check if SLP-only groups. */
2559 if (!STMT_SLP_TYPE (stmt_info)
2560 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2562 /* Dissolve the group. */
2563 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2565 stmt_vec_info vinfo = first_element;
2566 while (vinfo)
2568 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2569 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2570 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2571 DR_GROUP_SIZE (vinfo) = 1;
2572 if (STMT_VINFO_STRIDED_P (first_element)
2573 /* We cannot handle stores with gaps. */
2574 || DR_IS_WRITE (dr_info->dr))
2576 STMT_VINFO_STRIDED_P (vinfo) = true;
2577 DR_GROUP_GAP (vinfo) = 0;
2579 else
2580 DR_GROUP_GAP (vinfo) = group_size - 1;
2581 /* Duplicate and adjust alignment info, it needs to
2582 be present on each group leader, see dr_misalignment. */
2583 if (vinfo != first_element)
2585 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2586 dr_info2->target_alignment = dr_info->target_alignment;
2587 int misalignment = dr_info->misalignment;
2588 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2590 HOST_WIDE_INT diff
2591 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2592 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2593 unsigned HOST_WIDE_INT align_c
2594 = dr_info->target_alignment.to_constant ();
2595 misalignment = (misalignment + diff) % align_c;
2597 dr_info2->misalignment = misalignment;
2599 vinfo = next;
2606 /* Determine if operating on full vectors for LOOP_VINFO might leave
2607 some scalar iterations still to do. If so, decide how we should
2608 handle those scalar iterations. The possibilities are:
2610 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2611 In this case:
2613 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2614 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2615 LOOP_VINFO_PEELING_FOR_NITER == false
2617 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2618 to handle the remaining scalar iterations. In this case:
2620 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2621 LOOP_VINFO_PEELING_FOR_NITER == true
2623 There are two choices:
2625 (2a) Consider vectorizing the epilogue loop at the same VF as the
2626 main loop, but using partial vectors instead of full vectors.
2627 In this case:
2629 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2631 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2632 In this case:
2634 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2637 opt_result
2638 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2640 /* Determine whether there would be any scalar iterations left over. */
2641 bool need_peeling_or_partial_vectors_p
2642 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2644 /* Decide whether to vectorize the loop with partial vectors. */
2645 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2646 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2647 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2648 && need_peeling_or_partial_vectors_p)
2650 /* For partial-vector-usage=1, try to push the handling of partial
2651 vectors to the epilogue, with the main loop continuing to operate
2652 on full vectors.
2654 If we are unrolling we also do not want to use partial vectors. This
2655 is to avoid the overhead of generating multiple masks and also to
2656 avoid having to execute entire iterations of FALSE masked instructions
2657 when dealing with one or less full iterations.
2659 ??? We could then end up failing to use partial vectors if we
2660 decide to peel iterations into a prologue, and if the main loop
2661 then ends up processing fewer than VF iterations. */
2662 if ((param_vect_partial_vector_usage == 1
2663 || loop_vinfo->suggested_unroll_factor > 1)
2664 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2665 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2666 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2667 else
2668 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2671 if (dump_enabled_p ())
2672 dump_printf_loc (MSG_NOTE, vect_location,
2673 "operating on %s vectors%s.\n",
2674 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2675 ? "partial" : "full",
2676 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2677 ? " for epilogue loop" : "");
2679 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2680 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2681 && need_peeling_or_partial_vectors_p);
2683 /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2684 analysis that we don't know whether the loop is vectorized by partial
2685 vectors (More details see tree-vect-loop-manip.cc).
2687 However, SELECT_VL vectorizaton style should only applied on partial
2688 vectorization since SELECT_VL is the GIMPLE IR that calculates the
2689 number of elements to be process for each iteration.
2691 After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2692 if it is not partial vectorized loop. */
2693 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2694 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2696 return opt_result::success ();
2699 /* Function vect_analyze_loop_2.
2701 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2702 analyses will record information in some members of LOOP_VINFO. FATAL
2703 indicates if some analysis meets fatal error. If one non-NULL pointer
2704 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2705 worked out suggested unroll factor, while one NULL pointer shows it's
2706 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2707 is to hold the slp decision when the suggested unroll factor is worked
2708 out. */
2709 static opt_result
2710 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2711 unsigned *suggested_unroll_factor,
2712 bool& slp_done_for_suggested_uf)
2714 opt_result ok = opt_result::success ();
2715 int res;
2716 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2717 poly_uint64 min_vf = 2;
2718 loop_vec_info orig_loop_vinfo = NULL;
2720 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2721 loop_vec_info of the first vectorized loop. */
2722 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2723 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2724 else
2725 orig_loop_vinfo = loop_vinfo;
2726 gcc_assert (orig_loop_vinfo);
2728 /* The first group of checks is independent of the vector size. */
2729 fatal = true;
2731 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2732 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2733 return opt_result::failure_at (vect_location,
2734 "not vectorized: simd if(0)\n");
2736 /* Find all data references in the loop (which correspond to vdefs/vuses)
2737 and analyze their evolution in the loop. */
2739 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2741 /* Gather the data references and count stmts in the loop. */
2742 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2744 opt_result res
2745 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2746 &LOOP_VINFO_DATAREFS (loop_vinfo),
2747 &LOOP_VINFO_N_STMTS (loop_vinfo));
2748 if (!res)
2750 if (dump_enabled_p ())
2751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2752 "not vectorized: loop contains function "
2753 "calls or data references that cannot "
2754 "be analyzed\n");
2755 return res;
2757 loop_vinfo->shared->save_datarefs ();
2759 else
2760 loop_vinfo->shared->check_datarefs ();
2762 /* Analyze the data references and also adjust the minimal
2763 vectorization factor according to the loads and stores. */
2765 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2766 if (!ok)
2768 if (dump_enabled_p ())
2769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2770 "bad data references.\n");
2771 return ok;
2774 /* Check if we are applying unroll factor now. */
2775 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2776 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2778 /* If the slp decision is false when suggested unroll factor is worked
2779 out, and we are applying suggested unroll factor, we can simply skip
2780 all slp related analyses this time. */
2781 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2783 /* Classify all cross-iteration scalar data-flow cycles.
2784 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2785 vect_analyze_scalar_cycles (loop_vinfo, slp);
2787 vect_pattern_recog (loop_vinfo);
2789 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2791 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2792 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2794 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2795 if (!ok)
2797 if (dump_enabled_p ())
2798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2799 "bad data access.\n");
2800 return ok;
2803 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2805 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2806 if (!ok)
2808 if (dump_enabled_p ())
2809 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2810 "unexpected pattern.\n");
2811 return ok;
2814 /* While the rest of the analysis below depends on it in some way. */
2815 fatal = false;
2817 /* Analyze data dependences between the data-refs in the loop
2818 and adjust the maximum vectorization factor according to
2819 the dependences.
2820 FORNOW: fail at the first data dependence that we encounter. */
2822 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2823 if (!ok)
2825 if (dump_enabled_p ())
2826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2827 "bad data dependence.\n");
2828 return ok;
2830 if (max_vf != MAX_VECTORIZATION_FACTOR
2831 && maybe_lt (max_vf, min_vf))
2832 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2833 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2835 ok = vect_determine_vectorization_factor (loop_vinfo);
2836 if (!ok)
2838 if (dump_enabled_p ())
2839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2840 "can't determine vectorization factor.\n");
2841 return ok;
2844 /* Compute the scalar iteration cost. */
2845 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2847 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2849 if (slp)
2851 /* Check the SLP opportunities in the loop, analyze and build
2852 SLP trees. */
2853 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2854 if (!ok)
2855 return ok;
2857 /* If there are any SLP instances mark them as pure_slp. */
2858 slp = vect_make_slp_decision (loop_vinfo);
2859 if (slp)
2861 /* Find stmts that need to be both vectorized and SLPed. */
2862 vect_detect_hybrid_slp (loop_vinfo);
2864 /* Update the vectorization factor based on the SLP decision. */
2865 vect_update_vf_for_slp (loop_vinfo);
2867 /* Optimize the SLP graph with the vectorization factor fixed. */
2868 vect_optimize_slp (loop_vinfo);
2870 /* Gather the loads reachable from the SLP graph entries. */
2871 vect_gather_slp_loads (loop_vinfo);
2875 bool saved_can_use_partial_vectors_p
2876 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2878 /* We don't expect to have to roll back to anything other than an empty
2879 set of rgroups. */
2880 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2882 /* This is the point where we can re-start analysis with SLP forced off. */
2883 start_over:
2885 /* Apply the suggested unrolling factor, this was determined by the backend
2886 during finish_cost the first time we ran the analyzis for this
2887 vector mode. */
2888 if (applying_suggested_uf)
2889 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2891 /* Now the vectorization factor is final. */
2892 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2893 gcc_assert (known_ne (vectorization_factor, 0U));
2895 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2897 dump_printf_loc (MSG_NOTE, vect_location,
2898 "vectorization_factor = ");
2899 dump_dec (MSG_NOTE, vectorization_factor);
2900 dump_printf (MSG_NOTE, ", niters = %wd\n",
2901 LOOP_VINFO_INT_NITERS (loop_vinfo));
2904 if (max_vf != MAX_VECTORIZATION_FACTOR
2905 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2906 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2908 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2910 /* Analyze the alignment of the data-refs in the loop.
2911 Fail if a data reference is found that cannot be vectorized. */
2913 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2914 if (!ok)
2916 if (dump_enabled_p ())
2917 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2918 "bad data alignment.\n");
2919 return ok;
2922 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2923 It is important to call pruning after vect_analyze_data_ref_accesses,
2924 since we use grouping information gathered by interleaving analysis. */
2925 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2926 if (!ok)
2927 return ok;
2929 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2930 vectorization, since we do not want to add extra peeling or
2931 add versioning for alignment. */
2932 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2933 /* This pass will decide on using loop versioning and/or loop peeling in
2934 order to enhance the alignment of data references in the loop. */
2935 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2936 if (!ok)
2937 return ok;
2939 if (slp)
2941 /* Analyze operations in the SLP instances. Note this may
2942 remove unsupported SLP instances which makes the above
2943 SLP kind detection invalid. */
2944 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2945 vect_slp_analyze_operations (loop_vinfo);
2946 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2948 ok = opt_result::failure_at (vect_location,
2949 "unsupported SLP instances\n");
2950 goto again;
2953 /* Check whether any load in ALL SLP instances is possibly permuted. */
2954 slp_tree load_node, slp_root;
2955 unsigned i, x;
2956 slp_instance instance;
2957 bool can_use_lanes = true;
2958 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2960 slp_root = SLP_INSTANCE_TREE (instance);
2961 int group_size = SLP_TREE_LANES (slp_root);
2962 tree vectype = SLP_TREE_VECTYPE (slp_root);
2963 bool loads_permuted = false;
2964 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2966 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2967 continue;
2968 unsigned j;
2969 stmt_vec_info load_info;
2970 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2971 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2973 loads_permuted = true;
2974 break;
2978 /* If the loads and stores can be handled with load/store-lane
2979 instructions record it and move on to the next instance. */
2980 if (loads_permuted
2981 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2982 && vect_store_lanes_supported (vectype, group_size, false)
2983 != IFN_LAST)
2985 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2986 if (STMT_VINFO_GROUPED_ACCESS
2987 (SLP_TREE_REPRESENTATIVE (load_node)))
2989 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2990 (SLP_TREE_REPRESENTATIVE (load_node));
2991 /* Use SLP for strided accesses (or if we can't
2992 load-lanes). */
2993 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2994 || vect_load_lanes_supported
2995 (STMT_VINFO_VECTYPE (stmt_vinfo),
2996 DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2997 break;
3000 can_use_lanes
3001 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
3003 if (can_use_lanes && dump_enabled_p ())
3004 dump_printf_loc (MSG_NOTE, vect_location,
3005 "SLP instance %p can use load/store-lanes\n",
3006 (void *) instance);
3008 else
3010 can_use_lanes = false;
3011 break;
3015 /* If all SLP instances can use load/store-lanes abort SLP and try again
3016 with SLP disabled. */
3017 if (can_use_lanes)
3019 ok = opt_result::failure_at (vect_location,
3020 "Built SLP cancelled: can use "
3021 "load/store-lanes\n");
3022 if (dump_enabled_p ())
3023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3024 "Built SLP cancelled: all SLP instances support "
3025 "load/store-lanes\n");
3026 goto again;
3030 /* Dissolve SLP-only groups. */
3031 vect_dissolve_slp_only_groups (loop_vinfo);
3033 /* Scan all the remaining operations in the loop that are not subject
3034 to SLP and make sure they are vectorizable. */
3035 ok = vect_analyze_loop_operations (loop_vinfo);
3036 if (!ok)
3038 if (dump_enabled_p ())
3039 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3040 "bad operation or unsupported loop bound.\n");
3041 return ok;
3044 /* For now, we don't expect to mix both masking and length approaches for one
3045 loop, disable it if both are recorded. */
3046 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3047 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3048 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3050 if (dump_enabled_p ())
3051 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3052 "can't vectorize a loop with partial vectors"
3053 " because we don't expect to mix different"
3054 " approaches with partial vectors for the"
3055 " same loop.\n");
3056 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3059 /* If we still have the option of using partial vectors,
3060 check whether we can generate the necessary loop controls. */
3061 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3063 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3065 if (!vect_verify_full_masking (loop_vinfo)
3066 && !vect_verify_full_masking_avx512 (loop_vinfo))
3067 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3069 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3070 if (!vect_verify_loop_lens (loop_vinfo))
3071 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3074 /* If we're vectorizing a loop that uses length "controls" and
3075 can iterate more than once, we apply decrementing IV approach
3076 in loop control. */
3077 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3078 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3079 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3080 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3081 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3082 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3083 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3085 /* If a loop uses length controls and has a decrementing loop control IV,
3086 we will normally pass that IV through a MIN_EXPR to calcaluate the
3087 basis for the length controls. E.g. in a loop that processes one
3088 element per scalar iteration, the number of elements would be
3089 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3091 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3092 step, since only the final iteration of the vector loop can have
3093 inactive lanes.
3095 However, some targets have a dedicated instruction for calculating the
3096 preferred length, given the total number of elements that still need to
3097 be processed. This is encapsulated in the SELECT_VL internal function.
3099 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3100 to determine the basis for the length controls. However, unlike the
3101 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3102 lanes inactive in any iteration of the vector loop, not just the last
3103 iteration. This SELECT_VL approach therefore requires us to use pointer
3104 IVs with variable steps.
3106 Once we've decided how many elements should be processed by one
3107 iteration of the vector loop, we need to populate the rgroup controls.
3108 If a loop has multiple rgroups, we need to make sure that those rgroups
3109 "line up" (that is, they must be consistent about which elements are
3110 active and which aren't). This is done by vect_adjust_loop_lens_control.
3112 In principle, it would be possible to use vect_adjust_loop_lens_control
3113 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3114 However:
3116 (1) In practice, it only makes sense to use SELECT_VL when a vector
3117 operation will be controlled directly by the result. It is not
3118 worth using SELECT_VL if it would only be the input to other
3119 calculations.
3121 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3122 pointer IV will need N updates by a variable amount (N-1 updates
3123 within the iteration and 1 update to move to the next iteration).
3125 Because of this, we prefer to use the MIN_EXPR approach whenever there
3126 is more than one length control.
3128 In addition, SELECT_VL always operates to a granularity of 1 unit.
3129 If we wanted to use it to control an SLP operation on N consecutive
3130 elements, we would need to make the SELECT_VL inputs measure scalar
3131 iterations (rather than elements) and then multiply the SELECT_VL
3132 result by N. But using SELECT_VL this way is inefficient because
3133 of (1) above.
3135 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3136 satisfied:
3138 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3139 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3141 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3142 we will fail to gain benefits of following unroll optimizations. We prefer
3143 using the MIN_EXPR approach in this situation. */
3144 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3146 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3147 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3148 OPTIMIZE_FOR_SPEED)
3149 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3150 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3151 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3152 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3153 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3156 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3157 assuming that the loop will be used as a main loop. We will redo
3158 this analysis later if we instead decide to use the loop as an
3159 epilogue loop. */
3160 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3161 if (!ok)
3162 return ok;
3164 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3165 to be able to handle fewer than VF scalars, or needs to have a lower VF
3166 than the main loop. */
3167 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3168 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3170 poly_uint64 unscaled_vf
3171 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3172 orig_loop_vinfo->suggested_unroll_factor);
3173 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3174 return opt_result::failure_at (vect_location,
3175 "Vectorization factor too high for"
3176 " epilogue loop.\n");
3179 /* Check the costings of the loop make vectorizing worthwhile. */
3180 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3181 if (res < 0)
3183 ok = opt_result::failure_at (vect_location,
3184 "Loop costings may not be worthwhile.\n");
3185 goto again;
3187 if (!res)
3188 return opt_result::failure_at (vect_location,
3189 "Loop costings not worthwhile.\n");
3191 /* If an epilogue loop is required make sure we can create one. */
3192 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3193 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3194 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3196 if (dump_enabled_p ())
3197 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3198 if (!vect_can_advance_ivs_p (loop_vinfo)
3199 || !slpeel_can_duplicate_loop_p (loop,
3200 LOOP_VINFO_IV_EXIT (loop_vinfo),
3201 LOOP_VINFO_IV_EXIT (loop_vinfo)))
3203 ok = opt_result::failure_at (vect_location,
3204 "not vectorized: can't create required "
3205 "epilog loop\n");
3206 goto again;
3210 /* During peeling, we need to check if number of loop iterations is
3211 enough for both peeled prolog loop and vector loop. This check
3212 can be merged along with threshold check of loop versioning, so
3213 increase threshold for this case if necessary.
3215 If we are analyzing an epilogue we still want to check what its
3216 versioning threshold would be. If we decide to vectorize the epilogues we
3217 will want to use the lowest versioning threshold of all epilogues and main
3218 loop. This will enable us to enter a vectorized epilogue even when
3219 versioning the loop. We can't simply check whether the epilogue requires
3220 versioning though since we may have skipped some versioning checks when
3221 analyzing the epilogue. For instance, checks for alias versioning will be
3222 skipped when dealing with epilogues as we assume we already checked them
3223 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3224 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3226 poly_uint64 niters_th = 0;
3227 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3229 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3231 /* Niters for peeled prolog loop. */
3232 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3234 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3235 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3236 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3238 else
3239 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3242 /* Niters for at least one iteration of vectorized loop. */
3243 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3244 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3245 /* One additional iteration because of peeling for gap. */
3246 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3247 niters_th += 1;
3249 /* Use the same condition as vect_transform_loop to decide when to use
3250 the cost to determine a versioning threshold. */
3251 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3252 && ordered_p (th, niters_th))
3253 niters_th = ordered_max (poly_uint64 (th), niters_th);
3255 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3258 gcc_assert (known_eq (vectorization_factor,
3259 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3261 slp_done_for_suggested_uf = slp;
3263 /* Ok to vectorize! */
3264 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3265 return opt_result::success ();
3267 again:
3268 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3269 gcc_assert (!ok);
3271 /* Try again with SLP forced off but if we didn't do any SLP there is
3272 no point in re-trying. */
3273 if (!slp)
3274 return ok;
3276 /* If the slp decision is true when suggested unroll factor is worked
3277 out, and we are applying suggested unroll factor, we don't need to
3278 re-try any more. */
3279 if (applying_suggested_uf && slp_done_for_suggested_uf)
3280 return ok;
3282 /* If there are reduction chains re-trying will fail anyway. */
3283 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3284 return ok;
3286 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3287 via interleaving or lane instructions. */
3288 slp_instance instance;
3289 slp_tree node;
3290 unsigned i, j;
3291 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3293 stmt_vec_info vinfo;
3294 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3295 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3296 continue;
3297 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3298 unsigned int size = DR_GROUP_SIZE (vinfo);
3299 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3300 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3301 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3302 && ! vect_grouped_store_supported (vectype, size))
3303 return opt_result::failure_at (vinfo->stmt,
3304 "unsupported grouped store\n");
3305 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3307 vinfo = SLP_TREE_REPRESENTATIVE (node);
3308 if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3310 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3311 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3312 size = DR_GROUP_SIZE (vinfo);
3313 vectype = STMT_VINFO_VECTYPE (vinfo);
3314 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3315 && ! vect_grouped_load_supported (vectype, single_element_p,
3316 size))
3317 return opt_result::failure_at (vinfo->stmt,
3318 "unsupported grouped load\n");
3323 if (dump_enabled_p ())
3324 dump_printf_loc (MSG_NOTE, vect_location,
3325 "re-trying with SLP disabled\n");
3327 /* Roll back state appropriately. No SLP this time. */
3328 slp = false;
3329 /* Restore vectorization factor as it were without SLP. */
3330 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3331 /* Free the SLP instances. */
3332 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3333 vect_free_slp_instance (instance);
3334 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3335 /* Reset SLP type to loop_vect on all stmts. */
3336 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3338 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3339 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3340 !gsi_end_p (si); gsi_next (&si))
3342 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3343 STMT_SLP_TYPE (stmt_info) = loop_vect;
3344 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3345 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3347 /* vectorizable_reduction adjusts reduction stmt def-types,
3348 restore them to that of the PHI. */
3349 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3350 = STMT_VINFO_DEF_TYPE (stmt_info);
3351 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3352 (STMT_VINFO_REDUC_DEF (stmt_info)))
3353 = STMT_VINFO_DEF_TYPE (stmt_info);
3356 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3357 !gsi_end_p (si); gsi_next (&si))
3359 if (is_gimple_debug (gsi_stmt (si)))
3360 continue;
3361 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3362 STMT_SLP_TYPE (stmt_info) = loop_vect;
3363 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3365 stmt_vec_info pattern_stmt_info
3366 = STMT_VINFO_RELATED_STMT (stmt_info);
3367 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3368 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3370 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3371 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3372 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3373 !gsi_end_p (pi); gsi_next (&pi))
3374 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3375 = loop_vect;
3379 /* Free optimized alias test DDRS. */
3380 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3381 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3382 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3383 /* Reset target cost data. */
3384 delete loop_vinfo->vector_costs;
3385 loop_vinfo->vector_costs = nullptr;
3386 /* Reset accumulated rgroup information. */
3387 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3388 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3389 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3390 /* Reset assorted flags. */
3391 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3392 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3393 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3394 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3395 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3396 = saved_can_use_partial_vectors_p;
3398 goto start_over;
3401 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3402 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3403 OLD_LOOP_VINFO is better unless something specifically indicates
3404 otherwise.
3406 Note that this deliberately isn't a partial order. */
3408 static bool
3409 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3410 loop_vec_info old_loop_vinfo)
3412 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3413 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3415 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3416 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3418 /* Always prefer a VF of loop->simdlen over any other VF. */
3419 if (loop->simdlen)
3421 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3422 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3423 if (new_simdlen_p != old_simdlen_p)
3424 return new_simdlen_p;
3427 const auto *old_costs = old_loop_vinfo->vector_costs;
3428 const auto *new_costs = new_loop_vinfo->vector_costs;
3429 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3430 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3432 return new_costs->better_main_loop_than_p (old_costs);
3435 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3436 true if we should. */
3438 static bool
3439 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3440 loop_vec_info old_loop_vinfo)
3442 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3443 return false;
3445 if (dump_enabled_p ())
3446 dump_printf_loc (MSG_NOTE, vect_location,
3447 "***** Preferring vector mode %s to vector mode %s\n",
3448 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3449 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3450 return true;
3453 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3454 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3455 MODE_I to the next mode useful to analyze.
3456 Return the loop_vinfo on success and wrapped null on failure. */
3458 static opt_loop_vec_info
3459 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3460 const vect_loop_form_info *loop_form_info,
3461 loop_vec_info main_loop_vinfo,
3462 const vector_modes &vector_modes, unsigned &mode_i,
3463 machine_mode &autodetected_vector_mode,
3464 bool &fatal)
3466 loop_vec_info loop_vinfo
3467 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3469 machine_mode vector_mode = vector_modes[mode_i];
3470 loop_vinfo->vector_mode = vector_mode;
3471 unsigned int suggested_unroll_factor = 1;
3472 bool slp_done_for_suggested_uf = false;
3474 /* Run the main analysis. */
3475 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3476 &suggested_unroll_factor,
3477 slp_done_for_suggested_uf);
3478 if (dump_enabled_p ())
3479 dump_printf_loc (MSG_NOTE, vect_location,
3480 "***** Analysis %s with vector mode %s\n",
3481 res ? "succeeded" : " failed",
3482 GET_MODE_NAME (loop_vinfo->vector_mode));
3484 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3486 if (dump_enabled_p ())
3487 dump_printf_loc (MSG_NOTE, vect_location,
3488 "***** Re-trying analysis for unrolling"
3489 " with unroll factor %d and slp %s.\n",
3490 suggested_unroll_factor,
3491 slp_done_for_suggested_uf ? "on" : "off");
3492 loop_vec_info unroll_vinfo
3493 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3494 unroll_vinfo->vector_mode = vector_mode;
3495 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3496 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3497 slp_done_for_suggested_uf);
3498 if (new_res)
3500 delete loop_vinfo;
3501 loop_vinfo = unroll_vinfo;
3503 else
3504 delete unroll_vinfo;
3507 /* Remember the autodetected vector mode. */
3508 if (vector_mode == VOIDmode)
3509 autodetected_vector_mode = loop_vinfo->vector_mode;
3511 /* Advance mode_i, first skipping modes that would result in the
3512 same analysis result. */
3513 while (mode_i + 1 < vector_modes.length ()
3514 && vect_chooses_same_modes_p (loop_vinfo,
3515 vector_modes[mode_i + 1]))
3517 if (dump_enabled_p ())
3518 dump_printf_loc (MSG_NOTE, vect_location,
3519 "***** The result for vector mode %s would"
3520 " be the same\n",
3521 GET_MODE_NAME (vector_modes[mode_i + 1]));
3522 mode_i += 1;
3524 if (mode_i + 1 < vector_modes.length ()
3525 && VECTOR_MODE_P (autodetected_vector_mode)
3526 && (related_vector_mode (vector_modes[mode_i + 1],
3527 GET_MODE_INNER (autodetected_vector_mode))
3528 == autodetected_vector_mode)
3529 && (related_vector_mode (autodetected_vector_mode,
3530 GET_MODE_INNER (vector_modes[mode_i + 1]))
3531 == vector_modes[mode_i + 1]))
3533 if (dump_enabled_p ())
3534 dump_printf_loc (MSG_NOTE, vect_location,
3535 "***** Skipping vector mode %s, which would"
3536 " repeat the analysis for %s\n",
3537 GET_MODE_NAME (vector_modes[mode_i + 1]),
3538 GET_MODE_NAME (autodetected_vector_mode));
3539 mode_i += 1;
3541 mode_i++;
3543 if (!res)
3545 delete loop_vinfo;
3546 if (fatal)
3547 gcc_checking_assert (main_loop_vinfo == NULL);
3548 return opt_loop_vec_info::propagate_failure (res);
3551 return opt_loop_vec_info::success (loop_vinfo);
3554 /* Function vect_analyze_loop.
3556 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3557 for it. The different analyses will record information in the
3558 loop_vec_info struct. */
3559 opt_loop_vec_info
3560 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3562 DUMP_VECT_SCOPE ("analyze_loop_nest");
3564 if (loop_outer (loop)
3565 && loop_vec_info_for_loop (loop_outer (loop))
3566 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3567 return opt_loop_vec_info::failure_at (vect_location,
3568 "outer-loop already vectorized.\n");
3570 if (!find_loop_nest (loop, &shared->loop_nest))
3571 return opt_loop_vec_info::failure_at
3572 (vect_location,
3573 "not vectorized: loop nest containing two or more consecutive inner"
3574 " loops cannot be vectorized\n");
3576 /* Analyze the loop form. */
3577 vect_loop_form_info loop_form_info;
3578 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3579 if (!res)
3581 if (dump_enabled_p ())
3582 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3583 "bad loop form.\n");
3584 return opt_loop_vec_info::propagate_failure (res);
3586 if (!integer_onep (loop_form_info.assumptions))
3588 /* We consider to vectorize this loop by versioning it under
3589 some assumptions. In order to do this, we need to clear
3590 existing information computed by scev and niter analyzer. */
3591 scev_reset_htab ();
3592 free_numbers_of_iterations_estimates (loop);
3593 /* Also set flag for this loop so that following scev and niter
3594 analysis are done under the assumptions. */
3595 loop_constraint_set (loop, LOOP_C_FINITE);
3597 else
3598 /* Clear the existing niter information to make sure the nonwrapping flag
3599 will be calculated and set propriately. */
3600 free_numbers_of_iterations_estimates (loop);
3602 auto_vector_modes vector_modes;
3603 /* Autodetect first vector size we try. */
3604 vector_modes.safe_push (VOIDmode);
3605 unsigned int autovec_flags
3606 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3607 loop->simdlen != 0);
3608 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3609 && !unlimited_cost_model (loop));
3610 machine_mode autodetected_vector_mode = VOIDmode;
3611 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3612 unsigned int mode_i = 0;
3613 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3615 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3616 a mode has not been analyzed. */
3617 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3618 for (unsigned i = 0; i < vector_modes.length (); ++i)
3619 cached_vf_per_mode.safe_push (0);
3621 /* First determine the main loop vectorization mode, either the first
3622 one that works, starting with auto-detecting the vector mode and then
3623 following the targets order of preference, or the one with the
3624 lowest cost if pick_lowest_cost_p. */
3625 while (1)
3627 bool fatal;
3628 unsigned int last_mode_i = mode_i;
3629 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3630 failed. */
3631 cached_vf_per_mode[last_mode_i] = -1;
3632 opt_loop_vec_info loop_vinfo
3633 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3634 NULL, vector_modes, mode_i,
3635 autodetected_vector_mode, fatal);
3636 if (fatal)
3637 break;
3639 if (loop_vinfo)
3641 /* Analyzis has been successful so update the VF value. The
3642 VF should always be a multiple of unroll_factor and we want to
3643 capture the original VF here. */
3644 cached_vf_per_mode[last_mode_i]
3645 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3646 loop_vinfo->suggested_unroll_factor);
3647 /* Once we hit the desired simdlen for the first time,
3648 discard any previous attempts. */
3649 if (simdlen
3650 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3652 delete first_loop_vinfo;
3653 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3654 simdlen = 0;
3656 else if (pick_lowest_cost_p
3657 && first_loop_vinfo
3658 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3660 /* Pick loop_vinfo over first_loop_vinfo. */
3661 delete first_loop_vinfo;
3662 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3664 if (first_loop_vinfo == NULL)
3665 first_loop_vinfo = loop_vinfo;
3666 else
3668 delete loop_vinfo;
3669 loop_vinfo = opt_loop_vec_info::success (NULL);
3672 /* Commit to first_loop_vinfo if we have no reason to try
3673 alternatives. */
3674 if (!simdlen && !pick_lowest_cost_p)
3675 break;
3677 if (mode_i == vector_modes.length ()
3678 || autodetected_vector_mode == VOIDmode)
3679 break;
3681 /* Try the next biggest vector size. */
3682 if (dump_enabled_p ())
3683 dump_printf_loc (MSG_NOTE, vect_location,
3684 "***** Re-trying analysis with vector mode %s\n",
3685 GET_MODE_NAME (vector_modes[mode_i]));
3687 if (!first_loop_vinfo)
3688 return opt_loop_vec_info::propagate_failure (res);
3690 if (dump_enabled_p ())
3691 dump_printf_loc (MSG_NOTE, vect_location,
3692 "***** Choosing vector mode %s\n",
3693 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3695 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3696 enabled, SIMDUID is not set, it is the innermost loop and we have
3697 either already found the loop's SIMDLEN or there was no SIMDLEN to
3698 begin with.
3699 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3700 bool vect_epilogues = (!simdlen
3701 && loop->inner == NULL
3702 && param_vect_epilogues_nomask
3703 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3704 /* No code motion support for multiple epilogues so for now
3705 not supported when multiple exits. */
3706 && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3707 && !loop->simduid);
3708 if (!vect_epilogues)
3709 return first_loop_vinfo;
3711 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3712 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3714 /* For epilogues start the analysis from the first mode. The motivation
3715 behind starting from the beginning comes from cases where the VECTOR_MODES
3716 array may contain length-agnostic and length-specific modes. Their
3717 ordering is not guaranteed, so we could end up picking a mode for the main
3718 loop that is after the epilogue's optimal mode. */
3719 vector_modes[0] = autodetected_vector_mode;
3720 mode_i = 0;
3722 bool supports_partial_vectors =
3723 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3724 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3726 while (1)
3728 /* If the target does not support partial vectors we can shorten the
3729 number of modes to analyze for the epilogue as we know we can't pick a
3730 mode that would lead to a VF at least as big as the
3731 FIRST_VINFO_VF. */
3732 if (!supports_partial_vectors
3733 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3735 mode_i++;
3736 if (mode_i == vector_modes.length ())
3737 break;
3738 continue;
3741 if (dump_enabled_p ())
3742 dump_printf_loc (MSG_NOTE, vect_location,
3743 "***** Re-trying epilogue analysis with vector "
3744 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3746 bool fatal;
3747 opt_loop_vec_info loop_vinfo
3748 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3749 first_loop_vinfo,
3750 vector_modes, mode_i,
3751 autodetected_vector_mode, fatal);
3752 if (fatal)
3753 break;
3755 if (loop_vinfo)
3757 if (pick_lowest_cost_p)
3759 /* Keep trying to roll back vectorization attempts while the
3760 loop_vec_infos they produced were worse than this one. */
3761 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3762 while (!vinfos.is_empty ()
3763 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3765 gcc_assert (vect_epilogues);
3766 delete vinfos.pop ();
3769 /* For now only allow one epilogue loop. */
3770 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3772 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3773 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3774 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3775 || maybe_ne (lowest_th, 0U));
3776 /* Keep track of the known smallest versioning
3777 threshold. */
3778 if (ordered_p (lowest_th, th))
3779 lowest_th = ordered_min (lowest_th, th);
3781 else
3783 delete loop_vinfo;
3784 loop_vinfo = opt_loop_vec_info::success (NULL);
3787 /* For now only allow one epilogue loop, but allow
3788 pick_lowest_cost_p to replace it, so commit to the
3789 first epilogue if we have no reason to try alternatives. */
3790 if (!pick_lowest_cost_p)
3791 break;
3794 if (mode_i == vector_modes.length ())
3795 break;
3799 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3801 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3802 if (dump_enabled_p ())
3803 dump_printf_loc (MSG_NOTE, vect_location,
3804 "***** Choosing epilogue vector mode %s\n",
3805 GET_MODE_NAME
3806 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3809 return first_loop_vinfo;
3812 /* Return true if there is an in-order reduction function for CODE, storing
3813 it in *REDUC_FN if so. */
3815 static bool
3816 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3818 /* We support MINUS_EXPR by negating the operand. This also preserves an
3819 initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3820 (-0.0) = -0.0. */
3821 if (code == PLUS_EXPR || code == MINUS_EXPR)
3823 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3824 return true;
3826 return false;
3829 /* Function reduction_fn_for_scalar_code
3831 Input:
3832 CODE - tree_code of a reduction operations.
3834 Output:
3835 REDUC_FN - the corresponding internal function to be used to reduce the
3836 vector of partial results into a single scalar result, or IFN_LAST
3837 if the operation is a supported reduction operation, but does not have
3838 such an internal function.
3840 Return FALSE if CODE currently cannot be vectorized as reduction. */
3842 bool
3843 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3845 if (code.is_tree_code ())
3846 switch (tree_code (code))
3848 case MAX_EXPR:
3849 *reduc_fn = IFN_REDUC_MAX;
3850 return true;
3852 case MIN_EXPR:
3853 *reduc_fn = IFN_REDUC_MIN;
3854 return true;
3856 case PLUS_EXPR:
3857 *reduc_fn = IFN_REDUC_PLUS;
3858 return true;
3860 case BIT_AND_EXPR:
3861 *reduc_fn = IFN_REDUC_AND;
3862 return true;
3864 case BIT_IOR_EXPR:
3865 *reduc_fn = IFN_REDUC_IOR;
3866 return true;
3868 case BIT_XOR_EXPR:
3869 *reduc_fn = IFN_REDUC_XOR;
3870 return true;
3872 case MULT_EXPR:
3873 case MINUS_EXPR:
3874 *reduc_fn = IFN_LAST;
3875 return true;
3877 default:
3878 return false;
3880 else
3881 switch (combined_fn (code))
3883 CASE_CFN_FMAX:
3884 *reduc_fn = IFN_REDUC_FMAX;
3885 return true;
3887 CASE_CFN_FMIN:
3888 *reduc_fn = IFN_REDUC_FMIN;
3889 return true;
3891 default:
3892 return false;
3896 /* If there is a neutral value X such that a reduction would not be affected
3897 by the introduction of additional X elements, return that X, otherwise
3898 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3899 of the scalar elements. If the reduction has just a single initial value
3900 then INITIAL_VALUE is that value, otherwise it is null.
3901 If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3902 In that case no signed zero is returned. */
3904 tree
3905 neutral_op_for_reduction (tree scalar_type, code_helper code,
3906 tree initial_value, bool as_initial)
3908 if (code.is_tree_code ())
3909 switch (tree_code (code))
3911 case DOT_PROD_EXPR:
3912 case SAD_EXPR:
3913 case MINUS_EXPR:
3914 case BIT_IOR_EXPR:
3915 case BIT_XOR_EXPR:
3916 return build_zero_cst (scalar_type);
3917 case WIDEN_SUM_EXPR:
3918 case PLUS_EXPR:
3919 if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3920 return build_real (scalar_type, dconstm0);
3921 else
3922 return build_zero_cst (scalar_type);
3924 case MULT_EXPR:
3925 return build_one_cst (scalar_type);
3927 case BIT_AND_EXPR:
3928 return build_all_ones_cst (scalar_type);
3930 case MAX_EXPR:
3931 case MIN_EXPR:
3932 return initial_value;
3934 default:
3935 return NULL_TREE;
3937 else
3938 switch (combined_fn (code))
3940 CASE_CFN_FMIN:
3941 CASE_CFN_FMAX:
3942 return initial_value;
3944 default:
3945 return NULL_TREE;
3949 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3950 STMT is printed with a message MSG. */
3952 static void
3953 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3955 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3958 /* Return true if we need an in-order reduction for operation CODE
3959 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3960 overflow must wrap. */
3962 bool
3963 needs_fold_left_reduction_p (tree type, code_helper code)
3965 /* CHECKME: check for !flag_finite_math_only too? */
3966 if (SCALAR_FLOAT_TYPE_P (type))
3968 if (code.is_tree_code ())
3969 switch (tree_code (code))
3971 case MIN_EXPR:
3972 case MAX_EXPR:
3973 return false;
3975 default:
3976 return !flag_associative_math;
3978 else
3979 switch (combined_fn (code))
3981 CASE_CFN_FMIN:
3982 CASE_CFN_FMAX:
3983 return false;
3985 default:
3986 return !flag_associative_math;
3990 if (INTEGRAL_TYPE_P (type))
3991 return (!code.is_tree_code ()
3992 || !operation_no_trapping_overflow (type, tree_code (code)));
3994 if (SAT_FIXED_POINT_TYPE_P (type))
3995 return true;
3997 return false;
4000 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4001 has a handled computation expression. Store the main reduction
4002 operation in *CODE. */
4004 static bool
4005 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4006 tree loop_arg, code_helper *code,
4007 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
4009 auto_bitmap visited;
4010 tree lookfor = PHI_RESULT (phi);
4011 ssa_op_iter curri;
4012 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4013 while (USE_FROM_PTR (curr) != loop_arg)
4014 curr = op_iter_next_use (&curri);
4015 curri.i = curri.numops;
4018 path.safe_push (std::make_pair (curri, curr));
4019 tree use = USE_FROM_PTR (curr);
4020 if (use == lookfor)
4021 break;
4022 gimple *def = SSA_NAME_DEF_STMT (use);
4023 if (gimple_nop_p (def)
4024 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4026 pop:
4029 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4030 curri = x.first;
4031 curr = x.second;
4033 curr = op_iter_next_use (&curri);
4034 /* Skip already visited or non-SSA operands (from iterating
4035 over PHI args). */
4036 while (curr != NULL_USE_OPERAND_P
4037 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4038 || ! bitmap_set_bit (visited,
4039 SSA_NAME_VERSION
4040 (USE_FROM_PTR (curr)))));
4042 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4043 if (curr == NULL_USE_OPERAND_P)
4044 break;
4046 else
4048 if (gimple_code (def) == GIMPLE_PHI)
4049 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4050 else
4051 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4052 while (curr != NULL_USE_OPERAND_P
4053 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4054 || ! bitmap_set_bit (visited,
4055 SSA_NAME_VERSION
4056 (USE_FROM_PTR (curr)))))
4057 curr = op_iter_next_use (&curri);
4058 if (curr == NULL_USE_OPERAND_P)
4059 goto pop;
4062 while (1);
4063 if (dump_file && (dump_flags & TDF_DETAILS))
4065 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4066 unsigned i;
4067 std::pair<ssa_op_iter, use_operand_p> *x;
4068 FOR_EACH_VEC_ELT (path, i, x)
4069 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4070 dump_printf (MSG_NOTE, "\n");
4073 /* Check whether the reduction path detected is valid. */
4074 bool fail = path.length () == 0;
4075 bool neg = false;
4076 int sign = -1;
4077 *code = ERROR_MARK;
4078 for (unsigned i = 1; i < path.length (); ++i)
4080 gimple *use_stmt = USE_STMT (path[i].second);
4081 gimple_match_op op;
4082 if (!gimple_extract_op (use_stmt, &op))
4084 fail = true;
4085 break;
4087 unsigned int opi = op.num_ops;
4088 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4090 /* The following make sure we can compute the operand index
4091 easily plus it mostly disallows chaining via COND_EXPR condition
4092 operands. */
4093 for (opi = 0; opi < op.num_ops; ++opi)
4094 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4095 break;
4097 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4099 for (opi = 0; opi < op.num_ops; ++opi)
4100 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4101 break;
4103 if (opi == op.num_ops)
4105 fail = true;
4106 break;
4108 op.code = canonicalize_code (op.code, op.type);
4109 if (op.code == MINUS_EXPR)
4111 op.code = PLUS_EXPR;
4112 /* Track whether we negate the reduction value each iteration. */
4113 if (op.ops[1] == op.ops[opi])
4114 neg = ! neg;
4116 else if (op.code == IFN_COND_SUB)
4118 op.code = IFN_COND_ADD;
4119 /* Track whether we negate the reduction value each iteration. */
4120 if (op.ops[2] == op.ops[opi])
4121 neg = ! neg;
4123 if (CONVERT_EXPR_CODE_P (op.code)
4124 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4126 else if (*code == ERROR_MARK)
4128 *code = op.code;
4129 sign = TYPE_SIGN (op.type);
4131 else if (op.code != *code)
4133 fail = true;
4134 break;
4136 else if ((op.code == MIN_EXPR
4137 || op.code == MAX_EXPR)
4138 && sign != TYPE_SIGN (op.type))
4140 fail = true;
4141 break;
4143 /* Check there's only a single stmt the op is used on. For the
4144 not value-changing tail and the last stmt allow out-of-loop uses.
4145 ??? We could relax this and handle arbitrary live stmts by
4146 forcing a scalar epilogue for example. */
4147 imm_use_iterator imm_iter;
4148 use_operand_p use_p;
4149 gimple *op_use_stmt;
4150 unsigned cnt = 0;
4151 bool cond_fn_p = op.code.is_internal_fn ()
4152 && (conditional_internal_fn_code (internal_fn (op.code))
4153 != ERROR_MARK);
4155 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4157 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4158 op1 twice (once as definition, once as else) in the same operation.
4159 Allow this. */
4160 if (cond_fn_p && op_use_stmt == use_stmt)
4162 gcall *call = as_a<gcall *> (use_stmt);
4163 unsigned else_pos
4164 = internal_fn_else_index (internal_fn (op.code));
4166 for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4168 if (j == else_pos)
4169 continue;
4170 if (gimple_call_arg (call, j) == op.ops[opi])
4171 cnt++;
4174 else if (!is_gimple_debug (op_use_stmt)
4175 && (*code != ERROR_MARK
4176 || flow_bb_inside_loop_p (loop,
4177 gimple_bb (op_use_stmt))))
4178 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4179 cnt++;
4182 if (cnt != 1)
4184 fail = true;
4185 break;
4188 return ! fail && ! neg && *code != ERROR_MARK;
4191 bool
4192 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4193 tree loop_arg, enum tree_code code)
4195 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4196 code_helper code_;
4197 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4198 && code_ == code);
4203 /* Function vect_is_simple_reduction
4205 (1) Detect a cross-iteration def-use cycle that represents a simple
4206 reduction computation. We look for the following pattern:
4208 loop_header:
4209 a1 = phi < a0, a2 >
4210 a3 = ...
4211 a2 = operation (a3, a1)
4215 a3 = ...
4216 loop_header:
4217 a1 = phi < a0, a2 >
4218 a2 = operation (a3, a1)
4220 such that:
4221 1. operation is commutative and associative and it is safe to
4222 change the order of the computation
4223 2. no uses for a2 in the loop (a2 is used out of the loop)
4224 3. no uses of a1 in the loop besides the reduction operation
4225 4. no uses of a1 outside the loop.
4227 Conditions 1,4 are tested here.
4228 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4230 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4231 nested cycles.
4233 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4234 reductions:
4236 a1 = phi < a0, a2 >
4237 inner loop (def of a3)
4238 a2 = phi < a3 >
4240 (4) Detect condition expressions, ie:
4241 for (int i = 0; i < N; i++)
4242 if (a[i] < val)
4243 ret_val = a[i];
4247 static stmt_vec_info
4248 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4249 bool *double_reduc, bool *reduc_chain_p, bool slp)
4251 gphi *phi = as_a <gphi *> (phi_info->stmt);
4252 gimple *phi_use_stmt = NULL;
4253 imm_use_iterator imm_iter;
4254 use_operand_p use_p;
4256 *double_reduc = false;
4257 *reduc_chain_p = false;
4258 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4260 tree phi_name = PHI_RESULT (phi);
4261 /* ??? If there are no uses of the PHI result the inner loop reduction
4262 won't be detected as possibly double-reduction by vectorizable_reduction
4263 because that tries to walk the PHI arg from the preheader edge which
4264 can be constant. See PR60382. */
4265 if (has_zero_uses (phi_name))
4266 return NULL;
4267 class loop *loop = (gimple_bb (phi))->loop_father;
4268 unsigned nphi_def_loop_uses = 0;
4269 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4271 gimple *use_stmt = USE_STMT (use_p);
4272 if (is_gimple_debug (use_stmt))
4273 continue;
4275 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4277 if (dump_enabled_p ())
4278 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4279 "intermediate value used outside loop.\n");
4281 return NULL;
4284 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4285 op1 twice (once as definition, once as else) in the same operation.
4286 Only count it as one. */
4287 if (use_stmt != phi_use_stmt)
4289 nphi_def_loop_uses++;
4290 phi_use_stmt = use_stmt;
4294 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4295 if (TREE_CODE (latch_def) != SSA_NAME)
4297 if (dump_enabled_p ())
4298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4299 "reduction: not ssa_name: %T\n", latch_def);
4300 return NULL;
4303 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4304 if (!def_stmt_info
4305 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4306 return NULL;
4308 bool nested_in_vect_loop
4309 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4310 unsigned nlatch_def_loop_uses = 0;
4311 auto_vec<gphi *, 3> lcphis;
4312 bool inner_loop_of_double_reduc = false;
4313 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4315 gimple *use_stmt = USE_STMT (use_p);
4316 if (is_gimple_debug (use_stmt))
4317 continue;
4318 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4319 nlatch_def_loop_uses++;
4320 else
4322 /* We can have more than one loop-closed PHI. */
4323 lcphis.safe_push (as_a <gphi *> (use_stmt));
4324 if (nested_in_vect_loop
4325 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4326 == vect_double_reduction_def))
4327 inner_loop_of_double_reduc = true;
4331 /* If we are vectorizing an inner reduction we are executing that
4332 in the original order only in case we are not dealing with a
4333 double reduction. */
4334 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4336 if (dump_enabled_p ())
4337 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4338 "detected nested cycle: ");
4339 return def_stmt_info;
4342 /* When the inner loop of a double reduction ends up with more than
4343 one loop-closed PHI we have failed to classify alternate such
4344 PHIs as double reduction, leading to wrong code. See PR103237. */
4345 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4347 if (dump_enabled_p ())
4348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4349 "unhandle double reduction\n");
4350 return NULL;
4353 /* If this isn't a nested cycle or if the nested cycle reduction value
4354 is used ouside of the inner loop we cannot handle uses of the reduction
4355 value. */
4356 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4358 if (dump_enabled_p ())
4359 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4360 "reduction used in loop.\n");
4361 return NULL;
4364 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4365 defined in the inner loop. */
4366 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4368 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4369 if (gimple_phi_num_args (def_stmt) != 1
4370 || TREE_CODE (op1) != SSA_NAME)
4372 if (dump_enabled_p ())
4373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4374 "unsupported phi node definition.\n");
4376 return NULL;
4379 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4380 and the latch definition op1. */
4381 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4382 if (gimple_bb (def1)
4383 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4384 && loop->inner
4385 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4386 && (is_gimple_assign (def1) || is_gimple_call (def1))
4387 && is_a <gphi *> (phi_use_stmt)
4388 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4389 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4390 loop_latch_edge (loop->inner))))
4392 if (dump_enabled_p ())
4393 report_vect_op (MSG_NOTE, def_stmt,
4394 "detected double reduction: ");
4396 *double_reduc = true;
4397 return def_stmt_info;
4400 return NULL;
4403 /* Look for the expression computing latch_def from then loop PHI result. */
4404 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4405 code_helper code;
4406 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4407 path))
4409 STMT_VINFO_REDUC_CODE (phi_info) = code;
4410 if (code == COND_EXPR && !nested_in_vect_loop)
4411 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4413 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4414 reduction chain for which the additional restriction is that
4415 all operations in the chain are the same. */
4416 auto_vec<stmt_vec_info, 8> reduc_chain;
4417 unsigned i;
4418 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4419 for (i = path.length () - 1; i >= 1; --i)
4421 gimple *stmt = USE_STMT (path[i].second);
4422 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4423 gimple_match_op op;
4424 if (!gimple_extract_op (stmt, &op))
4425 gcc_unreachable ();
4426 if (gassign *assign = dyn_cast<gassign *> (stmt))
4427 STMT_VINFO_REDUC_IDX (stmt_info)
4428 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4429 else
4431 gcall *call = as_a<gcall *> (stmt);
4432 STMT_VINFO_REDUC_IDX (stmt_info)
4433 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4435 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4436 && (i == 1 || i == path.length () - 1));
4437 if ((op.code != code && !leading_conversion)
4438 /* We can only handle the final value in epilogue
4439 generation for reduction chains. */
4440 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4441 is_slp_reduc = false;
4442 /* For reduction chains we support a trailing/leading
4443 conversions. We do not store those in the actual chain. */
4444 if (leading_conversion)
4445 continue;
4446 reduc_chain.safe_push (stmt_info);
4448 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4450 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4452 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4453 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4455 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4456 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4458 /* Save the chain for further analysis in SLP detection. */
4459 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4460 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4462 *reduc_chain_p = true;
4463 if (dump_enabled_p ())
4464 dump_printf_loc (MSG_NOTE, vect_location,
4465 "reduction: detected reduction chain\n");
4467 else if (dump_enabled_p ())
4468 dump_printf_loc (MSG_NOTE, vect_location,
4469 "reduction: detected reduction\n");
4471 return def_stmt_info;
4474 if (dump_enabled_p ())
4475 dump_printf_loc (MSG_NOTE, vect_location,
4476 "reduction: unknown pattern\n");
4478 return NULL;
4481 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4482 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4483 or -1 if not known. */
4485 static int
4486 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4488 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4489 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4491 if (dump_enabled_p ())
4492 dump_printf_loc (MSG_NOTE, vect_location,
4493 "cost model: epilogue peel iters set to vf/2 "
4494 "because loop iterations are unknown .\n");
4495 return assumed_vf / 2;
4497 else
4499 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4500 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4501 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4502 /* If we need to peel for gaps, but no peeling is required, we have to
4503 peel VF iterations. */
4504 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4505 peel_iters_epilogue = assumed_vf;
4506 return peel_iters_epilogue;
4510 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4512 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4513 int *peel_iters_epilogue,
4514 stmt_vector_for_cost *scalar_cost_vec,
4515 stmt_vector_for_cost *prologue_cost_vec,
4516 stmt_vector_for_cost *epilogue_cost_vec)
4518 int retval = 0;
4520 *peel_iters_epilogue
4521 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4523 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4525 /* If peeled iterations are known but number of scalar loop
4526 iterations are unknown, count a taken branch per peeled loop. */
4527 if (peel_iters_prologue > 0)
4528 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4529 vect_prologue);
4530 if (*peel_iters_epilogue > 0)
4531 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4532 vect_epilogue);
4535 stmt_info_for_cost *si;
4536 int j;
4537 if (peel_iters_prologue)
4538 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4539 retval += record_stmt_cost (prologue_cost_vec,
4540 si->count * peel_iters_prologue,
4541 si->kind, si->stmt_info, si->misalign,
4542 vect_prologue);
4543 if (*peel_iters_epilogue)
4544 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4545 retval += record_stmt_cost (epilogue_cost_vec,
4546 si->count * *peel_iters_epilogue,
4547 si->kind, si->stmt_info, si->misalign,
4548 vect_epilogue);
4550 return retval;
4553 /* Function vect_estimate_min_profitable_iters
4555 Return the number of iterations required for the vector version of the
4556 loop to be profitable relative to the cost of the scalar version of the
4557 loop.
4559 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4560 of iterations for vectorization. -1 value means loop vectorization
4561 is not profitable. This returned value may be used for dynamic
4562 profitability check.
4564 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4565 for static check against estimated number of iterations. */
4567 static void
4568 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4569 int *ret_min_profitable_niters,
4570 int *ret_min_profitable_estimate,
4571 unsigned *suggested_unroll_factor)
4573 int min_profitable_iters;
4574 int min_profitable_estimate;
4575 int peel_iters_prologue;
4576 int peel_iters_epilogue;
4577 unsigned vec_inside_cost = 0;
4578 int vec_outside_cost = 0;
4579 unsigned vec_prologue_cost = 0;
4580 unsigned vec_epilogue_cost = 0;
4581 int scalar_single_iter_cost = 0;
4582 int scalar_outside_cost = 0;
4583 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4584 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4585 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4587 /* Cost model disabled. */
4588 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4590 if (dump_enabled_p ())
4591 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4592 *ret_min_profitable_niters = 0;
4593 *ret_min_profitable_estimate = 0;
4594 return;
4597 /* Requires loop versioning tests to handle misalignment. */
4598 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4600 /* FIXME: Make cost depend on complexity of individual check. */
4601 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4602 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4603 if (dump_enabled_p ())
4604 dump_printf (MSG_NOTE,
4605 "cost model: Adding cost of checks for loop "
4606 "versioning to treat misalignment.\n");
4609 /* Requires loop versioning with alias checks. */
4610 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4612 /* FIXME: Make cost depend on complexity of individual check. */
4613 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4614 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4615 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4616 if (len)
4617 /* Count LEN - 1 ANDs and LEN comparisons. */
4618 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4619 scalar_stmt, vect_prologue);
4620 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4621 if (len)
4623 /* Count LEN - 1 ANDs and LEN comparisons. */
4624 unsigned int nstmts = len * 2 - 1;
4625 /* +1 for each bias that needs adding. */
4626 for (unsigned int i = 0; i < len; ++i)
4627 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4628 nstmts += 1;
4629 (void) add_stmt_cost (target_cost_data, nstmts,
4630 scalar_stmt, vect_prologue);
4632 if (dump_enabled_p ())
4633 dump_printf (MSG_NOTE,
4634 "cost model: Adding cost of checks for loop "
4635 "versioning aliasing.\n");
4638 /* Requires loop versioning with niter checks. */
4639 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4641 /* FIXME: Make cost depend on complexity of individual check. */
4642 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4643 NULL, NULL, NULL_TREE, 0, vect_prologue);
4644 if (dump_enabled_p ())
4645 dump_printf (MSG_NOTE,
4646 "cost model: Adding cost of checks for loop "
4647 "versioning niters.\n");
4650 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4651 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4652 vect_prologue);
4654 /* Count statements in scalar loop. Using this as scalar cost for a single
4655 iteration for now.
4657 TODO: Add outer loop support.
4659 TODO: Consider assigning different costs to different scalar
4660 statements. */
4662 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4664 /* Add additional cost for the peeled instructions in prologue and epilogue
4665 loop. (For fully-masked loops there will be no peeling.)
4667 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4668 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4670 TODO: Build an expression that represents peel_iters for prologue and
4671 epilogue to be used in a run-time test. */
4673 bool prologue_need_br_taken_cost = false;
4674 bool prologue_need_br_not_taken_cost = false;
4676 /* Calculate peel_iters_prologue. */
4677 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4678 peel_iters_prologue = 0;
4679 else if (npeel < 0)
4681 peel_iters_prologue = assumed_vf / 2;
4682 if (dump_enabled_p ())
4683 dump_printf (MSG_NOTE, "cost model: "
4684 "prologue peel iters set to vf/2.\n");
4686 /* If peeled iterations are unknown, count a taken branch and a not taken
4687 branch per peeled loop. Even if scalar loop iterations are known,
4688 vector iterations are not known since peeled prologue iterations are
4689 not known. Hence guards remain the same. */
4690 prologue_need_br_taken_cost = true;
4691 prologue_need_br_not_taken_cost = true;
4693 else
4695 peel_iters_prologue = npeel;
4696 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4697 /* If peeled iterations are known but number of scalar loop
4698 iterations are unknown, count a taken branch per peeled loop. */
4699 prologue_need_br_taken_cost = true;
4702 bool epilogue_need_br_taken_cost = false;
4703 bool epilogue_need_br_not_taken_cost = false;
4705 /* Calculate peel_iters_epilogue. */
4706 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4707 /* We need to peel exactly one iteration for gaps. */
4708 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4709 else if (npeel < 0)
4711 /* If peeling for alignment is unknown, loop bound of main loop
4712 becomes unknown. */
4713 peel_iters_epilogue = assumed_vf / 2;
4714 if (dump_enabled_p ())
4715 dump_printf (MSG_NOTE, "cost model: "
4716 "epilogue peel iters set to vf/2 because "
4717 "peeling for alignment is unknown.\n");
4719 /* See the same reason above in peel_iters_prologue calculation. */
4720 epilogue_need_br_taken_cost = true;
4721 epilogue_need_br_not_taken_cost = true;
4723 else
4725 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4726 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4727 /* If peeled iterations are known but number of scalar loop
4728 iterations are unknown, count a taken branch per peeled loop. */
4729 epilogue_need_br_taken_cost = true;
4732 stmt_info_for_cost *si;
4733 int j;
4734 /* Add costs associated with peel_iters_prologue. */
4735 if (peel_iters_prologue)
4736 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4738 (void) add_stmt_cost (target_cost_data,
4739 si->count * peel_iters_prologue, si->kind,
4740 si->stmt_info, si->node, si->vectype,
4741 si->misalign, vect_prologue);
4744 /* Add costs associated with peel_iters_epilogue. */
4745 if (peel_iters_epilogue)
4746 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4748 (void) add_stmt_cost (target_cost_data,
4749 si->count * peel_iters_epilogue, si->kind,
4750 si->stmt_info, si->node, si->vectype,
4751 si->misalign, vect_epilogue);
4754 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4756 if (prologue_need_br_taken_cost)
4757 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4758 vect_prologue);
4760 if (prologue_need_br_not_taken_cost)
4761 (void) add_stmt_cost (target_cost_data, 1,
4762 cond_branch_not_taken, vect_prologue);
4764 if (epilogue_need_br_taken_cost)
4765 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4766 vect_epilogue);
4768 if (epilogue_need_br_not_taken_cost)
4769 (void) add_stmt_cost (target_cost_data, 1,
4770 cond_branch_not_taken, vect_epilogue);
4772 /* Take care of special costs for rgroup controls of partial vectors. */
4773 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4774 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4775 == vect_partial_vectors_avx512))
4777 /* Calculate how many masks we need to generate. */
4778 unsigned int num_masks = 0;
4779 bool need_saturation = false;
4780 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4781 if (rgm.type)
4783 unsigned nvectors = rgm.factor;
4784 num_masks += nvectors;
4785 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4786 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4787 need_saturation = true;
4790 /* ??? The target isn't able to identify the costs below as
4791 producing masks so it cannot penaltize cases where we'd run
4792 out of mask registers for example. */
4794 /* ??? We are also failing to account for smaller vector masks
4795 we generate by splitting larger masks in vect_get_loop_mask. */
4797 /* In the worst case, we need to generate each mask in the prologue
4798 and in the loop body. We need one splat per group and one
4799 compare per mask.
4801 Sometimes the prologue mask will fold to a constant,
4802 so the actual prologue cost might be smaller. However, it's
4803 simpler and safer to use the worst-case cost; if this ends up
4804 being the tie-breaker between vectorizing or not, then it's
4805 probably better not to vectorize. */
4806 (void) add_stmt_cost (target_cost_data,
4807 num_masks
4808 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4809 vector_stmt, NULL, NULL, NULL_TREE, 0,
4810 vect_prologue);
4811 (void) add_stmt_cost (target_cost_data,
4812 num_masks
4813 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4814 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4816 /* When we need saturation we need it both in the prologue and
4817 the epilogue. */
4818 if (need_saturation)
4820 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4821 NULL, NULL, NULL_TREE, 0, vect_prologue);
4822 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4823 NULL, NULL, NULL_TREE, 0, vect_body);
4826 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4827 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4828 == vect_partial_vectors_while_ult))
4830 /* Calculate how many masks we need to generate. */
4831 unsigned int num_masks = 0;
4832 rgroup_controls *rgm;
4833 unsigned int num_vectors_m1;
4834 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4835 num_vectors_m1, rgm)
4836 if (rgm->type)
4837 num_masks += num_vectors_m1 + 1;
4838 gcc_assert (num_masks > 0);
4840 /* In the worst case, we need to generate each mask in the prologue
4841 and in the loop body. One of the loop body mask instructions
4842 replaces the comparison in the scalar loop, and since we don't
4843 count the scalar comparison against the scalar body, we shouldn't
4844 count that vector instruction against the vector body either.
4846 Sometimes we can use unpacks instead of generating prologue
4847 masks and sometimes the prologue mask will fold to a constant,
4848 so the actual prologue cost might be smaller. However, it's
4849 simpler and safer to use the worst-case cost; if this ends up
4850 being the tie-breaker between vectorizing or not, then it's
4851 probably better not to vectorize. */
4852 (void) add_stmt_cost (target_cost_data, num_masks,
4853 vector_stmt, NULL, NULL, NULL_TREE, 0,
4854 vect_prologue);
4855 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4856 vector_stmt, NULL, NULL, NULL_TREE, 0,
4857 vect_body);
4859 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4861 /* Referring to the functions vect_set_loop_condition_partial_vectors
4862 and vect_set_loop_controls_directly, we need to generate each
4863 length in the prologue and in the loop body if required. Although
4864 there are some possible optimizations, we consider the worst case
4865 here. */
4867 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4868 signed char partial_load_store_bias
4869 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4870 bool need_iterate_p
4871 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4872 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4874 /* Calculate how many statements to be added. */
4875 unsigned int prologue_stmts = 0;
4876 unsigned int body_stmts = 0;
4878 rgroup_controls *rgc;
4879 unsigned int num_vectors_m1;
4880 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4881 if (rgc->type)
4883 /* May need one SHIFT for nitems_total computation. */
4884 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4885 if (nitems != 1 && !niters_known_p)
4886 prologue_stmts += 1;
4888 /* May need one MAX and one MINUS for wrap around. */
4889 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4890 prologue_stmts += 2;
4892 /* Need one MAX and one MINUS for each batch limit excepting for
4893 the 1st one. */
4894 prologue_stmts += num_vectors_m1 * 2;
4896 unsigned int num_vectors = num_vectors_m1 + 1;
4898 /* Need to set up lengths in prologue, only one MIN required
4899 for each since start index is zero. */
4900 prologue_stmts += num_vectors;
4902 /* If we have a non-zero partial load bias, we need one PLUS
4903 to adjust the load length. */
4904 if (partial_load_store_bias != 0)
4905 body_stmts += 1;
4907 unsigned int length_update_cost = 0;
4908 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4909 /* For decrement IV style, Each only need a single SELECT_VL
4910 or MIN since beginning to calculate the number of elements
4911 need to be processed in current iteration. */
4912 length_update_cost = 1;
4913 else
4914 /* For increment IV stype, Each may need two MINs and one MINUS to
4915 update lengths in body for next iteration. */
4916 length_update_cost = 3;
4918 if (need_iterate_p)
4919 body_stmts += length_update_cost * num_vectors;
4922 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4923 scalar_stmt, vect_prologue);
4924 (void) add_stmt_cost (target_cost_data, body_stmts,
4925 scalar_stmt, vect_body);
4928 /* FORNOW: The scalar outside cost is incremented in one of the
4929 following ways:
4931 1. The vectorizer checks for alignment and aliasing and generates
4932 a condition that allows dynamic vectorization. A cost model
4933 check is ANDED with the versioning condition. Hence scalar code
4934 path now has the added cost of the versioning check.
4936 if (cost > th & versioning_check)
4937 jmp to vector code
4939 Hence run-time scalar is incremented by not-taken branch cost.
4941 2. The vectorizer then checks if a prologue is required. If the
4942 cost model check was not done before during versioning, it has to
4943 be done before the prologue check.
4945 if (cost <= th)
4946 prologue = scalar_iters
4947 if (prologue == 0)
4948 jmp to vector code
4949 else
4950 execute prologue
4951 if (prologue == num_iters)
4952 go to exit
4954 Hence the run-time scalar cost is incremented by a taken branch,
4955 plus a not-taken branch, plus a taken branch cost.
4957 3. The vectorizer then checks if an epilogue is required. If the
4958 cost model check was not done before during prologue check, it
4959 has to be done with the epilogue check.
4961 if (prologue == 0)
4962 jmp to vector code
4963 else
4964 execute prologue
4965 if (prologue == num_iters)
4966 go to exit
4967 vector code:
4968 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4969 jmp to epilogue
4971 Hence the run-time scalar cost should be incremented by 2 taken
4972 branches.
4974 TODO: The back end may reorder the BBS's differently and reverse
4975 conditions/branch directions. Change the estimates below to
4976 something more reasonable. */
4978 /* If the number of iterations is known and we do not do versioning, we can
4979 decide whether to vectorize at compile time. Hence the scalar version
4980 do not carry cost model guard costs. */
4981 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4982 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4984 /* Cost model check occurs at versioning. */
4985 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4986 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4987 else
4989 /* Cost model check occurs at prologue generation. */
4990 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4991 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4992 + vect_get_stmt_cost (cond_branch_not_taken);
4993 /* Cost model check occurs at epilogue generation. */
4994 else
4995 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4999 /* Complete the target-specific cost calculations. */
5000 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
5001 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
5002 suggested_unroll_factor);
5004 if (suggested_unroll_factor && *suggested_unroll_factor > 1
5005 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
5006 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
5007 *suggested_unroll_factor,
5008 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5010 if (dump_enabled_p ())
5011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5012 "can't unroll as unrolled vectorization factor larger"
5013 " than maximum vectorization factor: "
5014 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5015 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5016 *suggested_unroll_factor = 1;
5019 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5021 if (dump_enabled_p ())
5023 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5024 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
5025 vec_inside_cost);
5026 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
5027 vec_prologue_cost);
5028 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
5029 vec_epilogue_cost);
5030 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
5031 scalar_single_iter_cost);
5032 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
5033 scalar_outside_cost);
5034 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
5035 vec_outside_cost);
5036 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
5037 peel_iters_prologue);
5038 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
5039 peel_iters_epilogue);
5042 /* Calculate number of iterations required to make the vector version
5043 profitable, relative to the loop bodies only. The following condition
5044 must hold true:
5045 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5046 where
5047 SIC = scalar iteration cost, VIC = vector iteration cost,
5048 VOC = vector outside cost, VF = vectorization factor,
5049 NPEEL = prologue iterations + epilogue iterations,
5050 SOC = scalar outside cost for run time cost model check. */
5052 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5053 - vec_inside_cost);
5054 if (saving_per_viter <= 0)
5056 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5057 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5058 "vectorization did not happen for a simd loop");
5060 if (dump_enabled_p ())
5061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5062 "cost model: the vector iteration cost = %d "
5063 "divided by the scalar iteration cost = %d "
5064 "is greater or equal to the vectorization factor = %d"
5065 ".\n",
5066 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5067 *ret_min_profitable_niters = -1;
5068 *ret_min_profitable_estimate = -1;
5069 return;
5072 /* ??? The "if" arm is written to handle all cases; see below for what
5073 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5074 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5076 /* Rewriting the condition above in terms of the number of
5077 vector iterations (vniters) rather than the number of
5078 scalar iterations (niters) gives:
5080 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5082 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5084 For integer N, X and Y when X > 0:
5086 N * X > Y <==> N >= (Y /[floor] X) + 1. */
5087 int outside_overhead = (vec_outside_cost
5088 - scalar_single_iter_cost * peel_iters_prologue
5089 - scalar_single_iter_cost * peel_iters_epilogue
5090 - scalar_outside_cost);
5091 /* We're only interested in cases that require at least one
5092 vector iteration. */
5093 int min_vec_niters = 1;
5094 if (outside_overhead > 0)
5095 min_vec_niters = outside_overhead / saving_per_viter + 1;
5097 if (dump_enabled_p ())
5098 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5099 min_vec_niters);
5101 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5103 /* Now that we know the minimum number of vector iterations,
5104 find the minimum niters for which the scalar cost is larger:
5106 SIC * niters > VIC * vniters + VOC - SOC
5108 We know that the minimum niters is no more than
5109 vniters * VF + NPEEL, but it might be (and often is) less
5110 than that if a partial vector iteration is cheaper than the
5111 equivalent scalar code. */
5112 int threshold = (vec_inside_cost * min_vec_niters
5113 + vec_outside_cost
5114 - scalar_outside_cost);
5115 if (threshold <= 0)
5116 min_profitable_iters = 1;
5117 else
5118 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5120 else
5121 /* Convert the number of vector iterations into a number of
5122 scalar iterations. */
5123 min_profitable_iters = (min_vec_niters * assumed_vf
5124 + peel_iters_prologue
5125 + peel_iters_epilogue);
5127 else
5129 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5130 * assumed_vf
5131 - vec_inside_cost * peel_iters_prologue
5132 - vec_inside_cost * peel_iters_epilogue);
5133 if (min_profitable_iters <= 0)
5134 min_profitable_iters = 0;
5135 else
5137 min_profitable_iters /= saving_per_viter;
5139 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5140 <= (((int) vec_inside_cost * min_profitable_iters)
5141 + (((int) vec_outside_cost - scalar_outside_cost)
5142 * assumed_vf)))
5143 min_profitable_iters++;
5147 if (dump_enabled_p ())
5148 dump_printf (MSG_NOTE,
5149 " Calculated minimum iters for profitability: %d\n",
5150 min_profitable_iters);
5152 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5153 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5154 /* We want the vectorized loop to execute at least once. */
5155 min_profitable_iters = assumed_vf + peel_iters_prologue;
5156 else if (min_profitable_iters < peel_iters_prologue)
5157 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5158 vectorized loop executes at least once. */
5159 min_profitable_iters = peel_iters_prologue;
5161 if (dump_enabled_p ())
5162 dump_printf_loc (MSG_NOTE, vect_location,
5163 " Runtime profitability threshold = %d\n",
5164 min_profitable_iters);
5166 *ret_min_profitable_niters = min_profitable_iters;
5168 /* Calculate number of iterations required to make the vector version
5169 profitable, relative to the loop bodies only.
5171 Non-vectorized variant is SIC * niters and it must win over vector
5172 variant on the expected loop trip count. The following condition must hold true:
5173 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5175 if (vec_outside_cost <= 0)
5176 min_profitable_estimate = 0;
5177 /* ??? This "else if" arm is written to handle all cases; see below for
5178 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5179 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5181 /* This is a repeat of the code above, but with + SOC rather
5182 than - SOC. */
5183 int outside_overhead = (vec_outside_cost
5184 - scalar_single_iter_cost * peel_iters_prologue
5185 - scalar_single_iter_cost * peel_iters_epilogue
5186 + scalar_outside_cost);
5187 int min_vec_niters = 1;
5188 if (outside_overhead > 0)
5189 min_vec_niters = outside_overhead / saving_per_viter + 1;
5191 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5193 int threshold = (vec_inside_cost * min_vec_niters
5194 + vec_outside_cost
5195 + scalar_outside_cost);
5196 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5198 else
5199 min_profitable_estimate = (min_vec_niters * assumed_vf
5200 + peel_iters_prologue
5201 + peel_iters_epilogue);
5203 else
5205 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5206 * assumed_vf
5207 - vec_inside_cost * peel_iters_prologue
5208 - vec_inside_cost * peel_iters_epilogue)
5209 / ((scalar_single_iter_cost * assumed_vf)
5210 - vec_inside_cost);
5212 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5213 if (dump_enabled_p ())
5214 dump_printf_loc (MSG_NOTE, vect_location,
5215 " Static estimate profitability threshold = %d\n",
5216 min_profitable_estimate);
5218 *ret_min_profitable_estimate = min_profitable_estimate;
5221 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5222 vector elements (not bits) for a vector with NELT elements. */
5223 static void
5224 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5225 vec_perm_builder *sel)
5227 /* The encoding is a single stepped pattern. Any wrap-around is handled
5228 by vec_perm_indices. */
5229 sel->new_vector (nelt, 1, 3);
5230 for (unsigned int i = 0; i < 3; i++)
5231 sel->quick_push (i + offset);
5234 /* Checks whether the target supports whole-vector shifts for vectors of mode
5235 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5236 it supports vec_perm_const with masks for all necessary shift amounts. */
5237 static bool
5238 have_whole_vector_shift (machine_mode mode)
5240 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5241 return true;
5243 /* Variable-length vectors should be handled via the optab. */
5244 unsigned int nelt;
5245 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5246 return false;
5248 vec_perm_builder sel;
5249 vec_perm_indices indices;
5250 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5252 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5253 indices.new_vector (sel, 2, nelt);
5254 if (!can_vec_perm_const_p (mode, mode, indices, false))
5255 return false;
5257 return true;
5260 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5261 multiplication operands have differing signs and (b) we intend
5262 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5263 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5265 static bool
5266 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5267 stmt_vec_info stmt_info)
5269 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5270 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5271 return false;
5273 tree rhs1 = gimple_assign_rhs1 (assign);
5274 tree rhs2 = gimple_assign_rhs2 (assign);
5275 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5276 return false;
5278 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5279 gcc_assert (reduc_info->is_reduc_info);
5280 return !directly_supported_p (DOT_PROD_EXPR,
5281 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5282 optab_vector_mixed_sign);
5285 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5286 functions. Design better to avoid maintenance issues. */
5288 /* Function vect_model_reduction_cost.
5290 Models cost for a reduction operation, including the vector ops
5291 generated within the strip-mine loop in some cases, the initial
5292 definition before the loop, and the epilogue code that must be generated. */
5294 static void
5295 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5296 stmt_vec_info stmt_info, internal_fn reduc_fn,
5297 vect_reduction_type reduction_type,
5298 int ncopies, stmt_vector_for_cost *cost_vec)
5300 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5301 tree vectype;
5302 machine_mode mode;
5303 class loop *loop = NULL;
5305 if (loop_vinfo)
5306 loop = LOOP_VINFO_LOOP (loop_vinfo);
5308 /* Condition reductions generate two reductions in the loop. */
5309 if (reduction_type == COND_REDUCTION)
5310 ncopies *= 2;
5312 vectype = STMT_VINFO_VECTYPE (stmt_info);
5313 mode = TYPE_MODE (vectype);
5314 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5316 gimple_match_op op;
5317 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5318 gcc_unreachable ();
5320 bool emulated_mixed_dot_prod
5321 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5322 if (reduction_type == EXTRACT_LAST_REDUCTION)
5323 /* No extra instructions are needed in the prologue. The loop body
5324 operations are costed in vectorizable_condition. */
5325 inside_cost = 0;
5326 else if (reduction_type == FOLD_LEFT_REDUCTION)
5328 /* No extra instructions needed in the prologue. */
5329 prologue_cost = 0;
5331 if (reduc_fn != IFN_LAST)
5332 /* Count one reduction-like operation per vector. */
5333 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5334 stmt_info, 0, vect_body);
5335 else
5337 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5338 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5339 inside_cost = record_stmt_cost (cost_vec, nelements,
5340 vec_to_scalar, stmt_info, 0,
5341 vect_body);
5342 inside_cost += record_stmt_cost (cost_vec, nelements,
5343 scalar_stmt, stmt_info, 0,
5344 vect_body);
5347 else
5349 /* Add in the cost of the initial definitions. */
5350 int prologue_stmts;
5351 if (reduction_type == COND_REDUCTION)
5352 /* For cond reductions we have four vectors: initial index, step,
5353 initial result of the data reduction, initial value of the index
5354 reduction. */
5355 prologue_stmts = 4;
5356 else if (emulated_mixed_dot_prod)
5357 /* We need the initial reduction value and two invariants:
5358 one that contains the minimum signed value and one that
5359 contains half of its negative. */
5360 prologue_stmts = 3;
5361 else
5362 prologue_stmts = 1;
5363 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5364 scalar_to_vec, stmt_info, 0,
5365 vect_prologue);
5368 /* Determine cost of epilogue code.
5370 We have a reduction operator that will reduce the vector in one statement.
5371 Also requires scalar extract. */
5373 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5375 if (reduc_fn != IFN_LAST)
5377 if (reduction_type == COND_REDUCTION)
5379 /* An EQ stmt and an COND_EXPR stmt. */
5380 epilogue_cost += record_stmt_cost (cost_vec, 2,
5381 vector_stmt, stmt_info, 0,
5382 vect_epilogue);
5383 /* Reduction of the max index and a reduction of the found
5384 values. */
5385 epilogue_cost += record_stmt_cost (cost_vec, 2,
5386 vec_to_scalar, stmt_info, 0,
5387 vect_epilogue);
5388 /* A broadcast of the max value. */
5389 epilogue_cost += record_stmt_cost (cost_vec, 1,
5390 scalar_to_vec, stmt_info, 0,
5391 vect_epilogue);
5393 else
5395 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5396 stmt_info, 0, vect_epilogue);
5397 epilogue_cost += record_stmt_cost (cost_vec, 1,
5398 vec_to_scalar, stmt_info, 0,
5399 vect_epilogue);
5402 else if (reduction_type == COND_REDUCTION)
5404 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5405 /* Extraction of scalar elements. */
5406 epilogue_cost += record_stmt_cost (cost_vec,
5407 2 * estimated_nunits,
5408 vec_to_scalar, stmt_info, 0,
5409 vect_epilogue);
5410 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5411 epilogue_cost += record_stmt_cost (cost_vec,
5412 2 * estimated_nunits - 3,
5413 scalar_stmt, stmt_info, 0,
5414 vect_epilogue);
5416 else if (reduction_type == EXTRACT_LAST_REDUCTION
5417 || reduction_type == FOLD_LEFT_REDUCTION)
5418 /* No extra instructions need in the epilogue. */
5420 else
5422 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5423 tree bitsize = TYPE_SIZE (op.type);
5424 int element_bitsize = tree_to_uhwi (bitsize);
5425 int nelements = vec_size_in_bits / element_bitsize;
5427 if (op.code == COND_EXPR)
5428 op.code = MAX_EXPR;
5430 /* We have a whole vector shift available. */
5431 if (VECTOR_MODE_P (mode)
5432 && directly_supported_p (op.code, vectype)
5433 && have_whole_vector_shift (mode))
5435 /* Final reduction via vector shifts and the reduction operator.
5436 Also requires scalar extract. */
5437 epilogue_cost += record_stmt_cost (cost_vec,
5438 exact_log2 (nelements) * 2,
5439 vector_stmt, stmt_info, 0,
5440 vect_epilogue);
5441 epilogue_cost += record_stmt_cost (cost_vec, 1,
5442 vec_to_scalar, stmt_info, 0,
5443 vect_epilogue);
5445 else
5446 /* Use extracts and reduction op for final reduction. For N
5447 elements, we have N extracts and N-1 reduction ops. */
5448 epilogue_cost += record_stmt_cost (cost_vec,
5449 nelements + nelements - 1,
5450 vector_stmt, stmt_info, 0,
5451 vect_epilogue);
5455 if (dump_enabled_p ())
5456 dump_printf (MSG_NOTE,
5457 "vect_model_reduction_cost: inside_cost = %d, "
5458 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5459 prologue_cost, epilogue_cost);
5462 /* SEQ is a sequence of instructions that initialize the reduction
5463 described by REDUC_INFO. Emit them in the appropriate place. */
5465 static void
5466 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5467 stmt_vec_info reduc_info, gimple *seq)
5469 if (reduc_info->reused_accumulator)
5471 /* When reusing an accumulator from the main loop, we only need
5472 initialization instructions if the main loop can be skipped.
5473 In that case, emit the initialization instructions at the end
5474 of the guard block that does the skip. */
5475 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5476 gcc_assert (skip_edge);
5477 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5478 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5480 else
5482 /* The normal case: emit the initialization instructions on the
5483 preheader edge. */
5484 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5485 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5489 /* Function get_initial_def_for_reduction
5491 Input:
5492 REDUC_INFO - the info_for_reduction
5493 INIT_VAL - the initial value of the reduction variable
5494 NEUTRAL_OP - a value that has no effect on the reduction, as per
5495 neutral_op_for_reduction
5497 Output:
5498 Return a vector variable, initialized according to the operation that
5499 STMT_VINFO performs. This vector will be used as the initial value
5500 of the vector of partial results.
5502 The value we need is a vector in which element 0 has value INIT_VAL
5503 and every other element has value NEUTRAL_OP. */
5505 static tree
5506 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5507 stmt_vec_info reduc_info,
5508 tree init_val, tree neutral_op)
5510 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5511 tree scalar_type = TREE_TYPE (init_val);
5512 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5513 tree init_def;
5514 gimple_seq stmts = NULL;
5516 gcc_assert (vectype);
5518 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5519 || SCALAR_FLOAT_TYPE_P (scalar_type));
5521 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5522 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5524 if (operand_equal_p (init_val, neutral_op))
5526 /* If both elements are equal then the vector described above is
5527 just a splat. */
5528 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5529 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5531 else
5533 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5534 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5535 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5537 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5538 element 0. */
5539 init_def = gimple_build_vector_from_val (&stmts, vectype,
5540 neutral_op);
5541 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5542 vectype, init_def, init_val);
5544 else
5546 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5547 tree_vector_builder elts (vectype, 1, 2);
5548 elts.quick_push (init_val);
5549 elts.quick_push (neutral_op);
5550 init_def = gimple_build_vector (&stmts, &elts);
5554 if (stmts)
5555 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5556 return init_def;
5559 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5560 which performs a reduction involving GROUP_SIZE scalar statements.
5561 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5562 is nonnull, introducing extra elements of that value will not change the
5563 result. */
5565 static void
5566 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5567 stmt_vec_info reduc_info,
5568 vec<tree> *vec_oprnds,
5569 unsigned int number_of_vectors,
5570 unsigned int group_size, tree neutral_op)
5572 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5573 unsigned HOST_WIDE_INT nunits;
5574 unsigned j, number_of_places_left_in_vector;
5575 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5576 unsigned int i;
5578 gcc_assert (group_size == initial_values.length () || neutral_op);
5580 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5581 created vectors. It is greater than 1 if unrolling is performed.
5583 For example, we have two scalar operands, s1 and s2 (e.g., group of
5584 strided accesses of size two), while NUNITS is four (i.e., four scalars
5585 of this type can be packed in a vector). The output vector will contain
5586 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5587 will be 2).
5589 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5590 vectors containing the operands.
5592 For example, NUNITS is four as before, and the group size is 8
5593 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5594 {s5, s6, s7, s8}. */
5596 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5597 nunits = group_size;
5599 number_of_places_left_in_vector = nunits;
5600 bool constant_p = true;
5601 tree_vector_builder elts (vector_type, nunits, 1);
5602 elts.quick_grow (nunits);
5603 gimple_seq ctor_seq = NULL;
5604 for (j = 0; j < nunits * number_of_vectors; ++j)
5606 tree op;
5607 i = j % group_size;
5609 /* Get the def before the loop. In reduction chain we have only
5610 one initial value. Else we have as many as PHIs in the group. */
5611 if (i >= initial_values.length () || (j > i && neutral_op))
5612 op = neutral_op;
5613 else
5614 op = initial_values[i];
5616 /* Create 'vect_ = {op0,op1,...,opn}'. */
5617 number_of_places_left_in_vector--;
5618 elts[nunits - number_of_places_left_in_vector - 1] = op;
5619 if (!CONSTANT_CLASS_P (op))
5620 constant_p = false;
5622 if (number_of_places_left_in_vector == 0)
5624 tree init;
5625 if (constant_p && !neutral_op
5626 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5627 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5628 /* Build the vector directly from ELTS. */
5629 init = gimple_build_vector (&ctor_seq, &elts);
5630 else if (neutral_op)
5632 /* Build a vector of the neutral value and shift the
5633 other elements into place. */
5634 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5635 neutral_op);
5636 int k = nunits;
5637 while (k > 0 && elts[k - 1] == neutral_op)
5638 k -= 1;
5639 while (k > 0)
5641 k -= 1;
5642 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5643 vector_type, init, elts[k]);
5646 else
5648 /* First time round, duplicate ELTS to fill the
5649 required number of vectors. */
5650 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5651 elts, number_of_vectors, *vec_oprnds);
5652 break;
5654 vec_oprnds->quick_push (init);
5656 number_of_places_left_in_vector = nunits;
5657 elts.new_vector (vector_type, nunits, 1);
5658 elts.quick_grow (nunits);
5659 constant_p = true;
5662 if (ctor_seq != NULL)
5663 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5666 /* For a statement STMT_INFO taking part in a reduction operation return
5667 the stmt_vec_info the meta information is stored on. */
5669 stmt_vec_info
5670 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5672 stmt_info = vect_orig_stmt (stmt_info);
5673 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5674 if (!is_a <gphi *> (stmt_info->stmt)
5675 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5676 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5677 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5678 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5680 if (gimple_phi_num_args (phi) == 1)
5681 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5683 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5685 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5686 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5687 stmt_info = info;
5689 return stmt_info;
5692 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5693 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5694 return false. */
5696 static bool
5697 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5698 stmt_vec_info reduc_info)
5700 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5701 if (!main_loop_vinfo)
5702 return false;
5704 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5705 return false;
5707 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5708 auto_vec<tree, 16> main_loop_results (num_phis);
5709 auto_vec<tree, 16> initial_values (num_phis);
5710 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5712 /* The epilogue loop can be entered either from the main loop or
5713 from an earlier guard block. */
5714 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5715 for (tree incoming_value : reduc_info->reduc_initial_values)
5717 /* Look for:
5719 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5720 INITIAL_VALUE(guard block)>. */
5721 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5723 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5724 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5726 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5727 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5729 main_loop_results.quick_push (from_main_loop);
5730 initial_values.quick_push (from_skip);
5733 else
5734 /* The main loop dominates the epilogue loop. */
5735 main_loop_results.splice (reduc_info->reduc_initial_values);
5737 /* See if the main loop has the kind of accumulator we need. */
5738 vect_reusable_accumulator *accumulator
5739 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5740 if (!accumulator
5741 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5742 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5743 accumulator->reduc_info->reduc_scalar_results.begin ()))
5744 return false;
5746 /* Handle the case where we can reduce wider vectors to narrower ones. */
5747 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5748 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5749 unsigned HOST_WIDE_INT m;
5750 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5751 TYPE_VECTOR_SUBPARTS (vectype), &m))
5752 return false;
5753 /* Check the intermediate vector types and operations are available. */
5754 tree prev_vectype = old_vectype;
5755 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5756 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5758 intermediate_nunits = exact_div (intermediate_nunits, 2);
5759 tree intermediate_vectype = get_related_vectype_for_scalar_type
5760 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5761 if (!intermediate_vectype
5762 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5763 intermediate_vectype)
5764 || !can_vec_extract (TYPE_MODE (prev_vectype),
5765 TYPE_MODE (intermediate_vectype)))
5766 return false;
5767 prev_vectype = intermediate_vectype;
5770 /* Non-SLP reductions might apply an adjustment after the reduction
5771 operation, in order to simplify the initialization of the accumulator.
5772 If the epilogue loop carries on from where the main loop left off,
5773 it should apply the same adjustment to the final reduction result.
5775 If the epilogue loop can also be entered directly (rather than via
5776 the main loop), we need to be able to handle that case in the same way,
5777 with the same adjustment. (In principle we could add a PHI node
5778 to select the correct adjustment, but in practice that shouldn't be
5779 necessary.) */
5780 tree main_adjustment
5781 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5782 if (loop_vinfo->main_loop_edge && main_adjustment)
5784 gcc_assert (num_phis == 1);
5785 tree initial_value = initial_values[0];
5786 /* Check that we can use INITIAL_VALUE as the adjustment and
5787 initialize the accumulator with a neutral value instead. */
5788 if (!operand_equal_p (initial_value, main_adjustment))
5789 return false;
5790 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5791 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5792 code, initial_value);
5794 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5795 reduc_info->reduc_initial_values.truncate (0);
5796 reduc_info->reduc_initial_values.splice (initial_values);
5797 reduc_info->reused_accumulator = accumulator;
5798 return true;
5801 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5802 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5804 static tree
5805 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5806 gimple_seq *seq)
5808 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5809 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5810 tree stype = TREE_TYPE (vectype);
5811 tree new_temp = vec_def;
5812 while (nunits > nunits1)
5814 nunits /= 2;
5815 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5816 stype, nunits);
5817 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5819 /* The target has to make sure we support lowpart/highpart
5820 extraction, either via direct vector extract or through
5821 an integer mode punning. */
5822 tree dst1, dst2;
5823 gimple *epilog_stmt;
5824 if (convert_optab_handler (vec_extract_optab,
5825 TYPE_MODE (TREE_TYPE (new_temp)),
5826 TYPE_MODE (vectype1))
5827 != CODE_FOR_nothing)
5829 /* Extract sub-vectors directly once vec_extract becomes
5830 a conversion optab. */
5831 dst1 = make_ssa_name (vectype1);
5832 epilog_stmt
5833 = gimple_build_assign (dst1, BIT_FIELD_REF,
5834 build3 (BIT_FIELD_REF, vectype1,
5835 new_temp, TYPE_SIZE (vectype1),
5836 bitsize_int (0)));
5837 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5838 dst2 = make_ssa_name (vectype1);
5839 epilog_stmt
5840 = gimple_build_assign (dst2, BIT_FIELD_REF,
5841 build3 (BIT_FIELD_REF, vectype1,
5842 new_temp, TYPE_SIZE (vectype1),
5843 bitsize_int (bitsize)));
5844 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5846 else
5848 /* Extract via punning to appropriately sized integer mode
5849 vector. */
5850 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5851 tree etype = build_vector_type (eltype, 2);
5852 gcc_assert (convert_optab_handler (vec_extract_optab,
5853 TYPE_MODE (etype),
5854 TYPE_MODE (eltype))
5855 != CODE_FOR_nothing);
5856 tree tem = make_ssa_name (etype);
5857 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5858 build1 (VIEW_CONVERT_EXPR,
5859 etype, new_temp));
5860 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5861 new_temp = tem;
5862 tem = make_ssa_name (eltype);
5863 epilog_stmt
5864 = gimple_build_assign (tem, BIT_FIELD_REF,
5865 build3 (BIT_FIELD_REF, eltype,
5866 new_temp, TYPE_SIZE (eltype),
5867 bitsize_int (0)));
5868 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5869 dst1 = make_ssa_name (vectype1);
5870 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5871 build1 (VIEW_CONVERT_EXPR,
5872 vectype1, tem));
5873 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5874 tem = make_ssa_name (eltype);
5875 epilog_stmt
5876 = gimple_build_assign (tem, BIT_FIELD_REF,
5877 build3 (BIT_FIELD_REF, eltype,
5878 new_temp, TYPE_SIZE (eltype),
5879 bitsize_int (bitsize)));
5880 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5881 dst2 = make_ssa_name (vectype1);
5882 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5883 build1 (VIEW_CONVERT_EXPR,
5884 vectype1, tem));
5885 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5888 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5891 return new_temp;
5894 /* Retrieves the definining statement to be used for a reduction.
5895 For MAIN_EXIT_P we use the current VEC_STMTs and otherwise we look at
5896 the reduction definitions. */
5898 tree
5899 vect_get_vect_def (stmt_vec_info reduc_info, slp_tree slp_node,
5900 slp_instance slp_node_instance, bool main_exit_p, unsigned i,
5901 vec <gimple *> &vec_stmts)
5903 tree def;
5905 if (slp_node)
5907 if (!main_exit_p)
5908 slp_node = slp_node_instance->reduc_phis;
5909 def = vect_get_slp_vect_def (slp_node, i);
5911 else
5913 if (!main_exit_p)
5914 reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (reduc_info));
5915 vec_stmts = STMT_VINFO_VEC_STMTS (reduc_info);
5916 def = gimple_get_lhs (vec_stmts[0]);
5919 return def;
5922 /* Function vect_create_epilog_for_reduction
5924 Create code at the loop-epilog to finalize the result of a reduction
5925 computation.
5927 STMT_INFO is the scalar reduction stmt that is being vectorized.
5928 SLP_NODE is an SLP node containing a group of reduction statements. The
5929 first one in this group is STMT_INFO.
5930 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5931 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5932 (counting from 0)
5933 LOOP_EXIT is the edge to update in the merge block. In the case of a single
5934 exit this edge is always the main loop exit.
5936 This function:
5937 1. Completes the reduction def-use cycles.
5938 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5939 by calling the function specified by REDUC_FN if available, or by
5940 other means (whole-vector shifts or a scalar loop).
5941 The function also creates a new phi node at the loop exit to preserve
5942 loop-closed form, as illustrated below.
5944 The flow at the entry to this function:
5946 loop:
5947 vec_def = phi <vec_init, null> # REDUCTION_PHI
5948 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5949 s_loop = scalar_stmt # (scalar) STMT_INFO
5950 loop_exit:
5951 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5952 use <s_out0>
5953 use <s_out0>
5955 The above is transformed by this function into:
5957 loop:
5958 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5959 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5960 s_loop = scalar_stmt # (scalar) STMT_INFO
5961 loop_exit:
5962 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5963 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5964 v_out2 = reduce <v_out1>
5965 s_out3 = extract_field <v_out2, 0>
5966 s_out4 = adjust_result <s_out3>
5967 use <s_out4>
5968 use <s_out4>
5971 static void
5972 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5973 stmt_vec_info stmt_info,
5974 slp_tree slp_node,
5975 slp_instance slp_node_instance,
5976 edge loop_exit)
5978 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5979 gcc_assert (reduc_info->is_reduc_info);
5980 /* For double reductions we need to get at the inner loop reduction
5981 stmt which has the meta info attached. Our stmt_info is that of the
5982 loop-closed PHI of the inner loop which we remember as
5983 def for the reduction PHI generation. */
5984 bool double_reduc = false;
5985 bool main_exit_p = LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit;
5986 stmt_vec_info rdef_info = stmt_info;
5987 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5989 gcc_assert (!slp_node);
5990 double_reduc = true;
5991 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5992 (stmt_info->stmt, 0));
5993 stmt_info = vect_stmt_to_vectorize (stmt_info);
5995 gphi *reduc_def_stmt
5996 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5997 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5998 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5999 tree vectype;
6000 machine_mode mode;
6001 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
6002 basic_block exit_bb;
6003 tree scalar_dest;
6004 tree scalar_type;
6005 gimple *new_phi = NULL, *phi = NULL;
6006 gimple_stmt_iterator exit_gsi;
6007 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
6008 gimple *epilog_stmt = NULL;
6009 gimple *exit_phi;
6010 tree bitsize;
6011 tree def;
6012 tree orig_name, scalar_result;
6013 imm_use_iterator imm_iter, phi_imm_iter;
6014 use_operand_p use_p, phi_use_p;
6015 gimple *use_stmt;
6016 auto_vec<tree> reduc_inputs;
6017 int j, i;
6018 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
6019 unsigned int group_size = 1, k;
6020 auto_vec<gimple *> phis;
6021 /* SLP reduction without reduction chain, e.g.,
6022 # a1 = phi <a2, a0>
6023 # b1 = phi <b2, b0>
6024 a2 = operation (a1)
6025 b2 = operation (b1) */
6026 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6027 bool direct_slp_reduc;
6028 tree induction_index = NULL_TREE;
6030 if (slp_node)
6031 group_size = SLP_TREE_LANES (slp_node);
6033 if (nested_in_vect_loop_p (loop, stmt_info))
6035 outer_loop = loop;
6036 loop = loop->inner;
6037 gcc_assert (!slp_node && double_reduc);
6040 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6041 gcc_assert (vectype);
6042 mode = TYPE_MODE (vectype);
6044 tree induc_val = NULL_TREE;
6045 tree adjustment_def = NULL;
6046 if (slp_node)
6048 else
6050 /* Optimize: for induction condition reduction, if we can't use zero
6051 for induc_val, use initial_def. */
6052 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6053 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6054 else if (double_reduc)
6056 else
6057 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6060 stmt_vec_info single_live_out_stmt[] = { stmt_info };
6061 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6062 if (slp_reduc)
6063 /* All statements produce live-out values. */
6064 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6065 else if (slp_node)
6067 /* The last statement in the reduction chain produces the live-out
6068 value. Note SLP optimization can shuffle scalar stmts to
6069 optimize permutations so we have to search for the last stmt. */
6070 for (k = 0; k < group_size; ++k)
6071 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
6073 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6074 break;
6078 unsigned vec_num;
6079 int ncopies;
6080 if (slp_node)
6082 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6083 ncopies = 1;
6085 else
6087 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
6088 vec_num = 1;
6089 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6092 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6093 which is updated with the current index of the loop for every match of
6094 the original loop's cond_expr (VEC_STMT). This results in a vector
6095 containing the last time the condition passed for that vector lane.
6096 The first match will be a 1 to allow 0 to be used for non-matching
6097 indexes. If there are no matches at all then the vector will be all
6098 zeroes.
6100 PR92772: This algorithm is broken for architectures that support
6101 masked vectors, but do not provide fold_extract_last. */
6102 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6104 auto_vec<std::pair<tree, bool>, 2> ccompares;
6105 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6106 cond_info = vect_stmt_to_vectorize (cond_info);
6107 while (cond_info != reduc_info)
6109 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6111 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6112 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6113 ccompares.safe_push
6114 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6115 STMT_VINFO_REDUC_IDX (cond_info) == 2));
6117 cond_info
6118 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6119 1 + STMT_VINFO_REDUC_IDX
6120 (cond_info)));
6121 cond_info = vect_stmt_to_vectorize (cond_info);
6123 gcc_assert (ccompares.length () != 0);
6125 tree indx_before_incr, indx_after_incr;
6126 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6127 int scalar_precision
6128 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6129 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6130 tree cr_index_vector_type = get_related_vectype_for_scalar_type
6131 (TYPE_MODE (vectype), cr_index_scalar_type,
6132 TYPE_VECTOR_SUBPARTS (vectype));
6134 /* First we create a simple vector induction variable which starts
6135 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6136 vector size (STEP). */
6138 /* Create a {1,2,3,...} vector. */
6139 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6141 /* Create a vector of the step value. */
6142 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6143 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6145 /* Create an induction variable. */
6146 gimple_stmt_iterator incr_gsi;
6147 bool insert_after;
6148 vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6149 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6150 insert_after, &indx_before_incr, &indx_after_incr);
6152 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6153 filled with zeros (VEC_ZERO). */
6155 /* Create a vector of 0s. */
6156 tree zero = build_zero_cst (cr_index_scalar_type);
6157 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6159 /* Create a vector phi node. */
6160 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6161 new_phi = create_phi_node (new_phi_tree, loop->header);
6162 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6163 loop_preheader_edge (loop), UNKNOWN_LOCATION);
6165 /* Now take the condition from the loops original cond_exprs
6166 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6167 every match uses values from the induction variable
6168 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6169 (NEW_PHI_TREE).
6170 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6171 the new cond_expr (INDEX_COND_EXPR). */
6172 gimple_seq stmts = NULL;
6173 for (int i = ccompares.length () - 1; i != -1; --i)
6175 tree ccompare = ccompares[i].first;
6176 if (ccompares[i].second)
6177 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6178 cr_index_vector_type,
6179 ccompare,
6180 indx_before_incr, new_phi_tree);
6181 else
6182 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6183 cr_index_vector_type,
6184 ccompare,
6185 new_phi_tree, indx_before_incr);
6187 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6189 /* Update the phi with the vec cond. */
6190 induction_index = new_phi_tree;
6191 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6192 loop_latch_edge (loop), UNKNOWN_LOCATION);
6195 /* 2. Create epilog code.
6196 The reduction epilog code operates across the elements of the vector
6197 of partial results computed by the vectorized loop.
6198 The reduction epilog code consists of:
6200 step 1: compute the scalar result in a vector (v_out2)
6201 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6202 step 3: adjust the scalar result (s_out3) if needed.
6204 Step 1 can be accomplished using one the following three schemes:
6205 (scheme 1) using reduc_fn, if available.
6206 (scheme 2) using whole-vector shifts, if available.
6207 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6208 combined.
6210 The overall epilog code looks like this:
6212 s_out0 = phi <s_loop> # original EXIT_PHI
6213 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6214 v_out2 = reduce <v_out1> # step 1
6215 s_out3 = extract_field <v_out2, 0> # step 2
6216 s_out4 = adjust_result <s_out3> # step 3
6218 (step 3 is optional, and steps 1 and 2 may be combined).
6219 Lastly, the uses of s_out0 are replaced by s_out4. */
6222 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6223 v_out1 = phi <VECT_DEF>
6224 Store them in NEW_PHIS. */
6225 if (double_reduc)
6226 loop = outer_loop;
6227 /* We need to reduce values in all exits. */
6228 exit_bb = loop_exit->dest;
6229 exit_gsi = gsi_after_labels (exit_bb);
6230 reduc_inputs.create (slp_node ? vec_num : ncopies);
6231 vec <gimple *> vec_stmts = vNULL;
6232 for (unsigned i = 0; i < vec_num; i++)
6234 gimple_seq stmts = NULL;
6235 def = vect_get_vect_def (rdef_info, slp_node, slp_node_instance,
6236 main_exit_p, i, vec_stmts);
6237 for (j = 0; j < ncopies; j++)
6239 tree new_def = copy_ssa_name (def);
6240 phi = create_phi_node (new_def, exit_bb);
6241 if (j)
6242 def = gimple_get_lhs (vec_stmts[j]);
6243 if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6244 SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6245 else
6247 for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6248 SET_PHI_ARG_DEF (phi, k, def);
6250 new_def = gimple_convert (&stmts, vectype, new_def);
6251 reduc_inputs.quick_push (new_def);
6253 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6256 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6257 (i.e. when reduc_fn is not available) and in the final adjustment
6258 code (if needed). Also get the original scalar reduction variable as
6259 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6260 represents a reduction pattern), the tree-code and scalar-def are
6261 taken from the original stmt that the pattern-stmt (STMT) replaces.
6262 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6263 are taken from STMT. */
6265 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6266 if (orig_stmt_info != stmt_info)
6268 /* Reduction pattern */
6269 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6270 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6273 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6274 scalar_type = TREE_TYPE (scalar_dest);
6275 scalar_results.truncate (0);
6276 scalar_results.reserve_exact (group_size);
6277 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6278 bitsize = TYPE_SIZE (scalar_type);
6280 /* True if we should implement SLP_REDUC using native reduction operations
6281 instead of scalar operations. */
6282 direct_slp_reduc = (reduc_fn != IFN_LAST
6283 && slp_reduc
6284 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6286 /* In case of reduction chain, e.g.,
6287 # a1 = phi <a3, a0>
6288 a2 = operation (a1)
6289 a3 = operation (a2),
6291 we may end up with more than one vector result. Here we reduce them
6292 to one vector.
6294 The same is true for a SLP reduction, e.g.,
6295 # a1 = phi <a2, a0>
6296 # b1 = phi <b2, b0>
6297 a2 = operation (a1)
6298 b2 = operation (a2),
6300 where we can end up with more than one vector as well. We can
6301 easily accumulate vectors when the number of vector elements is
6302 a multiple of the SLP group size.
6304 The same is true if we couldn't use a single defuse cycle. */
6305 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6306 || direct_slp_reduc
6307 || (slp_reduc
6308 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6309 || ncopies > 1)
6311 gimple_seq stmts = NULL;
6312 tree single_input = reduc_inputs[0];
6313 for (k = 1; k < reduc_inputs.length (); k++)
6314 single_input = gimple_build (&stmts, code, vectype,
6315 single_input, reduc_inputs[k]);
6316 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6318 reduc_inputs.truncate (0);
6319 reduc_inputs.safe_push (single_input);
6322 tree orig_reduc_input = reduc_inputs[0];
6324 /* If this loop is an epilogue loop that can be skipped after the
6325 main loop, we can only share a reduction operation between the
6326 main loop and the epilogue if we put it at the target of the
6327 skip edge.
6329 We can still reuse accumulators if this check fails. Doing so has
6330 the minor(?) benefit of making the epilogue loop's scalar result
6331 independent of the main loop's scalar result. */
6332 bool unify_with_main_loop_p = false;
6333 if (reduc_info->reused_accumulator
6334 && loop_vinfo->skip_this_loop_edge
6335 && single_succ_p (exit_bb)
6336 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6338 unify_with_main_loop_p = true;
6340 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6341 reduc_inputs[0] = make_ssa_name (vectype);
6342 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6343 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6344 UNKNOWN_LOCATION);
6345 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6346 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6347 exit_gsi = gsi_after_labels (reduc_block);
6350 /* Shouldn't be used beyond this point. */
6351 exit_bb = nullptr;
6353 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6354 && reduc_fn != IFN_LAST)
6356 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6357 various data values where the condition matched and another vector
6358 (INDUCTION_INDEX) containing all the indexes of those matches. We
6359 need to extract the last matching index (which will be the index with
6360 highest value) and use this to index into the data vector.
6361 For the case where there were no matches, the data vector will contain
6362 all default values and the index vector will be all zeros. */
6364 /* Get various versions of the type of the vector of indexes. */
6365 tree index_vec_type = TREE_TYPE (induction_index);
6366 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6367 tree index_scalar_type = TREE_TYPE (index_vec_type);
6368 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6370 /* Get an unsigned integer version of the type of the data vector. */
6371 int scalar_precision
6372 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6373 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6374 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6375 vectype);
6377 /* First we need to create a vector (ZERO_VEC) of zeros and another
6378 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6379 can create using a MAX reduction and then expanding.
6380 In the case where the loop never made any matches, the max index will
6381 be zero. */
6383 /* Vector of {0, 0, 0,...}. */
6384 tree zero_vec = build_zero_cst (vectype);
6386 /* Find maximum value from the vector of found indexes. */
6387 tree max_index = make_ssa_name (index_scalar_type);
6388 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6389 1, induction_index);
6390 gimple_call_set_lhs (max_index_stmt, max_index);
6391 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6393 /* Vector of {max_index, max_index, max_index,...}. */
6394 tree max_index_vec = make_ssa_name (index_vec_type);
6395 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6396 max_index);
6397 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6398 max_index_vec_rhs);
6399 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6401 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6402 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6403 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6404 otherwise. Only one value should match, resulting in a vector
6405 (VEC_COND) with one data value and the rest zeros.
6406 In the case where the loop never made any matches, every index will
6407 match, resulting in a vector with all data values (which will all be
6408 the default value). */
6410 /* Compare the max index vector to the vector of found indexes to find
6411 the position of the max value. */
6412 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6413 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6414 induction_index,
6415 max_index_vec);
6416 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6418 /* Use the compare to choose either values from the data vector or
6419 zero. */
6420 tree vec_cond = make_ssa_name (vectype);
6421 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6422 vec_compare,
6423 reduc_inputs[0],
6424 zero_vec);
6425 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6427 /* Finally we need to extract the data value from the vector (VEC_COND)
6428 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6429 reduction, but because this doesn't exist, we can use a MAX reduction
6430 instead. The data value might be signed or a float so we need to cast
6431 it first.
6432 In the case where the loop never made any matches, the data values are
6433 all identical, and so will reduce down correctly. */
6435 /* Make the matched data values unsigned. */
6436 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6437 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6438 vec_cond);
6439 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6440 VIEW_CONVERT_EXPR,
6441 vec_cond_cast_rhs);
6442 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6444 /* Reduce down to a scalar value. */
6445 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6446 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6447 1, vec_cond_cast);
6448 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6449 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6451 /* Convert the reduced value back to the result type and set as the
6452 result. */
6453 gimple_seq stmts = NULL;
6454 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6455 data_reduc);
6456 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6457 scalar_results.safe_push (new_temp);
6459 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6460 && reduc_fn == IFN_LAST)
6462 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6463 idx = 0;
6464 idx_val = induction_index[0];
6465 val = data_reduc[0];
6466 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6467 if (induction_index[i] > idx_val)
6468 val = data_reduc[i], idx_val = induction_index[i];
6469 return val; */
6471 tree data_eltype = TREE_TYPE (vectype);
6472 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6473 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6474 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6475 /* Enforced by vectorizable_reduction, which ensures we have target
6476 support before allowing a conditional reduction on variable-length
6477 vectors. */
6478 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6479 tree idx_val = NULL_TREE, val = NULL_TREE;
6480 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6482 tree old_idx_val = idx_val;
6483 tree old_val = val;
6484 idx_val = make_ssa_name (idx_eltype);
6485 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6486 build3 (BIT_FIELD_REF, idx_eltype,
6487 induction_index,
6488 bitsize_int (el_size),
6489 bitsize_int (off)));
6490 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6491 val = make_ssa_name (data_eltype);
6492 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6493 build3 (BIT_FIELD_REF,
6494 data_eltype,
6495 reduc_inputs[0],
6496 bitsize_int (el_size),
6497 bitsize_int (off)));
6498 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6499 if (off != 0)
6501 tree new_idx_val = idx_val;
6502 if (off != v_size - el_size)
6504 new_idx_val = make_ssa_name (idx_eltype);
6505 epilog_stmt = gimple_build_assign (new_idx_val,
6506 MAX_EXPR, idx_val,
6507 old_idx_val);
6508 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6510 tree cond = make_ssa_name (boolean_type_node);
6511 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6512 idx_val, old_idx_val);
6513 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6514 tree new_val = make_ssa_name (data_eltype);
6515 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6516 cond, val, old_val);
6517 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6518 idx_val = new_idx_val;
6519 val = new_val;
6522 /* Convert the reduced value back to the result type and set as the
6523 result. */
6524 gimple_seq stmts = NULL;
6525 val = gimple_convert (&stmts, scalar_type, val);
6526 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6527 scalar_results.safe_push (val);
6530 /* 2.3 Create the reduction code, using one of the three schemes described
6531 above. In SLP we simply need to extract all the elements from the
6532 vector (without reducing them), so we use scalar shifts. */
6533 else if (reduc_fn != IFN_LAST && !slp_reduc)
6535 tree tmp;
6536 tree vec_elem_type;
6538 /* Case 1: Create:
6539 v_out2 = reduc_expr <v_out1> */
6541 if (dump_enabled_p ())
6542 dump_printf_loc (MSG_NOTE, vect_location,
6543 "Reduce using direct vector reduction.\n");
6545 gimple_seq stmts = NULL;
6546 vec_elem_type = TREE_TYPE (vectype);
6547 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6548 vec_elem_type, reduc_inputs[0]);
6549 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6550 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6552 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6553 && induc_val)
6555 /* Earlier we set the initial value to be a vector if induc_val
6556 values. Check the result and if it is induc_val then replace
6557 with the original initial value, unless induc_val is
6558 the same as initial_def already. */
6559 tree zcompare = make_ssa_name (boolean_type_node);
6560 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6561 new_temp, induc_val);
6562 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6563 tree initial_def = reduc_info->reduc_initial_values[0];
6564 tmp = make_ssa_name (new_scalar_dest);
6565 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6566 initial_def, new_temp);
6567 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6568 new_temp = tmp;
6571 scalar_results.safe_push (new_temp);
6573 else if (direct_slp_reduc)
6575 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6576 with the elements for other SLP statements replaced with the
6577 neutral value. We can then do a normal reduction on each vector. */
6579 /* Enforced by vectorizable_reduction. */
6580 gcc_assert (reduc_inputs.length () == 1);
6581 gcc_assert (pow2p_hwi (group_size));
6583 gimple_seq seq = NULL;
6585 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6586 and the same element size as VECTYPE. */
6587 tree index = build_index_vector (vectype, 0, 1);
6588 tree index_type = TREE_TYPE (index);
6589 tree index_elt_type = TREE_TYPE (index_type);
6590 tree mask_type = truth_type_for (index_type);
6592 /* Create a vector that, for each element, identifies which of
6593 the REDUC_GROUP_SIZE results should use it. */
6594 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6595 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6596 build_vector_from_val (index_type, index_mask));
6598 /* Get a neutral vector value. This is simply a splat of the neutral
6599 scalar value if we have one, otherwise the initial scalar value
6600 is itself a neutral value. */
6601 tree vector_identity = NULL_TREE;
6602 tree neutral_op = NULL_TREE;
6603 if (slp_node)
6605 tree initial_value = NULL_TREE;
6606 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6607 initial_value = reduc_info->reduc_initial_values[0];
6608 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6609 initial_value, false);
6611 if (neutral_op)
6612 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6613 neutral_op);
6614 for (unsigned int i = 0; i < group_size; ++i)
6616 /* If there's no univeral neutral value, we can use the
6617 initial scalar value from the original PHI. This is used
6618 for MIN and MAX reduction, for example. */
6619 if (!neutral_op)
6621 tree scalar_value = reduc_info->reduc_initial_values[i];
6622 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6623 scalar_value);
6624 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6625 scalar_value);
6628 /* Calculate the equivalent of:
6630 sel[j] = (index[j] == i);
6632 which selects the elements of REDUC_INPUTS[0] that should
6633 be included in the result. */
6634 tree compare_val = build_int_cst (index_elt_type, i);
6635 compare_val = build_vector_from_val (index_type, compare_val);
6636 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6637 index, compare_val);
6639 /* Calculate the equivalent of:
6641 vec = seq ? reduc_inputs[0] : vector_identity;
6643 VEC is now suitable for a full vector reduction. */
6644 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6645 sel, reduc_inputs[0], vector_identity);
6647 /* Do the reduction and convert it to the appropriate type. */
6648 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6649 TREE_TYPE (vectype), vec);
6650 scalar = gimple_convert (&seq, scalar_type, scalar);
6651 scalar_results.safe_push (scalar);
6653 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6655 else
6657 bool reduce_with_shift;
6658 tree vec_temp;
6660 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6662 /* See if the target wants to do the final (shift) reduction
6663 in a vector mode of smaller size and first reduce upper/lower
6664 halves against each other. */
6665 enum machine_mode mode1 = mode;
6666 tree stype = TREE_TYPE (vectype);
6667 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6668 unsigned nunits1 = nunits;
6669 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6670 && reduc_inputs.length () == 1)
6672 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6673 /* For SLP reductions we have to make sure lanes match up, but
6674 since we're doing individual element final reduction reducing
6675 vector width here is even more important.
6676 ??? We can also separate lanes with permutes, for the common
6677 case of power-of-two group-size odd/even extracts would work. */
6678 if (slp_reduc && nunits != nunits1)
6680 nunits1 = least_common_multiple (nunits1, group_size);
6681 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6684 if (!slp_reduc
6685 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6686 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6688 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6689 stype, nunits1);
6690 reduce_with_shift = have_whole_vector_shift (mode1);
6691 if (!VECTOR_MODE_P (mode1)
6692 || !directly_supported_p (code, vectype1))
6693 reduce_with_shift = false;
6695 /* First reduce the vector to the desired vector size we should
6696 do shift reduction on by combining upper and lower halves. */
6697 gimple_seq stmts = NULL;
6698 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6699 code, &stmts);
6700 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6701 reduc_inputs[0] = new_temp;
6703 if (reduce_with_shift && !slp_reduc)
6705 int element_bitsize = tree_to_uhwi (bitsize);
6706 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6707 for variable-length vectors and also requires direct target support
6708 for loop reductions. */
6709 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6710 int nelements = vec_size_in_bits / element_bitsize;
6711 vec_perm_builder sel;
6712 vec_perm_indices indices;
6714 int elt_offset;
6716 tree zero_vec = build_zero_cst (vectype1);
6717 /* Case 2: Create:
6718 for (offset = nelements/2; offset >= 1; offset/=2)
6720 Create: va' = vec_shift <va, offset>
6721 Create: va = vop <va, va'>
6722 } */
6724 tree rhs;
6726 if (dump_enabled_p ())
6727 dump_printf_loc (MSG_NOTE, vect_location,
6728 "Reduce using vector shifts\n");
6730 gimple_seq stmts = NULL;
6731 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6732 for (elt_offset = nelements / 2;
6733 elt_offset >= 1;
6734 elt_offset /= 2)
6736 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6737 indices.new_vector (sel, 2, nelements);
6738 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6739 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6740 new_temp, zero_vec, mask);
6741 new_temp = gimple_build (&stmts, code,
6742 vectype1, new_name, new_temp);
6744 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6746 /* 2.4 Extract the final scalar result. Create:
6747 s_out3 = extract_field <v_out2, bitpos> */
6749 if (dump_enabled_p ())
6750 dump_printf_loc (MSG_NOTE, vect_location,
6751 "extract scalar result\n");
6753 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6754 bitsize, bitsize_zero_node);
6755 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6756 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6757 gimple_assign_set_lhs (epilog_stmt, new_temp);
6758 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6759 scalar_results.safe_push (new_temp);
6761 else
6763 /* Case 3: Create:
6764 s = extract_field <v_out2, 0>
6765 for (offset = element_size;
6766 offset < vector_size;
6767 offset += element_size;)
6769 Create: s' = extract_field <v_out2, offset>
6770 Create: s = op <s, s'> // For non SLP cases
6771 } */
6773 if (dump_enabled_p ())
6774 dump_printf_loc (MSG_NOTE, vect_location,
6775 "Reduce using scalar code.\n");
6777 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6778 int element_bitsize = tree_to_uhwi (bitsize);
6779 tree compute_type = TREE_TYPE (vectype);
6780 gimple_seq stmts = NULL;
6781 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6783 int bit_offset;
6784 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6785 vec_temp, bitsize, bitsize_zero_node);
6787 /* In SLP we don't need to apply reduction operation, so we just
6788 collect s' values in SCALAR_RESULTS. */
6789 if (slp_reduc)
6790 scalar_results.safe_push (new_temp);
6792 for (bit_offset = element_bitsize;
6793 bit_offset < vec_size_in_bits;
6794 bit_offset += element_bitsize)
6796 tree bitpos = bitsize_int (bit_offset);
6797 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6798 compute_type, vec_temp,
6799 bitsize, bitpos);
6800 if (slp_reduc)
6802 /* In SLP we don't need to apply reduction operation, so
6803 we just collect s' values in SCALAR_RESULTS. */
6804 new_temp = new_name;
6805 scalar_results.safe_push (new_name);
6807 else
6808 new_temp = gimple_build (&stmts, code, compute_type,
6809 new_name, new_temp);
6813 /* The only case where we need to reduce scalar results in SLP, is
6814 unrolling. If the size of SCALAR_RESULTS is greater than
6815 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6816 REDUC_GROUP_SIZE. */
6817 if (slp_reduc)
6819 tree res, first_res, new_res;
6821 /* Reduce multiple scalar results in case of SLP unrolling. */
6822 for (j = group_size; scalar_results.iterate (j, &res);
6823 j++)
6825 first_res = scalar_results[j % group_size];
6826 new_res = gimple_build (&stmts, code, compute_type,
6827 first_res, res);
6828 scalar_results[j % group_size] = new_res;
6830 scalar_results.truncate (group_size);
6831 for (k = 0; k < group_size; k++)
6832 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6833 scalar_results[k]);
6835 else
6837 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6838 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6839 scalar_results.safe_push (new_temp);
6842 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6845 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6846 && induc_val)
6848 /* Earlier we set the initial value to be a vector if induc_val
6849 values. Check the result and if it is induc_val then replace
6850 with the original initial value, unless induc_val is
6851 the same as initial_def already. */
6852 tree zcompare = make_ssa_name (boolean_type_node);
6853 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6854 induc_val);
6855 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6856 tree initial_def = reduc_info->reduc_initial_values[0];
6857 tree tmp = make_ssa_name (new_scalar_dest);
6858 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6859 initial_def, new_temp);
6860 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6861 scalar_results[0] = tmp;
6865 /* 2.5 Adjust the final result by the initial value of the reduction
6866 variable. (When such adjustment is not needed, then
6867 'adjustment_def' is zero). For example, if code is PLUS we create:
6868 new_temp = loop_exit_def + adjustment_def */
6870 if (adjustment_def)
6872 gcc_assert (!slp_reduc);
6873 gimple_seq stmts = NULL;
6874 if (double_reduc)
6876 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6877 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6878 new_temp = gimple_build (&stmts, code, vectype,
6879 reduc_inputs[0], adjustment_def);
6881 else
6883 new_temp = scalar_results[0];
6884 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6885 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6886 adjustment_def);
6887 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6888 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6889 new_temp, adjustment_def);
6890 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6893 epilog_stmt = gimple_seq_last_stmt (stmts);
6894 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6895 scalar_results[0] = new_temp;
6898 /* Record this operation if it could be reused by the epilogue loop. */
6899 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6900 && reduc_inputs.length () == 1)
6901 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6902 { orig_reduc_input, reduc_info });
6904 if (double_reduc)
6905 loop = outer_loop;
6907 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6908 phis with new adjusted scalar results, i.e., replace use <s_out0>
6909 with use <s_out4>.
6911 Transform:
6912 loop_exit:
6913 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6914 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6915 v_out2 = reduce <v_out1>
6916 s_out3 = extract_field <v_out2, 0>
6917 s_out4 = adjust_result <s_out3>
6918 use <s_out0>
6919 use <s_out0>
6921 into:
6923 loop_exit:
6924 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6925 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6926 v_out2 = reduce <v_out1>
6927 s_out3 = extract_field <v_out2, 0>
6928 s_out4 = adjust_result <s_out3>
6929 use <s_out4>
6930 use <s_out4> */
6932 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6933 for (k = 0; k < live_out_stmts.size (); k++)
6935 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6936 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6938 phis.create (3);
6939 /* Find the loop-closed-use at the loop exit of the original scalar
6940 result. (The reduction result is expected to have two immediate uses,
6941 one at the latch block, and one at the loop exit). For double
6942 reductions we are looking for exit phis of the outer loop. */
6943 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6945 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6947 if (!is_gimple_debug (USE_STMT (use_p)))
6948 phis.safe_push (USE_STMT (use_p));
6950 else
6952 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6954 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6956 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6958 if (!flow_bb_inside_loop_p (loop,
6959 gimple_bb (USE_STMT (phi_use_p)))
6960 && !is_gimple_debug (USE_STMT (phi_use_p)))
6961 phis.safe_push (USE_STMT (phi_use_p));
6967 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6969 /* Replace the uses: */
6970 orig_name = PHI_RESULT (exit_phi);
6972 /* Look for a single use at the target of the skip edge. */
6973 if (unify_with_main_loop_p)
6975 use_operand_p use_p;
6976 gimple *user;
6977 if (!single_imm_use (orig_name, &use_p, &user))
6978 gcc_unreachable ();
6979 orig_name = gimple_get_lhs (user);
6982 scalar_result = scalar_results[k];
6983 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6985 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6986 SET_USE (use_p, scalar_result);
6987 update_stmt (use_stmt);
6991 phis.release ();
6995 /* Return a vector of type VECTYPE that is equal to the vector select
6996 operation "MASK ? VEC : IDENTITY". Insert the select statements
6997 before GSI. */
6999 static tree
7000 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
7001 tree vec, tree identity)
7003 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
7004 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
7005 mask, vec, identity);
7006 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7007 return cond;
7010 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
7011 order, starting with LHS. Insert the extraction statements before GSI and
7012 associate the new scalar SSA names with variable SCALAR_DEST.
7013 If MASK is nonzero mask the input and then operate on it unconditionally.
7014 Return the SSA name for the result. */
7016 static tree
7017 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
7018 tree_code code, tree lhs, tree vector_rhs,
7019 tree mask)
7021 tree vectype = TREE_TYPE (vector_rhs);
7022 tree scalar_type = TREE_TYPE (vectype);
7023 tree bitsize = TYPE_SIZE (scalar_type);
7024 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
7025 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
7027 /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7028 to perform an unconditional element-wise reduction of it. */
7029 if (mask)
7031 tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7032 "masked_vector_rhs");
7033 tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7034 false);
7035 tree vector_identity = build_vector_from_val (vectype, neutral_op);
7036 gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7037 mask, vector_rhs, vector_identity);
7038 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7039 vector_rhs = masked_vector_rhs;
7042 for (unsigned HOST_WIDE_INT bit_offset = 0;
7043 bit_offset < vec_size_in_bits;
7044 bit_offset += element_bitsize)
7046 tree bitpos = bitsize_int (bit_offset);
7047 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7048 bitsize, bitpos);
7050 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7051 rhs = make_ssa_name (scalar_dest, stmt);
7052 gimple_assign_set_lhs (stmt, rhs);
7053 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7055 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7056 tree new_name = make_ssa_name (scalar_dest, stmt);
7057 gimple_assign_set_lhs (stmt, new_name);
7058 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7059 lhs = new_name;
7061 return lhs;
7064 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
7065 type of the vector input. */
7067 static internal_fn
7068 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7070 internal_fn mask_reduc_fn;
7071 internal_fn mask_len_reduc_fn;
7073 switch (reduc_fn)
7075 case IFN_FOLD_LEFT_PLUS:
7076 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7077 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7078 break;
7080 default:
7081 return IFN_LAST;
7084 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7085 OPTIMIZE_FOR_SPEED))
7086 return mask_reduc_fn;
7087 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7088 OPTIMIZE_FOR_SPEED))
7089 return mask_len_reduc_fn;
7090 return IFN_LAST;
7093 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
7094 statement that sets the live-out value. REDUC_DEF_STMT is the phi
7095 statement. CODE is the operation performed by STMT_INFO and OPS are
7096 its scalar operands. REDUC_INDEX is the index of the operand in
7097 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
7098 implements in-order reduction, or IFN_LAST if we should open-code it.
7099 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7100 that should be used to control the operation in a fully-masked loop. */
7102 static bool
7103 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7104 stmt_vec_info stmt_info,
7105 gimple_stmt_iterator *gsi,
7106 gimple **vec_stmt, slp_tree slp_node,
7107 gimple *reduc_def_stmt,
7108 code_helper code, internal_fn reduc_fn,
7109 tree *ops, int num_ops, tree vectype_in,
7110 int reduc_index, vec_loop_masks *masks,
7111 vec_loop_lens *lens)
7113 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7114 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7115 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7117 int ncopies;
7118 if (slp_node)
7119 ncopies = 1;
7120 else
7121 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7123 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7124 gcc_assert (ncopies == 1);
7126 bool is_cond_op = false;
7127 if (!code.is_tree_code ())
7129 code = conditional_internal_fn_code (internal_fn (code));
7130 gcc_assert (code != ERROR_MARK);
7131 is_cond_op = true;
7134 gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7136 if (slp_node)
7138 if (is_cond_op)
7140 if (dump_enabled_p ())
7141 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7142 "fold-left reduction on SLP not supported.\n");
7143 return false;
7146 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7147 TYPE_VECTOR_SUBPARTS (vectype_in)));
7150 /* The operands either come from a binary operation or an IFN_COND operation.
7151 The former is a gimple assign with binary rhs and the latter is a
7152 gimple call with four arguments. */
7153 gcc_assert (num_ops == 2 || num_ops == 4);
7154 tree op0, opmask;
7155 if (!is_cond_op)
7156 op0 = ops[1 - reduc_index];
7157 else
7159 op0 = ops[2 + (1 - reduc_index)];
7160 opmask = ops[0];
7161 gcc_assert (!slp_node);
7164 int group_size = 1;
7165 stmt_vec_info scalar_dest_def_info;
7166 auto_vec<tree> vec_oprnds0, vec_opmask;
7167 if (slp_node)
7169 auto_vec<vec<tree> > vec_defs (2);
7170 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7171 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7172 vec_defs[0].release ();
7173 vec_defs[1].release ();
7174 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7175 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7177 else
7179 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7180 op0, &vec_oprnds0);
7181 scalar_dest_def_info = stmt_info;
7183 /* For an IFN_COND_OP we also need the vector mask operand. */
7184 if (is_cond_op)
7185 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7186 opmask, &vec_opmask);
7189 gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7190 tree scalar_dest = gimple_get_lhs (sdef);
7191 tree scalar_type = TREE_TYPE (scalar_dest);
7192 tree reduc_var = gimple_phi_result (reduc_def_stmt);
7194 int vec_num = vec_oprnds0.length ();
7195 gcc_assert (vec_num == 1 || slp_node);
7196 tree vec_elem_type = TREE_TYPE (vectype_out);
7197 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7199 tree vector_identity = NULL_TREE;
7200 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7202 vector_identity = build_zero_cst (vectype_out);
7203 if (!HONOR_SIGNED_ZEROS (vectype_out))
7205 else
7207 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7208 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7209 vector_identity);
7213 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7214 int i;
7215 tree def0;
7216 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7218 gimple *new_stmt;
7219 tree mask = NULL_TREE;
7220 tree len = NULL_TREE;
7221 tree bias = NULL_TREE;
7222 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7223 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7224 else if (is_cond_op)
7225 mask = vec_opmask[0];
7226 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7228 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7229 i, 1);
7230 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7231 bias = build_int_cst (intQI_type_node, biasval);
7232 if (!is_cond_op)
7233 mask = build_minus_one_cst (truth_type_for (vectype_in));
7236 /* Handle MINUS by adding the negative. */
7237 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7239 tree negated = make_ssa_name (vectype_out);
7240 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7241 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7242 def0 = negated;
7245 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7246 && mask && mask_reduc_fn == IFN_LAST)
7247 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7248 vector_identity);
7250 /* On the first iteration the input is simply the scalar phi
7251 result, and for subsequent iterations it is the output of
7252 the preceding operation. */
7253 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7255 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7256 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7257 def0, mask, len, bias);
7258 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7259 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7260 def0, mask);
7261 else
7262 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7263 def0);
7264 /* For chained SLP reductions the output of the previous reduction
7265 operation serves as the input of the next. For the final statement
7266 the output cannot be a temporary - we reuse the original
7267 scalar destination of the last statement. */
7268 if (i != vec_num - 1)
7270 gimple_set_lhs (new_stmt, scalar_dest_var);
7271 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7272 gimple_set_lhs (new_stmt, reduc_var);
7275 else
7277 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7278 tree_code (code), reduc_var, def0,
7279 mask);
7280 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7281 /* Remove the statement, so that we can use the same code paths
7282 as for statements that we've just created. */
7283 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7284 gsi_remove (&tmp_gsi, true);
7287 if (i == vec_num - 1)
7289 gimple_set_lhs (new_stmt, scalar_dest);
7290 vect_finish_replace_stmt (loop_vinfo,
7291 scalar_dest_def_info,
7292 new_stmt);
7294 else
7295 vect_finish_stmt_generation (loop_vinfo,
7296 scalar_dest_def_info,
7297 new_stmt, gsi);
7299 if (slp_node)
7300 slp_node->push_vec_def (new_stmt);
7301 else
7303 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7304 *vec_stmt = new_stmt;
7308 return true;
7311 /* Function is_nonwrapping_integer_induction.
7313 Check if STMT_VINO (which is part of loop LOOP) both increments and
7314 does not cause overflow. */
7316 static bool
7317 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7319 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7320 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7321 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7322 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7323 widest_int ni, max_loop_value, lhs_max;
7324 wi::overflow_type overflow = wi::OVF_NONE;
7326 /* Make sure the loop is integer based. */
7327 if (TREE_CODE (base) != INTEGER_CST
7328 || TREE_CODE (step) != INTEGER_CST)
7329 return false;
7331 /* Check that the max size of the loop will not wrap. */
7333 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7334 return true;
7336 if (! max_stmt_executions (loop, &ni))
7337 return false;
7339 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7340 &overflow);
7341 if (overflow)
7342 return false;
7344 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7345 TYPE_SIGN (lhs_type), &overflow);
7346 if (overflow)
7347 return false;
7349 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7350 <= TYPE_PRECISION (lhs_type));
7353 /* Check if masking can be supported by inserting a conditional expression.
7354 CODE is the code for the operation. COND_FN is the conditional internal
7355 function, if it exists. VECTYPE_IN is the type of the vector input. */
7356 static bool
7357 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7358 tree vectype_in)
7360 if (cond_fn != IFN_LAST
7361 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7362 OPTIMIZE_FOR_SPEED))
7363 return false;
7365 if (code.is_tree_code ())
7366 switch (tree_code (code))
7368 case DOT_PROD_EXPR:
7369 case SAD_EXPR:
7370 return true;
7372 default:
7373 break;
7375 return false;
7378 /* Insert a conditional expression to enable masked vectorization. CODE is the
7379 code for the operation. VOP is the array of operands. MASK is the loop
7380 mask. GSI is a statement iterator used to place the new conditional
7381 expression. */
7382 static void
7383 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7384 gimple_stmt_iterator *gsi)
7386 switch (tree_code (code))
7388 case DOT_PROD_EXPR:
7390 tree vectype = TREE_TYPE (vop[1]);
7391 tree zero = build_zero_cst (vectype);
7392 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7393 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7394 mask, vop[1], zero);
7395 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7396 vop[1] = masked_op1;
7397 break;
7400 case SAD_EXPR:
7402 tree vectype = TREE_TYPE (vop[1]);
7403 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7404 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7405 mask, vop[1], vop[0]);
7406 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7407 vop[1] = masked_op1;
7408 break;
7411 default:
7412 gcc_unreachable ();
7416 /* Function vectorizable_reduction.
7418 Check if STMT_INFO performs a reduction operation that can be vectorized.
7419 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7420 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7421 Return true if STMT_INFO is vectorizable in this way.
7423 This function also handles reduction idioms (patterns) that have been
7424 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7425 may be of this form:
7426 X = pattern_expr (arg0, arg1, ..., X)
7427 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7428 sequence that had been detected and replaced by the pattern-stmt
7429 (STMT_INFO).
7431 This function also handles reduction of condition expressions, for example:
7432 for (int i = 0; i < N; i++)
7433 if (a[i] < value)
7434 last = a[i];
7435 This is handled by vectorising the loop and creating an additional vector
7436 containing the loop indexes for which "a[i] < value" was true. In the
7437 function epilogue this is reduced to a single max value and then used to
7438 index into the vector of results.
7440 In some cases of reduction patterns, the type of the reduction variable X is
7441 different than the type of the other arguments of STMT_INFO.
7442 In such cases, the vectype that is used when transforming STMT_INFO into
7443 a vector stmt is different than the vectype that is used to determine the
7444 vectorization factor, because it consists of a different number of elements
7445 than the actual number of elements that are being operated upon in parallel.
7447 For example, consider an accumulation of shorts into an int accumulator.
7448 On some targets it's possible to vectorize this pattern operating on 8
7449 shorts at a time (hence, the vectype for purposes of determining the
7450 vectorization factor should be V8HI); on the other hand, the vectype that
7451 is used to create the vector form is actually V4SI (the type of the result).
7453 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7454 indicates what is the actual level of parallelism (V8HI in the example), so
7455 that the right vectorization factor would be derived. This vectype
7456 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7457 be used to create the vectorized stmt. The right vectype for the vectorized
7458 stmt is obtained from the type of the result X:
7459 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7461 This means that, contrary to "regular" reductions (or "regular" stmts in
7462 general), the following equation:
7463 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7464 does *NOT* necessarily hold for reduction patterns. */
7466 bool
7467 vectorizable_reduction (loop_vec_info loop_vinfo,
7468 stmt_vec_info stmt_info, slp_tree slp_node,
7469 slp_instance slp_node_instance,
7470 stmt_vector_for_cost *cost_vec)
7472 tree vectype_in = NULL_TREE;
7473 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7474 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7475 stmt_vec_info cond_stmt_vinfo = NULL;
7476 int i;
7477 int ncopies;
7478 bool single_defuse_cycle = false;
7479 bool nested_cycle = false;
7480 bool double_reduc = false;
7481 int vec_num;
7482 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7483 tree cond_reduc_val = NULL_TREE;
7485 /* Make sure it was already recognized as a reduction computation. */
7486 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7487 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7488 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7489 return false;
7491 /* The stmt we store reduction analysis meta on. */
7492 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7493 reduc_info->is_reduc_info = true;
7495 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7497 if (is_a <gphi *> (stmt_info->stmt))
7499 if (slp_node)
7501 /* We eventually need to set a vector type on invariant
7502 arguments. */
7503 unsigned j;
7504 slp_tree child;
7505 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7506 if (!vect_maybe_update_slp_op_vectype
7507 (child, SLP_TREE_VECTYPE (slp_node)))
7509 if (dump_enabled_p ())
7510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7511 "incompatible vector types for "
7512 "invariants\n");
7513 return false;
7516 /* Analysis for double-reduction is done on the outer
7517 loop PHI, nested cycles have no further restrictions. */
7518 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7520 else
7521 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7522 return true;
7525 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7526 stmt_vec_info phi_info = stmt_info;
7527 if (!is_a <gphi *> (stmt_info->stmt))
7529 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7530 return true;
7532 if (slp_node)
7534 slp_node_instance->reduc_phis = slp_node;
7535 /* ??? We're leaving slp_node to point to the PHIs, we only
7536 need it to get at the number of vector stmts which wasn't
7537 yet initialized for the instance root. */
7539 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7541 use_operand_p use_p;
7542 gimple *use_stmt;
7543 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7544 &use_p, &use_stmt);
7545 gcc_assert (res);
7546 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7549 /* PHIs should not participate in patterns. */
7550 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7551 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7553 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7554 and compute the reduction chain length. Discover the real
7555 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7556 tree reduc_def
7557 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7558 loop_latch_edge
7559 (gimple_bb (reduc_def_phi)->loop_father));
7560 unsigned reduc_chain_length = 0;
7561 bool only_slp_reduc_chain = true;
7562 stmt_info = NULL;
7563 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7564 while (reduc_def != PHI_RESULT (reduc_def_phi))
7566 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7567 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7568 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7570 if (dump_enabled_p ())
7571 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7572 "reduction chain broken by patterns.\n");
7573 return false;
7575 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7576 only_slp_reduc_chain = false;
7577 /* For epilogue generation live members of the chain need
7578 to point back to the PHI via their original stmt for
7579 info_for_reduction to work. For SLP we need to look at
7580 all lanes here - even though we only will vectorize from
7581 the SLP node with live lane zero the other live lanes also
7582 need to be identified as part of a reduction to be able
7583 to skip code generation for them. */
7584 if (slp_for_stmt_info)
7586 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7587 if (STMT_VINFO_LIVE_P (s))
7588 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7590 else if (STMT_VINFO_LIVE_P (vdef))
7591 STMT_VINFO_REDUC_DEF (def) = phi_info;
7592 gimple_match_op op;
7593 if (!gimple_extract_op (vdef->stmt, &op))
7595 if (dump_enabled_p ())
7596 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7597 "reduction chain includes unsupported"
7598 " statement type.\n");
7599 return false;
7601 if (CONVERT_EXPR_CODE_P (op.code))
7603 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7605 if (dump_enabled_p ())
7606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7607 "conversion in the reduction chain.\n");
7608 return false;
7611 else if (!stmt_info)
7612 /* First non-conversion stmt. */
7613 stmt_info = vdef;
7614 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7615 reduc_chain_length++;
7616 if (!stmt_info && slp_node)
7617 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7619 /* PHIs should not participate in patterns. */
7620 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7622 if (nested_in_vect_loop_p (loop, stmt_info))
7624 loop = loop->inner;
7625 nested_cycle = true;
7628 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7629 element. */
7630 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7632 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7633 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7635 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7636 gcc_assert (slp_node
7637 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7639 /* 1. Is vectorizable reduction? */
7640 /* Not supportable if the reduction variable is used in the loop, unless
7641 it's a reduction chain. */
7642 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7643 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7644 return false;
7646 /* Reductions that are not used even in an enclosing outer-loop,
7647 are expected to be "live" (used out of the loop). */
7648 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7649 && !STMT_VINFO_LIVE_P (stmt_info))
7650 return false;
7652 /* 2. Has this been recognized as a reduction pattern?
7654 Check if STMT represents a pattern that has been recognized
7655 in earlier analysis stages. For stmts that represent a pattern,
7656 the STMT_VINFO_RELATED_STMT field records the last stmt in
7657 the original sequence that constitutes the pattern. */
7659 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7660 if (orig_stmt_info)
7662 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7663 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7666 /* 3. Check the operands of the operation. The first operands are defined
7667 inside the loop body. The last operand is the reduction variable,
7668 which is defined by the loop-header-phi. */
7670 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7671 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7672 gimple_match_op op;
7673 if (!gimple_extract_op (stmt_info->stmt, &op))
7674 gcc_unreachable ();
7675 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7676 || op.code == WIDEN_SUM_EXPR
7677 || op.code == SAD_EXPR);
7679 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7680 && !SCALAR_FLOAT_TYPE_P (op.type))
7681 return false;
7683 /* Do not try to vectorize bit-precision reductions. */
7684 if (!type_has_mode_precision_p (op.type))
7685 return false;
7687 /* For lane-reducing ops we're reducing the number of reduction PHIs
7688 which means the only use of that may be in the lane-reducing operation. */
7689 if (lane_reduc_code_p
7690 && reduc_chain_length != 1
7691 && !only_slp_reduc_chain)
7693 if (dump_enabled_p ())
7694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7695 "lane-reducing reduction with extra stmts.\n");
7696 return false;
7699 /* All uses but the last are expected to be defined in the loop.
7700 The last use is the reduction variable. In case of nested cycle this
7701 assumption is not true: we use reduc_index to record the index of the
7702 reduction variable. */
7703 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7704 tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7705 /* We need to skip an extra operand for COND_EXPRs with embedded
7706 comparison. */
7707 unsigned opno_adjust = 0;
7708 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7709 opno_adjust = 1;
7710 for (i = 0; i < (int) op.num_ops; i++)
7712 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7713 if (i == 0 && op.code == COND_EXPR)
7714 continue;
7716 stmt_vec_info def_stmt_info;
7717 enum vect_def_type dt;
7718 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7719 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7720 &vectype_op[i], &def_stmt_info))
7722 if (dump_enabled_p ())
7723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7724 "use not simple.\n");
7725 return false;
7727 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7728 continue;
7730 /* For an IFN_COND_OP we might hit the reduction definition operand
7731 twice (once as definition, once as else). */
7732 if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7733 continue;
7735 /* There should be only one cycle def in the stmt, the one
7736 leading to reduc_def. */
7737 if (VECTORIZABLE_CYCLE_DEF (dt))
7738 return false;
7740 if (!vectype_op[i])
7741 vectype_op[i]
7742 = get_vectype_for_scalar_type (loop_vinfo,
7743 TREE_TYPE (op.ops[i]), slp_op[i]);
7745 /* To properly compute ncopies we are interested in the widest
7746 non-reduction input type in case we're looking at a widening
7747 accumulation that we later handle in vect_transform_reduction. */
7748 if (lane_reduc_code_p
7749 && vectype_op[i]
7750 && (!vectype_in
7751 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7752 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7753 vectype_in = vectype_op[i];
7755 if (op.code == COND_EXPR)
7757 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7758 if (dt == vect_constant_def)
7760 cond_reduc_dt = dt;
7761 cond_reduc_val = op.ops[i];
7763 if (dt == vect_induction_def
7764 && def_stmt_info
7765 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7767 cond_reduc_dt = dt;
7768 cond_stmt_vinfo = def_stmt_info;
7772 if (!vectype_in)
7773 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7774 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7776 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7777 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7778 /* If we have a condition reduction, see if we can simplify it further. */
7779 if (v_reduc_type == COND_REDUCTION)
7781 if (slp_node)
7782 return false;
7784 /* When the condition uses the reduction value in the condition, fail. */
7785 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7787 if (dump_enabled_p ())
7788 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7789 "condition depends on previous iteration\n");
7790 return false;
7793 if (reduc_chain_length == 1
7794 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7795 OPTIMIZE_FOR_SPEED)
7796 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7797 vectype_in,
7798 OPTIMIZE_FOR_SPEED)))
7800 if (dump_enabled_p ())
7801 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7802 "optimizing condition reduction with"
7803 " FOLD_EXTRACT_LAST.\n");
7804 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7806 else if (cond_reduc_dt == vect_induction_def)
7808 tree base
7809 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7810 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7812 gcc_assert (TREE_CODE (base) == INTEGER_CST
7813 && TREE_CODE (step) == INTEGER_CST);
7814 cond_reduc_val = NULL_TREE;
7815 enum tree_code cond_reduc_op_code = ERROR_MARK;
7816 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7817 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7819 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7820 above base; punt if base is the minimum value of the type for
7821 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7822 else if (tree_int_cst_sgn (step) == -1)
7824 cond_reduc_op_code = MIN_EXPR;
7825 if (tree_int_cst_sgn (base) == -1)
7826 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7827 else if (tree_int_cst_lt (base,
7828 TYPE_MAX_VALUE (TREE_TYPE (base))))
7829 cond_reduc_val
7830 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7832 else
7834 cond_reduc_op_code = MAX_EXPR;
7835 if (tree_int_cst_sgn (base) == 1)
7836 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7837 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7838 base))
7839 cond_reduc_val
7840 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7842 if (cond_reduc_val)
7844 if (dump_enabled_p ())
7845 dump_printf_loc (MSG_NOTE, vect_location,
7846 "condition expression based on "
7847 "integer induction.\n");
7848 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7849 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7850 = cond_reduc_val;
7851 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7854 else if (cond_reduc_dt == vect_constant_def)
7856 enum vect_def_type cond_initial_dt;
7857 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7858 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7859 if (cond_initial_dt == vect_constant_def
7860 && types_compatible_p (TREE_TYPE (cond_initial_val),
7861 TREE_TYPE (cond_reduc_val)))
7863 tree e = fold_binary (LE_EXPR, boolean_type_node,
7864 cond_initial_val, cond_reduc_val);
7865 if (e && (integer_onep (e) || integer_zerop (e)))
7867 if (dump_enabled_p ())
7868 dump_printf_loc (MSG_NOTE, vect_location,
7869 "condition expression based on "
7870 "compile time constant.\n");
7871 /* Record reduction code at analysis stage. */
7872 STMT_VINFO_REDUC_CODE (reduc_info)
7873 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7874 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7880 if (STMT_VINFO_LIVE_P (phi_info))
7881 return false;
7883 if (slp_node)
7884 ncopies = 1;
7885 else
7886 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7888 gcc_assert (ncopies >= 1);
7890 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7892 if (nested_cycle)
7894 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7895 == vect_double_reduction_def);
7896 double_reduc = true;
7899 /* 4.2. Check support for the epilog operation.
7901 If STMT represents a reduction pattern, then the type of the
7902 reduction variable may be different than the type of the rest
7903 of the arguments. For example, consider the case of accumulation
7904 of shorts into an int accumulator; The original code:
7905 S1: int_a = (int) short_a;
7906 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7908 was replaced with:
7909 STMT: int_acc = widen_sum <short_a, int_acc>
7911 This means that:
7912 1. The tree-code that is used to create the vector operation in the
7913 epilog code (that reduces the partial results) is not the
7914 tree-code of STMT, but is rather the tree-code of the original
7915 stmt from the pattern that STMT is replacing. I.e, in the example
7916 above we want to use 'widen_sum' in the loop, but 'plus' in the
7917 epilog.
7918 2. The type (mode) we use to check available target support
7919 for the vector operation to be created in the *epilog*, is
7920 determined by the type of the reduction variable (in the example
7921 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7922 However the type (mode) we use to check available target support
7923 for the vector operation to be created *inside the loop*, is
7924 determined by the type of the other arguments to STMT (in the
7925 example we'd check this: optab_handler (widen_sum_optab,
7926 vect_short_mode)).
7928 This is contrary to "regular" reductions, in which the types of all
7929 the arguments are the same as the type of the reduction variable.
7930 For "regular" reductions we can therefore use the same vector type
7931 (and also the same tree-code) when generating the epilog code and
7932 when generating the code inside the loop. */
7934 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7936 /* If conversion might have created a conditional operation like
7937 IFN_COND_ADD already. Use the internal code for the following checks. */
7938 if (orig_code.is_internal_fn ())
7940 tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7941 orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7944 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7946 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7947 if (reduction_type == TREE_CODE_REDUCTION)
7949 /* Check whether it's ok to change the order of the computation.
7950 Generally, when vectorizing a reduction we change the order of the
7951 computation. This may change the behavior of the program in some
7952 cases, so we need to check that this is ok. One exception is when
7953 vectorizing an outer-loop: the inner-loop is executed sequentially,
7954 and therefore vectorizing reductions in the inner-loop during
7955 outer-loop vectorization is safe. Likewise when we are vectorizing
7956 a series of reductions using SLP and the VF is one the reductions
7957 are performed in scalar order. */
7958 if (slp_node
7959 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7960 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7962 else if (needs_fold_left_reduction_p (op.type, orig_code))
7964 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7965 is not directy used in stmt. */
7966 if (!only_slp_reduc_chain
7967 && reduc_chain_length != 1)
7969 if (dump_enabled_p ())
7970 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7971 "in-order reduction chain without SLP.\n");
7972 return false;
7974 STMT_VINFO_REDUC_TYPE (reduc_info)
7975 = reduction_type = FOLD_LEFT_REDUCTION;
7977 else if (!commutative_binary_op_p (orig_code, op.type)
7978 || !associative_binary_op_p (orig_code, op.type))
7980 if (dump_enabled_p ())
7981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7982 "reduction: not commutative/associative\n");
7983 return false;
7987 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7988 && ncopies > 1)
7990 if (dump_enabled_p ())
7991 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7992 "multiple types in double reduction or condition "
7993 "reduction or fold-left reduction.\n");
7994 return false;
7997 internal_fn reduc_fn = IFN_LAST;
7998 if (reduction_type == TREE_CODE_REDUCTION
7999 || reduction_type == FOLD_LEFT_REDUCTION
8000 || reduction_type == INTEGER_INDUC_COND_REDUCTION
8001 || reduction_type == CONST_COND_REDUCTION)
8003 if (reduction_type == FOLD_LEFT_REDUCTION
8004 ? fold_left_reduction_fn (orig_code, &reduc_fn)
8005 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
8007 if (reduc_fn != IFN_LAST
8008 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
8009 OPTIMIZE_FOR_SPEED))
8011 if (dump_enabled_p ())
8012 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8013 "reduc op not supported by target.\n");
8015 reduc_fn = IFN_LAST;
8018 else
8020 if (!nested_cycle || double_reduc)
8022 if (dump_enabled_p ())
8023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8024 "no reduc code for scalar code.\n");
8026 return false;
8030 else if (reduction_type == COND_REDUCTION)
8032 int scalar_precision
8033 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8034 cr_index_scalar_type = make_unsigned_type (scalar_precision);
8035 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8036 vectype_out);
8038 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8039 OPTIMIZE_FOR_SPEED))
8040 reduc_fn = IFN_REDUC_MAX;
8042 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8044 if (reduction_type != EXTRACT_LAST_REDUCTION
8045 && (!nested_cycle || double_reduc)
8046 && reduc_fn == IFN_LAST
8047 && !nunits_out.is_constant ())
8049 if (dump_enabled_p ())
8050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8051 "missing target support for reduction on"
8052 " variable-length vectors.\n");
8053 return false;
8056 /* For SLP reductions, see if there is a neutral value we can use. */
8057 tree neutral_op = NULL_TREE;
8058 if (slp_node)
8060 tree initial_value = NULL_TREE;
8061 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8062 initial_value = vect_phi_initial_value (reduc_def_phi);
8063 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8064 orig_code, initial_value);
8067 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8069 /* We can't support in-order reductions of code such as this:
8071 for (int i = 0; i < n1; ++i)
8072 for (int j = 0; j < n2; ++j)
8073 l += a[j];
8075 since GCC effectively transforms the loop when vectorizing:
8077 for (int i = 0; i < n1 / VF; ++i)
8078 for (int j = 0; j < n2; ++j)
8079 for (int k = 0; k < VF; ++k)
8080 l += a[j];
8082 which is a reassociation of the original operation. */
8083 if (dump_enabled_p ())
8084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8085 "in-order double reduction not supported.\n");
8087 return false;
8090 if (reduction_type == FOLD_LEFT_REDUCTION
8091 && slp_node
8092 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8094 /* We cannot use in-order reductions in this case because there is
8095 an implicit reassociation of the operations involved. */
8096 if (dump_enabled_p ())
8097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8098 "in-order unchained SLP reductions not supported.\n");
8099 return false;
8102 /* For double reductions, and for SLP reductions with a neutral value,
8103 we construct a variable-length initial vector by loading a vector
8104 full of the neutral value and then shift-and-inserting the start
8105 values into the low-numbered elements. */
8106 if ((double_reduc || neutral_op)
8107 && !nunits_out.is_constant ()
8108 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8109 vectype_out, OPTIMIZE_FOR_SPEED))
8111 if (dump_enabled_p ())
8112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8113 "reduction on variable-length vectors requires"
8114 " target support for a vector-shift-and-insert"
8115 " operation.\n");
8116 return false;
8119 /* Check extra constraints for variable-length unchained SLP reductions. */
8120 if (slp_node
8121 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8122 && !nunits_out.is_constant ())
8124 /* We checked above that we could build the initial vector when
8125 there's a neutral element value. Check here for the case in
8126 which each SLP statement has its own initial value and in which
8127 that value needs to be repeated for every instance of the
8128 statement within the initial vector. */
8129 unsigned int group_size = SLP_TREE_LANES (slp_node);
8130 if (!neutral_op
8131 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8132 TREE_TYPE (vectype_out)))
8134 if (dump_enabled_p ())
8135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8136 "unsupported form of SLP reduction for"
8137 " variable-length vectors: cannot build"
8138 " initial vector.\n");
8139 return false;
8141 /* The epilogue code relies on the number of elements being a multiple
8142 of the group size. The duplicate-and-interleave approach to setting
8143 up the initial vector does too. */
8144 if (!multiple_p (nunits_out, group_size))
8146 if (dump_enabled_p ())
8147 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8148 "unsupported form of SLP reduction for"
8149 " variable-length vectors: the vector size"
8150 " is not a multiple of the number of results.\n");
8151 return false;
8155 if (reduction_type == COND_REDUCTION)
8157 widest_int ni;
8159 if (! max_loop_iterations (loop, &ni))
8161 if (dump_enabled_p ())
8162 dump_printf_loc (MSG_NOTE, vect_location,
8163 "loop count not known, cannot create cond "
8164 "reduction.\n");
8165 return false;
8167 /* Convert backedges to iterations. */
8168 ni += 1;
8170 /* The additional index will be the same type as the condition. Check
8171 that the loop can fit into this less one (because we'll use up the
8172 zero slot for when there are no matches). */
8173 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8174 if (wi::geu_p (ni, wi::to_widest (max_index)))
8176 if (dump_enabled_p ())
8177 dump_printf_loc (MSG_NOTE, vect_location,
8178 "loop size is greater than data size.\n");
8179 return false;
8183 /* In case the vectorization factor (VF) is bigger than the number
8184 of elements that we can fit in a vectype (nunits), we have to generate
8185 more than one vector stmt - i.e - we need to "unroll" the
8186 vector stmt by a factor VF/nunits. For more details see documentation
8187 in vectorizable_operation. */
8189 /* If the reduction is used in an outer loop we need to generate
8190 VF intermediate results, like so (e.g. for ncopies=2):
8191 r0 = phi (init, r0)
8192 r1 = phi (init, r1)
8193 r0 = x0 + r0;
8194 r1 = x1 + r1;
8195 (i.e. we generate VF results in 2 registers).
8196 In this case we have a separate def-use cycle for each copy, and therefore
8197 for each copy we get the vector def for the reduction variable from the
8198 respective phi node created for this copy.
8200 Otherwise (the reduction is unused in the loop nest), we can combine
8201 together intermediate results, like so (e.g. for ncopies=2):
8202 r = phi (init, r)
8203 r = x0 + r;
8204 r = x1 + r;
8205 (i.e. we generate VF/2 results in a single register).
8206 In this case for each copy we get the vector def for the reduction variable
8207 from the vectorized reduction operation generated in the previous iteration.
8209 This only works when we see both the reduction PHI and its only consumer
8210 in vectorizable_reduction and there are no intermediate stmts
8211 participating. When unrolling we want each unrolled iteration to have its
8212 own reduction accumulator since one of the main goals of unrolling a
8213 reduction is to reduce the aggregate loop-carried latency. */
8214 if (ncopies > 1
8215 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8216 && reduc_chain_length == 1
8217 && loop_vinfo->suggested_unroll_factor == 1)
8218 single_defuse_cycle = true;
8220 if (single_defuse_cycle || lane_reduc_code_p)
8222 gcc_assert (op.code != COND_EXPR);
8224 /* 4. Supportable by target? */
8225 bool ok = true;
8227 /* 4.1. check support for the operation in the loop
8229 This isn't necessary for the lane reduction codes, since they
8230 can only be produced by pattern matching, and it's up to the
8231 pattern matcher to test for support. The main reason for
8232 specifically skipping this step is to avoid rechecking whether
8233 mixed-sign dot-products can be implemented using signed
8234 dot-products. */
8235 machine_mode vec_mode = TYPE_MODE (vectype_in);
8236 if (!lane_reduc_code_p
8237 && !directly_supported_p (op.code, vectype_in, optab_vector))
8239 if (dump_enabled_p ())
8240 dump_printf (MSG_NOTE, "op not supported by target.\n");
8241 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8242 || !vect_can_vectorize_without_simd_p (op.code))
8243 ok = false;
8244 else
8245 if (dump_enabled_p ())
8246 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8249 if (vect_emulated_vector_p (vectype_in)
8250 && !vect_can_vectorize_without_simd_p (op.code))
8252 if (dump_enabled_p ())
8253 dump_printf (MSG_NOTE, "using word mode not possible.\n");
8254 return false;
8257 /* lane-reducing operations have to go through vect_transform_reduction.
8258 For the other cases try without the single cycle optimization. */
8259 if (!ok)
8261 if (lane_reduc_code_p)
8262 return false;
8263 else
8264 single_defuse_cycle = false;
8267 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8269 /* If the reduction stmt is one of the patterns that have lane
8270 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
8271 if ((ncopies > 1 && ! single_defuse_cycle)
8272 && lane_reduc_code_p)
8274 if (dump_enabled_p ())
8275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8276 "multi def-use cycle not possible for lane-reducing "
8277 "reduction operation\n");
8278 return false;
8281 if (slp_node
8282 && !(!single_defuse_cycle
8283 && !lane_reduc_code_p
8284 && reduction_type != FOLD_LEFT_REDUCTION))
8285 for (i = 0; i < (int) op.num_ops; i++)
8286 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8288 if (dump_enabled_p ())
8289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8290 "incompatible vector types for invariants\n");
8291 return false;
8294 if (slp_node)
8295 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8296 else
8297 vec_num = 1;
8299 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8300 reduction_type, ncopies, cost_vec);
8301 /* Cost the reduction op inside the loop if transformed via
8302 vect_transform_reduction. Otherwise this is costed by the
8303 separate vectorizable_* routines. */
8304 if (single_defuse_cycle || lane_reduc_code_p)
8306 int factor = 1;
8307 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8308 /* Three dot-products and a subtraction. */
8309 factor = 4;
8310 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8311 stmt_info, 0, vect_body);
8314 if (dump_enabled_p ()
8315 && reduction_type == FOLD_LEFT_REDUCTION)
8316 dump_printf_loc (MSG_NOTE, vect_location,
8317 "using an in-order (fold-left) reduction.\n");
8318 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8319 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8320 reductions go through their own vectorizable_* routines. */
8321 if (!single_defuse_cycle
8322 && !lane_reduc_code_p
8323 && reduction_type != FOLD_LEFT_REDUCTION)
8325 stmt_vec_info tem
8326 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8327 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8329 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8330 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8332 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8333 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8335 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8337 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8338 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8339 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8341 if (reduction_type != FOLD_LEFT_REDUCTION
8342 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8343 && (cond_fn == IFN_LAST
8344 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8345 OPTIMIZE_FOR_SPEED)))
8347 if (dump_enabled_p ())
8348 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8349 "can't operate on partial vectors because"
8350 " no conditional operation is available.\n");
8351 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8353 else if (reduction_type == FOLD_LEFT_REDUCTION
8354 && reduc_fn == IFN_LAST
8355 && !expand_vec_cond_expr_p (vectype_in,
8356 truth_type_for (vectype_in),
8357 SSA_NAME))
8359 if (dump_enabled_p ())
8360 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8361 "can't operate on partial vectors because"
8362 " no conditional operation is available.\n");
8363 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8365 else if (reduction_type == FOLD_LEFT_REDUCTION
8366 && internal_fn_mask_index (reduc_fn) == -1
8367 && FLOAT_TYPE_P (vectype_in)
8368 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8370 if (dump_enabled_p ())
8371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8372 "can't operate on partial vectors because"
8373 " signed zeros cannot be preserved.\n");
8374 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8376 else
8378 internal_fn mask_reduc_fn
8379 = get_masked_reduction_fn (reduc_fn, vectype_in);
8381 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8382 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8383 vectype_in, 1);
8384 else
8385 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8386 vectype_in, NULL);
8389 return true;
8392 /* STMT_INFO is a dot-product reduction whose multiplication operands
8393 have different signs. Emit a sequence to emulate the operation
8394 using a series of signed DOT_PROD_EXPRs and return the last
8395 statement generated. VEC_DEST is the result of the vector operation
8396 and VOP lists its inputs. */
8398 static gassign *
8399 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8400 gimple_stmt_iterator *gsi, tree vec_dest,
8401 tree vop[3])
8403 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8404 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8405 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8406 gimple *new_stmt;
8408 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8409 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8410 std::swap (vop[0], vop[1]);
8412 /* Convert all inputs to signed types. */
8413 for (int i = 0; i < 3; ++i)
8414 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8416 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8417 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8418 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8419 vop[i] = tmp;
8422 /* In the comments below we assume 8-bit inputs for simplicity,
8423 but the approach works for any full integer type. */
8425 /* Create a vector of -128. */
8426 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8427 tree min_narrow = build_vector_from_val (narrow_vectype,
8428 min_narrow_elttype);
8430 /* Create a vector of 64. */
8431 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8432 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8433 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8435 /* Emit: SUB_RES = VOP[0] - 128. */
8436 tree sub_res = make_ssa_name (narrow_vectype);
8437 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8438 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8440 /* Emit:
8442 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8443 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8444 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8446 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8447 Doing the two 64 * y steps first allows more time to compute x. */
8448 tree stage1 = make_ssa_name (wide_vectype);
8449 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8450 vop[1], half_narrow, vop[2]);
8451 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8453 tree stage2 = make_ssa_name (wide_vectype);
8454 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8455 vop[1], half_narrow, stage1);
8456 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8458 tree stage3 = make_ssa_name (wide_vectype);
8459 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8460 sub_res, vop[1], stage2);
8461 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8463 /* Convert STAGE3 to the reduction type. */
8464 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8467 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8468 value. */
8470 bool
8471 vect_transform_reduction (loop_vec_info loop_vinfo,
8472 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8473 gimple **vec_stmt, slp_tree slp_node)
8475 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8476 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8477 int i;
8478 int ncopies;
8479 int vec_num;
8481 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8482 gcc_assert (reduc_info->is_reduc_info);
8484 if (nested_in_vect_loop_p (loop, stmt_info))
8486 loop = loop->inner;
8487 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8490 gimple_match_op op;
8491 if (!gimple_extract_op (stmt_info->stmt, &op))
8492 gcc_unreachable ();
8494 /* All uses but the last are expected to be defined in the loop.
8495 The last use is the reduction variable. In case of nested cycle this
8496 assumption is not true: we use reduc_index to record the index of the
8497 reduction variable. */
8498 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8499 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8500 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8501 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8503 if (slp_node)
8505 ncopies = 1;
8506 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8508 else
8510 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8511 vec_num = 1;
8514 code_helper code = canonicalize_code (op.code, op.type);
8515 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8517 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8518 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8519 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8521 /* Transform. */
8522 tree new_temp = NULL_TREE;
8523 auto_vec<tree> vec_oprnds0;
8524 auto_vec<tree> vec_oprnds1;
8525 auto_vec<tree> vec_oprnds2;
8526 tree def0;
8528 if (dump_enabled_p ())
8529 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8531 /* FORNOW: Multiple types are not supported for condition. */
8532 if (code == COND_EXPR)
8533 gcc_assert (ncopies == 1);
8535 /* A binary COND_OP reduction must have the same definition and else
8536 value. */
8537 bool cond_fn_p = code.is_internal_fn ()
8538 && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8539 if (cond_fn_p)
8541 gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8542 || code == IFN_COND_MUL || code == IFN_COND_AND
8543 || code == IFN_COND_IOR || code == IFN_COND_XOR);
8544 gcc_assert (op.num_ops == 4
8545 && (op.ops[reduc_index]
8546 == op.ops[internal_fn_else_index ((internal_fn) code)]));
8549 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8551 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8552 if (reduction_type == FOLD_LEFT_REDUCTION)
8554 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8555 gcc_assert (code.is_tree_code () || cond_fn_p);
8556 return vectorize_fold_left_reduction
8557 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8558 code, reduc_fn, op.ops, op.num_ops, vectype_in,
8559 reduc_index, masks, lens);
8562 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8563 gcc_assert (single_defuse_cycle
8564 || code == DOT_PROD_EXPR
8565 || code == WIDEN_SUM_EXPR
8566 || code == SAD_EXPR);
8568 /* Create the destination vector */
8569 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8570 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8572 /* Get NCOPIES vector definitions for all operands except the reduction
8573 definition. */
8574 if (!cond_fn_p)
8576 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8577 single_defuse_cycle && reduc_index == 0
8578 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8579 single_defuse_cycle && reduc_index == 1
8580 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8581 op.num_ops == 3
8582 && !(single_defuse_cycle && reduc_index == 2)
8583 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8585 else
8587 /* For a conditional operation pass the truth type as mask
8588 vectype. */
8589 gcc_assert (single_defuse_cycle
8590 && (reduc_index == 1 || reduc_index == 2));
8591 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8592 op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8593 reduc_index == 1 ? NULL_TREE : op.ops[1],
8594 NULL_TREE, &vec_oprnds1,
8595 reduc_index == 2 ? NULL_TREE : op.ops[2],
8596 NULL_TREE, &vec_oprnds2);
8599 /* For single def-use cycles get one copy of the vectorized reduction
8600 definition. */
8601 if (single_defuse_cycle)
8603 gcc_assert (!slp_node);
8604 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8605 op.ops[reduc_index],
8606 reduc_index == 0 ? &vec_oprnds0
8607 : (reduc_index == 1 ? &vec_oprnds1
8608 : &vec_oprnds2));
8611 bool emulated_mixed_dot_prod
8612 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8613 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8615 gimple *new_stmt;
8616 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8617 if (masked_loop_p && !mask_by_cond_expr)
8619 /* No conditional ifns have been defined for dot-product yet. */
8620 gcc_assert (code != DOT_PROD_EXPR);
8622 /* Make sure that the reduction accumulator is vop[0]. */
8623 if (reduc_index == 1)
8625 gcc_assert (commutative_binary_op_p (code, op.type));
8626 std::swap (vop[0], vop[1]);
8628 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8629 vec_num * ncopies, vectype_in, i);
8630 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8631 vop[0], vop[1], vop[0]);
8632 new_temp = make_ssa_name (vec_dest, call);
8633 gimple_call_set_lhs (call, new_temp);
8634 gimple_call_set_nothrow (call, true);
8635 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8636 new_stmt = call;
8638 else
8640 if (op.num_ops >= 3)
8641 vop[2] = vec_oprnds2[i];
8643 if (masked_loop_p && mask_by_cond_expr)
8645 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8646 vec_num * ncopies, vectype_in, i);
8647 build_vect_cond_expr (code, vop, mask, gsi);
8650 if (emulated_mixed_dot_prod)
8651 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8652 vec_dest, vop);
8654 else if (code.is_internal_fn () && !cond_fn_p)
8655 new_stmt = gimple_build_call_internal (internal_fn (code),
8656 op.num_ops,
8657 vop[0], vop[1], vop[2]);
8658 else if (code.is_internal_fn () && cond_fn_p)
8659 new_stmt = gimple_build_call_internal (internal_fn (code),
8660 op.num_ops,
8661 vop[0], vop[1], vop[2],
8662 vop[1]);
8663 else
8664 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8665 vop[0], vop[1], vop[2]);
8666 new_temp = make_ssa_name (vec_dest, new_stmt);
8667 gimple_set_lhs (new_stmt, new_temp);
8668 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8671 if (slp_node)
8672 slp_node->push_vec_def (new_stmt);
8673 else if (single_defuse_cycle
8674 && i < ncopies - 1)
8676 if (reduc_index == 0)
8677 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8678 else if (reduc_index == 1)
8679 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8680 else if (reduc_index == 2)
8681 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8683 else
8684 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8687 if (!slp_node)
8688 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8690 return true;
8693 /* Transform phase of a cycle PHI. */
8695 bool
8696 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8697 stmt_vec_info stmt_info, gimple **vec_stmt,
8698 slp_tree slp_node, slp_instance slp_node_instance)
8700 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8701 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8702 int i;
8703 int ncopies;
8704 int j;
8705 bool nested_cycle = false;
8706 int vec_num;
8708 if (nested_in_vect_loop_p (loop, stmt_info))
8710 loop = loop->inner;
8711 nested_cycle = true;
8714 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8715 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8716 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8717 gcc_assert (reduc_info->is_reduc_info);
8719 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8720 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8721 /* Leave the scalar phi in place. */
8722 return true;
8724 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8725 /* For a nested cycle we do not fill the above. */
8726 if (!vectype_in)
8727 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8728 gcc_assert (vectype_in);
8730 if (slp_node)
8732 /* The size vect_schedule_slp_instance computes is off for us. */
8733 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8734 * SLP_TREE_LANES (slp_node), vectype_in);
8735 ncopies = 1;
8737 else
8739 vec_num = 1;
8740 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8743 /* Check whether we should use a single PHI node and accumulate
8744 vectors to one before the backedge. */
8745 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8746 ncopies = 1;
8748 /* Create the destination vector */
8749 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8750 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8751 vectype_out);
8753 /* Get the loop-entry arguments. */
8754 tree vec_initial_def = NULL_TREE;
8755 auto_vec<tree> vec_initial_defs;
8756 if (slp_node)
8758 vec_initial_defs.reserve (vec_num);
8759 if (nested_cycle)
8761 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8762 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8763 &vec_initial_defs);
8765 else
8767 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8768 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8769 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8771 unsigned int num_phis = stmts.length ();
8772 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8773 num_phis = 1;
8774 initial_values.reserve (num_phis);
8775 for (unsigned int i = 0; i < num_phis; ++i)
8777 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8778 initial_values.quick_push (vect_phi_initial_value (this_phi));
8780 if (vec_num == 1)
8781 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8782 if (!initial_values.is_empty ())
8784 tree initial_value
8785 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8786 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8787 tree neutral_op
8788 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8789 code, initial_value);
8790 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8791 &vec_initial_defs, vec_num,
8792 stmts.length (), neutral_op);
8796 else
8798 /* Get at the scalar def before the loop, that defines the initial
8799 value of the reduction variable. */
8800 tree initial_def = vect_phi_initial_value (phi);
8801 reduc_info->reduc_initial_values.safe_push (initial_def);
8802 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8803 and we can't use zero for induc_val, use initial_def. Similarly
8804 for REDUC_MIN and initial_def larger than the base. */
8805 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8807 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8808 if (TREE_CODE (initial_def) == INTEGER_CST
8809 && !integer_zerop (induc_val)
8810 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8811 && tree_int_cst_lt (initial_def, induc_val))
8812 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8813 && tree_int_cst_lt (induc_val, initial_def))))
8815 induc_val = initial_def;
8816 /* Communicate we used the initial_def to epilouge
8817 generation. */
8818 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8820 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8822 else if (nested_cycle)
8824 /* Do not use an adjustment def as that case is not supported
8825 correctly if ncopies is not one. */
8826 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8827 ncopies, initial_def,
8828 &vec_initial_defs);
8830 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8831 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8832 /* Fill the initial vector with the initial scalar value. */
8833 vec_initial_def
8834 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8835 initial_def, initial_def);
8836 else
8838 if (ncopies == 1)
8839 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8840 if (!reduc_info->reduc_initial_values.is_empty ())
8842 initial_def = reduc_info->reduc_initial_values[0];
8843 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8844 tree neutral_op
8845 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8846 code, initial_def);
8847 gcc_assert (neutral_op);
8848 /* Try to simplify the vector initialization by applying an
8849 adjustment after the reduction has been performed. */
8850 if (!reduc_info->reused_accumulator
8851 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8852 && !operand_equal_p (neutral_op, initial_def))
8854 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8855 = initial_def;
8856 initial_def = neutral_op;
8858 vec_initial_def
8859 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8860 initial_def, neutral_op);
8865 if (vec_initial_def)
8867 vec_initial_defs.create (ncopies);
8868 for (i = 0; i < ncopies; ++i)
8869 vec_initial_defs.quick_push (vec_initial_def);
8872 if (auto *accumulator = reduc_info->reused_accumulator)
8874 tree def = accumulator->reduc_input;
8875 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8877 unsigned int nreduc;
8878 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8879 (TREE_TYPE (def)),
8880 TYPE_VECTOR_SUBPARTS (vectype_out),
8881 &nreduc);
8882 gcc_assert (res);
8883 gimple_seq stmts = NULL;
8884 /* Reduce the single vector to a smaller one. */
8885 if (nreduc != 1)
8887 /* Perform the reduction in the appropriate type. */
8888 tree rvectype = vectype_out;
8889 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8890 TREE_TYPE (TREE_TYPE (def))))
8891 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8892 TYPE_VECTOR_SUBPARTS
8893 (vectype_out));
8894 def = vect_create_partial_epilog (def, rvectype,
8895 STMT_VINFO_REDUC_CODE
8896 (reduc_info),
8897 &stmts);
8899 /* The epilogue loop might use a different vector mode, like
8900 VNx2DI vs. V2DI. */
8901 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8903 tree reduc_type = build_vector_type_for_mode
8904 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8905 def = gimple_convert (&stmts, reduc_type, def);
8907 /* Adjust the input so we pick up the partially reduced value
8908 for the skip edge in vect_create_epilog_for_reduction. */
8909 accumulator->reduc_input = def;
8910 /* And the reduction could be carried out using a different sign. */
8911 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8912 def = gimple_convert (&stmts, vectype_out, def);
8913 if (loop_vinfo->main_loop_edge)
8915 /* While we'd like to insert on the edge this will split
8916 blocks and disturb bookkeeping, we also will eventually
8917 need this on the skip edge. Rely on sinking to
8918 fixup optimal placement and insert in the pred. */
8919 gimple_stmt_iterator gsi
8920 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8921 /* Insert before a cond that eventually skips the
8922 epilogue. */
8923 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8924 gsi_prev (&gsi);
8925 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8927 else
8928 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8929 stmts);
8931 if (loop_vinfo->main_loop_edge)
8932 vec_initial_defs[0]
8933 = vect_get_main_loop_result (loop_vinfo, def,
8934 vec_initial_defs[0]);
8935 else
8936 vec_initial_defs.safe_push (def);
8939 /* Generate the reduction PHIs upfront. */
8940 for (i = 0; i < vec_num; i++)
8942 tree vec_init_def = vec_initial_defs[i];
8943 for (j = 0; j < ncopies; j++)
8945 /* Create the reduction-phi that defines the reduction
8946 operand. */
8947 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8949 /* Set the loop-entry arg of the reduction-phi. */
8950 if (j != 0 && nested_cycle)
8951 vec_init_def = vec_initial_defs[j];
8952 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8953 UNKNOWN_LOCATION);
8955 /* The loop-latch arg is set in epilogue processing. */
8957 if (slp_node)
8958 slp_node->push_vec_def (new_phi);
8959 else
8961 if (j == 0)
8962 *vec_stmt = new_phi;
8963 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8968 return true;
8971 /* Vectorizes LC PHIs. */
8973 bool
8974 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8975 stmt_vec_info stmt_info, gimple **vec_stmt,
8976 slp_tree slp_node)
8978 if (!loop_vinfo
8979 || !is_a <gphi *> (stmt_info->stmt)
8980 || gimple_phi_num_args (stmt_info->stmt) != 1)
8981 return false;
8983 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8984 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8985 return false;
8987 if (!vec_stmt) /* transformation not required. */
8989 /* Deal with copies from externs or constants that disguise as
8990 loop-closed PHI nodes (PR97886). */
8991 if (slp_node
8992 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8993 SLP_TREE_VECTYPE (slp_node)))
8995 if (dump_enabled_p ())
8996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8997 "incompatible vector types for invariants\n");
8998 return false;
9000 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
9001 return true;
9004 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9005 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9006 basic_block bb = gimple_bb (stmt_info->stmt);
9007 edge e = single_pred_edge (bb);
9008 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9009 auto_vec<tree> vec_oprnds;
9010 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
9011 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
9012 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
9013 for (unsigned i = 0; i < vec_oprnds.length (); i++)
9015 /* Create the vectorized LC PHI node. */
9016 gphi *new_phi = create_phi_node (vec_dest, bb);
9017 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
9018 if (slp_node)
9019 slp_node->push_vec_def (new_phi);
9020 else
9021 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
9023 if (!slp_node)
9024 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9026 return true;
9029 /* Vectorizes PHIs. */
9031 bool
9032 vectorizable_phi (vec_info *,
9033 stmt_vec_info stmt_info, gimple **vec_stmt,
9034 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9036 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9037 return false;
9039 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9040 return false;
9042 tree vectype = SLP_TREE_VECTYPE (slp_node);
9044 if (!vec_stmt) /* transformation not required. */
9046 slp_tree child;
9047 unsigned i;
9048 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9049 if (!child)
9051 if (dump_enabled_p ())
9052 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9053 "PHI node with unvectorized backedge def\n");
9054 return false;
9056 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9058 if (dump_enabled_p ())
9059 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9060 "incompatible vector types for invariants\n");
9061 return false;
9063 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9064 && !useless_type_conversion_p (vectype,
9065 SLP_TREE_VECTYPE (child)))
9067 /* With bools we can have mask and non-mask precision vectors
9068 or different non-mask precisions. while pattern recog is
9069 supposed to guarantee consistency here bugs in it can cause
9070 mismatches (PR103489 and PR103800 for example).
9071 Deal with them here instead of ICEing later. */
9072 if (dump_enabled_p ())
9073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9074 "incompatible vector type setup from "
9075 "bool pattern detection\n");
9076 return false;
9079 /* For single-argument PHIs assume coalescing which means zero cost
9080 for the scalar and the vector PHIs. This avoids artificially
9081 favoring the vector path (but may pessimize it in some cases). */
9082 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9083 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9084 vector_stmt, stmt_info, vectype, 0, vect_body);
9085 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9086 return true;
9089 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9090 basic_block bb = gimple_bb (stmt_info->stmt);
9091 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9092 auto_vec<gphi *> new_phis;
9093 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9095 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9097 /* Skip not yet vectorized defs. */
9098 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9099 && SLP_TREE_VEC_DEFS (child).is_empty ())
9100 continue;
9102 auto_vec<tree> vec_oprnds;
9103 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9104 if (!new_phis.exists ())
9106 new_phis.create (vec_oprnds.length ());
9107 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9109 /* Create the vectorized LC PHI node. */
9110 new_phis.quick_push (create_phi_node (vec_dest, bb));
9111 slp_node->push_vec_def (new_phis[j]);
9114 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9115 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9116 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9118 /* We should have at least one already vectorized child. */
9119 gcc_assert (new_phis.exists ());
9121 return true;
9124 /* Vectorizes first order recurrences. An overview of the transformation
9125 is described below. Suppose we have the following loop.
9127 int t = 0;
9128 for (int i = 0; i < n; ++i)
9130 b[i] = a[i] - t;
9131 t = a[i];
9134 There is a first-order recurrence on 'a'. For this loop, the scalar IR
9135 looks (simplified) like:
9137 scalar.preheader:
9138 init = 0;
9140 scalar.body:
9141 i = PHI <0(scalar.preheader), i+1(scalar.body)>
9142 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9143 _1 = a[i]
9144 b[i] = _1 - _2
9145 if (i < n) goto scalar.body
9147 In this example, _2 is a recurrence because it's value depends on the
9148 previous iteration. We vectorize this as (VF = 4)
9150 vector.preheader:
9151 vect_init = vect_cst(..., ..., ..., 0)
9153 vector.body
9154 i = PHI <0(vector.preheader), i+4(vector.body)>
9155 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9156 vect_2 = a[i, i+1, i+2, i+3];
9157 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9158 b[i, i+1, i+2, i+3] = vect_2 - vect_3
9159 if (..) goto vector.body
9161 In this function, vectorizable_recurr, we code generate both the
9162 vector PHI node and the permute since those together compute the
9163 vectorized value of the scalar PHI. We do not yet have the
9164 backedge value to fill in there nor into the vec_perm. Those
9165 are filled in maybe_set_vectorized_backedge_value and
9166 vect_schedule_scc.
9168 TODO: Since the scalar loop does not have a use of the recurrence
9169 outside of the loop the natural way to implement peeling via
9170 vectorizing the live value doesn't work. For now peeling of loops
9171 with a recurrence is not implemented. For SLP the supported cases
9172 are restricted to those requiring a single vector recurrence PHI. */
9174 bool
9175 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9176 gimple **vec_stmt, slp_tree slp_node,
9177 stmt_vector_for_cost *cost_vec)
9179 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9180 return false;
9182 gphi *phi = as_a<gphi *> (stmt_info->stmt);
9184 /* So far we only support first-order recurrence auto-vectorization. */
9185 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9186 return false;
9188 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9189 unsigned ncopies;
9190 if (slp_node)
9191 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9192 else
9193 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9194 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9195 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9196 /* We need to be able to make progress with a single vector. */
9197 if (maybe_gt (dist * 2, nunits))
9199 if (dump_enabled_p ())
9200 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9201 "first order recurrence exceeds half of "
9202 "a vector\n");
9203 return false;
9206 /* First-order recurrence autovectorization needs to handle permutation
9207 with indices = [nunits-1, nunits, nunits+1, ...]. */
9208 vec_perm_builder sel (nunits, 1, 3);
9209 for (int i = 0; i < 3; ++i)
9210 sel.quick_push (nunits - dist + i);
9211 vec_perm_indices indices (sel, 2, nunits);
9213 if (!vec_stmt) /* transformation not required. */
9215 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9216 indices))
9217 return false;
9219 if (slp_node)
9221 /* We eventually need to set a vector type on invariant
9222 arguments. */
9223 unsigned j;
9224 slp_tree child;
9225 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9226 if (!vect_maybe_update_slp_op_vectype
9227 (child, SLP_TREE_VECTYPE (slp_node)))
9229 if (dump_enabled_p ())
9230 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9231 "incompatible vector types for "
9232 "invariants\n");
9233 return false;
9236 /* The recurrence costs the initialization vector and one permute
9237 for each copy. */
9238 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9239 stmt_info, 0, vect_prologue);
9240 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9241 stmt_info, 0, vect_body);
9242 if (dump_enabled_p ())
9243 dump_printf_loc (MSG_NOTE, vect_location,
9244 "vectorizable_recurr: inside_cost = %d, "
9245 "prologue_cost = %d .\n", inside_cost,
9246 prologue_cost);
9248 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9249 return true;
9252 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9253 basic_block bb = gimple_bb (phi);
9254 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9255 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9257 gimple_seq stmts = NULL;
9258 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9259 gsi_insert_seq_on_edge_immediate (pe, stmts);
9261 tree vec_init = build_vector_from_val (vectype, preheader);
9262 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9264 /* Create the vectorized first-order PHI node. */
9265 tree vec_dest = vect_get_new_vect_var (vectype,
9266 vect_simple_var, "vec_recur_");
9267 gphi *new_phi = create_phi_node (vec_dest, bb);
9268 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9270 /* Insert shuffles the first-order recurrence autovectorization.
9271 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9272 tree perm = vect_gen_perm_mask_checked (vectype, indices);
9274 /* Insert the required permute after the latch definition. The
9275 second and later operands are tentative and will be updated when we have
9276 vectorized the latch definition. */
9277 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9278 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9279 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9280 gsi_next (&gsi2);
9282 for (unsigned i = 0; i < ncopies; ++i)
9284 vec_dest = make_ssa_name (vectype);
9285 gassign *vperm
9286 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9287 i == 0 ? gimple_phi_result (new_phi) : NULL,
9288 NULL, perm);
9289 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9291 if (slp_node)
9292 slp_node->push_vec_def (vperm);
9293 else
9294 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9297 if (!slp_node)
9298 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9299 return true;
9302 /* Return true if VECTYPE represents a vector that requires lowering
9303 by the vector lowering pass. */
9305 bool
9306 vect_emulated_vector_p (tree vectype)
9308 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9309 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9310 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9313 /* Return true if we can emulate CODE on an integer mode representation
9314 of a vector. */
9316 bool
9317 vect_can_vectorize_without_simd_p (tree_code code)
9319 switch (code)
9321 case PLUS_EXPR:
9322 case MINUS_EXPR:
9323 case NEGATE_EXPR:
9324 case BIT_AND_EXPR:
9325 case BIT_IOR_EXPR:
9326 case BIT_XOR_EXPR:
9327 case BIT_NOT_EXPR:
9328 return true;
9330 default:
9331 return false;
9335 /* Likewise, but taking a code_helper. */
9337 bool
9338 vect_can_vectorize_without_simd_p (code_helper code)
9340 return (code.is_tree_code ()
9341 && vect_can_vectorize_without_simd_p (tree_code (code)));
9344 /* Create vector init for vectorized iv. */
9345 static tree
9346 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9347 tree step_expr, poly_uint64 nunits,
9348 tree vectype,
9349 enum vect_induction_op_type induction_type)
9351 unsigned HOST_WIDE_INT const_nunits;
9352 tree vec_shift, vec_init, new_name;
9353 unsigned i;
9354 tree itype = TREE_TYPE (vectype);
9356 /* iv_loop is the loop to be vectorized. Create:
9357 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9358 new_name = gimple_convert (stmts, itype, init_expr);
9359 switch (induction_type)
9361 case vect_step_op_shr:
9362 case vect_step_op_shl:
9363 /* Build the Initial value from shift_expr. */
9364 vec_init = gimple_build_vector_from_val (stmts,
9365 vectype,
9366 new_name);
9367 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9368 build_zero_cst (itype), step_expr);
9369 vec_init = gimple_build (stmts,
9370 (induction_type == vect_step_op_shr
9371 ? RSHIFT_EXPR : LSHIFT_EXPR),
9372 vectype, vec_init, vec_shift);
9373 break;
9375 case vect_step_op_neg:
9377 vec_init = gimple_build_vector_from_val (stmts,
9378 vectype,
9379 new_name);
9380 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9381 vectype, vec_init);
9382 /* The encoding has 2 interleaved stepped patterns. */
9383 vec_perm_builder sel (nunits, 2, 3);
9384 sel.quick_grow (6);
9385 for (i = 0; i < 3; i++)
9387 sel[2 * i] = i;
9388 sel[2 * i + 1] = i + nunits;
9390 vec_perm_indices indices (sel, 2, nunits);
9391 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9392 fail when vec_init is const vector. In that situation vec_perm is not
9393 really needed. */
9394 tree perm_mask_even
9395 = vect_gen_perm_mask_any (vectype, indices);
9396 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9397 vectype,
9398 vec_init, vec_neg,
9399 perm_mask_even);
9401 break;
9403 case vect_step_op_mul:
9405 /* Use unsigned mult to avoid UD integer overflow. */
9406 gcc_assert (nunits.is_constant (&const_nunits));
9407 tree utype = unsigned_type_for (itype);
9408 tree uvectype = build_vector_type (utype,
9409 TYPE_VECTOR_SUBPARTS (vectype));
9410 new_name = gimple_convert (stmts, utype, new_name);
9411 vec_init = gimple_build_vector_from_val (stmts,
9412 uvectype,
9413 new_name);
9414 tree_vector_builder elts (uvectype, const_nunits, 1);
9415 tree elt_step = build_one_cst (utype);
9417 elts.quick_push (elt_step);
9418 for (i = 1; i < const_nunits; i++)
9420 /* Create: new_name_i = new_name + step_expr. */
9421 elt_step = gimple_build (stmts, MULT_EXPR,
9422 utype, elt_step, step_expr);
9423 elts.quick_push (elt_step);
9425 /* Create a vector from [new_name_0, new_name_1, ...,
9426 new_name_nunits-1]. */
9427 tree vec_mul = gimple_build_vector (stmts, &elts);
9428 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9429 vec_init, vec_mul);
9430 vec_init = gimple_convert (stmts, vectype, vec_init);
9432 break;
9434 default:
9435 gcc_unreachable ();
9438 return vec_init;
9441 /* Peel init_expr by skip_niter for induction_type. */
9442 tree
9443 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9444 tree skip_niters, tree step_expr,
9445 enum vect_induction_op_type induction_type)
9447 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9448 tree type = TREE_TYPE (init_expr);
9449 unsigned prec = TYPE_PRECISION (type);
9450 switch (induction_type)
9452 case vect_step_op_neg:
9453 if (TREE_INT_CST_LOW (skip_niters) % 2)
9454 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9455 /* else no change. */
9456 break;
9458 case vect_step_op_shr:
9459 case vect_step_op_shl:
9460 skip_niters = gimple_convert (stmts, type, skip_niters);
9461 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9462 /* When shift mount >= precision, need to avoid UD.
9463 In the original loop, there's no UD, and according to semantic,
9464 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9465 if (!tree_fits_uhwi_p (step_expr)
9466 || tree_to_uhwi (step_expr) >= prec)
9468 if (induction_type == vect_step_op_shl
9469 || TYPE_UNSIGNED (type))
9470 init_expr = build_zero_cst (type);
9471 else
9472 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9473 init_expr,
9474 wide_int_to_tree (type, prec - 1));
9476 else
9477 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9478 ? RSHIFT_EXPR : LSHIFT_EXPR),
9479 type, init_expr, step_expr);
9480 break;
9482 case vect_step_op_mul:
9484 tree utype = unsigned_type_for (type);
9485 init_expr = gimple_convert (stmts, utype, init_expr);
9486 wide_int skipn = wi::to_wide (skip_niters);
9487 wide_int begin = wi::to_wide (step_expr);
9488 auto_mpz base, exp, mod, res;
9489 wi::to_mpz (begin, base, TYPE_SIGN (type));
9490 wi::to_mpz (skipn, exp, UNSIGNED);
9491 mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9492 mpz_powm (res, base, exp, mod);
9493 begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9494 tree mult_expr = wide_int_to_tree (utype, begin);
9495 init_expr = gimple_build (stmts, MULT_EXPR, utype,
9496 init_expr, mult_expr);
9497 init_expr = gimple_convert (stmts, type, init_expr);
9499 break;
9501 default:
9502 gcc_unreachable ();
9505 return init_expr;
9508 /* Create vector step for vectorized iv. */
9509 static tree
9510 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9511 poly_uint64 vf,
9512 enum vect_induction_op_type induction_type)
9514 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9515 tree new_name = NULL;
9516 /* Step should be pow (step, vf) for mult induction. */
9517 if (induction_type == vect_step_op_mul)
9519 gcc_assert (vf.is_constant ());
9520 wide_int begin = wi::to_wide (step_expr);
9522 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9523 begin = wi::mul (begin, wi::to_wide (step_expr));
9525 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9527 else if (induction_type == vect_step_op_neg)
9528 /* Do nothing. */
9530 else
9531 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9532 expr, step_expr);
9533 return new_name;
9536 static tree
9537 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9538 stmt_vec_info stmt_info,
9539 tree new_name, tree vectype,
9540 enum vect_induction_op_type induction_type)
9542 /* No step is needed for neg induction. */
9543 if (induction_type == vect_step_op_neg)
9544 return NULL;
9546 tree t = unshare_expr (new_name);
9547 gcc_assert (CONSTANT_CLASS_P (new_name)
9548 || TREE_CODE (new_name) == SSA_NAME);
9549 tree new_vec = build_vector_from_val (vectype, t);
9550 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9551 new_vec, vectype, NULL);
9552 return vec_step;
9555 /* Update vectorized iv with vect_step, induc_def is init. */
9556 static tree
9557 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9558 tree induc_def, tree vec_step,
9559 enum vect_induction_op_type induction_type)
9561 tree vec_def = induc_def;
9562 switch (induction_type)
9564 case vect_step_op_mul:
9566 /* Use unsigned mult to avoid UD integer overflow. */
9567 tree uvectype
9568 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9569 TYPE_VECTOR_SUBPARTS (vectype));
9570 vec_def = gimple_convert (stmts, uvectype, vec_def);
9571 vec_step = gimple_convert (stmts, uvectype, vec_step);
9572 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9573 vec_def, vec_step);
9574 vec_def = gimple_convert (stmts, vectype, vec_def);
9576 break;
9578 case vect_step_op_shr:
9579 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9580 vec_def, vec_step);
9581 break;
9583 case vect_step_op_shl:
9584 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9585 vec_def, vec_step);
9586 break;
9587 case vect_step_op_neg:
9588 vec_def = induc_def;
9589 /* Do nothing. */
9590 break;
9591 default:
9592 gcc_unreachable ();
9595 return vec_def;
9599 /* Function vectorizable_induction
9601 Check if STMT_INFO performs an nonlinear induction computation that can be
9602 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9603 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9604 basic block.
9605 Return true if STMT_INFO is vectorizable in this way. */
9607 static bool
9608 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9609 stmt_vec_info stmt_info,
9610 gimple **vec_stmt, slp_tree slp_node,
9611 stmt_vector_for_cost *cost_vec)
9613 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9614 unsigned ncopies;
9615 bool nested_in_vect_loop = false;
9616 class loop *iv_loop;
9617 tree vec_def;
9618 edge pe = loop_preheader_edge (loop);
9619 basic_block new_bb;
9620 tree vec_init, vec_step;
9621 tree new_name;
9622 gimple *new_stmt;
9623 gphi *induction_phi;
9624 tree induc_def, vec_dest;
9625 tree init_expr, step_expr;
9626 tree niters_skip;
9627 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9628 unsigned i;
9629 gimple_stmt_iterator si;
9631 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9633 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9634 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9635 enum vect_induction_op_type induction_type
9636 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9638 gcc_assert (induction_type > vect_step_op_add);
9640 if (slp_node)
9641 ncopies = 1;
9642 else
9643 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9644 gcc_assert (ncopies >= 1);
9646 /* FORNOW. Only handle nonlinear induction in the same loop. */
9647 if (nested_in_vect_loop_p (loop, stmt_info))
9649 if (dump_enabled_p ())
9650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9651 "nonlinear induction in nested loop.\n");
9652 return false;
9655 iv_loop = loop;
9656 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9658 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9659 update for each iv and a permutation to generate wanted vector iv. */
9660 if (slp_node)
9662 if (dump_enabled_p ())
9663 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9664 "SLP induction not supported for nonlinear"
9665 " induction.\n");
9666 return false;
9669 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9671 if (dump_enabled_p ())
9672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9673 "floating point nonlinear induction vectorization"
9674 " not supported.\n");
9675 return false;
9678 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9679 init_expr = vect_phi_initial_value (phi);
9680 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9681 && TREE_CODE (step_expr) == INTEGER_CST);
9682 /* step_expr should be aligned with init_expr,
9683 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9684 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9686 if (TREE_CODE (init_expr) == INTEGER_CST)
9687 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9688 else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9690 /* INIT_EXPR could be a bit_field, bail out for such case. */
9691 if (dump_enabled_p ())
9692 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9693 "nonlinear induction vectorization failed:"
9694 " component type of vectype is not a nop conversion"
9695 " from type of init_expr.\n");
9696 return false;
9699 switch (induction_type)
9701 case vect_step_op_neg:
9702 if (TREE_CODE (init_expr) != INTEGER_CST
9703 && TREE_CODE (init_expr) != REAL_CST)
9705 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9706 if (!directly_supported_p (NEGATE_EXPR, vectype))
9707 return false;
9709 /* The encoding has 2 interleaved stepped patterns. */
9710 vec_perm_builder sel (nunits, 2, 3);
9711 machine_mode mode = TYPE_MODE (vectype);
9712 sel.quick_grow (6);
9713 for (i = 0; i < 3; i++)
9715 sel[i * 2] = i;
9716 sel[i * 2 + 1] = i + nunits;
9718 vec_perm_indices indices (sel, 2, nunits);
9719 if (!can_vec_perm_const_p (mode, mode, indices))
9720 return false;
9722 break;
9724 case vect_step_op_mul:
9726 /* Check for backend support of MULT_EXPR. */
9727 if (!directly_supported_p (MULT_EXPR, vectype))
9728 return false;
9730 /* ?? How to construct vector step for variable number vector.
9731 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9732 if (!vf.is_constant ())
9733 return false;
9735 break;
9737 case vect_step_op_shr:
9738 /* Check for backend support of RSHIFT_EXPR. */
9739 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9740 return false;
9742 /* Don't shift more than type precision to avoid UD. */
9743 if (!tree_fits_uhwi_p (step_expr)
9744 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9745 TYPE_PRECISION (TREE_TYPE (init_expr))))
9746 return false;
9747 break;
9749 case vect_step_op_shl:
9750 /* Check for backend support of RSHIFT_EXPR. */
9751 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9752 return false;
9754 /* Don't shift more than type precision to avoid UD. */
9755 if (!tree_fits_uhwi_p (step_expr)
9756 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9757 TYPE_PRECISION (TREE_TYPE (init_expr))))
9758 return false;
9760 break;
9762 default:
9763 gcc_unreachable ();
9766 if (!vec_stmt) /* transformation not required. */
9768 unsigned inside_cost = 0, prologue_cost = 0;
9769 /* loop cost for vec_loop. Neg induction doesn't have any
9770 inside_cost. */
9771 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9772 stmt_info, 0, vect_body);
9774 /* loop cost for vec_loop. Neg induction doesn't have any
9775 inside_cost. */
9776 if (induction_type == vect_step_op_neg)
9777 inside_cost = 0;
9779 /* prologue cost for vec_init and vec_step. */
9780 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9781 stmt_info, 0, vect_prologue);
9783 if (dump_enabled_p ())
9784 dump_printf_loc (MSG_NOTE, vect_location,
9785 "vect_model_induction_cost: inside_cost = %d, "
9786 "prologue_cost = %d. \n", inside_cost,
9787 prologue_cost);
9789 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9790 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9791 return true;
9794 /* Transform. */
9796 /* Compute a vector variable, initialized with the first VF values of
9797 the induction variable. E.g., for an iv with IV_PHI='X' and
9798 evolution S, for a vector of 4 units, we want to compute:
9799 [X, X + S, X + 2*S, X + 3*S]. */
9801 if (dump_enabled_p ())
9802 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9804 pe = loop_preheader_edge (iv_loop);
9805 /* Find the first insertion point in the BB. */
9806 basic_block bb = gimple_bb (phi);
9807 si = gsi_after_labels (bb);
9809 gimple_seq stmts = NULL;
9811 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9812 /* If we are using the loop mask to "peel" for alignment then we need
9813 to adjust the start value here. */
9814 if (niters_skip != NULL_TREE)
9815 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9816 step_expr, induction_type);
9818 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9819 step_expr, nunits, vectype,
9820 induction_type);
9821 if (stmts)
9823 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9824 gcc_assert (!new_bb);
9827 stmts = NULL;
9828 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9829 vf, induction_type);
9830 if (stmts)
9832 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9833 gcc_assert (!new_bb);
9836 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9837 new_name, vectype,
9838 induction_type);
9839 /* Create the following def-use cycle:
9840 loop prolog:
9841 vec_init = ...
9842 vec_step = ...
9843 loop:
9844 vec_iv = PHI <vec_init, vec_loop>
9846 STMT
9848 vec_loop = vec_iv + vec_step; */
9850 /* Create the induction-phi that defines the induction-operand. */
9851 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9852 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9853 induc_def = PHI_RESULT (induction_phi);
9855 /* Create the iv update inside the loop. */
9856 stmts = NULL;
9857 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9858 induc_def, vec_step,
9859 induction_type);
9861 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9862 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9864 /* Set the arguments of the phi node: */
9865 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9866 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9867 UNKNOWN_LOCATION);
9869 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9870 *vec_stmt = induction_phi;
9872 /* In case that vectorization factor (VF) is bigger than the number
9873 of elements that we can fit in a vectype (nunits), we have to generate
9874 more than one vector stmt - i.e - we need to "unroll" the
9875 vector stmt by a factor VF/nunits. For more details see documentation
9876 in vectorizable_operation. */
9878 if (ncopies > 1)
9880 stmts = NULL;
9881 /* FORNOW. This restriction should be relaxed. */
9882 gcc_assert (!nested_in_vect_loop);
9884 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9885 nunits, induction_type);
9887 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9888 new_name, vectype,
9889 induction_type);
9890 vec_def = induc_def;
9891 for (i = 1; i < ncopies; i++)
9893 /* vec_i = vec_prev + vec_step. */
9894 stmts = NULL;
9895 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9896 vec_def, vec_step,
9897 induction_type);
9898 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9899 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9900 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9904 if (dump_enabled_p ())
9905 dump_printf_loc (MSG_NOTE, vect_location,
9906 "transform induction: created def-use cycle: %G%G",
9907 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9909 return true;
9912 /* Function vectorizable_induction
9914 Check if STMT_INFO performs an induction computation that can be vectorized.
9915 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9916 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9917 Return true if STMT_INFO is vectorizable in this way. */
9919 bool
9920 vectorizable_induction (loop_vec_info loop_vinfo,
9921 stmt_vec_info stmt_info,
9922 gimple **vec_stmt, slp_tree slp_node,
9923 stmt_vector_for_cost *cost_vec)
9925 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9926 unsigned ncopies;
9927 bool nested_in_vect_loop = false;
9928 class loop *iv_loop;
9929 tree vec_def;
9930 edge pe = loop_preheader_edge (loop);
9931 basic_block new_bb;
9932 tree new_vec, vec_init, vec_step, t;
9933 tree new_name;
9934 gimple *new_stmt;
9935 gphi *induction_phi;
9936 tree induc_def, vec_dest;
9937 tree init_expr, step_expr;
9938 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9939 unsigned i;
9940 tree expr;
9941 gimple_stmt_iterator si;
9942 enum vect_induction_op_type induction_type
9943 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9945 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9946 if (!phi)
9947 return false;
9949 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9950 return false;
9952 /* Make sure it was recognized as induction computation. */
9953 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9954 return false;
9956 /* Handle nonlinear induction in a separate place. */
9957 if (induction_type != vect_step_op_add)
9958 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9959 vec_stmt, slp_node, cost_vec);
9961 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9962 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9964 if (slp_node)
9965 ncopies = 1;
9966 else
9967 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9968 gcc_assert (ncopies >= 1);
9970 /* FORNOW. These restrictions should be relaxed. */
9971 if (nested_in_vect_loop_p (loop, stmt_info))
9973 imm_use_iterator imm_iter;
9974 use_operand_p use_p;
9975 gimple *exit_phi;
9976 edge latch_e;
9977 tree loop_arg;
9979 if (ncopies > 1)
9981 if (dump_enabled_p ())
9982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9983 "multiple types in nested loop.\n");
9984 return false;
9987 exit_phi = NULL;
9988 latch_e = loop_latch_edge (loop->inner);
9989 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9990 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9992 gimple *use_stmt = USE_STMT (use_p);
9993 if (is_gimple_debug (use_stmt))
9994 continue;
9996 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9998 exit_phi = use_stmt;
9999 break;
10002 if (exit_phi)
10004 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
10005 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
10006 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
10008 if (dump_enabled_p ())
10009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10010 "inner-loop induction only used outside "
10011 "of the outer vectorized loop.\n");
10012 return false;
10016 nested_in_vect_loop = true;
10017 iv_loop = loop->inner;
10019 else
10020 iv_loop = loop;
10021 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10023 if (slp_node && !nunits.is_constant ())
10025 /* The current SLP code creates the step value element-by-element. */
10026 if (dump_enabled_p ())
10027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10028 "SLP induction not supported for variable-length"
10029 " vectors.\n");
10030 return false;
10033 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10035 if (dump_enabled_p ())
10036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10037 "floating point induction vectorization disabled\n");
10038 return false;
10041 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10042 gcc_assert (step_expr != NULL_TREE);
10043 if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10044 && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10046 if (dump_enabled_p ())
10047 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10048 "bit-precision induction vectorization not "
10049 "supported.\n");
10050 return false;
10052 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10054 /* Check for backend support of PLUS/MINUS_EXPR. */
10055 if (!directly_supported_p (PLUS_EXPR, step_vectype)
10056 || !directly_supported_p (MINUS_EXPR, step_vectype))
10057 return false;
10059 if (!vec_stmt) /* transformation not required. */
10061 unsigned inside_cost = 0, prologue_cost = 0;
10062 if (slp_node)
10064 /* We eventually need to set a vector type on invariant
10065 arguments. */
10066 unsigned j;
10067 slp_tree child;
10068 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10069 if (!vect_maybe_update_slp_op_vectype
10070 (child, SLP_TREE_VECTYPE (slp_node)))
10072 if (dump_enabled_p ())
10073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10074 "incompatible vector types for "
10075 "invariants\n");
10076 return false;
10078 /* loop cost for vec_loop. */
10079 inside_cost
10080 = record_stmt_cost (cost_vec,
10081 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10082 vector_stmt, stmt_info, 0, vect_body);
10083 /* prologue cost for vec_init (if not nested) and step. */
10084 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10085 scalar_to_vec,
10086 stmt_info, 0, vect_prologue);
10088 else /* if (!slp_node) */
10090 /* loop cost for vec_loop. */
10091 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10092 stmt_info, 0, vect_body);
10093 /* prologue cost for vec_init and vec_step. */
10094 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10095 stmt_info, 0, vect_prologue);
10097 if (dump_enabled_p ())
10098 dump_printf_loc (MSG_NOTE, vect_location,
10099 "vect_model_induction_cost: inside_cost = %d, "
10100 "prologue_cost = %d .\n", inside_cost,
10101 prologue_cost);
10103 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10104 DUMP_VECT_SCOPE ("vectorizable_induction");
10105 return true;
10108 /* Transform. */
10110 /* Compute a vector variable, initialized with the first VF values of
10111 the induction variable. E.g., for an iv with IV_PHI='X' and
10112 evolution S, for a vector of 4 units, we want to compute:
10113 [X, X + S, X + 2*S, X + 3*S]. */
10115 if (dump_enabled_p ())
10116 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10118 pe = loop_preheader_edge (iv_loop);
10119 /* Find the first insertion point in the BB. */
10120 basic_block bb = gimple_bb (phi);
10121 si = gsi_after_labels (bb);
10123 /* For SLP induction we have to generate several IVs as for example
10124 with group size 3 we need
10125 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10126 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
10127 if (slp_node)
10129 /* Enforced above. */
10130 unsigned int const_nunits = nunits.to_constant ();
10132 /* The initial values are vectorized, but any lanes > group_size
10133 need adjustment. */
10134 slp_tree init_node
10135 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10137 /* Gather steps. Since we do not vectorize inductions as
10138 cycles we have to reconstruct the step from SCEV data. */
10139 unsigned group_size = SLP_TREE_LANES (slp_node);
10140 tree *steps = XALLOCAVEC (tree, group_size);
10141 tree *inits = XALLOCAVEC (tree, group_size);
10142 stmt_vec_info phi_info;
10143 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10145 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10146 if (!init_node)
10147 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10148 pe->dest_idx);
10151 /* Now generate the IVs. */
10152 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10153 gcc_assert ((const_nunits * nvects) % group_size == 0);
10154 unsigned nivs;
10155 if (nested_in_vect_loop)
10156 nivs = nvects;
10157 else
10159 /* Compute the number of distinct IVs we need. First reduce
10160 group_size if it is a multiple of const_nunits so we get
10161 one IV for a group_size of 4 but const_nunits 2. */
10162 unsigned group_sizep = group_size;
10163 if (group_sizep % const_nunits == 0)
10164 group_sizep = group_sizep / const_nunits;
10165 nivs = least_common_multiple (group_sizep,
10166 const_nunits) / const_nunits;
10168 tree stept = TREE_TYPE (step_vectype);
10169 tree lupdate_mul = NULL_TREE;
10170 if (!nested_in_vect_loop)
10172 /* The number of iterations covered in one vector iteration. */
10173 unsigned lup_mul = (nvects * const_nunits) / group_size;
10174 lupdate_mul
10175 = build_vector_from_val (step_vectype,
10176 SCALAR_FLOAT_TYPE_P (stept)
10177 ? build_real_from_wide (stept, lup_mul,
10178 UNSIGNED)
10179 : build_int_cstu (stept, lup_mul));
10181 tree peel_mul = NULL_TREE;
10182 gimple_seq init_stmts = NULL;
10183 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10185 if (SCALAR_FLOAT_TYPE_P (stept))
10186 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10187 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10188 else
10189 peel_mul = gimple_convert (&init_stmts, stept,
10190 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10191 peel_mul = gimple_build_vector_from_val (&init_stmts,
10192 step_vectype, peel_mul);
10194 unsigned ivn;
10195 auto_vec<tree> vec_steps;
10196 for (ivn = 0; ivn < nivs; ++ivn)
10198 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10199 tree_vector_builder init_elts (vectype, const_nunits, 1);
10200 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10201 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10203 /* The scalar steps of the IVs. */
10204 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10205 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10206 step_elts.quick_push (elt);
10207 if (!init_node)
10209 /* The scalar inits of the IVs if not vectorized. */
10210 elt = inits[(ivn*const_nunits + eltn) % group_size];
10211 if (!useless_type_conversion_p (TREE_TYPE (vectype),
10212 TREE_TYPE (elt)))
10213 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10214 TREE_TYPE (vectype), elt);
10215 init_elts.quick_push (elt);
10217 /* The number of steps to add to the initial values. */
10218 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10219 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10220 ? build_real_from_wide (stept,
10221 mul_elt, UNSIGNED)
10222 : build_int_cstu (stept, mul_elt));
10224 vec_step = gimple_build_vector (&init_stmts, &step_elts);
10225 vec_steps.safe_push (vec_step);
10226 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10227 if (peel_mul)
10228 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10229 step_mul, peel_mul);
10230 if (!init_node)
10231 vec_init = gimple_build_vector (&init_stmts, &init_elts);
10233 /* Create the induction-phi that defines the induction-operand. */
10234 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10235 "vec_iv_");
10236 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10237 induc_def = PHI_RESULT (induction_phi);
10239 /* Create the iv update inside the loop */
10240 tree up = vec_step;
10241 if (lupdate_mul)
10242 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10243 vec_step, lupdate_mul);
10244 gimple_seq stmts = NULL;
10245 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10246 vec_def = gimple_build (&stmts,
10247 PLUS_EXPR, step_vectype, vec_def, up);
10248 vec_def = gimple_convert (&stmts, vectype, vec_def);
10249 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10250 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10251 UNKNOWN_LOCATION);
10253 if (init_node)
10254 vec_init = vect_get_slp_vect_def (init_node, ivn);
10255 if (!nested_in_vect_loop
10256 && !integer_zerop (step_mul))
10258 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10259 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10260 vec_step, step_mul);
10261 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10262 vec_def, up);
10263 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10266 /* Set the arguments of the phi node: */
10267 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10269 slp_node->push_vec_def (induction_phi);
10271 if (!nested_in_vect_loop)
10273 /* Fill up to the number of vectors we need for the whole group. */
10274 nivs = least_common_multiple (group_size,
10275 const_nunits) / const_nunits;
10276 vec_steps.reserve (nivs-ivn);
10277 for (; ivn < nivs; ++ivn)
10279 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10280 vec_steps.quick_push (vec_steps[0]);
10284 /* Re-use IVs when we can. We are generating further vector
10285 stmts by adding VF' * stride to the IVs generated above. */
10286 if (ivn < nvects)
10288 unsigned vfp
10289 = least_common_multiple (group_size, const_nunits) / group_size;
10290 tree lupdate_mul
10291 = build_vector_from_val (step_vectype,
10292 SCALAR_FLOAT_TYPE_P (stept)
10293 ? build_real_from_wide (stept,
10294 vfp, UNSIGNED)
10295 : build_int_cstu (stept, vfp));
10296 for (; ivn < nvects; ++ivn)
10298 gimple *iv
10299 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10300 tree def = gimple_get_lhs (iv);
10301 if (ivn < 2*nivs)
10302 vec_steps[ivn - nivs]
10303 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10304 vec_steps[ivn - nivs], lupdate_mul);
10305 gimple_seq stmts = NULL;
10306 def = gimple_convert (&stmts, step_vectype, def);
10307 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10308 def, vec_steps[ivn % nivs]);
10309 def = gimple_convert (&stmts, vectype, def);
10310 if (gimple_code (iv) == GIMPLE_PHI)
10311 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10312 else
10314 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10315 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10317 slp_node->push_vec_def (def);
10321 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10322 gcc_assert (!new_bb);
10324 return true;
10327 init_expr = vect_phi_initial_value (phi);
10329 gimple_seq stmts = NULL;
10330 if (!nested_in_vect_loop)
10332 /* Convert the initial value to the IV update type. */
10333 tree new_type = TREE_TYPE (step_expr);
10334 init_expr = gimple_convert (&stmts, new_type, init_expr);
10336 /* If we are using the loop mask to "peel" for alignment then we need
10337 to adjust the start value here. */
10338 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10339 if (skip_niters != NULL_TREE)
10341 if (FLOAT_TYPE_P (vectype))
10342 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10343 skip_niters);
10344 else
10345 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10346 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10347 skip_niters, step_expr);
10348 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10349 init_expr, skip_step);
10353 if (stmts)
10355 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10356 gcc_assert (!new_bb);
10359 /* Create the vector that holds the initial_value of the induction. */
10360 if (nested_in_vect_loop)
10362 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10363 been created during vectorization of previous stmts. We obtain it
10364 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10365 auto_vec<tree> vec_inits;
10366 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10367 init_expr, &vec_inits);
10368 vec_init = vec_inits[0];
10369 /* If the initial value is not of proper type, convert it. */
10370 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10372 new_stmt
10373 = gimple_build_assign (vect_get_new_ssa_name (vectype,
10374 vect_simple_var,
10375 "vec_iv_"),
10376 VIEW_CONVERT_EXPR,
10377 build1 (VIEW_CONVERT_EXPR, vectype,
10378 vec_init));
10379 vec_init = gimple_assign_lhs (new_stmt);
10380 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10381 new_stmt);
10382 gcc_assert (!new_bb);
10385 else
10387 /* iv_loop is the loop to be vectorized. Create:
10388 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10389 stmts = NULL;
10390 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10392 unsigned HOST_WIDE_INT const_nunits;
10393 if (nunits.is_constant (&const_nunits))
10395 tree_vector_builder elts (step_vectype, const_nunits, 1);
10396 elts.quick_push (new_name);
10397 for (i = 1; i < const_nunits; i++)
10399 /* Create: new_name_i = new_name + step_expr */
10400 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10401 new_name, step_expr);
10402 elts.quick_push (new_name);
10404 /* Create a vector from [new_name_0, new_name_1, ...,
10405 new_name_nunits-1] */
10406 vec_init = gimple_build_vector (&stmts, &elts);
10408 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10409 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10410 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10411 new_name, step_expr);
10412 else
10414 /* Build:
10415 [base, base, base, ...]
10416 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10417 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10418 gcc_assert (flag_associative_math);
10419 tree index = build_index_vector (step_vectype, 0, 1);
10420 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10421 new_name);
10422 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10423 step_expr);
10424 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10425 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10426 vec_init, step_vec);
10427 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10428 vec_init, base_vec);
10430 vec_init = gimple_convert (&stmts, vectype, vec_init);
10432 if (stmts)
10434 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10435 gcc_assert (!new_bb);
10440 /* Create the vector that holds the step of the induction. */
10441 gimple_stmt_iterator *step_iv_si = NULL;
10442 if (nested_in_vect_loop)
10443 /* iv_loop is nested in the loop to be vectorized. Generate:
10444 vec_step = [S, S, S, S] */
10445 new_name = step_expr;
10446 else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10448 /* When we're using loop_len produced by SELEC_VL, the non-final
10449 iterations are not always processing VF elements. So vectorize
10450 induction variable instead of
10452 _21 = vect_vec_iv_.6_22 + { VF, ... };
10454 We should generate:
10456 _35 = .SELECT_VL (ivtmp_33, VF);
10457 vect_cst__22 = [vec_duplicate_expr] _35;
10458 _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10459 gcc_assert (!slp_node);
10460 gimple_seq seq = NULL;
10461 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10462 tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10463 expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10464 unshare_expr (len)),
10465 &seq, true, NULL_TREE);
10466 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10467 step_expr);
10468 gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10469 step_iv_si = &si;
10471 else
10473 /* iv_loop is the loop to be vectorized. Generate:
10474 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10475 gimple_seq seq = NULL;
10476 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10478 expr = build_int_cst (integer_type_node, vf);
10479 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10481 else
10482 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10483 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10484 expr, step_expr);
10485 if (seq)
10487 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10488 gcc_assert (!new_bb);
10492 t = unshare_expr (new_name);
10493 gcc_assert (CONSTANT_CLASS_P (new_name)
10494 || TREE_CODE (new_name) == SSA_NAME);
10495 new_vec = build_vector_from_val (step_vectype, t);
10496 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10497 new_vec, step_vectype, step_iv_si);
10500 /* Create the following def-use cycle:
10501 loop prolog:
10502 vec_init = ...
10503 vec_step = ...
10504 loop:
10505 vec_iv = PHI <vec_init, vec_loop>
10507 STMT
10509 vec_loop = vec_iv + vec_step; */
10511 /* Create the induction-phi that defines the induction-operand. */
10512 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10513 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10514 induc_def = PHI_RESULT (induction_phi);
10516 /* Create the iv update inside the loop */
10517 stmts = NULL;
10518 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10519 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10520 vec_def = gimple_convert (&stmts, vectype, vec_def);
10521 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10522 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10524 /* Set the arguments of the phi node: */
10525 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10526 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10527 UNKNOWN_LOCATION);
10529 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10530 *vec_stmt = induction_phi;
10532 /* In case that vectorization factor (VF) is bigger than the number
10533 of elements that we can fit in a vectype (nunits), we have to generate
10534 more than one vector stmt - i.e - we need to "unroll" the
10535 vector stmt by a factor VF/nunits. For more details see documentation
10536 in vectorizable_operation. */
10538 if (ncopies > 1)
10540 gimple_seq seq = NULL;
10541 /* FORNOW. This restriction should be relaxed. */
10542 gcc_assert (!nested_in_vect_loop);
10543 /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1. */
10544 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10546 /* Create the vector that holds the step of the induction. */
10547 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10549 expr = build_int_cst (integer_type_node, nunits);
10550 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10552 else
10553 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10554 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10555 expr, step_expr);
10556 if (seq)
10558 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10559 gcc_assert (!new_bb);
10562 t = unshare_expr (new_name);
10563 gcc_assert (CONSTANT_CLASS_P (new_name)
10564 || TREE_CODE (new_name) == SSA_NAME);
10565 new_vec = build_vector_from_val (step_vectype, t);
10566 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10567 new_vec, step_vectype, NULL);
10569 vec_def = induc_def;
10570 for (i = 1; i < ncopies + 1; i++)
10572 /* vec_i = vec_prev + vec_step */
10573 gimple_seq stmts = NULL;
10574 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10575 vec_def = gimple_build (&stmts,
10576 PLUS_EXPR, step_vectype, vec_def, vec_step);
10577 vec_def = gimple_convert (&stmts, vectype, vec_def);
10579 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10580 if (i < ncopies)
10582 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10583 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10585 else
10587 /* vec_1 = vec_iv + (VF/n * S)
10588 vec_2 = vec_1 + (VF/n * S)
10590 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10592 vec_n is used as vec_loop to save the large step register and
10593 related operations. */
10594 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10595 UNKNOWN_LOCATION);
10600 if (dump_enabled_p ())
10601 dump_printf_loc (MSG_NOTE, vect_location,
10602 "transform induction: created def-use cycle: %G%G",
10603 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10605 return true;
10608 /* Function vectorizable_live_operation_1.
10610 helper function for vectorizable_live_operation. */
10612 static tree
10613 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10614 stmt_vec_info stmt_info, basic_block exit_bb,
10615 tree vectype, int ncopies, slp_tree slp_node,
10616 tree bitsize, tree bitstart, tree vec_lhs,
10617 tree lhs_type, gimple_stmt_iterator *exit_gsi)
10619 gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10621 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10622 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10623 for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10624 SET_PHI_ARG_DEF (phi, i, vec_lhs);
10626 gimple_seq stmts = NULL;
10627 tree new_tree;
10629 /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10630 if (integer_zerop (bitstart))
10632 tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10633 vec_lhs_phi, bitsize, bitstart);
10635 /* Convert the extracted vector element to the scalar type. */
10636 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10638 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10640 /* Emit:
10642 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10644 where VEC_LHS is the vectorized live-out result and MASK is
10645 the loop mask for the final iteration. */
10646 gcc_assert (ncopies == 1 && !slp_node);
10647 gimple_seq tem = NULL;
10648 gimple_stmt_iterator gsi = gsi_last (tem);
10649 tree len = vect_get_loop_len (loop_vinfo, &gsi,
10650 &LOOP_VINFO_LENS (loop_vinfo),
10651 1, vectype, 0, 0);
10653 /* BIAS - 1. */
10654 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10655 tree bias_minus_one
10656 = int_const_binop (MINUS_EXPR,
10657 build_int_cst (TREE_TYPE (len), biasval),
10658 build_one_cst (TREE_TYPE (len)));
10660 /* LAST_INDEX = LEN + (BIAS - 1). */
10661 tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10662 len, bias_minus_one);
10664 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10665 tree scalar_res
10666 = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10667 vec_lhs_phi, last_index);
10669 /* Convert the extracted vector element to the scalar type. */
10670 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10672 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10674 /* Emit:
10676 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10678 where VEC_LHS is the vectorized live-out result and MASK is
10679 the loop mask for the final iteration. */
10680 gcc_assert (!slp_node);
10681 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10682 gimple_seq tem = NULL;
10683 gimple_stmt_iterator gsi = gsi_last (tem);
10684 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10685 &LOOP_VINFO_MASKS (loop_vinfo),
10686 1, vectype, 0);
10687 tree scalar_res;
10688 gimple_seq_add_seq (&stmts, tem);
10690 scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10691 mask, vec_lhs_phi);
10693 /* Convert the extracted vector element to the scalar type. */
10694 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10696 else
10698 tree bftype = TREE_TYPE (vectype);
10699 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10700 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10701 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10702 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10703 &stmts, true, NULL_TREE);
10706 *exit_gsi = gsi_after_labels (exit_bb);
10707 if (stmts)
10708 gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10710 return new_tree;
10713 /* Find the edge that's the final one in the path from SRC to DEST and
10714 return it. This edge must exist in at most one forwarder edge between. */
10716 static edge
10717 find_connected_edge (edge src, basic_block dest)
10719 if (src->dest == dest)
10720 return src;
10722 return find_edge (src->dest, dest);
10725 /* Function vectorizable_live_operation.
10727 STMT_INFO computes a value that is used outside the loop. Check if
10728 it can be supported. */
10730 bool
10731 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10732 slp_tree slp_node, slp_instance slp_node_instance,
10733 int slp_index, bool vec_stmt_p,
10734 stmt_vector_for_cost *cost_vec)
10736 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10737 imm_use_iterator imm_iter;
10738 tree lhs, lhs_type, bitsize;
10739 tree vectype = (slp_node
10740 ? SLP_TREE_VECTYPE (slp_node)
10741 : STMT_VINFO_VECTYPE (stmt_info));
10742 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10743 int ncopies;
10744 gimple *use_stmt;
10745 use_operand_p use_p;
10746 auto_vec<tree> vec_oprnds;
10747 int vec_entry = 0;
10748 poly_uint64 vec_index = 0;
10750 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10751 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10753 /* If a stmt of a reduction is live, vectorize it via
10754 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10755 validity so just trigger the transform here. */
10756 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10758 if (!vec_stmt_p)
10759 return true;
10760 if (slp_node)
10762 /* For reduction chains the meta-info is attached to
10763 the group leader. */
10764 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10765 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10766 /* For SLP reductions we vectorize the epilogue for
10767 all involved stmts together. */
10768 else if (slp_index != 0)
10769 return true;
10771 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10772 gcc_assert (reduc_info->is_reduc_info);
10773 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10774 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10775 return true;
10777 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10778 slp_node_instance,
10779 LOOP_VINFO_IV_EXIT (loop_vinfo));
10781 /* If early break we only have to materialize the reduction on the merge
10782 block, but we have to find an alternate exit first. */
10783 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10785 for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10786 if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10788 vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
10789 slp_node, slp_node_instance,
10790 exit);
10791 break;
10795 return true;
10798 /* If STMT is not relevant and it is a simple assignment and its inputs are
10799 invariant then it can remain in place, unvectorized. The original last
10800 scalar value that it computes will be used. */
10801 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10803 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10804 if (dump_enabled_p ())
10805 dump_printf_loc (MSG_NOTE, vect_location,
10806 "statement is simple and uses invariant. Leaving in "
10807 "place.\n");
10808 return true;
10811 if (slp_node)
10812 ncopies = 1;
10813 else
10814 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10816 if (slp_node)
10818 gcc_assert (slp_index >= 0);
10820 /* Get the last occurrence of the scalar index from the concatenation of
10821 all the slp vectors. Calculate which slp vector it is and the index
10822 within. */
10823 int num_scalar = SLP_TREE_LANES (slp_node);
10824 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10825 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10827 /* Calculate which vector contains the result, and which lane of
10828 that vector we need. */
10829 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10831 if (dump_enabled_p ())
10832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10833 "Cannot determine which vector holds the"
10834 " final result.\n");
10835 return false;
10839 if (!vec_stmt_p)
10841 /* No transformation required. */
10842 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10844 if (slp_node)
10846 if (dump_enabled_p ())
10847 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10848 "can't operate on partial vectors "
10849 "because an SLP statement is live after "
10850 "the loop.\n");
10851 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10853 else if (ncopies > 1)
10855 if (dump_enabled_p ())
10856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10857 "can't operate on partial vectors "
10858 "because ncopies is greater than 1.\n");
10859 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10861 else
10863 gcc_assert (ncopies == 1 && !slp_node);
10864 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10865 OPTIMIZE_FOR_SPEED))
10866 vect_record_loop_mask (loop_vinfo,
10867 &LOOP_VINFO_MASKS (loop_vinfo),
10868 1, vectype, NULL);
10869 else if (can_vec_extract_var_idx_p (
10870 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10871 vect_record_loop_len (loop_vinfo,
10872 &LOOP_VINFO_LENS (loop_vinfo),
10873 1, vectype, 1);
10874 else
10876 if (dump_enabled_p ())
10877 dump_printf_loc (
10878 MSG_MISSED_OPTIMIZATION, vect_location,
10879 "can't operate on partial vectors "
10880 "because the target doesn't support extract "
10881 "last reduction.\n");
10882 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10886 /* ??? Enable for loop costing as well. */
10887 if (!loop_vinfo)
10888 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10889 0, vect_epilogue);
10890 return true;
10893 /* Use the lhs of the original scalar statement. */
10894 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10895 if (dump_enabled_p ())
10896 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10897 "stmt %G", stmt);
10899 lhs = gimple_get_lhs (stmt);
10900 lhs_type = TREE_TYPE (lhs);
10902 bitsize = vector_element_bits_tree (vectype);
10904 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10905 tree vec_lhs, vec_lhs0, bitstart;
10906 gimple *vec_stmt, *vec_stmt0;
10907 if (slp_node)
10909 gcc_assert (!loop_vinfo
10910 || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10911 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10913 /* Get the correct slp vectorized stmt. */
10914 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10915 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10917 /* In case we need to early break vectorize also get the first stmt. */
10918 vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10919 vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10921 /* Get entry to use. */
10922 bitstart = bitsize_int (vec_index);
10923 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10925 else
10927 /* For multiple copies, get the last copy. */
10928 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10929 vec_lhs = gimple_get_lhs (vec_stmt);
10931 /* In case we need to early break vectorize also get the first stmt. */
10932 vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10933 vec_lhs0 = gimple_get_lhs (vec_stmt0);
10935 /* Get the last lane in the vector. */
10936 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10939 if (loop_vinfo)
10941 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10942 requirement, insert one phi node for it. It looks like:
10943 loop;
10945 # lhs' = PHI <lhs>
10947 loop;
10949 # vec_lhs' = PHI <vec_lhs>
10950 new_tree = lane_extract <vec_lhs', ...>;
10951 lhs' = new_tree; */
10953 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10954 /* Check if we have a loop where the chosen exit is not the main exit,
10955 in these cases for an early break we restart the iteration the vector code
10956 did. For the live values we want the value at the start of the iteration
10957 rather than at the end. */
10958 edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10959 bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10960 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10961 if (!is_gimple_debug (use_stmt)
10962 && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10963 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10965 edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10966 phi_arg_index_from_use (use_p));
10967 bool main_exit_edge = e == main_e
10968 || find_connected_edge (main_e, e->src);
10970 /* Early exits have an merge block, we want the merge block itself
10971 so use ->src. For main exit the merge block is the
10972 destination. */
10973 basic_block dest = main_exit_edge ? main_e->dest : e->src;
10974 tree tmp_vec_lhs = vec_lhs;
10975 tree tmp_bitstart = bitstart;
10977 /* For early exit where the exit is not in the BB that leads
10978 to the latch then we're restarting the iteration in the
10979 scalar loop. So get the first live value. */
10980 restart_loop = restart_loop || !main_exit_edge;
10981 if (restart_loop
10982 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10984 tmp_vec_lhs = vec_lhs0;
10985 tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10988 gimple_stmt_iterator exit_gsi;
10989 tree new_tree
10990 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10991 dest, vectype, ncopies,
10992 slp_node, bitsize,
10993 tmp_bitstart, tmp_vec_lhs,
10994 lhs_type, &exit_gsi);
10996 if (gimple_phi_num_args (use_stmt) == 1)
10998 auto gsi = gsi_for_stmt (use_stmt);
10999 remove_phi_node (&gsi, false);
11000 tree lhs_phi = gimple_phi_result (use_stmt);
11001 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
11002 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
11004 else
11005 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, new_tree);
11008 /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
11009 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11010 gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
11012 else
11014 /* For basic-block vectorization simply insert the lane-extraction. */
11015 tree bftype = TREE_TYPE (vectype);
11016 if (VECTOR_BOOLEAN_TYPE_P (vectype))
11017 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11018 tree new_tree = build3 (BIT_FIELD_REF, bftype,
11019 vec_lhs, bitsize, bitstart);
11020 gimple_seq stmts = NULL;
11021 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11022 &stmts, true, NULL_TREE);
11023 if (TREE_CODE (new_tree) == SSA_NAME
11024 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11025 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11026 if (is_a <gphi *> (vec_stmt))
11028 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11029 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11031 else
11033 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11034 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11037 /* Replace use of lhs with newly computed result. If the use stmt is a
11038 single arg PHI, just replace all uses of PHI result. It's necessary
11039 because lcssa PHI defining lhs may be before newly inserted stmt. */
11040 use_operand_p use_p;
11041 stmt_vec_info use_stmt_info;
11042 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11043 if (!is_gimple_debug (use_stmt)
11044 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11045 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11047 /* ??? This can happen when the live lane ends up being
11048 rooted in a vector construction code-generated by an
11049 external SLP node (and code-generation for that already
11050 happened). See gcc.dg/vect/bb-slp-47.c.
11051 Doing this is what would happen if that vector CTOR
11052 were not code-generated yet so it is not too bad.
11053 ??? In fact we'd likely want to avoid this situation
11054 in the first place. */
11055 if (TREE_CODE (new_tree) == SSA_NAME
11056 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11057 && gimple_code (use_stmt) != GIMPLE_PHI
11058 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11059 use_stmt))
11061 if (dump_enabled_p ())
11062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11063 "Using original scalar computation for "
11064 "live lane because use preceeds vector "
11065 "def\n");
11066 continue;
11068 /* ??? It can also happen that we end up pulling a def into
11069 a loop where replacing out-of-loop uses would require
11070 a new LC SSA PHI node. Retain the original scalar in
11071 those cases as well. PR98064. */
11072 if (TREE_CODE (new_tree) == SSA_NAME
11073 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11074 && (gimple_bb (use_stmt)->loop_father
11075 != gimple_bb (vec_stmt)->loop_father)
11076 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11077 gimple_bb (use_stmt)->loop_father))
11079 if (dump_enabled_p ())
11080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11081 "Using original scalar computation for "
11082 "live lane because there is an out-of-loop "
11083 "definition for it\n");
11084 continue;
11086 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11087 SET_USE (use_p, new_tree);
11088 update_stmt (use_stmt);
11092 return true;
11095 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
11097 static void
11098 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11100 ssa_op_iter op_iter;
11101 imm_use_iterator imm_iter;
11102 def_operand_p def_p;
11103 gimple *ustmt;
11105 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11107 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11109 basic_block bb;
11111 if (!is_gimple_debug (ustmt))
11112 continue;
11114 bb = gimple_bb (ustmt);
11116 if (!flow_bb_inside_loop_p (loop, bb))
11118 if (gimple_debug_bind_p (ustmt))
11120 if (dump_enabled_p ())
11121 dump_printf_loc (MSG_NOTE, vect_location,
11122 "killing debug use\n");
11124 gimple_debug_bind_reset_value (ustmt);
11125 update_stmt (ustmt);
11127 else
11128 gcc_unreachable ();
11134 /* Given loop represented by LOOP_VINFO, return true if computation of
11135 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11136 otherwise. */
11138 static bool
11139 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11141 /* Constant case. */
11142 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11144 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11145 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11147 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11148 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11149 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11150 return true;
11153 widest_int max;
11154 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11155 /* Check the upper bound of loop niters. */
11156 if (get_max_loop_iterations (loop, &max))
11158 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11159 signop sgn = TYPE_SIGN (type);
11160 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11161 if (max < type_max)
11162 return true;
11164 return false;
11167 /* Return a mask type with half the number of elements as OLD_TYPE,
11168 given that it should have mode NEW_MODE. */
11170 tree
11171 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11173 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11174 return build_truth_vector_type_for_mode (nunits, new_mode);
11177 /* Return a mask type with twice as many elements as OLD_TYPE,
11178 given that it should have mode NEW_MODE. */
11180 tree
11181 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11183 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11184 return build_truth_vector_type_for_mode (nunits, new_mode);
11187 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11188 contain a sequence of NVECTORS masks that each control a vector of type
11189 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
11190 these vector masks with the vector version of SCALAR_MASK. */
11192 void
11193 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11194 unsigned int nvectors, tree vectype, tree scalar_mask)
11196 gcc_assert (nvectors != 0);
11198 if (scalar_mask)
11200 scalar_cond_masked_key cond (scalar_mask, nvectors);
11201 loop_vinfo->scalar_cond_masked_set.add (cond);
11204 masks->mask_set.add (std::make_pair (vectype, nvectors));
11207 /* Given a complete set of masks MASKS, extract mask number INDEX
11208 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11209 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
11211 See the comment above vec_loop_masks for more details about the mask
11212 arrangement. */
11214 tree
11215 vect_get_loop_mask (loop_vec_info loop_vinfo,
11216 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11217 unsigned int nvectors, tree vectype, unsigned int index)
11219 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11220 == vect_partial_vectors_while_ult)
11222 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11223 tree mask_type = rgm->type;
11225 /* Populate the rgroup's mask array, if this is the first time we've
11226 used it. */
11227 if (rgm->controls.is_empty ())
11229 rgm->controls.safe_grow_cleared (nvectors, true);
11230 for (unsigned int i = 0; i < nvectors; ++i)
11232 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11233 /* Provide a dummy definition until the real one is available. */
11234 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11235 rgm->controls[i] = mask;
11239 tree mask = rgm->controls[index];
11240 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11241 TYPE_VECTOR_SUBPARTS (vectype)))
11243 /* A loop mask for data type X can be reused for data type Y
11244 if X has N times more elements than Y and if Y's elements
11245 are N times bigger than X's. In this case each sequence
11246 of N elements in the loop mask will be all-zero or all-one.
11247 We can then view-convert the mask so that each sequence of
11248 N elements is replaced by a single element. */
11249 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11250 TYPE_VECTOR_SUBPARTS (vectype)));
11251 gimple_seq seq = NULL;
11252 mask_type = truth_type_for (vectype);
11253 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11254 if (seq)
11255 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11257 return mask;
11259 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11260 == vect_partial_vectors_avx512)
11262 /* The number of scalars per iteration and the number of vectors are
11263 both compile-time constants. */
11264 unsigned int nscalars_per_iter
11265 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11266 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11268 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11270 /* The stored nV is dependent on the mask type produced. */
11271 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11272 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11273 == rgm->factor);
11274 nvectors = rgm->factor;
11276 /* Populate the rgroup's mask array, if this is the first time we've
11277 used it. */
11278 if (rgm->controls.is_empty ())
11280 rgm->controls.safe_grow_cleared (nvectors, true);
11281 for (unsigned int i = 0; i < nvectors; ++i)
11283 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11284 /* Provide a dummy definition until the real one is available. */
11285 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11286 rgm->controls[i] = mask;
11289 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11290 TYPE_VECTOR_SUBPARTS (vectype)))
11291 return rgm->controls[index];
11293 /* Split the vector if needed. Since we are dealing with integer mode
11294 masks with AVX512 we can operate on the integer representation
11295 performing the whole vector shifting. */
11296 unsigned HOST_WIDE_INT factor;
11297 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11298 TYPE_VECTOR_SUBPARTS (vectype), &factor);
11299 gcc_assert (ok);
11300 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11301 tree mask_type = truth_type_for (vectype);
11302 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11303 unsigned vi = index / factor;
11304 unsigned vpart = index % factor;
11305 tree vec = rgm->controls[vi];
11306 gimple_seq seq = NULL;
11307 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11308 lang_hooks.types.type_for_mode
11309 (TYPE_MODE (rgm->type), 1), vec);
11310 /* For integer mode masks simply shift the right bits into position. */
11311 if (vpart != 0)
11312 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11313 build_int_cst (integer_type_node,
11314 (TYPE_VECTOR_SUBPARTS (vectype)
11315 * vpart)));
11316 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11317 (TYPE_MODE (mask_type), 1), vec);
11318 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11319 if (seq)
11320 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11321 return vec;
11323 else
11324 gcc_unreachable ();
11327 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11328 lengths for controlling an operation on VECTYPE. The operation splits
11329 each element of VECTYPE into FACTOR separate subelements, measuring the
11330 length as a number of these subelements. */
11332 void
11333 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11334 unsigned int nvectors, tree vectype, unsigned int factor)
11336 gcc_assert (nvectors != 0);
11337 if (lens->length () < nvectors)
11338 lens->safe_grow_cleared (nvectors, true);
11339 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11341 /* The number of scalars per iteration, scalar occupied bytes and
11342 the number of vectors are both compile-time constants. */
11343 unsigned int nscalars_per_iter
11344 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11345 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11347 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11349 /* For now, we only support cases in which all loads and stores fall back
11350 to VnQI or none do. */
11351 gcc_assert (!rgl->max_nscalars_per_iter
11352 || (rgl->factor == 1 && factor == 1)
11353 || (rgl->max_nscalars_per_iter * rgl->factor
11354 == nscalars_per_iter * factor));
11355 rgl->max_nscalars_per_iter = nscalars_per_iter;
11356 rgl->type = vectype;
11357 rgl->factor = factor;
11361 /* Given a complete set of lengths LENS, extract length number INDEX
11362 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11363 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11364 multipled by the number of elements that should be processed.
11365 Insert any set-up statements before GSI. */
11367 tree
11368 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11369 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11370 unsigned int index, unsigned int factor)
11372 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11373 bool use_bias_adjusted_len =
11374 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11376 /* Populate the rgroup's len array, if this is the first time we've
11377 used it. */
11378 if (rgl->controls.is_empty ())
11380 rgl->controls.safe_grow_cleared (nvectors, true);
11381 for (unsigned int i = 0; i < nvectors; ++i)
11383 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11384 gcc_assert (len_type != NULL_TREE);
11386 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11388 /* Provide a dummy definition until the real one is available. */
11389 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11390 rgl->controls[i] = len;
11392 if (use_bias_adjusted_len)
11394 gcc_assert (i == 0);
11395 tree adjusted_len =
11396 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11397 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11398 rgl->bias_adjusted_ctrl = adjusted_len;
11403 if (use_bias_adjusted_len)
11404 return rgl->bias_adjusted_ctrl;
11406 tree loop_len = rgl->controls[index];
11407 if (rgl->factor == 1 && factor == 1)
11409 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11410 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11411 if (maybe_ne (nunits1, nunits2))
11413 /* A loop len for data type X can be reused for data type Y
11414 if X has N times more elements than Y and if Y's elements
11415 are N times bigger than X's. */
11416 gcc_assert (multiple_p (nunits1, nunits2));
11417 factor = exact_div (nunits1, nunits2).to_constant ();
11418 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11419 gimple_seq seq = NULL;
11420 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11421 build_int_cst (iv_type, factor));
11422 if (seq)
11423 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11426 return loop_len;
11429 /* Scale profiling counters by estimation for LOOP which is vectorized
11430 by factor VF.
11431 If FLAT is true, the loop we started with had unrealistically flat
11432 profile. */
11434 static void
11435 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11437 /* For flat profiles do not scale down proportionally by VF and only
11438 cap by known iteration count bounds. */
11439 if (flat)
11441 if (dump_file && (dump_flags & TDF_DETAILS))
11442 fprintf (dump_file,
11443 "Vectorized loop profile seems flat; not scaling iteration "
11444 "count down by the vectorization factor %i\n", vf);
11445 scale_loop_profile (loop, profile_probability::always (),
11446 get_likely_max_loop_iterations_int (loop));
11447 return;
11449 /* Loop body executes VF fewer times and exit increases VF times. */
11450 profile_count entry_count = loop_preheader_edge (loop)->count ();
11452 /* If we have unreliable loop profile avoid dropping entry
11453 count bellow header count. This can happen since loops
11454 has unrealistically low trip counts. */
11455 while (vf > 1
11456 && loop->header->count > entry_count
11457 && loop->header->count < entry_count * vf)
11459 if (dump_file && (dump_flags & TDF_DETAILS))
11460 fprintf (dump_file,
11461 "Vectorization factor %i seems too large for profile "
11462 "prevoiusly believed to be consistent; reducing.\n", vf);
11463 vf /= 2;
11466 if (entry_count.nonzero_p ())
11467 set_edge_probability_and_rescale_others
11468 (exit_e,
11469 entry_count.probability_in (loop->header->count / vf));
11470 /* Avoid producing very large exit probability when we do not have
11471 sensible profile. */
11472 else if (exit_e->probability < profile_probability::always () / (vf * 2))
11473 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11474 loop->latch->count = single_pred_edge (loop->latch)->count ();
11476 scale_loop_profile (loop, profile_probability::always () / vf,
11477 get_likely_max_loop_iterations_int (loop));
11480 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11481 latch edge values originally defined by it. */
11483 static void
11484 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11485 stmt_vec_info def_stmt_info)
11487 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11488 if (!def || TREE_CODE (def) != SSA_NAME)
11489 return;
11490 stmt_vec_info phi_info;
11491 imm_use_iterator iter;
11492 use_operand_p use_p;
11493 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11495 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11496 if (!phi)
11497 continue;
11498 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11499 && (phi_info = loop_vinfo->lookup_stmt (phi))
11500 && STMT_VINFO_RELEVANT_P (phi_info)))
11501 continue;
11502 loop_p loop = gimple_bb (phi)->loop_father;
11503 edge e = loop_latch_edge (loop);
11504 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11505 continue;
11507 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11508 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11509 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11511 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11512 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11513 gcc_assert (phi_defs.length () == latch_defs.length ());
11514 for (unsigned i = 0; i < phi_defs.length (); ++i)
11515 add_phi_arg (as_a <gphi *> (phi_defs[i]),
11516 gimple_get_lhs (latch_defs[i]), e,
11517 gimple_phi_arg_location (phi, e->dest_idx));
11519 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11521 /* For first order recurrences we have to update both uses of
11522 the latch definition, the one in the PHI node and the one
11523 in the generated VEC_PERM_EXPR. */
11524 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11525 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11526 gcc_assert (phi_defs.length () == latch_defs.length ());
11527 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11528 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11529 for (unsigned i = 0; i < phi_defs.length (); ++i)
11531 gassign *perm = as_a <gassign *> (phi_defs[i]);
11532 if (i > 0)
11533 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11534 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11535 update_stmt (perm);
11537 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11538 gimple_phi_arg_location (phi, e->dest_idx));
11543 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11544 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11545 stmt_vec_info. */
11547 static bool
11548 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11549 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11551 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11552 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11554 if (dump_enabled_p ())
11555 dump_printf_loc (MSG_NOTE, vect_location,
11556 "------>vectorizing statement: %G", stmt_info->stmt);
11558 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11559 vect_loop_kill_debug_uses (loop, stmt_info);
11561 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11562 && !STMT_VINFO_LIVE_P (stmt_info))
11564 if (is_gimple_call (stmt_info->stmt)
11565 && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11567 gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11568 *seen_store = stmt_info;
11569 return false;
11571 return false;
11574 if (STMT_VINFO_VECTYPE (stmt_info))
11576 poly_uint64 nunits
11577 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11578 if (!STMT_SLP_TYPE (stmt_info)
11579 && maybe_ne (nunits, vf)
11580 && dump_enabled_p ())
11581 /* For SLP VF is set according to unrolling factor, and not
11582 to vector size, hence for SLP this print is not valid. */
11583 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11586 /* Pure SLP statements have already been vectorized. We still need
11587 to apply loop vectorization to hybrid SLP statements. */
11588 if (PURE_SLP_STMT (stmt_info))
11589 return false;
11591 if (dump_enabled_p ())
11592 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11594 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11595 *seen_store = stmt_info;
11597 return true;
11600 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11601 in the hash_map with its corresponding values. */
11603 static tree
11604 find_in_mapping (tree t, void *context)
11606 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11608 tree *value = mapping->get (t);
11609 return value ? *value : t;
11612 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11613 original loop that has now been vectorized.
11615 The inits of the data_references need to be advanced with the number of
11616 iterations of the main loop. This has been computed in vect_do_peeling and
11617 is stored in parameter ADVANCE. We first restore the data_references
11618 initial offset with the values recored in ORIG_DRS_INIT.
11620 Since the loop_vec_info of this EPILOGUE was constructed for the original
11621 loop, its stmt_vec_infos all point to the original statements. These need
11622 to be updated to point to their corresponding copies as well as the SSA_NAMES
11623 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11625 The data_reference's connections also need to be updated. Their
11626 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11627 stmt_vec_infos, their statements need to point to their corresponding copy,
11628 if they are gather loads or scatter stores then their reference needs to be
11629 updated to point to its corresponding copy and finally we set
11630 'base_misaligned' to false as we have already peeled for alignment in the
11631 prologue of the main loop. */
11633 static void
11634 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11636 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11637 auto_vec<gimple *> stmt_worklist;
11638 hash_map<tree,tree> mapping;
11639 gimple *orig_stmt, *new_stmt;
11640 gimple_stmt_iterator epilogue_gsi;
11641 gphi_iterator epilogue_phi_gsi;
11642 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11643 basic_block *epilogue_bbs = get_loop_body (epilogue);
11644 unsigned i;
11646 free (LOOP_VINFO_BBS (epilogue_vinfo));
11647 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11649 /* Advance data_reference's with the number of iterations of the previous
11650 loop and its prologue. */
11651 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11654 /* The EPILOGUE loop is a copy of the original loop so they share the same
11655 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11656 point to the copied statements. We also create a mapping of all LHS' in
11657 the original loop and all the LHS' in the EPILOGUE and create worklists to
11658 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11659 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11661 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11662 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11664 new_stmt = epilogue_phi_gsi.phi ();
11666 gcc_assert (gimple_uid (new_stmt) > 0);
11667 stmt_vinfo
11668 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11670 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11671 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11673 mapping.put (gimple_phi_result (orig_stmt),
11674 gimple_phi_result (new_stmt));
11675 /* PHI nodes can not have patterns or related statements. */
11676 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11677 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11680 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11681 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11683 new_stmt = gsi_stmt (epilogue_gsi);
11684 if (is_gimple_debug (new_stmt))
11685 continue;
11687 gcc_assert (gimple_uid (new_stmt) > 0);
11688 stmt_vinfo
11689 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11691 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11692 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11694 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11695 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11697 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11699 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11700 for (gimple_stmt_iterator gsi = gsi_start (seq);
11701 !gsi_end_p (gsi); gsi_next (&gsi))
11702 stmt_worklist.safe_push (gsi_stmt (gsi));
11705 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11706 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11708 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11709 stmt_worklist.safe_push (stmt);
11710 /* Set BB such that the assert in
11711 'get_initial_def_for_reduction' is able to determine that
11712 the BB of the related stmt is inside this loop. */
11713 gimple_set_bb (stmt,
11714 gimple_bb (new_stmt));
11715 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11716 gcc_assert (related_vinfo == NULL
11717 || related_vinfo == stmt_vinfo);
11722 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11723 using the original main loop and thus need to be updated to refer to the
11724 cloned variables used in the epilogue. */
11725 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11727 gimple *stmt = stmt_worklist[i];
11728 tree *new_op;
11730 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11732 tree op = gimple_op (stmt, j);
11733 if ((new_op = mapping.get(op)))
11734 gimple_set_op (stmt, j, *new_op);
11735 else
11737 /* PR92429: The last argument of simplify_replace_tree disables
11738 folding when replacing arguments. This is required as
11739 otherwise you might end up with different statements than the
11740 ones analyzed in vect_loop_analyze, leading to different
11741 vectorization. */
11742 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11743 &find_in_mapping, &mapping, false);
11744 gimple_set_op (stmt, j, op);
11749 struct data_reference *dr;
11750 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11751 FOR_EACH_VEC_ELT (datarefs, i, dr)
11753 orig_stmt = DR_STMT (dr);
11754 gcc_assert (gimple_uid (orig_stmt) > 0);
11755 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11756 /* Data references for gather loads and scatter stores do not use the
11757 updated offset we set using ADVANCE. Instead we have to make sure the
11758 reference in the data references point to the corresponding copy of
11759 the original in the epilogue. Make sure to update both
11760 gather/scatters recognized by dataref analysis and also other
11761 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11762 auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11763 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11764 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11766 DR_REF (dr)
11767 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11768 &find_in_mapping, &mapping);
11769 DR_BASE_ADDRESS (dr)
11770 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11771 &find_in_mapping, &mapping);
11773 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11774 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11775 /* The vector size of the epilogue is smaller than that of the main loop
11776 so the alignment is either the same or lower. This means the dr will
11777 thus by definition be aligned. */
11778 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11781 epilogue_vinfo->shared->datarefs_copy.release ();
11782 epilogue_vinfo->shared->save_datarefs ();
11785 /* When vectorizing early break statements instructions that happen before
11786 the early break in the current BB need to be moved to after the early
11787 break. This function deals with that and assumes that any validity
11788 checks has already been performed.
11790 While moving the instructions if it encounters a VUSE or VDEF it then
11791 corrects the VUSES as it moves the statements along. GDEST is the location
11792 in which to insert the new statements. */
11794 static void
11795 move_early_exit_stmts (loop_vec_info loop_vinfo)
11797 DUMP_VECT_SCOPE ("move_early_exit_stmts");
11799 if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11800 return;
11802 /* Move all stmts that need moving. */
11803 basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11804 gimple_stmt_iterator dest_gsi = gsi_start_bb (dest_bb);
11806 for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11808 /* Check to see if statement is still required for vect or has been
11809 elided. */
11810 auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11811 if (!stmt_info)
11812 continue;
11814 if (dump_enabled_p ())
11815 dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11817 gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11818 gsi_move_before (&stmt_gsi, &dest_gsi);
11819 gsi_prev (&dest_gsi);
11822 /* Update all the stmts with their new reaching VUSES. */
11823 tree vuse
11824 = gimple_vuse (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).last ());
11825 for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11827 if (dump_enabled_p ())
11828 dump_printf_loc (MSG_NOTE, vect_location,
11829 "updating vuse to %T for load %G", vuse, p);
11830 gimple_set_vuse (p, vuse);
11831 update_stmt (p);
11834 /* And update the LC PHIs on exits. */
11835 for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11836 if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11837 if (gphi *phi = get_virtual_phi (e->dest))
11838 SET_PHI_ARG_DEF_ON_EDGE (phi, e, vuse);
11841 /* Function vect_transform_loop.
11843 The analysis phase has determined that the loop is vectorizable.
11844 Vectorize the loop - created vectorized stmts to replace the scalar
11845 stmts in the loop, and update the loop exit condition.
11846 Returns scalar epilogue loop if any. */
11848 class loop *
11849 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11851 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11852 class loop *epilogue = NULL;
11853 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11854 int nbbs = loop->num_nodes;
11855 int i;
11856 tree niters_vector = NULL_TREE;
11857 tree step_vector = NULL_TREE;
11858 tree niters_vector_mult_vf = NULL_TREE;
11859 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11860 unsigned int lowest_vf = constant_lower_bound (vf);
11861 gimple *stmt;
11862 bool check_profitability = false;
11863 unsigned int th;
11864 bool flat = maybe_flat_loop_profile (loop);
11866 DUMP_VECT_SCOPE ("vec_transform_loop");
11868 loop_vinfo->shared->check_datarefs ();
11870 /* Use the more conservative vectorization threshold. If the number
11871 of iterations is constant assume the cost check has been performed
11872 by our caller. If the threshold makes all loops profitable that
11873 run at least the (estimated) vectorization factor number of times
11874 checking is pointless, too. */
11875 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11876 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11878 if (dump_enabled_p ())
11879 dump_printf_loc (MSG_NOTE, vect_location,
11880 "Profitability threshold is %d loop iterations.\n",
11881 th);
11882 check_profitability = true;
11885 /* Make sure there exists a single-predecessor exit bb. Do this before
11886 versioning. */
11887 edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11888 if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11890 split_loop_exit_edge (e, true);
11891 if (dump_enabled_p ())
11892 dump_printf (MSG_NOTE, "split exit edge\n");
11895 /* Version the loop first, if required, so the profitability check
11896 comes first. */
11898 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11900 class loop *sloop
11901 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11902 sloop->force_vectorize = false;
11903 check_profitability = false;
11906 /* Make sure there exists a single-predecessor exit bb also on the
11907 scalar loop copy. Do this after versioning but before peeling
11908 so CFG structure is fine for both scalar and if-converted loop
11909 to make slpeel_duplicate_current_defs_from_edges face matched
11910 loop closed PHI nodes on the exit. */
11911 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11913 e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11914 if (! single_pred_p (e->dest))
11916 split_loop_exit_edge (e, true);
11917 if (dump_enabled_p ())
11918 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11922 tree niters = vect_build_loop_niters (loop_vinfo);
11923 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11924 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11925 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11926 tree advance;
11927 drs_init_vec orig_drs_init;
11929 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11930 &step_vector, &niters_vector_mult_vf, th,
11931 check_profitability, niters_no_overflow,
11932 &advance);
11933 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11934 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11936 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11937 block after loop exit. We need to scale all that. */
11938 basic_block preheader
11939 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11940 preheader->count
11941 = preheader->count.apply_probability
11942 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11943 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11944 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11945 LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11948 if (niters_vector == NULL_TREE)
11950 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11951 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11952 && known_eq (lowest_vf, vf))
11954 niters_vector
11955 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11956 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11957 step_vector = build_one_cst (TREE_TYPE (niters));
11959 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11960 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11961 &step_vector, niters_no_overflow);
11962 else
11963 /* vect_do_peeling subtracted the number of peeled prologue
11964 iterations from LOOP_VINFO_NITERS. */
11965 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11966 &niters_vector, &step_vector,
11967 niters_no_overflow);
11970 /* 1) Make sure the loop header has exactly two entries
11971 2) Make sure we have a preheader basic block. */
11973 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11975 split_edge (loop_preheader_edge (loop));
11977 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11978 /* This will deal with any possible peeling. */
11979 vect_prepare_for_masked_peels (loop_vinfo);
11981 /* Handle any code motion that we need to for early-break vectorization after
11982 we've done peeling but just before we start vectorizing. */
11983 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11984 move_early_exit_stmts (loop_vinfo);
11986 /* Schedule the SLP instances first, then handle loop vectorization
11987 below. */
11988 if (!loop_vinfo->slp_instances.is_empty ())
11990 DUMP_VECT_SCOPE ("scheduling SLP instances");
11991 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11994 /* FORNOW: the vectorizer supports only loops which body consist
11995 of one basic block (header + empty latch). When the vectorizer will
11996 support more involved loop forms, the order by which the BBs are
11997 traversed need to be reconsidered. */
11999 for (i = 0; i < nbbs; i++)
12001 basic_block bb = bbs[i];
12002 stmt_vec_info stmt_info;
12004 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12005 gsi_next (&si))
12007 gphi *phi = si.phi ();
12008 if (dump_enabled_p ())
12009 dump_printf_loc (MSG_NOTE, vect_location,
12010 "------>vectorizing phi: %G", (gimple *) phi);
12011 stmt_info = loop_vinfo->lookup_stmt (phi);
12012 if (!stmt_info)
12013 continue;
12015 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
12016 vect_loop_kill_debug_uses (loop, stmt_info);
12018 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12019 && !STMT_VINFO_LIVE_P (stmt_info))
12020 continue;
12022 if (STMT_VINFO_VECTYPE (stmt_info)
12023 && (maybe_ne
12024 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12025 && dump_enabled_p ())
12026 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12028 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12029 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12030 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12031 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12032 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12033 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12034 && ! PURE_SLP_STMT (stmt_info))
12036 if (dump_enabled_p ())
12037 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12038 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12042 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12043 gsi_next (&si))
12045 gphi *phi = si.phi ();
12046 stmt_info = loop_vinfo->lookup_stmt (phi);
12047 if (!stmt_info)
12048 continue;
12050 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12051 && !STMT_VINFO_LIVE_P (stmt_info))
12052 continue;
12054 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12055 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12056 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12057 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12058 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12059 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12060 && ! PURE_SLP_STMT (stmt_info))
12061 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12064 for (gimple_stmt_iterator si = gsi_start_bb (bb);
12065 !gsi_end_p (si);)
12067 stmt = gsi_stmt (si);
12068 /* During vectorization remove existing clobber stmts. */
12069 if (gimple_clobber_p (stmt))
12071 unlink_stmt_vdef (stmt);
12072 gsi_remove (&si, true);
12073 release_defs (stmt);
12075 else
12077 /* Ignore vector stmts created in the outer loop. */
12078 stmt_info = loop_vinfo->lookup_stmt (stmt);
12080 /* vector stmts created in the outer-loop during vectorization of
12081 stmts in an inner-loop may not have a stmt_info, and do not
12082 need to be vectorized. */
12083 stmt_vec_info seen_store = NULL;
12084 if (stmt_info)
12086 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12088 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12089 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12090 !gsi_end_p (subsi); gsi_next (&subsi))
12092 stmt_vec_info pat_stmt_info
12093 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12094 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12095 &si, &seen_store);
12097 stmt_vec_info pat_stmt_info
12098 = STMT_VINFO_RELATED_STMT (stmt_info);
12099 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12100 &si, &seen_store))
12101 maybe_set_vectorized_backedge_value (loop_vinfo,
12102 pat_stmt_info);
12104 else
12106 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12107 &seen_store))
12108 maybe_set_vectorized_backedge_value (loop_vinfo,
12109 stmt_info);
12112 gsi_next (&si);
12113 if (seen_store)
12115 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12116 /* Interleaving. If IS_STORE is TRUE, the
12117 vectorization of the interleaving chain was
12118 completed - free all the stores in the chain. */
12119 vect_remove_stores (loop_vinfo,
12120 DR_GROUP_FIRST_ELEMENT (seen_store));
12121 else
12122 /* Free the attached stmt_vec_info and remove the stmt. */
12123 loop_vinfo->remove_stmt (stmt_info);
12128 /* Stub out scalar statements that must not survive vectorization.
12129 Doing this here helps with grouped statements, or statements that
12130 are involved in patterns. */
12131 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12132 !gsi_end_p (gsi); gsi_next (&gsi))
12134 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12135 if (!call || !gimple_call_internal_p (call))
12136 continue;
12137 internal_fn ifn = gimple_call_internal_fn (call);
12138 if (ifn == IFN_MASK_LOAD)
12140 tree lhs = gimple_get_lhs (call);
12141 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12143 tree zero = build_zero_cst (TREE_TYPE (lhs));
12144 gimple *new_stmt = gimple_build_assign (lhs, zero);
12145 gsi_replace (&gsi, new_stmt, true);
12148 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12150 tree lhs = gimple_get_lhs (call);
12151 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12153 tree else_arg
12154 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12155 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12156 gsi_replace (&gsi, new_stmt, true);
12160 } /* BBs in loop */
12162 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12163 a zero NITERS becomes a nonzero NITERS_VECTOR. */
12164 if (integer_onep (step_vector))
12165 niters_no_overflow = true;
12166 vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12167 niters_vector, step_vector, niters_vector_mult_vf,
12168 !niters_no_overflow);
12170 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12172 /* True if the final iteration might not handle a full vector's
12173 worth of scalar iterations. */
12174 bool final_iter_may_be_partial
12175 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
12176 /* The minimum number of iterations performed by the epilogue. This
12177 is 1 when peeling for gaps because we always need a final scalar
12178 iteration. */
12179 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12180 /* +1 to convert latch counts to loop iteration counts,
12181 -min_epilogue_iters to remove iterations that cannot be performed
12182 by the vector code. */
12183 int bias_for_lowest = 1 - min_epilogue_iters;
12184 int bias_for_assumed = bias_for_lowest;
12185 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12186 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12188 /* When the amount of peeling is known at compile time, the first
12189 iteration will have exactly alignment_npeels active elements.
12190 In the worst case it will have at least one. */
12191 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12192 bias_for_lowest += lowest_vf - min_first_active;
12193 bias_for_assumed += assumed_vf - min_first_active;
12195 /* In these calculations the "- 1" converts loop iteration counts
12196 back to latch counts. */
12197 if (loop->any_upper_bound)
12199 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12200 loop->nb_iterations_upper_bound
12201 = (final_iter_may_be_partial
12202 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12203 lowest_vf) - 1
12204 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12205 lowest_vf) - 1);
12206 if (main_vinfo
12207 /* Both peeling for alignment and peeling for gaps can end up
12208 with the scalar epilogue running for more than VF-1 iterations. */
12209 && !main_vinfo->peeling_for_alignment
12210 && !main_vinfo->peeling_for_gaps)
12212 unsigned int bound;
12213 poly_uint64 main_iters
12214 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12215 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12216 main_iters
12217 = upper_bound (main_iters,
12218 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12219 if (can_div_away_from_zero_p (main_iters,
12220 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12221 &bound))
12222 loop->nb_iterations_upper_bound
12223 = wi::umin ((bound_wide_int) (bound - 1),
12224 loop->nb_iterations_upper_bound);
12227 if (loop->any_likely_upper_bound)
12228 loop->nb_iterations_likely_upper_bound
12229 = (final_iter_may_be_partial
12230 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12231 + bias_for_lowest, lowest_vf) - 1
12232 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12233 + bias_for_lowest, lowest_vf) - 1);
12234 if (loop->any_estimate)
12235 loop->nb_iterations_estimate
12236 = (final_iter_may_be_partial
12237 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12238 assumed_vf) - 1
12239 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12240 assumed_vf) - 1);
12241 scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12242 assumed_vf, flat);
12244 if (dump_enabled_p ())
12246 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12248 dump_printf_loc (MSG_NOTE, vect_location,
12249 "LOOP VECTORIZED\n");
12250 if (loop->inner)
12251 dump_printf_loc (MSG_NOTE, vect_location,
12252 "OUTER LOOP VECTORIZED\n");
12253 dump_printf (MSG_NOTE, "\n");
12255 else
12256 dump_printf_loc (MSG_NOTE, vect_location,
12257 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12258 GET_MODE_NAME (loop_vinfo->vector_mode));
12261 /* Loops vectorized with a variable factor won't benefit from
12262 unrolling/peeling. */
12263 if (!vf.is_constant ())
12265 loop->unroll = 1;
12266 if (dump_enabled_p ())
12267 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12268 " variable-length vectorization factor\n");
12270 /* Free SLP instances here because otherwise stmt reference counting
12271 won't work. */
12272 slp_instance instance;
12273 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12274 vect_free_slp_instance (instance);
12275 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12276 /* Clear-up safelen field since its value is invalid after vectorization
12277 since vectorized loop can have loop-carried dependencies. */
12278 loop->safelen = 0;
12280 if (epilogue)
12282 update_epilogue_loop_vinfo (epilogue, advance);
12284 epilogue->simduid = loop->simduid;
12285 epilogue->force_vectorize = loop->force_vectorize;
12286 epilogue->dont_vectorize = false;
12289 return epilogue;
12292 /* The code below is trying to perform simple optimization - revert
12293 if-conversion for masked stores, i.e. if the mask of a store is zero
12294 do not perform it and all stored value producers also if possible.
12295 For example,
12296 for (i=0; i<n; i++)
12297 if (c[i])
12299 p1[i] += 1;
12300 p2[i] = p3[i] +2;
12302 this transformation will produce the following semi-hammock:
12304 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12306 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12307 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12308 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12309 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12310 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12311 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12315 void
12316 optimize_mask_stores (class loop *loop)
12318 basic_block *bbs = get_loop_body (loop);
12319 unsigned nbbs = loop->num_nodes;
12320 unsigned i;
12321 basic_block bb;
12322 class loop *bb_loop;
12323 gimple_stmt_iterator gsi;
12324 gimple *stmt;
12325 auto_vec<gimple *> worklist;
12326 auto_purge_vect_location sentinel;
12328 vect_location = find_loop_location (loop);
12329 /* Pick up all masked stores in loop if any. */
12330 for (i = 0; i < nbbs; i++)
12332 bb = bbs[i];
12333 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12334 gsi_next (&gsi))
12336 stmt = gsi_stmt (gsi);
12337 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12338 worklist.safe_push (stmt);
12342 free (bbs);
12343 if (worklist.is_empty ())
12344 return;
12346 /* Loop has masked stores. */
12347 while (!worklist.is_empty ())
12349 gimple *last, *last_store;
12350 edge e, efalse;
12351 tree mask;
12352 basic_block store_bb, join_bb;
12353 gimple_stmt_iterator gsi_to;
12354 tree vdef, new_vdef;
12355 gphi *phi;
12356 tree vectype;
12357 tree zero;
12359 last = worklist.pop ();
12360 mask = gimple_call_arg (last, 2);
12361 bb = gimple_bb (last);
12362 /* Create then_bb and if-then structure in CFG, then_bb belongs to
12363 the same loop as if_bb. It could be different to LOOP when two
12364 level loop-nest is vectorized and mask_store belongs to the inner
12365 one. */
12366 e = split_block (bb, last);
12367 bb_loop = bb->loop_father;
12368 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12369 join_bb = e->dest;
12370 store_bb = create_empty_bb (bb);
12371 add_bb_to_loop (store_bb, bb_loop);
12372 e->flags = EDGE_TRUE_VALUE;
12373 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12374 /* Put STORE_BB to likely part. */
12375 efalse->probability = profile_probability::likely ();
12376 e->probability = efalse->probability.invert ();
12377 store_bb->count = efalse->count ();
12378 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12379 if (dom_info_available_p (CDI_DOMINATORS))
12380 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12381 if (dump_enabled_p ())
12382 dump_printf_loc (MSG_NOTE, vect_location,
12383 "Create new block %d to sink mask stores.",
12384 store_bb->index);
12385 /* Create vector comparison with boolean result. */
12386 vectype = TREE_TYPE (mask);
12387 zero = build_zero_cst (vectype);
12388 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12389 gsi = gsi_last_bb (bb);
12390 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12391 /* Create new PHI node for vdef of the last masked store:
12392 .MEM_2 = VDEF <.MEM_1>
12393 will be converted to
12394 .MEM.3 = VDEF <.MEM_1>
12395 and new PHI node will be created in join bb
12396 .MEM_2 = PHI <.MEM_1, .MEM_3>
12398 vdef = gimple_vdef (last);
12399 new_vdef = make_ssa_name (gimple_vop (cfun), last);
12400 gimple_set_vdef (last, new_vdef);
12401 phi = create_phi_node (vdef, join_bb);
12402 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12404 /* Put all masked stores with the same mask to STORE_BB if possible. */
12405 while (true)
12407 gimple_stmt_iterator gsi_from;
12408 gimple *stmt1 = NULL;
12410 /* Move masked store to STORE_BB. */
12411 last_store = last;
12412 gsi = gsi_for_stmt (last);
12413 gsi_from = gsi;
12414 /* Shift GSI to the previous stmt for further traversal. */
12415 gsi_prev (&gsi);
12416 gsi_to = gsi_start_bb (store_bb);
12417 gsi_move_before (&gsi_from, &gsi_to);
12418 /* Setup GSI_TO to the non-empty block start. */
12419 gsi_to = gsi_start_bb (store_bb);
12420 if (dump_enabled_p ())
12421 dump_printf_loc (MSG_NOTE, vect_location,
12422 "Move stmt to created bb\n%G", last);
12423 /* Move all stored value producers if possible. */
12424 while (!gsi_end_p (gsi))
12426 tree lhs;
12427 imm_use_iterator imm_iter;
12428 use_operand_p use_p;
12429 bool res;
12431 /* Skip debug statements. */
12432 if (is_gimple_debug (gsi_stmt (gsi)))
12434 gsi_prev (&gsi);
12435 continue;
12437 stmt1 = gsi_stmt (gsi);
12438 /* Do not consider statements writing to memory or having
12439 volatile operand. */
12440 if (gimple_vdef (stmt1)
12441 || gimple_has_volatile_ops (stmt1))
12442 break;
12443 gsi_from = gsi;
12444 gsi_prev (&gsi);
12445 lhs = gimple_get_lhs (stmt1);
12446 if (!lhs)
12447 break;
12449 /* LHS of vectorized stmt must be SSA_NAME. */
12450 if (TREE_CODE (lhs) != SSA_NAME)
12451 break;
12453 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12455 /* Remove dead scalar statement. */
12456 if (has_zero_uses (lhs))
12458 gsi_remove (&gsi_from, true);
12459 continue;
12463 /* Check that LHS does not have uses outside of STORE_BB. */
12464 res = true;
12465 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12467 gimple *use_stmt;
12468 use_stmt = USE_STMT (use_p);
12469 if (is_gimple_debug (use_stmt))
12470 continue;
12471 if (gimple_bb (use_stmt) != store_bb)
12473 res = false;
12474 break;
12477 if (!res)
12478 break;
12480 if (gimple_vuse (stmt1)
12481 && gimple_vuse (stmt1) != gimple_vuse (last_store))
12482 break;
12484 /* Can move STMT1 to STORE_BB. */
12485 if (dump_enabled_p ())
12486 dump_printf_loc (MSG_NOTE, vect_location,
12487 "Move stmt to created bb\n%G", stmt1);
12488 gsi_move_before (&gsi_from, &gsi_to);
12489 /* Shift GSI_TO for further insertion. */
12490 gsi_prev (&gsi_to);
12492 /* Put other masked stores with the same mask to STORE_BB. */
12493 if (worklist.is_empty ()
12494 || gimple_call_arg (worklist.last (), 2) != mask
12495 || worklist.last () != stmt1)
12496 break;
12497 last = worklist.pop ();
12499 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12503 /* Decide whether it is possible to use a zero-based induction variable
12504 when vectorizing LOOP_VINFO with partial vectors. If it is, return
12505 the value that the induction variable must be able to hold in order
12506 to ensure that the rgroups eventually have no active vector elements.
12507 Return -1 otherwise. */
12509 widest_int
12510 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12512 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12513 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12514 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12516 /* Calculate the value that the induction variable must be able
12517 to hit in order to ensure that we end the loop with an all-false mask.
12518 This involves adding the maximum number of inactive trailing scalar
12519 iterations. */
12520 widest_int iv_limit = -1;
12521 if (max_loop_iterations (loop, &iv_limit))
12523 if (niters_skip)
12525 /* Add the maximum number of skipped iterations to the
12526 maximum iteration count. */
12527 if (TREE_CODE (niters_skip) == INTEGER_CST)
12528 iv_limit += wi::to_widest (niters_skip);
12529 else
12530 iv_limit += max_vf - 1;
12532 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12533 /* Make a conservatively-correct assumption. */
12534 iv_limit += max_vf - 1;
12536 /* IV_LIMIT is the maximum number of latch iterations, which is also
12537 the maximum in-range IV value. Round this value down to the previous
12538 vector alignment boundary and then add an extra full iteration. */
12539 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12540 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12542 return iv_limit;
12545 /* For the given rgroup_controls RGC, check whether an induction variable
12546 would ever hit a value that produces a set of all-false masks or zero
12547 lengths before wrapping around. Return true if it's possible to wrap
12548 around before hitting the desirable value, otherwise return false. */
12550 bool
12551 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12553 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12555 if (iv_limit == -1)
12556 return true;
12558 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12559 unsigned int compare_precision = TYPE_PRECISION (compare_type);
12560 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12562 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12563 return true;
12565 return false;