Fix couple of endianness issues in fold_ctor_reference
[official-gcc.git] / gcc / tree-vect-loop.cc
blob0a03f56aae7b51fb4c5ce0e49d96888bae634ef7
1 /* Loop Vectorization
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 #include "case-cfn-macros.h"
58 #include "langhooks.h"
60 /* Loop Vectorization Pass.
62 This pass tries to vectorize loops.
64 For example, the vectorizer transforms the following simple loop:
66 short a[N]; short b[N]; short c[N]; int i;
68 for (i=0; i<N; i++){
69 a[i] = b[i] + c[i];
72 as if it was manually vectorized by rewriting the source code into:
74 typedef int __attribute__((mode(V8HI))) v8hi;
75 short a[N]; short b[N]; short c[N]; int i;
76 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
77 v8hi va, vb, vc;
79 for (i=0; i<N/8; i++){
80 vb = pb[i];
81 vc = pc[i];
82 va = vb + vc;
83 pa[i] = va;
86 The main entry to this pass is vectorize_loops(), in which
87 the vectorizer applies a set of analyses on a given set of loops,
88 followed by the actual vectorization transformation for the loops that
89 had successfully passed the analysis phase.
90 Throughout this pass we make a distinction between two types of
91 data: scalars (which are represented by SSA_NAMES), and memory references
92 ("data-refs"). These two types of data require different handling both
93 during analysis and transformation. The types of data-refs that the
94 vectorizer currently supports are ARRAY_REFS which base is an array DECL
95 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
96 accesses are required to have a simple (consecutive) access pattern.
98 Analysis phase:
99 ===============
100 The driver for the analysis phase is vect_analyze_loop().
101 It applies a set of analyses, some of which rely on the scalar evolution
102 analyzer (scev) developed by Sebastian Pop.
104 During the analysis phase the vectorizer records some information
105 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
106 loop, as well as general information about the loop as a whole, which is
107 recorded in a "loop_vec_info" struct attached to each loop.
109 Transformation phase:
110 =====================
111 The loop transformation phase scans all the stmts in the loop, and
112 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
113 the loop that needs to be vectorized. It inserts the vector code sequence
114 just before the scalar stmt S, and records a pointer to the vector code
115 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
116 attached to S). This pointer will be used for the vectorization of following
117 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
118 otherwise, we rely on dead code elimination for removing it.
120 For example, say stmt S1 was vectorized into stmt VS1:
122 VS1: vb = px[i];
123 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
124 S2: a = b;
126 To vectorize stmt S2, the vectorizer first finds the stmt that defines
127 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
128 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
129 resulting sequence would be:
131 VS1: vb = px[i];
132 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
133 VS2: va = vb;
134 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
136 Operands that are not SSA_NAMEs, are data-refs that appear in
137 load/store operations (like 'x[i]' in S1), and are handled differently.
139 Target modeling:
140 =================
141 Currently the only target specific information that is used is the
142 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
143 Targets that can support different sizes of vectors, for now will need
144 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
145 flexibility will be added in the future.
147 Since we only vectorize operations which vector form can be
148 expressed using existing tree codes, to verify that an operation is
149 supported, the vectorizer checks the relevant optab at the relevant
150 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
151 the value found is CODE_FOR_nothing, then there's no target support, and
152 we can't vectorize the stmt.
154 For additional information on this project see:
155 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
158 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
159 unsigned *);
160 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
161 bool *, bool *, bool);
163 /* Subroutine of vect_determine_vf_for_stmt that handles only one
164 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
165 may already be set for general statements (not just data refs). */
167 static opt_result
168 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
169 bool vectype_maybe_set_p,
170 poly_uint64 *vf)
172 gimple *stmt = stmt_info->stmt;
174 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
175 && !STMT_VINFO_LIVE_P (stmt_info))
176 || gimple_clobber_p (stmt))
178 if (dump_enabled_p ())
179 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
180 return opt_result::success ();
183 tree stmt_vectype, nunits_vectype;
184 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
185 &stmt_vectype,
186 &nunits_vectype);
187 if (!res)
188 return res;
190 if (stmt_vectype)
192 if (STMT_VINFO_VECTYPE (stmt_info))
193 /* The only case when a vectype had been already set is for stmts
194 that contain a data ref, or for "pattern-stmts" (stmts generated
195 by the vectorizer to represent/replace a certain idiom). */
196 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
197 || vectype_maybe_set_p)
198 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
199 else
200 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
203 if (nunits_vectype)
204 vect_update_max_nunits (vf, nunits_vectype);
206 return opt_result::success ();
209 /* Subroutine of vect_determine_vectorization_factor. Set the vector
210 types of STMT_INFO and all attached pattern statements and update
211 the vectorization factor VF accordingly. Return true on success
212 or false if something prevented vectorization. */
214 static opt_result
215 vect_determine_vf_for_stmt (vec_info *vinfo,
216 stmt_vec_info stmt_info, poly_uint64 *vf)
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
220 stmt_info->stmt);
221 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
222 if (!res)
223 return res;
225 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
226 && STMT_VINFO_RELATED_STMT (stmt_info))
228 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
229 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
231 /* If a pattern statement has def stmts, analyze them too. */
232 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
233 !gsi_end_p (si); gsi_next (&si))
235 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
236 if (dump_enabled_p ())
237 dump_printf_loc (MSG_NOTE, vect_location,
238 "==> examining pattern def stmt: %G",
239 def_stmt_info->stmt);
240 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
241 if (!res)
242 return res;
245 if (dump_enabled_p ())
246 dump_printf_loc (MSG_NOTE, vect_location,
247 "==> examining pattern statement: %G",
248 stmt_info->stmt);
249 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
250 if (!res)
251 return res;
254 return opt_result::success ();
257 /* Function vect_determine_vectorization_factor
259 Determine the vectorization factor (VF). VF is the number of data elements
260 that are operated upon in parallel in a single iteration of the vectorized
261 loop. For example, when vectorizing a loop that operates on 4byte elements,
262 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
263 elements can fit in a single vector register.
265 We currently support vectorization of loops in which all types operated upon
266 are of the same size. Therefore this function currently sets VF according to
267 the size of the types operated upon, and fails if there are multiple sizes
268 in the loop.
270 VF is also the factor by which the loop iterations are strip-mined, e.g.:
271 original loop:
272 for (i=0; i<N; i++){
273 a[i] = b[i] + c[i];
276 vectorized loop:
277 for (i=0; i<N; i+=VF){
278 a[i:VF] = b[i:VF] + c[i:VF];
282 static opt_result
283 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
285 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
286 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
287 unsigned nbbs = loop->num_nodes;
288 poly_uint64 vectorization_factor = 1;
289 tree scalar_type = NULL_TREE;
290 gphi *phi;
291 tree vectype;
292 stmt_vec_info stmt_info;
293 unsigned i;
295 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
297 for (i = 0; i < nbbs; i++)
299 basic_block bb = bbs[i];
301 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
302 gsi_next (&si))
304 phi = si.phi ();
305 stmt_info = loop_vinfo->lookup_stmt (phi);
306 if (dump_enabled_p ())
307 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
308 (gimple *) phi);
310 gcc_assert (stmt_info);
312 if (STMT_VINFO_RELEVANT_P (stmt_info)
313 || STMT_VINFO_LIVE_P (stmt_info))
315 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
316 scalar_type = TREE_TYPE (PHI_RESULT (phi));
318 if (dump_enabled_p ())
319 dump_printf_loc (MSG_NOTE, vect_location,
320 "get vectype for scalar type: %T\n",
321 scalar_type);
323 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
324 if (!vectype)
325 return opt_result::failure_at (phi,
326 "not vectorized: unsupported "
327 "data-type %T\n",
328 scalar_type);
329 STMT_VINFO_VECTYPE (stmt_info) = vectype;
331 if (dump_enabled_p ())
332 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
333 vectype);
335 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
338 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
339 dump_printf (MSG_NOTE, "\n");
342 vect_update_max_nunits (&vectorization_factor, vectype);
346 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
347 gsi_next (&si))
349 if (is_gimple_debug (gsi_stmt (si)))
350 continue;
351 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
352 opt_result res
353 = vect_determine_vf_for_stmt (loop_vinfo,
354 stmt_info, &vectorization_factor);
355 if (!res)
356 return res;
360 /* TODO: Analyze cost. Decide if worth while to vectorize. */
361 if (dump_enabled_p ())
363 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
364 dump_dec (MSG_NOTE, vectorization_factor);
365 dump_printf (MSG_NOTE, "\n");
368 if (known_le (vectorization_factor, 1U))
369 return opt_result::failure_at (vect_location,
370 "not vectorized: unsupported data-type\n");
371 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
372 return opt_result::success ();
376 /* Function vect_is_simple_iv_evolution.
378 FORNOW: A simple evolution of an induction variables in the loop is
379 considered a polynomial evolution. */
381 static bool
382 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
383 tree * step)
385 tree init_expr;
386 tree step_expr;
387 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
388 basic_block bb;
390 /* When there is no evolution in this loop, the evolution function
391 is not "simple". */
392 if (evolution_part == NULL_TREE)
393 return false;
395 /* When the evolution is a polynomial of degree >= 2
396 the evolution function is not "simple". */
397 if (tree_is_chrec (evolution_part))
398 return false;
400 step_expr = evolution_part;
401 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
403 if (dump_enabled_p ())
404 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
405 step_expr, init_expr);
407 *init = init_expr;
408 *step = step_expr;
410 if (TREE_CODE (step_expr) != INTEGER_CST
411 && (TREE_CODE (step_expr) != SSA_NAME
412 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
413 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
414 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
415 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
416 || !flag_associative_math)))
417 && (TREE_CODE (step_expr) != REAL_CST
418 || !flag_associative_math))
420 if (dump_enabled_p ())
421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
422 "step unknown.\n");
423 return false;
426 return true;
429 /* Function vect_is_nonlinear_iv_evolution
431 Only support nonlinear induction for integer type
432 1. neg
433 2. mul by constant
434 3. lshift/rshift by constant.
436 For neg induction, return a fake step as integer -1. */
437 static bool
438 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
439 gphi* loop_phi_node, tree *init, tree *step)
441 tree init_expr, ev_expr, result, op1, op2;
442 gimple* def;
444 if (gimple_phi_num_args (loop_phi_node) != 2)
445 return false;
447 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
448 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
450 /* Support nonlinear induction only for integer type. */
451 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
452 return false;
454 *init = init_expr;
455 result = PHI_RESULT (loop_phi_node);
457 if (TREE_CODE (ev_expr) != SSA_NAME
458 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
459 || !is_gimple_assign (def))
460 return false;
462 enum tree_code t_code = gimple_assign_rhs_code (def);
463 switch (t_code)
465 case NEGATE_EXPR:
466 if (gimple_assign_rhs1 (def) != result)
467 return false;
468 *step = build_int_cst (TREE_TYPE (init_expr), -1);
469 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
470 break;
472 case RSHIFT_EXPR:
473 case LSHIFT_EXPR:
474 case MULT_EXPR:
475 op1 = gimple_assign_rhs1 (def);
476 op2 = gimple_assign_rhs2 (def);
477 if (TREE_CODE (op2) != INTEGER_CST
478 || op1 != result)
479 return false;
480 *step = op2;
481 if (t_code == LSHIFT_EXPR)
482 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
483 else if (t_code == RSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
485 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
486 else
487 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
488 break;
490 default:
491 return false;
494 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
495 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
497 return true;
500 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
501 what we are assuming is a double reduction. For example, given
502 a structure like this:
504 outer1:
505 x_1 = PHI <x_4(outer2), ...>;
508 inner:
509 x_2 = PHI <x_1(outer1), ...>;
511 x_3 = ...;
514 outer2:
515 x_4 = PHI <x_3(inner)>;
518 outer loop analysis would treat x_1 as a double reduction phi and
519 this function would then return true for x_2. */
521 static bool
522 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
524 use_operand_p use_p;
525 ssa_op_iter op_iter;
526 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
527 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
528 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
529 return true;
530 return false;
533 /* Returns true if Phi is a first-order recurrence. A first-order
534 recurrence is a non-reduction recurrence relation in which the value of
535 the recurrence in the current loop iteration equals a value defined in
536 the previous iteration. */
538 static bool
539 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
540 gphi *phi)
542 /* A nested cycle isn't vectorizable as first order recurrence. */
543 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
544 return false;
546 /* Ensure the loop latch definition is from within the loop. */
547 edge latch = loop_latch_edge (loop);
548 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
549 if (TREE_CODE (ldef) != SSA_NAME
550 || SSA_NAME_IS_DEFAULT_DEF (ldef)
551 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
552 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
553 return false;
555 tree def = gimple_phi_result (phi);
557 /* Ensure every use_stmt of the phi node is dominated by the latch
558 definition. */
559 imm_use_iterator imm_iter;
560 use_operand_p use_p;
561 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
562 if (!is_gimple_debug (USE_STMT (use_p))
563 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
564 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
565 USE_STMT (use_p))))
566 return false;
568 /* First-order recurrence autovectorization needs shuffle vector. */
569 tree scalar_type = TREE_TYPE (def);
570 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
571 if (!vectype)
572 return false;
574 return true;
577 /* Function vect_analyze_scalar_cycles_1.
579 Examine the cross iteration def-use cycles of scalar variables
580 in LOOP. LOOP_VINFO represents the loop that is now being
581 considered for vectorization (can be LOOP, or an outer-loop
582 enclosing LOOP). SLP indicates there will be some subsequent
583 slp analyses or not. */
585 static void
586 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
587 bool slp)
589 basic_block bb = loop->header;
590 tree init, step;
591 auto_vec<stmt_vec_info, 64> worklist;
592 gphi_iterator gsi;
593 bool double_reduc, reduc_chain;
595 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
597 /* First - identify all inductions. Reduction detection assumes that all the
598 inductions have been identified, therefore, this order must not be
599 changed. */
600 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
602 gphi *phi = gsi.phi ();
603 tree access_fn = NULL;
604 tree def = PHI_RESULT (phi);
605 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
607 if (dump_enabled_p ())
608 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
609 (gimple *) phi);
611 /* Skip virtual phi's. The data dependences that are associated with
612 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
613 if (virtual_operand_p (def))
614 continue;
616 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
618 /* Analyze the evolution function. */
619 access_fn = analyze_scalar_evolution (loop, def);
620 if (access_fn)
622 STRIP_NOPS (access_fn);
623 if (dump_enabled_p ())
624 dump_printf_loc (MSG_NOTE, vect_location,
625 "Access function of PHI: %T\n", access_fn);
626 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
627 = initial_condition_in_loop_num (access_fn, loop->num);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
629 = evolution_part_in_loop_num (access_fn, loop->num);
632 if ((!access_fn
633 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
634 || !vect_is_simple_iv_evolution (loop->num, access_fn,
635 &init, &step)
636 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
637 && TREE_CODE (step) != INTEGER_CST))
638 /* Only handle nonlinear iv for same loop. */
639 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
640 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
641 phi, &init, &step)))
643 worklist.safe_push (stmt_vinfo);
644 continue;
647 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
648 != NULL_TREE);
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
651 if (dump_enabled_p ())
652 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
653 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
657 /* Second - identify all reductions and nested cycles. */
658 while (worklist.length () > 0)
660 stmt_vec_info stmt_vinfo = worklist.pop ();
661 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
662 tree def = PHI_RESULT (phi);
664 if (dump_enabled_p ())
665 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
666 (gimple *) phi);
668 gcc_assert (!virtual_operand_p (def)
669 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
671 stmt_vec_info reduc_stmt_info
672 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
673 &reduc_chain, slp);
674 if (reduc_stmt_info)
676 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
677 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
678 if (double_reduc)
680 if (dump_enabled_p ())
681 dump_printf_loc (MSG_NOTE, vect_location,
682 "Detected double reduction.\n");
684 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
685 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
687 else
689 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
691 if (dump_enabled_p ())
692 dump_printf_loc (MSG_NOTE, vect_location,
693 "Detected vectorizable nested cycle.\n");
695 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
697 else
699 if (dump_enabled_p ())
700 dump_printf_loc (MSG_NOTE, vect_location,
701 "Detected reduction.\n");
703 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
704 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
705 /* Store the reduction cycles for possible vectorization in
706 loop-aware SLP if it was not detected as reduction
707 chain. */
708 if (! reduc_chain)
709 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
710 (reduc_stmt_info);
714 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
715 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
716 else
717 if (dump_enabled_p ())
718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
719 "Unknown def-use cycle pattern.\n");
724 /* Function vect_analyze_scalar_cycles.
726 Examine the cross iteration def-use cycles of scalar variables, by
727 analyzing the loop-header PHIs of scalar variables. Classify each
728 cycle as one of the following: invariant, induction, reduction, unknown.
729 We do that for the loop represented by LOOP_VINFO, and also to its
730 inner-loop, if exists.
731 Examples for scalar cycles:
733 Example1: reduction:
735 loop1:
736 for (i=0; i<N; i++)
737 sum += a[i];
739 Example2: induction:
741 loop2:
742 for (i=0; i<N; i++)
743 a[i] = i; */
745 static void
746 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
748 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
750 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
752 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
753 Reductions in such inner-loop therefore have different properties than
754 the reductions in the nest that gets vectorized:
755 1. When vectorized, they are executed in the same order as in the original
756 scalar loop, so we can't change the order of computation when
757 vectorizing them.
758 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
759 current checks are too strict. */
761 if (loop->inner)
762 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
765 /* Transfer group and reduction information from STMT_INFO to its
766 pattern stmt. */
768 static void
769 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
771 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
772 stmt_vec_info stmtp;
773 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
774 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
775 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
778 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
779 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
780 == STMT_VINFO_DEF_TYPE (stmt_info));
781 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
782 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
783 if (stmt_info)
784 REDUC_GROUP_NEXT_ELEMENT (stmtp)
785 = STMT_VINFO_RELATED_STMT (stmt_info);
787 while (stmt_info);
790 /* Fixup scalar cycles that now have their stmts detected as patterns. */
792 static void
793 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
795 stmt_vec_info first;
796 unsigned i;
798 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
800 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
801 while (next)
803 if ((STMT_VINFO_IN_PATTERN_P (next)
804 != STMT_VINFO_IN_PATTERN_P (first))
805 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
806 break;
807 next = REDUC_GROUP_NEXT_ELEMENT (next);
809 /* If all reduction chain members are well-formed patterns adjust
810 the group to group the pattern stmts instead. */
811 if (! next
812 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
814 if (STMT_VINFO_IN_PATTERN_P (first))
816 vect_fixup_reduc_chain (first);
817 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
818 = STMT_VINFO_RELATED_STMT (first);
821 /* If not all stmt in the chain are patterns or if we failed
822 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
823 it as regular reduction instead. */
824 else
826 stmt_vec_info vinfo = first;
827 stmt_vec_info last = NULL;
828 while (vinfo)
830 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
831 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
832 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
833 last = vinfo;
834 vinfo = next;
836 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
837 = vect_internal_def;
838 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
839 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
840 --i;
845 /* Function vect_get_loop_niters.
847 Determine how many iterations the loop is executed and place it
848 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
849 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
850 niter information holds in ASSUMPTIONS.
852 Return the loop exit condition. */
855 static gcond *
856 vect_get_loop_niters (class loop *loop, tree *assumptions,
857 tree *number_of_iterations, tree *number_of_iterationsm1)
859 edge exit = single_exit (loop);
860 class tree_niter_desc niter_desc;
861 tree niter_assumptions, niter, may_be_zero;
862 gcond *cond = get_loop_exit_condition (loop);
864 *assumptions = boolean_true_node;
865 *number_of_iterationsm1 = chrec_dont_know;
866 *number_of_iterations = chrec_dont_know;
867 DUMP_VECT_SCOPE ("get_loop_niters");
869 if (!exit)
870 return cond;
872 may_be_zero = NULL_TREE;
873 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
874 || chrec_contains_undetermined (niter_desc.niter))
875 return cond;
877 niter_assumptions = niter_desc.assumptions;
878 may_be_zero = niter_desc.may_be_zero;
879 niter = niter_desc.niter;
881 if (may_be_zero && integer_zerop (may_be_zero))
882 may_be_zero = NULL_TREE;
884 if (may_be_zero)
886 if (COMPARISON_CLASS_P (may_be_zero))
888 /* Try to combine may_be_zero with assumptions, this can simplify
889 computation of niter expression. */
890 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
891 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
892 niter_assumptions,
893 fold_build1 (TRUTH_NOT_EXPR,
894 boolean_type_node,
895 may_be_zero));
896 else
897 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
898 build_int_cst (TREE_TYPE (niter), 0),
899 rewrite_to_non_trapping_overflow (niter));
901 may_be_zero = NULL_TREE;
903 else if (integer_nonzerop (may_be_zero))
905 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
906 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
907 return cond;
909 else
910 return cond;
913 *assumptions = niter_assumptions;
914 *number_of_iterationsm1 = niter;
916 /* We want the number of loop header executions which is the number
917 of latch executions plus one.
918 ??? For UINT_MAX latch executions this number overflows to zero
919 for loops like do { n++; } while (n != 0); */
920 if (niter && !chrec_contains_undetermined (niter))
921 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
922 build_int_cst (TREE_TYPE (niter), 1));
923 *number_of_iterations = niter;
925 return cond;
928 /* Function bb_in_loop_p
930 Used as predicate for dfs order traversal of the loop bbs. */
932 static bool
933 bb_in_loop_p (const_basic_block bb, const void *data)
935 const class loop *const loop = (const class loop *)data;
936 if (flow_bb_inside_loop_p (loop, bb))
937 return true;
938 return false;
942 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
943 stmt_vec_info structs for all the stmts in LOOP_IN. */
945 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
946 : vec_info (vec_info::loop, shared),
947 loop (loop_in),
948 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
949 num_itersm1 (NULL_TREE),
950 num_iters (NULL_TREE),
951 num_iters_unchanged (NULL_TREE),
952 num_iters_assumptions (NULL_TREE),
953 vector_costs (nullptr),
954 scalar_costs (nullptr),
955 th (0),
956 versioning_threshold (0),
957 vectorization_factor (0),
958 main_loop_edge (nullptr),
959 skip_main_loop_edge (nullptr),
960 skip_this_loop_edge (nullptr),
961 reusable_accumulators (),
962 suggested_unroll_factor (1),
963 max_vectorization_factor (0),
964 mask_skip_niters (NULL_TREE),
965 rgroup_compare_type (NULL_TREE),
966 simd_if_cond (NULL_TREE),
967 partial_vector_style (vect_partial_vectors_none),
968 unaligned_dr (NULL),
969 peeling_for_alignment (0),
970 ptr_mask (0),
971 ivexpr_map (NULL),
972 scan_map (NULL),
973 slp_unrolling_factor (1),
974 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
975 vectorizable (false),
976 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
977 using_partial_vectors_p (false),
978 using_decrementing_iv_p (false),
979 using_select_vl_p (false),
980 epil_using_partial_vectors_p (false),
981 partial_load_store_bias (0),
982 peeling_for_gaps (false),
983 peeling_for_niter (false),
984 no_data_dependencies (false),
985 has_mask_store (false),
986 scalar_loop_scaling (profile_probability::uninitialized ()),
987 scalar_loop (NULL),
988 orig_loop_info (NULL)
990 /* CHECKME: We want to visit all BBs before their successors (except for
991 latch blocks, for which this assertion wouldn't hold). In the simple
992 case of the loop forms we allow, a dfs order of the BBs would the same
993 as reversed postorder traversal, so we are safe. */
995 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
996 bbs, loop->num_nodes, loop);
997 gcc_assert (nbbs == loop->num_nodes);
999 for (unsigned int i = 0; i < nbbs; i++)
1001 basic_block bb = bbs[i];
1002 gimple_stmt_iterator si;
1004 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1006 gimple *phi = gsi_stmt (si);
1007 gimple_set_uid (phi, 0);
1008 add_stmt (phi);
1011 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1013 gimple *stmt = gsi_stmt (si);
1014 gimple_set_uid (stmt, 0);
1015 if (is_gimple_debug (stmt))
1016 continue;
1017 add_stmt (stmt);
1018 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1019 third argument is the #pragma omp simd if (x) condition, when 0,
1020 loop shouldn't be vectorized, when non-zero constant, it should
1021 be vectorized normally, otherwise versioned with vectorized loop
1022 done if the condition is non-zero at runtime. */
1023 if (loop_in->simduid
1024 && is_gimple_call (stmt)
1025 && gimple_call_internal_p (stmt)
1026 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1027 && gimple_call_num_args (stmt) >= 3
1028 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1029 && (loop_in->simduid
1030 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1032 tree arg = gimple_call_arg (stmt, 2);
1033 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1034 simd_if_cond = arg;
1035 else
1036 gcc_assert (integer_nonzerop (arg));
1041 epilogue_vinfos.create (6);
1044 /* Free all levels of rgroup CONTROLS. */
1046 void
1047 release_vec_loop_controls (vec<rgroup_controls> *controls)
1049 rgroup_controls *rgc;
1050 unsigned int i;
1051 FOR_EACH_VEC_ELT (*controls, i, rgc)
1052 rgc->controls.release ();
1053 controls->release ();
1056 /* Free all memory used by the _loop_vec_info, as well as all the
1057 stmt_vec_info structs of all the stmts in the loop. */
1059 _loop_vec_info::~_loop_vec_info ()
1061 free (bbs);
1063 release_vec_loop_controls (&masks.rgc_vec);
1064 release_vec_loop_controls (&lens);
1065 delete ivexpr_map;
1066 delete scan_map;
1067 epilogue_vinfos.release ();
1068 delete scalar_costs;
1069 delete vector_costs;
1071 /* When we release an epiloge vinfo that we do not intend to use
1072 avoid clearing AUX of the main loop which should continue to
1073 point to the main loop vinfo since otherwise we'll leak that. */
1074 if (loop->aux == this)
1075 loop->aux = NULL;
1078 /* Return an invariant or register for EXPR and emit necessary
1079 computations in the LOOP_VINFO loop preheader. */
1081 tree
1082 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1084 if (is_gimple_reg (expr)
1085 || is_gimple_min_invariant (expr))
1086 return expr;
1088 if (! loop_vinfo->ivexpr_map)
1089 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1090 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1091 if (! cached)
1093 gimple_seq stmts = NULL;
1094 cached = force_gimple_operand (unshare_expr (expr),
1095 &stmts, true, NULL_TREE);
1096 if (stmts)
1098 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1099 gsi_insert_seq_on_edge_immediate (e, stmts);
1102 return cached;
1105 /* Return true if we can use CMP_TYPE as the comparison type to produce
1106 all masks required to mask LOOP_VINFO. */
1108 static bool
1109 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1111 rgroup_controls *rgm;
1112 unsigned int i;
1113 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1114 if (rgm->type != NULL_TREE
1115 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1116 cmp_type, rgm->type,
1117 OPTIMIZE_FOR_SPEED))
1118 return false;
1119 return true;
1122 /* Calculate the maximum number of scalars per iteration for every
1123 rgroup in LOOP_VINFO. */
1125 static unsigned int
1126 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1128 unsigned int res = 1;
1129 unsigned int i;
1130 rgroup_controls *rgm;
1131 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1132 res = MAX (res, rgm->max_nscalars_per_iter);
1133 return res;
1136 /* Calculate the minimum precision necessary to represent:
1138 MAX_NITERS * FACTOR
1140 as an unsigned integer, where MAX_NITERS is the maximum number of
1141 loop header iterations for the original scalar form of LOOP_VINFO. */
1143 static unsigned
1144 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1146 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1148 /* Get the maximum number of iterations that is representable
1149 in the counter type. */
1150 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1151 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1153 /* Get a more refined estimate for the number of iterations. */
1154 widest_int max_back_edges;
1155 if (max_loop_iterations (loop, &max_back_edges))
1156 max_ni = wi::smin (max_ni, max_back_edges + 1);
1158 /* Work out how many bits we need to represent the limit. */
1159 return wi::min_precision (max_ni * factor, UNSIGNED);
1162 /* True if the loop needs peeling or partial vectors when vectorized. */
1164 static bool
1165 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1167 unsigned HOST_WIDE_INT const_vf;
1168 HOST_WIDE_INT max_niter
1169 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1171 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1172 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1173 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1174 (loop_vinfo));
1176 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1177 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1179 /* Work out the (constant) number of iterations that need to be
1180 peeled for reasons other than niters. */
1181 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1182 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1183 peel_niter += 1;
1184 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1185 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1186 return true;
1188 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1189 /* ??? When peeling for gaps but not alignment, we could
1190 try to check whether the (variable) niters is known to be
1191 VF * N + 1. That's something of a niche case though. */
1192 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1193 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1194 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1195 < (unsigned) exact_log2 (const_vf))
1196 /* In case of versioning, check if the maximum number of
1197 iterations is greater than th. If they are identical,
1198 the epilogue is unnecessary. */
1199 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1200 || ((unsigned HOST_WIDE_INT) max_niter
1201 > (th / const_vf) * const_vf))))
1202 return true;
1204 return false;
1207 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1208 whether we can actually generate the masks required. Return true if so,
1209 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1211 static bool
1212 vect_verify_full_masking (loop_vec_info loop_vinfo)
1214 unsigned int min_ni_width;
1216 /* Use a normal loop if there are no statements that need masking.
1217 This only happens in rare degenerate cases: it means that the loop
1218 has no loads, no stores, and no live-out values. */
1219 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1220 return false;
1222 /* Produce the rgroup controls. */
1223 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1225 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1226 tree vectype = mask.first;
1227 unsigned nvectors = mask.second;
1229 if (masks->rgc_vec.length () < nvectors)
1230 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1231 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1232 /* The number of scalars per iteration and the number of vectors are
1233 both compile-time constants. */
1234 unsigned int nscalars_per_iter
1235 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1236 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1238 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1240 rgm->max_nscalars_per_iter = nscalars_per_iter;
1241 rgm->type = truth_type_for (vectype);
1242 rgm->factor = 1;
1246 unsigned int max_nscalars_per_iter
1247 = vect_get_max_nscalars_per_iter (loop_vinfo);
1249 /* Work out how many bits we need to represent the limit. */
1250 min_ni_width
1251 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1253 /* Find a scalar mode for which WHILE_ULT is supported. */
1254 opt_scalar_int_mode cmp_mode_iter;
1255 tree cmp_type = NULL_TREE;
1256 tree iv_type = NULL_TREE;
1257 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1258 unsigned int iv_precision = UINT_MAX;
1260 if (iv_limit != -1)
1261 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1262 UNSIGNED);
1264 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1266 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1267 if (cmp_bits >= min_ni_width
1268 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1270 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1271 if (this_type
1272 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1274 /* Although we could stop as soon as we find a valid mode,
1275 there are at least two reasons why that's not always the
1276 best choice:
1278 - An IV that's Pmode or wider is more likely to be reusable
1279 in address calculations than an IV that's narrower than
1280 Pmode.
1282 - Doing the comparison in IV_PRECISION or wider allows
1283 a natural 0-based IV, whereas using a narrower comparison
1284 type requires mitigations against wrap-around.
1286 Conversely, if the IV limit is variable, doing the comparison
1287 in a wider type than the original type can introduce
1288 unnecessary extensions, so picking the widest valid mode
1289 is not always a good choice either.
1291 Here we prefer the first IV type that's Pmode or wider,
1292 and the first comparison type that's IV_PRECISION or wider.
1293 (The comparison type must be no wider than the IV type,
1294 to avoid extensions in the vector loop.)
1296 ??? We might want to try continuing beyond Pmode for ILP32
1297 targets if CMP_BITS < IV_PRECISION. */
1298 iv_type = this_type;
1299 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1300 cmp_type = this_type;
1301 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1302 break;
1307 if (!cmp_type)
1309 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1310 return false;
1313 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1314 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1315 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1316 return true;
1319 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1320 whether we can actually generate AVX512 style masks. Return true if so,
1321 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1323 static bool
1324 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1326 /* Produce differently organized rgc_vec and differently check
1327 we can produce masks. */
1329 /* Use a normal loop if there are no statements that need masking.
1330 This only happens in rare degenerate cases: it means that the loop
1331 has no loads, no stores, and no live-out values. */
1332 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1333 return false;
1335 /* For the decrementing IV we need to represent all values in
1336 [0, niter + niter_skip] where niter_skip is the elements we
1337 skip in the first iteration for prologue peeling. */
1338 tree iv_type = NULL_TREE;
1339 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1340 unsigned int iv_precision = UINT_MAX;
1341 if (iv_limit != -1)
1342 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1344 /* First compute the type for the IV we use to track the remaining
1345 scalar iterations. */
1346 opt_scalar_int_mode cmp_mode_iter;
1347 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1349 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1350 if (cmp_bits >= iv_precision
1351 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1353 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1354 if (iv_type)
1355 break;
1358 if (!iv_type)
1359 return false;
1361 /* Produce the rgroup controls. */
1362 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1364 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1365 tree vectype = mask.first;
1366 unsigned nvectors = mask.second;
1368 /* The number of scalars per iteration and the number of vectors are
1369 both compile-time constants. */
1370 unsigned int nscalars_per_iter
1371 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1372 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1374 /* We index the rgroup_controls vector with nscalars_per_iter
1375 which we keep constant and instead have a varying nvectors,
1376 remembering the vector mask with the fewest nV. */
1377 if (masks->rgc_vec.length () < nscalars_per_iter)
1378 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1379 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1381 if (!rgm->type || rgm->factor > nvectors)
1383 rgm->type = truth_type_for (vectype);
1384 rgm->compare_type = NULL_TREE;
1385 rgm->max_nscalars_per_iter = nscalars_per_iter;
1386 rgm->factor = nvectors;
1387 rgm->bias_adjusted_ctrl = NULL_TREE;
1391 /* There is no fixed compare type we are going to use but we have to
1392 be able to get at one for each mask group. */
1393 unsigned int min_ni_width
1394 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1396 bool ok = true;
1397 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1399 tree mask_type = rgc.type;
1400 if (!mask_type)
1401 continue;
1403 if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1405 ok = false;
1406 break;
1409 /* If iv_type is usable as compare type use that - we can elide the
1410 saturation in that case. */
1411 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1413 tree cmp_vectype
1414 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1415 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1416 rgc.compare_type = cmp_vectype;
1418 if (!rgc.compare_type)
1419 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1421 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1422 if (cmp_bits >= min_ni_width
1423 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1425 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1426 if (!cmp_type)
1427 continue;
1429 /* Check whether we can produce the mask with cmp_type. */
1430 tree cmp_vectype
1431 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1432 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1434 rgc.compare_type = cmp_vectype;
1435 break;
1439 if (!rgc.compare_type)
1441 ok = false;
1442 break;
1445 if (!ok)
1447 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1448 return false;
1451 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1452 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1453 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1454 return true;
1457 /* Check whether we can use vector access with length based on precison
1458 comparison. So far, to keep it simple, we only allow the case that the
1459 precision of the target supported length is larger than the precision
1460 required by loop niters. */
1462 static bool
1463 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1465 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1466 return false;
1468 machine_mode len_load_mode = get_len_load_store_mode
1469 (loop_vinfo->vector_mode, true).require ();
1470 machine_mode len_store_mode = get_len_load_store_mode
1471 (loop_vinfo->vector_mode, false).require ();
1473 signed char partial_load_bias = internal_len_load_store_bias
1474 (IFN_LEN_LOAD, len_load_mode);
1476 signed char partial_store_bias = internal_len_load_store_bias
1477 (IFN_LEN_STORE, len_store_mode);
1479 gcc_assert (partial_load_bias == partial_store_bias);
1481 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1482 return false;
1484 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1485 len_loads with a length of zero. In order to avoid that we prohibit
1486 more than one loop length here. */
1487 if (partial_load_bias == -1
1488 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1489 return false;
1491 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1493 unsigned int max_nitems_per_iter = 1;
1494 unsigned int i;
1495 rgroup_controls *rgl;
1496 /* Find the maximum number of items per iteration for every rgroup. */
1497 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1499 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1500 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1503 /* Work out how many bits we need to represent the length limit. */
1504 unsigned int min_ni_prec
1505 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1507 /* Now use the maximum of below precisions for one suitable IV type:
1508 - the IV's natural precision
1509 - the precision needed to hold: the maximum number of scalar
1510 iterations multiplied by the scale factor (min_ni_prec above)
1511 - the Pmode precision
1513 If min_ni_prec is less than the precision of the current niters,
1514 we perfer to still use the niters type. Prefer to use Pmode and
1515 wider IV to avoid narrow conversions. */
1517 unsigned int ni_prec
1518 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1519 min_ni_prec = MAX (min_ni_prec, ni_prec);
1520 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1522 tree iv_type = NULL_TREE;
1523 opt_scalar_int_mode tmode_iter;
1524 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1526 scalar_mode tmode = tmode_iter.require ();
1527 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1529 /* ??? Do we really want to construct one IV whose precision exceeds
1530 BITS_PER_WORD? */
1531 if (tbits > BITS_PER_WORD)
1532 break;
1534 /* Find the first available standard integral type. */
1535 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1537 iv_type = build_nonstandard_integer_type (tbits, true);
1538 break;
1542 if (!iv_type)
1544 if (dump_enabled_p ())
1545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1546 "can't vectorize with length-based partial vectors"
1547 " because there is no suitable iv type.\n");
1548 return false;
1551 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1552 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1553 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1555 return true;
1558 /* Calculate the cost of one scalar iteration of the loop. */
1559 static void
1560 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1562 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1563 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1564 int nbbs = loop->num_nodes, factor;
1565 int innerloop_iters, i;
1567 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1569 /* Gather costs for statements in the scalar loop. */
1571 /* FORNOW. */
1572 innerloop_iters = 1;
1573 if (loop->inner)
1574 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1576 for (i = 0; i < nbbs; i++)
1578 gimple_stmt_iterator si;
1579 basic_block bb = bbs[i];
1581 if (bb->loop_father == loop->inner)
1582 factor = innerloop_iters;
1583 else
1584 factor = 1;
1586 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1588 gimple *stmt = gsi_stmt (si);
1589 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1591 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1592 continue;
1594 /* Skip stmts that are not vectorized inside the loop. */
1595 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1596 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1597 && (!STMT_VINFO_LIVE_P (vstmt_info)
1598 || !VECTORIZABLE_CYCLE_DEF
1599 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1600 continue;
1602 vect_cost_for_stmt kind;
1603 if (STMT_VINFO_DATA_REF (stmt_info))
1605 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1606 kind = scalar_load;
1607 else
1608 kind = scalar_store;
1610 else if (vect_nop_conversion_p (stmt_info))
1611 continue;
1612 else
1613 kind = scalar_stmt;
1615 /* We are using vect_prologue here to avoid scaling twice
1616 by the inner loop factor. */
1617 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1618 factor, kind, stmt_info, 0, vect_prologue);
1622 /* Now accumulate cost. */
1623 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1624 add_stmt_costs (loop_vinfo->scalar_costs,
1625 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1626 loop_vinfo->scalar_costs->finish_cost (nullptr);
1630 /* Function vect_analyze_loop_form.
1632 Verify that certain CFG restrictions hold, including:
1633 - the loop has a pre-header
1634 - the loop has a single entry and exit
1635 - the loop exit condition is simple enough
1636 - the number of iterations can be analyzed, i.e, a countable loop. The
1637 niter could be analyzed under some assumptions. */
1639 opt_result
1640 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1642 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1644 /* Different restrictions apply when we are considering an inner-most loop,
1645 vs. an outer (nested) loop.
1646 (FORNOW. May want to relax some of these restrictions in the future). */
1648 info->inner_loop_cond = NULL;
1649 if (!loop->inner)
1651 /* Inner-most loop. We currently require that the number of BBs is
1652 exactly 2 (the header and latch). Vectorizable inner-most loops
1653 look like this:
1655 (pre-header)
1657 header <--------+
1658 | | |
1659 | +--> latch --+
1661 (exit-bb) */
1663 if (loop->num_nodes != 2)
1664 return opt_result::failure_at (vect_location,
1665 "not vectorized:"
1666 " control flow in loop.\n");
1668 if (empty_block_p (loop->header))
1669 return opt_result::failure_at (vect_location,
1670 "not vectorized: empty loop.\n");
1672 else
1674 class loop *innerloop = loop->inner;
1675 edge entryedge;
1677 /* Nested loop. We currently require that the loop is doubly-nested,
1678 contains a single inner loop, and the number of BBs is exactly 5.
1679 Vectorizable outer-loops look like this:
1681 (pre-header)
1683 header <---+
1685 inner-loop |
1687 tail ------+
1689 (exit-bb)
1691 The inner-loop has the properties expected of inner-most loops
1692 as described above. */
1694 if ((loop->inner)->inner || (loop->inner)->next)
1695 return opt_result::failure_at (vect_location,
1696 "not vectorized:"
1697 " multiple nested loops.\n");
1699 if (loop->num_nodes != 5)
1700 return opt_result::failure_at (vect_location,
1701 "not vectorized:"
1702 " control flow in loop.\n");
1704 entryedge = loop_preheader_edge (innerloop);
1705 if (entryedge->src != loop->header
1706 || !single_exit (innerloop)
1707 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1708 return opt_result::failure_at (vect_location,
1709 "not vectorized:"
1710 " unsupported outerloop form.\n");
1712 /* Analyze the inner-loop. */
1713 vect_loop_form_info inner;
1714 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1715 if (!res)
1717 if (dump_enabled_p ())
1718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719 "not vectorized: Bad inner loop.\n");
1720 return res;
1723 /* Don't support analyzing niter under assumptions for inner
1724 loop. */
1725 if (!integer_onep (inner.assumptions))
1726 return opt_result::failure_at (vect_location,
1727 "not vectorized: Bad inner loop.\n");
1729 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1730 return opt_result::failure_at (vect_location,
1731 "not vectorized: inner-loop count not"
1732 " invariant.\n");
1734 if (dump_enabled_p ())
1735 dump_printf_loc (MSG_NOTE, vect_location,
1736 "Considering outer-loop vectorization.\n");
1737 info->inner_loop_cond = inner.loop_cond;
1740 if (!single_exit (loop))
1741 return opt_result::failure_at (vect_location,
1742 "not vectorized: multiple exits.\n");
1743 if (EDGE_COUNT (loop->header->preds) != 2)
1744 return opt_result::failure_at (vect_location,
1745 "not vectorized:"
1746 " too many incoming edges.\n");
1748 /* We assume that the loop exit condition is at the end of the loop. i.e,
1749 that the loop is represented as a do-while (with a proper if-guard
1750 before the loop if needed), where the loop header contains all the
1751 executable statements, and the latch is empty. */
1752 if (!empty_block_p (loop->latch)
1753 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1754 return opt_result::failure_at (vect_location,
1755 "not vectorized: latch block not empty.\n");
1757 /* Make sure the exit is not abnormal. */
1758 edge e = single_exit (loop);
1759 if (e->flags & EDGE_ABNORMAL)
1760 return opt_result::failure_at (vect_location,
1761 "not vectorized:"
1762 " abnormal loop exit edge.\n");
1764 info->loop_cond
1765 = vect_get_loop_niters (loop, &info->assumptions,
1766 &info->number_of_iterations,
1767 &info->number_of_iterationsm1);
1768 if (!info->loop_cond)
1769 return opt_result::failure_at
1770 (vect_location,
1771 "not vectorized: complicated exit condition.\n");
1773 if (integer_zerop (info->assumptions)
1774 || !info->number_of_iterations
1775 || chrec_contains_undetermined (info->number_of_iterations))
1776 return opt_result::failure_at
1777 (info->loop_cond,
1778 "not vectorized: number of iterations cannot be computed.\n");
1780 if (integer_zerop (info->number_of_iterations))
1781 return opt_result::failure_at
1782 (info->loop_cond,
1783 "not vectorized: number of iterations = 0.\n");
1785 if (!(tree_fits_shwi_p (info->number_of_iterations)
1786 && tree_to_shwi (info->number_of_iterations) > 0))
1788 if (dump_enabled_p ())
1790 dump_printf_loc (MSG_NOTE, vect_location,
1791 "Symbolic number of iterations is ");
1792 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1793 dump_printf (MSG_NOTE, "\n");
1797 return opt_result::success ();
1800 /* Create a loop_vec_info for LOOP with SHARED and the
1801 vect_analyze_loop_form result. */
1803 loop_vec_info
1804 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1805 const vect_loop_form_info *info,
1806 loop_vec_info main_loop_info)
1808 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1809 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1810 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1811 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1812 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1813 /* Also record the assumptions for versioning. */
1814 if (!integer_onep (info->assumptions) && !main_loop_info)
1815 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1817 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1818 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1819 if (info->inner_loop_cond)
1821 stmt_vec_info inner_loop_cond_info
1822 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1823 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1824 /* If we have an estimate on the number of iterations of the inner
1825 loop use that to limit the scale for costing, otherwise use
1826 --param vect-inner-loop-cost-factor literally. */
1827 widest_int nit;
1828 if (estimated_stmt_executions (loop->inner, &nit))
1829 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1830 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1833 return loop_vinfo;
1838 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1839 statements update the vectorization factor. */
1841 static void
1842 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1844 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1845 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1846 int nbbs = loop->num_nodes;
1847 poly_uint64 vectorization_factor;
1848 int i;
1850 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1852 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1853 gcc_assert (known_ne (vectorization_factor, 0U));
1855 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1856 vectorization factor of the loop is the unrolling factor required by
1857 the SLP instances. If that unrolling factor is 1, we say, that we
1858 perform pure SLP on loop - cross iteration parallelism is not
1859 exploited. */
1860 bool only_slp_in_loop = true;
1861 for (i = 0; i < nbbs; i++)
1863 basic_block bb = bbs[i];
1864 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1865 gsi_next (&si))
1867 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1868 if (!stmt_info)
1869 continue;
1870 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1871 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1872 && !PURE_SLP_STMT (stmt_info))
1873 /* STMT needs both SLP and loop-based vectorization. */
1874 only_slp_in_loop = false;
1876 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1877 gsi_next (&si))
1879 if (is_gimple_debug (gsi_stmt (si)))
1880 continue;
1881 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1882 stmt_info = vect_stmt_to_vectorize (stmt_info);
1883 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1884 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1885 && !PURE_SLP_STMT (stmt_info))
1886 /* STMT needs both SLP and loop-based vectorization. */
1887 only_slp_in_loop = false;
1891 if (only_slp_in_loop)
1893 if (dump_enabled_p ())
1894 dump_printf_loc (MSG_NOTE, vect_location,
1895 "Loop contains only SLP stmts\n");
1896 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1898 else
1900 if (dump_enabled_p ())
1901 dump_printf_loc (MSG_NOTE, vect_location,
1902 "Loop contains SLP and non-SLP stmts\n");
1903 /* Both the vectorization factor and unroll factor have the form
1904 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1905 so they must have a common multiple. */
1906 vectorization_factor
1907 = force_common_multiple (vectorization_factor,
1908 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1911 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1912 if (dump_enabled_p ())
1914 dump_printf_loc (MSG_NOTE, vect_location,
1915 "Updating vectorization factor to ");
1916 dump_dec (MSG_NOTE, vectorization_factor);
1917 dump_printf (MSG_NOTE, ".\n");
1921 /* Return true if STMT_INFO describes a double reduction phi and if
1922 the other phi in the reduction is also relevant for vectorization.
1923 This rejects cases such as:
1925 outer1:
1926 x_1 = PHI <x_3(outer2), ...>;
1929 inner:
1930 x_2 = ...;
1933 outer2:
1934 x_3 = PHI <x_2(inner)>;
1936 if nothing in x_2 or elsewhere makes x_1 relevant. */
1938 static bool
1939 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1941 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1942 return false;
1944 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1947 /* Function vect_analyze_loop_operations.
1949 Scan the loop stmts and make sure they are all vectorizable. */
1951 static opt_result
1952 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1954 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1955 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1956 int nbbs = loop->num_nodes;
1957 int i;
1958 stmt_vec_info stmt_info;
1959 bool need_to_vectorize = false;
1960 bool ok;
1962 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1964 auto_vec<stmt_info_for_cost> cost_vec;
1966 for (i = 0; i < nbbs; i++)
1968 basic_block bb = bbs[i];
1970 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1971 gsi_next (&si))
1973 gphi *phi = si.phi ();
1974 ok = true;
1976 stmt_info = loop_vinfo->lookup_stmt (phi);
1977 if (dump_enabled_p ())
1978 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1979 (gimple *) phi);
1980 if (virtual_operand_p (gimple_phi_result (phi)))
1981 continue;
1983 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1984 (i.e., a phi in the tail of the outer-loop). */
1985 if (! is_loop_header_bb_p (bb))
1987 /* FORNOW: we currently don't support the case that these phis
1988 are not used in the outerloop (unless it is double reduction,
1989 i.e., this phi is vect_reduction_def), cause this case
1990 requires to actually do something here. */
1991 if (STMT_VINFO_LIVE_P (stmt_info)
1992 && !vect_active_double_reduction_p (stmt_info))
1993 return opt_result::failure_at (phi,
1994 "Unsupported loop-closed phi"
1995 " in outer-loop.\n");
1997 /* If PHI is used in the outer loop, we check that its operand
1998 is defined in the inner loop. */
1999 if (STMT_VINFO_RELEVANT_P (stmt_info))
2001 tree phi_op;
2003 if (gimple_phi_num_args (phi) != 1)
2004 return opt_result::failure_at (phi, "unsupported phi");
2006 phi_op = PHI_ARG_DEF (phi, 0);
2007 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2008 if (!op_def_info)
2009 return opt_result::failure_at (phi, "unsupported phi\n");
2011 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2012 && (STMT_VINFO_RELEVANT (op_def_info)
2013 != vect_used_in_outer_by_reduction))
2014 return opt_result::failure_at (phi, "unsupported phi\n");
2016 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2017 || (STMT_VINFO_DEF_TYPE (stmt_info)
2018 == vect_double_reduction_def))
2019 && !vectorizable_lc_phi (loop_vinfo,
2020 stmt_info, NULL, NULL))
2021 return opt_result::failure_at (phi, "unsupported phi\n");
2024 continue;
2027 gcc_assert (stmt_info);
2029 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2030 || STMT_VINFO_LIVE_P (stmt_info))
2031 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2032 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2033 /* A scalar-dependence cycle that we don't support. */
2034 return opt_result::failure_at (phi,
2035 "not vectorized:"
2036 " scalar dependence cycle.\n");
2038 if (STMT_VINFO_RELEVANT_P (stmt_info))
2040 need_to_vectorize = true;
2041 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2042 && ! PURE_SLP_STMT (stmt_info))
2043 ok = vectorizable_induction (loop_vinfo,
2044 stmt_info, NULL, NULL,
2045 &cost_vec);
2046 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2047 || (STMT_VINFO_DEF_TYPE (stmt_info)
2048 == vect_double_reduction_def)
2049 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2050 && ! PURE_SLP_STMT (stmt_info))
2051 ok = vectorizable_reduction (loop_vinfo,
2052 stmt_info, NULL, NULL, &cost_vec);
2053 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2054 == vect_first_order_recurrence)
2055 && ! PURE_SLP_STMT (stmt_info))
2056 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2057 &cost_vec);
2060 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2061 if (ok
2062 && STMT_VINFO_LIVE_P (stmt_info)
2063 && !PURE_SLP_STMT (stmt_info))
2064 ok = vectorizable_live_operation (loop_vinfo,
2065 stmt_info, NULL, NULL, NULL,
2066 -1, false, &cost_vec);
2068 if (!ok)
2069 return opt_result::failure_at (phi,
2070 "not vectorized: relevant phi not "
2071 "supported: %G",
2072 static_cast <gimple *> (phi));
2075 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2076 gsi_next (&si))
2078 gimple *stmt = gsi_stmt (si);
2079 if (!gimple_clobber_p (stmt)
2080 && !is_gimple_debug (stmt))
2082 opt_result res
2083 = vect_analyze_stmt (loop_vinfo,
2084 loop_vinfo->lookup_stmt (stmt),
2085 &need_to_vectorize,
2086 NULL, NULL, &cost_vec);
2087 if (!res)
2088 return res;
2091 } /* bbs */
2093 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2095 /* All operations in the loop are either irrelevant (deal with loop
2096 control, or dead), or only used outside the loop and can be moved
2097 out of the loop (e.g. invariants, inductions). The loop can be
2098 optimized away by scalar optimizations. We're better off not
2099 touching this loop. */
2100 if (!need_to_vectorize)
2102 if (dump_enabled_p ())
2103 dump_printf_loc (MSG_NOTE, vect_location,
2104 "All the computation can be taken out of the loop.\n");
2105 return opt_result::failure_at
2106 (vect_location,
2107 "not vectorized: redundant loop. no profit to vectorize.\n");
2110 return opt_result::success ();
2113 /* Return true if we know that the iteration count is smaller than the
2114 vectorization factor. Return false if it isn't, or if we can't be sure
2115 either way. */
2117 static bool
2118 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2120 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2122 HOST_WIDE_INT max_niter;
2123 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2124 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2125 else
2126 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2128 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2129 return true;
2131 return false;
2134 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2135 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2136 definitely no, or -1 if it's worth retrying. */
2138 static int
2139 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2140 unsigned *suggested_unroll_factor)
2142 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2143 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2145 /* Only loops that can handle partially-populated vectors can have iteration
2146 counts less than the vectorization factor. */
2147 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2149 if (vect_known_niters_smaller_than_vf (loop_vinfo))
2151 if (dump_enabled_p ())
2152 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2153 "not vectorized: iteration count smaller than "
2154 "vectorization factor.\n");
2155 return 0;
2159 /* If using the "very cheap" model. reject cases in which we'd keep
2160 a copy of the scalar code (even if we might be able to vectorize it). */
2161 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2162 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2163 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2164 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2166 if (dump_enabled_p ())
2167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2168 "some scalar iterations would need to be peeled\n");
2169 return 0;
2172 int min_profitable_iters, min_profitable_estimate;
2173 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2174 &min_profitable_estimate,
2175 suggested_unroll_factor);
2177 if (min_profitable_iters < 0)
2179 if (dump_enabled_p ())
2180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2181 "not vectorized: vectorization not profitable.\n");
2182 if (dump_enabled_p ())
2183 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184 "not vectorized: vector version will never be "
2185 "profitable.\n");
2186 return -1;
2189 int min_scalar_loop_bound = (param_min_vect_loop_bound
2190 * assumed_vf);
2192 /* Use the cost model only if it is more conservative than user specified
2193 threshold. */
2194 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2195 min_profitable_iters);
2197 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2199 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2200 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2202 if (dump_enabled_p ())
2203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2204 "not vectorized: vectorization not profitable.\n");
2205 if (dump_enabled_p ())
2206 dump_printf_loc (MSG_NOTE, vect_location,
2207 "not vectorized: iteration count smaller than user "
2208 "specified loop bound parameter or minimum profitable "
2209 "iterations (whichever is more conservative).\n");
2210 return 0;
2213 /* The static profitablity threshold min_profitable_estimate includes
2214 the cost of having to check at runtime whether the scalar loop
2215 should be used instead. If it turns out that we don't need or want
2216 such a check, the threshold we should use for the static estimate
2217 is simply the point at which the vector loop becomes more profitable
2218 than the scalar loop. */
2219 if (min_profitable_estimate > min_profitable_iters
2220 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2221 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2222 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2223 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2225 if (dump_enabled_p ())
2226 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2227 " choice between the scalar and vector loops\n");
2228 min_profitable_estimate = min_profitable_iters;
2231 /* If the vector loop needs multiple iterations to be beneficial then
2232 things are probably too close to call, and the conservative thing
2233 would be to stick with the scalar code. */
2234 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2235 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2237 if (dump_enabled_p ())
2238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2239 "one iteration of the vector loop would be"
2240 " more expensive than the equivalent number of"
2241 " iterations of the scalar loop\n");
2242 return 0;
2245 HOST_WIDE_INT estimated_niter;
2247 /* If we are vectorizing an epilogue then we know the maximum number of
2248 scalar iterations it will cover is at least one lower than the
2249 vectorization factor of the main loop. */
2250 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2251 estimated_niter
2252 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2253 else
2255 estimated_niter = estimated_stmt_executions_int (loop);
2256 if (estimated_niter == -1)
2257 estimated_niter = likely_max_stmt_executions_int (loop);
2259 if (estimated_niter != -1
2260 && ((unsigned HOST_WIDE_INT) estimated_niter
2261 < MAX (th, (unsigned) min_profitable_estimate)))
2263 if (dump_enabled_p ())
2264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2265 "not vectorized: estimated iteration count too "
2266 "small.\n");
2267 if (dump_enabled_p ())
2268 dump_printf_loc (MSG_NOTE, vect_location,
2269 "not vectorized: estimated iteration count smaller "
2270 "than specified loop bound parameter or minimum "
2271 "profitable iterations (whichever is more "
2272 "conservative).\n");
2273 return -1;
2276 return 1;
2279 static opt_result
2280 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2281 vec<data_reference_p> *datarefs,
2282 unsigned int *n_stmts)
2284 *n_stmts = 0;
2285 for (unsigned i = 0; i < loop->num_nodes; i++)
2286 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2287 !gsi_end_p (gsi); gsi_next (&gsi))
2289 gimple *stmt = gsi_stmt (gsi);
2290 if (is_gimple_debug (stmt))
2291 continue;
2292 ++(*n_stmts);
2293 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2294 NULL, 0);
2295 if (!res)
2297 if (is_gimple_call (stmt) && loop->safelen)
2299 tree fndecl = gimple_call_fndecl (stmt), op;
2300 if (fndecl == NULL_TREE
2301 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2303 fndecl = gimple_call_arg (stmt, 0);
2304 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2305 fndecl = TREE_OPERAND (fndecl, 0);
2306 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2308 if (fndecl != NULL_TREE)
2310 cgraph_node *node = cgraph_node::get (fndecl);
2311 if (node != NULL && node->simd_clones != NULL)
2313 unsigned int j, n = gimple_call_num_args (stmt);
2314 for (j = 0; j < n; j++)
2316 op = gimple_call_arg (stmt, j);
2317 if (DECL_P (op)
2318 || (REFERENCE_CLASS_P (op)
2319 && get_base_address (op)))
2320 break;
2322 op = gimple_call_lhs (stmt);
2323 /* Ignore #pragma omp declare simd functions
2324 if they don't have data references in the
2325 call stmt itself. */
2326 if (j == n
2327 && !(op
2328 && (DECL_P (op)
2329 || (REFERENCE_CLASS_P (op)
2330 && get_base_address (op)))))
2331 continue;
2335 return res;
2337 /* If dependence analysis will give up due to the limit on the
2338 number of datarefs stop here and fail fatally. */
2339 if (datarefs->length ()
2340 > (unsigned)param_loop_max_datarefs_for_datadeps)
2341 return opt_result::failure_at (stmt, "exceeded param "
2342 "loop-max-datarefs-for-datadeps\n");
2344 return opt_result::success ();
2347 /* Look for SLP-only access groups and turn each individual access into its own
2348 group. */
2349 static void
2350 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2352 unsigned int i;
2353 struct data_reference *dr;
2355 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2357 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2358 FOR_EACH_VEC_ELT (datarefs, i, dr)
2360 gcc_assert (DR_REF (dr));
2361 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2363 /* Check if the load is a part of an interleaving chain. */
2364 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2366 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2367 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2368 unsigned int group_size = DR_GROUP_SIZE (first_element);
2370 /* Check if SLP-only groups. */
2371 if (!STMT_SLP_TYPE (stmt_info)
2372 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2374 /* Dissolve the group. */
2375 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2377 stmt_vec_info vinfo = first_element;
2378 while (vinfo)
2380 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2381 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2382 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2383 DR_GROUP_SIZE (vinfo) = 1;
2384 if (STMT_VINFO_STRIDED_P (first_element))
2385 DR_GROUP_GAP (vinfo) = 0;
2386 else
2387 DR_GROUP_GAP (vinfo) = group_size - 1;
2388 /* Duplicate and adjust alignment info, it needs to
2389 be present on each group leader, see dr_misalignment. */
2390 if (vinfo != first_element)
2392 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2393 dr_info2->target_alignment = dr_info->target_alignment;
2394 int misalignment = dr_info->misalignment;
2395 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2397 HOST_WIDE_INT diff
2398 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2399 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2400 unsigned HOST_WIDE_INT align_c
2401 = dr_info->target_alignment.to_constant ();
2402 misalignment = (misalignment + diff) % align_c;
2404 dr_info2->misalignment = misalignment;
2406 vinfo = next;
2413 /* Determine if operating on full vectors for LOOP_VINFO might leave
2414 some scalar iterations still to do. If so, decide how we should
2415 handle those scalar iterations. The possibilities are:
2417 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2418 In this case:
2420 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2421 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2422 LOOP_VINFO_PEELING_FOR_NITER == false
2424 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2425 to handle the remaining scalar iterations. In this case:
2427 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2428 LOOP_VINFO_PEELING_FOR_NITER == true
2430 There are two choices:
2432 (2a) Consider vectorizing the epilogue loop at the same VF as the
2433 main loop, but using partial vectors instead of full vectors.
2434 In this case:
2436 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2438 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2439 In this case:
2441 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2443 When FOR_EPILOGUE_P is true, make this determination based on the
2444 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2445 based on the assumption that LOOP_VINFO is the main loop. The caller
2446 has made sure that the number of iterations is set appropriately for
2447 this value of FOR_EPILOGUE_P. */
2449 opt_result
2450 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2451 bool for_epilogue_p)
2453 /* Determine whether there would be any scalar iterations left over. */
2454 bool need_peeling_or_partial_vectors_p
2455 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2457 /* Decide whether to vectorize the loop with partial vectors. */
2458 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2459 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2460 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2461 && need_peeling_or_partial_vectors_p)
2463 /* For partial-vector-usage=1, try to push the handling of partial
2464 vectors to the epilogue, with the main loop continuing to operate
2465 on full vectors.
2467 If we are unrolling we also do not want to use partial vectors. This
2468 is to avoid the overhead of generating multiple masks and also to
2469 avoid having to execute entire iterations of FALSE masked instructions
2470 when dealing with one or less full iterations.
2472 ??? We could then end up failing to use partial vectors if we
2473 decide to peel iterations into a prologue, and if the main loop
2474 then ends up processing fewer than VF iterations. */
2475 if ((param_vect_partial_vector_usage == 1
2476 || loop_vinfo->suggested_unroll_factor > 1)
2477 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2478 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2479 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2480 else
2481 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2484 if (dump_enabled_p ())
2486 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2487 dump_printf_loc (MSG_NOTE, vect_location,
2488 "operating on partial vectors%s.\n",
2489 for_epilogue_p ? " for epilogue loop" : "");
2490 else
2491 dump_printf_loc (MSG_NOTE, vect_location,
2492 "operating only on full vectors%s.\n",
2493 for_epilogue_p ? " for epilogue loop" : "");
2496 if (for_epilogue_p)
2498 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2499 gcc_assert (orig_loop_vinfo);
2500 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2501 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2502 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2505 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2506 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2508 /* Check that the loop processes at least one full vector. */
2509 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2510 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2511 if (known_lt (wi::to_widest (scalar_niters), vf))
2512 return opt_result::failure_at (vect_location,
2513 "loop does not have enough iterations"
2514 " to support vectorization.\n");
2516 /* If we need to peel an extra epilogue iteration to handle data
2517 accesses with gaps, check that there are enough scalar iterations
2518 available.
2520 The check above is redundant with this one when peeling for gaps,
2521 but the distinction is useful for diagnostics. */
2522 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2523 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2524 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2525 return opt_result::failure_at (vect_location,
2526 "loop does not have enough iterations"
2527 " to support peeling for gaps.\n");
2530 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2531 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2532 && need_peeling_or_partial_vectors_p);
2534 return opt_result::success ();
2537 /* Function vect_analyze_loop_2.
2539 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2540 analyses will record information in some members of LOOP_VINFO. FATAL
2541 indicates if some analysis meets fatal error. If one non-NULL pointer
2542 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2543 worked out suggested unroll factor, while one NULL pointer shows it's
2544 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2545 is to hold the slp decision when the suggested unroll factor is worked
2546 out. */
2547 static opt_result
2548 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2549 unsigned *suggested_unroll_factor,
2550 bool& slp_done_for_suggested_uf)
2552 opt_result ok = opt_result::success ();
2553 int res;
2554 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2555 poly_uint64 min_vf = 2;
2556 loop_vec_info orig_loop_vinfo = NULL;
2558 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2559 loop_vec_info of the first vectorized loop. */
2560 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2561 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2562 else
2563 orig_loop_vinfo = loop_vinfo;
2564 gcc_assert (orig_loop_vinfo);
2566 /* The first group of checks is independent of the vector size. */
2567 fatal = true;
2569 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2570 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2571 return opt_result::failure_at (vect_location,
2572 "not vectorized: simd if(0)\n");
2574 /* Find all data references in the loop (which correspond to vdefs/vuses)
2575 and analyze their evolution in the loop. */
2577 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2579 /* Gather the data references and count stmts in the loop. */
2580 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2582 opt_result res
2583 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2584 &LOOP_VINFO_DATAREFS (loop_vinfo),
2585 &LOOP_VINFO_N_STMTS (loop_vinfo));
2586 if (!res)
2588 if (dump_enabled_p ())
2589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2590 "not vectorized: loop contains function "
2591 "calls or data references that cannot "
2592 "be analyzed\n");
2593 return res;
2595 loop_vinfo->shared->save_datarefs ();
2597 else
2598 loop_vinfo->shared->check_datarefs ();
2600 /* Analyze the data references and also adjust the minimal
2601 vectorization factor according to the loads and stores. */
2603 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2604 if (!ok)
2606 if (dump_enabled_p ())
2607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2608 "bad data references.\n");
2609 return ok;
2612 /* Check if we are applying unroll factor now. */
2613 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2614 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2616 /* If the slp decision is false when suggested unroll factor is worked
2617 out, and we are applying suggested unroll factor, we can simply skip
2618 all slp related analyses this time. */
2619 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2621 /* Classify all cross-iteration scalar data-flow cycles.
2622 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2623 vect_analyze_scalar_cycles (loop_vinfo, slp);
2625 vect_pattern_recog (loop_vinfo);
2627 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2629 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2630 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2632 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2633 if (!ok)
2635 if (dump_enabled_p ())
2636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2637 "bad data access.\n");
2638 return ok;
2641 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2643 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2644 if (!ok)
2646 if (dump_enabled_p ())
2647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2648 "unexpected pattern.\n");
2649 return ok;
2652 /* While the rest of the analysis below depends on it in some way. */
2653 fatal = false;
2655 /* Analyze data dependences between the data-refs in the loop
2656 and adjust the maximum vectorization factor according to
2657 the dependences.
2658 FORNOW: fail at the first data dependence that we encounter. */
2660 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2661 if (!ok)
2663 if (dump_enabled_p ())
2664 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2665 "bad data dependence.\n");
2666 return ok;
2668 if (max_vf != MAX_VECTORIZATION_FACTOR
2669 && maybe_lt (max_vf, min_vf))
2670 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2671 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2673 ok = vect_determine_vectorization_factor (loop_vinfo);
2674 if (!ok)
2676 if (dump_enabled_p ())
2677 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2678 "can't determine vectorization factor.\n");
2679 return ok;
2681 if (max_vf != MAX_VECTORIZATION_FACTOR
2682 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2683 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2685 /* Compute the scalar iteration cost. */
2686 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2688 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2690 if (slp)
2692 /* Check the SLP opportunities in the loop, analyze and build
2693 SLP trees. */
2694 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2695 if (!ok)
2696 return ok;
2698 /* If there are any SLP instances mark them as pure_slp. */
2699 slp = vect_make_slp_decision (loop_vinfo);
2700 if (slp)
2702 /* Find stmts that need to be both vectorized and SLPed. */
2703 vect_detect_hybrid_slp (loop_vinfo);
2705 /* Update the vectorization factor based on the SLP decision. */
2706 vect_update_vf_for_slp (loop_vinfo);
2708 /* Optimize the SLP graph with the vectorization factor fixed. */
2709 vect_optimize_slp (loop_vinfo);
2711 /* Gather the loads reachable from the SLP graph entries. */
2712 vect_gather_slp_loads (loop_vinfo);
2716 bool saved_can_use_partial_vectors_p
2717 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2719 /* We don't expect to have to roll back to anything other than an empty
2720 set of rgroups. */
2721 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2723 /* This is the point where we can re-start analysis with SLP forced off. */
2724 start_over:
2726 /* Apply the suggested unrolling factor, this was determined by the backend
2727 during finish_cost the first time we ran the analyzis for this
2728 vector mode. */
2729 if (applying_suggested_uf)
2730 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2732 /* Now the vectorization factor is final. */
2733 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2734 gcc_assert (known_ne (vectorization_factor, 0U));
2736 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2738 dump_printf_loc (MSG_NOTE, vect_location,
2739 "vectorization_factor = ");
2740 dump_dec (MSG_NOTE, vectorization_factor);
2741 dump_printf (MSG_NOTE, ", niters = %wd\n",
2742 LOOP_VINFO_INT_NITERS (loop_vinfo));
2745 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2747 /* Analyze the alignment of the data-refs in the loop.
2748 Fail if a data reference is found that cannot be vectorized. */
2750 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2751 if (!ok)
2753 if (dump_enabled_p ())
2754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2755 "bad data alignment.\n");
2756 return ok;
2759 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2760 It is important to call pruning after vect_analyze_data_ref_accesses,
2761 since we use grouping information gathered by interleaving analysis. */
2762 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2763 if (!ok)
2764 return ok;
2766 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2767 vectorization, since we do not want to add extra peeling or
2768 add versioning for alignment. */
2769 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2770 /* This pass will decide on using loop versioning and/or loop peeling in
2771 order to enhance the alignment of data references in the loop. */
2772 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2773 if (!ok)
2774 return ok;
2776 if (slp)
2778 /* Analyze operations in the SLP instances. Note this may
2779 remove unsupported SLP instances which makes the above
2780 SLP kind detection invalid. */
2781 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2782 vect_slp_analyze_operations (loop_vinfo);
2783 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2785 ok = opt_result::failure_at (vect_location,
2786 "unsupported SLP instances\n");
2787 goto again;
2790 /* Check whether any load in ALL SLP instances is possibly permuted. */
2791 slp_tree load_node, slp_root;
2792 unsigned i, x;
2793 slp_instance instance;
2794 bool can_use_lanes = true;
2795 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2797 slp_root = SLP_INSTANCE_TREE (instance);
2798 int group_size = SLP_TREE_LANES (slp_root);
2799 tree vectype = SLP_TREE_VECTYPE (slp_root);
2800 bool loads_permuted = false;
2801 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2803 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2804 continue;
2805 unsigned j;
2806 stmt_vec_info load_info;
2807 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2808 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2810 loads_permuted = true;
2811 break;
2815 /* If the loads and stores can be handled with load/store-lane
2816 instructions record it and move on to the next instance. */
2817 if (loads_permuted
2818 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2819 && vect_store_lanes_supported (vectype, group_size, false))
2821 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2823 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2824 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2825 /* Use SLP for strided accesses (or if we can't
2826 load-lanes). */
2827 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2828 || ! vect_load_lanes_supported
2829 (STMT_VINFO_VECTYPE (stmt_vinfo),
2830 DR_GROUP_SIZE (stmt_vinfo), false))
2831 break;
2834 can_use_lanes
2835 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2837 if (can_use_lanes && dump_enabled_p ())
2838 dump_printf_loc (MSG_NOTE, vect_location,
2839 "SLP instance %p can use load/store-lanes\n",
2840 (void *) instance);
2842 else
2844 can_use_lanes = false;
2845 break;
2849 /* If all SLP instances can use load/store-lanes abort SLP and try again
2850 with SLP disabled. */
2851 if (can_use_lanes)
2853 ok = opt_result::failure_at (vect_location,
2854 "Built SLP cancelled: can use "
2855 "load/store-lanes\n");
2856 if (dump_enabled_p ())
2857 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2858 "Built SLP cancelled: all SLP instances support "
2859 "load/store-lanes\n");
2860 goto again;
2864 /* Dissolve SLP-only groups. */
2865 vect_dissolve_slp_only_groups (loop_vinfo);
2867 /* Scan all the remaining operations in the loop that are not subject
2868 to SLP and make sure they are vectorizable. */
2869 ok = vect_analyze_loop_operations (loop_vinfo);
2870 if (!ok)
2872 if (dump_enabled_p ())
2873 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2874 "bad operation or unsupported loop bound.\n");
2875 return ok;
2878 /* For now, we don't expect to mix both masking and length approaches for one
2879 loop, disable it if both are recorded. */
2880 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2881 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2882 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2884 if (dump_enabled_p ())
2885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2886 "can't vectorize a loop with partial vectors"
2887 " because we don't expect to mix different"
2888 " approaches with partial vectors for the"
2889 " same loop.\n");
2890 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2893 /* If we still have the option of using partial vectors,
2894 check whether we can generate the necessary loop controls. */
2895 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2897 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2899 if (!vect_verify_full_masking (loop_vinfo)
2900 && !vect_verify_full_masking_avx512 (loop_vinfo))
2901 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2903 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2904 if (!vect_verify_loop_lens (loop_vinfo))
2905 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2908 /* If we're vectorizing a loop that uses length "controls" and
2909 can iterate more than once, we apply decrementing IV approach
2910 in loop control. */
2911 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2912 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2913 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2914 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2915 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2916 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2917 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2919 /* If a loop uses length controls and has a decrementing loop control IV,
2920 we will normally pass that IV through a MIN_EXPR to calcaluate the
2921 basis for the length controls. E.g. in a loop that processes one
2922 element per scalar iteration, the number of elements would be
2923 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2925 This MIN_EXPR approach allows us to use pointer IVs with an invariant
2926 step, since only the final iteration of the vector loop can have
2927 inactive lanes.
2929 However, some targets have a dedicated instruction for calculating the
2930 preferred length, given the total number of elements that still need to
2931 be processed. This is encapsulated in the SELECT_VL internal function.
2933 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2934 to determine the basis for the length controls. However, unlike the
2935 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2936 lanes inactive in any iteration of the vector loop, not just the last
2937 iteration. This SELECT_VL approach therefore requires us to use pointer
2938 IVs with variable steps.
2940 Once we've decided how many elements should be processed by one
2941 iteration of the vector loop, we need to populate the rgroup controls.
2942 If a loop has multiple rgroups, we need to make sure that those rgroups
2943 "line up" (that is, they must be consistent about which elements are
2944 active and which aren't). This is done by vect_adjust_loop_lens_control.
2946 In principle, it would be possible to use vect_adjust_loop_lens_control
2947 on either the result of a MIN_EXPR or the result of a SELECT_VL.
2948 However:
2950 (1) In practice, it only makes sense to use SELECT_VL when a vector
2951 operation will be controlled directly by the result. It is not
2952 worth using SELECT_VL if it would only be the input to other
2953 calculations.
2955 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2956 pointer IV will need N updates by a variable amount (N-1 updates
2957 within the iteration and 1 update to move to the next iteration).
2959 Because of this, we prefer to use the MIN_EXPR approach whenever there
2960 is more than one length control.
2962 In addition, SELECT_VL always operates to a granularity of 1 unit.
2963 If we wanted to use it to control an SLP operation on N consecutive
2964 elements, we would need to make the SELECT_VL inputs measure scalar
2965 iterations (rather than elements) and then multiply the SELECT_VL
2966 result by N. But using SELECT_VL this way is inefficient because
2967 of (1) above.
2969 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2970 satisfied:
2972 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2973 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2975 Since SELECT_VL (variable step) will make SCEV analysis failed and then
2976 we will fail to gain benefits of following unroll optimizations. We prefer
2977 using the MIN_EXPR approach in this situation. */
2978 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
2980 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
2981 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
2982 OPTIMIZE_FOR_SPEED)
2983 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
2984 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
2985 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2986 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
2987 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
2990 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2991 to be able to handle fewer than VF scalars, or needs to have a lower VF
2992 than the main loop. */
2993 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2994 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2995 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2996 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2997 return opt_result::failure_at (vect_location,
2998 "Vectorization factor too high for"
2999 " epilogue loop.\n");
3001 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3002 assuming that the loop will be used as a main loop. We will redo
3003 this analysis later if we instead decide to use the loop as an
3004 epilogue loop. */
3005 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
3006 if (!ok)
3007 return ok;
3009 /* Check the costings of the loop make vectorizing worthwhile. */
3010 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3011 if (res < 0)
3013 ok = opt_result::failure_at (vect_location,
3014 "Loop costings may not be worthwhile.\n");
3015 goto again;
3017 if (!res)
3018 return opt_result::failure_at (vect_location,
3019 "Loop costings not worthwhile.\n");
3021 /* If an epilogue loop is required make sure we can create one. */
3022 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3023 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3025 if (dump_enabled_p ())
3026 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3027 if (!vect_can_advance_ivs_p (loop_vinfo)
3028 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
3029 single_exit (LOOP_VINFO_LOOP
3030 (loop_vinfo))))
3032 ok = opt_result::failure_at (vect_location,
3033 "not vectorized: can't create required "
3034 "epilog loop\n");
3035 goto again;
3039 /* During peeling, we need to check if number of loop iterations is
3040 enough for both peeled prolog loop and vector loop. This check
3041 can be merged along with threshold check of loop versioning, so
3042 increase threshold for this case if necessary.
3044 If we are analyzing an epilogue we still want to check what its
3045 versioning threshold would be. If we decide to vectorize the epilogues we
3046 will want to use the lowest versioning threshold of all epilogues and main
3047 loop. This will enable us to enter a vectorized epilogue even when
3048 versioning the loop. We can't simply check whether the epilogue requires
3049 versioning though since we may have skipped some versioning checks when
3050 analyzing the epilogue. For instance, checks for alias versioning will be
3051 skipped when dealing with epilogues as we assume we already checked them
3052 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3053 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3055 poly_uint64 niters_th = 0;
3056 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3058 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3060 /* Niters for peeled prolog loop. */
3061 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3063 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3064 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3065 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3067 else
3068 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3071 /* Niters for at least one iteration of vectorized loop. */
3072 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3073 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3074 /* One additional iteration because of peeling for gap. */
3075 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3076 niters_th += 1;
3078 /* Use the same condition as vect_transform_loop to decide when to use
3079 the cost to determine a versioning threshold. */
3080 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3081 && ordered_p (th, niters_th))
3082 niters_th = ordered_max (poly_uint64 (th), niters_th);
3084 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3087 gcc_assert (known_eq (vectorization_factor,
3088 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3090 slp_done_for_suggested_uf = slp;
3092 /* Ok to vectorize! */
3093 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3094 return opt_result::success ();
3096 again:
3097 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3098 gcc_assert (!ok);
3100 /* Try again with SLP forced off but if we didn't do any SLP there is
3101 no point in re-trying. */
3102 if (!slp)
3103 return ok;
3105 /* If the slp decision is true when suggested unroll factor is worked
3106 out, and we are applying suggested unroll factor, we don't need to
3107 re-try any more. */
3108 if (applying_suggested_uf && slp_done_for_suggested_uf)
3109 return ok;
3111 /* If there are reduction chains re-trying will fail anyway. */
3112 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3113 return ok;
3115 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3116 via interleaving or lane instructions. */
3117 slp_instance instance;
3118 slp_tree node;
3119 unsigned i, j;
3120 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3122 stmt_vec_info vinfo;
3123 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3124 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3125 continue;
3126 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3127 unsigned int size = DR_GROUP_SIZE (vinfo);
3128 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3129 if (! vect_store_lanes_supported (vectype, size, false)
3130 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3131 && ! vect_grouped_store_supported (vectype, size))
3132 return opt_result::failure_at (vinfo->stmt,
3133 "unsupported grouped store\n");
3134 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3136 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
3137 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3138 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3139 size = DR_GROUP_SIZE (vinfo);
3140 vectype = STMT_VINFO_VECTYPE (vinfo);
3141 if (! vect_load_lanes_supported (vectype, size, false)
3142 && ! vect_grouped_load_supported (vectype, single_element_p,
3143 size))
3144 return opt_result::failure_at (vinfo->stmt,
3145 "unsupported grouped load\n");
3149 if (dump_enabled_p ())
3150 dump_printf_loc (MSG_NOTE, vect_location,
3151 "re-trying with SLP disabled\n");
3153 /* Roll back state appropriately. No SLP this time. */
3154 slp = false;
3155 /* Restore vectorization factor as it were without SLP. */
3156 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3157 /* Free the SLP instances. */
3158 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3159 vect_free_slp_instance (instance);
3160 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3161 /* Reset SLP type to loop_vect on all stmts. */
3162 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3164 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3165 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3166 !gsi_end_p (si); gsi_next (&si))
3168 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3169 STMT_SLP_TYPE (stmt_info) = loop_vect;
3170 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3171 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3173 /* vectorizable_reduction adjusts reduction stmt def-types,
3174 restore them to that of the PHI. */
3175 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3176 = STMT_VINFO_DEF_TYPE (stmt_info);
3177 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3178 (STMT_VINFO_REDUC_DEF (stmt_info)))
3179 = STMT_VINFO_DEF_TYPE (stmt_info);
3182 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3183 !gsi_end_p (si); gsi_next (&si))
3185 if (is_gimple_debug (gsi_stmt (si)))
3186 continue;
3187 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3188 STMT_SLP_TYPE (stmt_info) = loop_vect;
3189 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3191 stmt_vec_info pattern_stmt_info
3192 = STMT_VINFO_RELATED_STMT (stmt_info);
3193 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3194 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3196 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3197 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3198 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3199 !gsi_end_p (pi); gsi_next (&pi))
3200 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3201 = loop_vect;
3205 /* Free optimized alias test DDRS. */
3206 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3207 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3208 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3209 /* Reset target cost data. */
3210 delete loop_vinfo->vector_costs;
3211 loop_vinfo->vector_costs = nullptr;
3212 /* Reset accumulated rgroup information. */
3213 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3214 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3215 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3216 /* Reset assorted flags. */
3217 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3218 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3219 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3220 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3221 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3222 = saved_can_use_partial_vectors_p;
3224 goto start_over;
3227 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3228 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3229 OLD_LOOP_VINFO is better unless something specifically indicates
3230 otherwise.
3232 Note that this deliberately isn't a partial order. */
3234 static bool
3235 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3236 loop_vec_info old_loop_vinfo)
3238 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3239 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3241 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3242 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3244 /* Always prefer a VF of loop->simdlen over any other VF. */
3245 if (loop->simdlen)
3247 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3248 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3249 if (new_simdlen_p != old_simdlen_p)
3250 return new_simdlen_p;
3253 const auto *old_costs = old_loop_vinfo->vector_costs;
3254 const auto *new_costs = new_loop_vinfo->vector_costs;
3255 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3256 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3258 return new_costs->better_main_loop_than_p (old_costs);
3261 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3262 true if we should. */
3264 static bool
3265 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3266 loop_vec_info old_loop_vinfo)
3268 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3269 return false;
3271 if (dump_enabled_p ())
3272 dump_printf_loc (MSG_NOTE, vect_location,
3273 "***** Preferring vector mode %s to vector mode %s\n",
3274 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3275 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3276 return true;
3279 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3280 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3281 MODE_I to the next mode useful to analyze.
3282 Return the loop_vinfo on success and wrapped null on failure. */
3284 static opt_loop_vec_info
3285 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3286 const vect_loop_form_info *loop_form_info,
3287 loop_vec_info main_loop_vinfo,
3288 const vector_modes &vector_modes, unsigned &mode_i,
3289 machine_mode &autodetected_vector_mode,
3290 bool &fatal)
3292 loop_vec_info loop_vinfo
3293 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3295 machine_mode vector_mode = vector_modes[mode_i];
3296 loop_vinfo->vector_mode = vector_mode;
3297 unsigned int suggested_unroll_factor = 1;
3298 bool slp_done_for_suggested_uf;
3300 /* Run the main analysis. */
3301 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3302 &suggested_unroll_factor,
3303 slp_done_for_suggested_uf);
3304 if (dump_enabled_p ())
3305 dump_printf_loc (MSG_NOTE, vect_location,
3306 "***** Analysis %s with vector mode %s\n",
3307 res ? "succeeded" : " failed",
3308 GET_MODE_NAME (loop_vinfo->vector_mode));
3310 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3312 if (dump_enabled_p ())
3313 dump_printf_loc (MSG_NOTE, vect_location,
3314 "***** Re-trying analysis for unrolling"
3315 " with unroll factor %d and slp %s.\n",
3316 suggested_unroll_factor,
3317 slp_done_for_suggested_uf ? "on" : "off");
3318 loop_vec_info unroll_vinfo
3319 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3320 unroll_vinfo->vector_mode = vector_mode;
3321 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3322 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3323 slp_done_for_suggested_uf);
3324 if (new_res)
3326 delete loop_vinfo;
3327 loop_vinfo = unroll_vinfo;
3329 else
3330 delete unroll_vinfo;
3333 /* Remember the autodetected vector mode. */
3334 if (vector_mode == VOIDmode)
3335 autodetected_vector_mode = loop_vinfo->vector_mode;
3337 /* Advance mode_i, first skipping modes that would result in the
3338 same analysis result. */
3339 while (mode_i + 1 < vector_modes.length ()
3340 && vect_chooses_same_modes_p (loop_vinfo,
3341 vector_modes[mode_i + 1]))
3343 if (dump_enabled_p ())
3344 dump_printf_loc (MSG_NOTE, vect_location,
3345 "***** The result for vector mode %s would"
3346 " be the same\n",
3347 GET_MODE_NAME (vector_modes[mode_i + 1]));
3348 mode_i += 1;
3350 if (mode_i + 1 < vector_modes.length ()
3351 && VECTOR_MODE_P (autodetected_vector_mode)
3352 && (related_vector_mode (vector_modes[mode_i + 1],
3353 GET_MODE_INNER (autodetected_vector_mode))
3354 == autodetected_vector_mode)
3355 && (related_vector_mode (autodetected_vector_mode,
3356 GET_MODE_INNER (vector_modes[mode_i + 1]))
3357 == vector_modes[mode_i + 1]))
3359 if (dump_enabled_p ())
3360 dump_printf_loc (MSG_NOTE, vect_location,
3361 "***** Skipping vector mode %s, which would"
3362 " repeat the analysis for %s\n",
3363 GET_MODE_NAME (vector_modes[mode_i + 1]),
3364 GET_MODE_NAME (autodetected_vector_mode));
3365 mode_i += 1;
3367 mode_i++;
3369 if (!res)
3371 delete loop_vinfo;
3372 if (fatal)
3373 gcc_checking_assert (main_loop_vinfo == NULL);
3374 return opt_loop_vec_info::propagate_failure (res);
3377 return opt_loop_vec_info::success (loop_vinfo);
3380 /* Function vect_analyze_loop.
3382 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3383 for it. The different analyses will record information in the
3384 loop_vec_info struct. */
3385 opt_loop_vec_info
3386 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3388 DUMP_VECT_SCOPE ("analyze_loop_nest");
3390 if (loop_outer (loop)
3391 && loop_vec_info_for_loop (loop_outer (loop))
3392 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3393 return opt_loop_vec_info::failure_at (vect_location,
3394 "outer-loop already vectorized.\n");
3396 if (!find_loop_nest (loop, &shared->loop_nest))
3397 return opt_loop_vec_info::failure_at
3398 (vect_location,
3399 "not vectorized: loop nest containing two or more consecutive inner"
3400 " loops cannot be vectorized\n");
3402 /* Analyze the loop form. */
3403 vect_loop_form_info loop_form_info;
3404 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3405 if (!res)
3407 if (dump_enabled_p ())
3408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3409 "bad loop form.\n");
3410 return opt_loop_vec_info::propagate_failure (res);
3412 if (!integer_onep (loop_form_info.assumptions))
3414 /* We consider to vectorize this loop by versioning it under
3415 some assumptions. In order to do this, we need to clear
3416 existing information computed by scev and niter analyzer. */
3417 scev_reset_htab ();
3418 free_numbers_of_iterations_estimates (loop);
3419 /* Also set flag for this loop so that following scev and niter
3420 analysis are done under the assumptions. */
3421 loop_constraint_set (loop, LOOP_C_FINITE);
3424 auto_vector_modes vector_modes;
3425 /* Autodetect first vector size we try. */
3426 vector_modes.safe_push (VOIDmode);
3427 unsigned int autovec_flags
3428 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3429 loop->simdlen != 0);
3430 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3431 && !unlimited_cost_model (loop));
3432 machine_mode autodetected_vector_mode = VOIDmode;
3433 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3434 unsigned int mode_i = 0;
3435 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3437 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3438 a mode has not been analyzed. */
3439 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3440 for (unsigned i = 0; i < vector_modes.length (); ++i)
3441 cached_vf_per_mode.safe_push (0);
3443 /* First determine the main loop vectorization mode, either the first
3444 one that works, starting with auto-detecting the vector mode and then
3445 following the targets order of preference, or the one with the
3446 lowest cost if pick_lowest_cost_p. */
3447 while (1)
3449 bool fatal;
3450 unsigned int last_mode_i = mode_i;
3451 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3452 failed. */
3453 cached_vf_per_mode[last_mode_i] = -1;
3454 opt_loop_vec_info loop_vinfo
3455 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3456 NULL, vector_modes, mode_i,
3457 autodetected_vector_mode, fatal);
3458 if (fatal)
3459 break;
3461 if (loop_vinfo)
3463 /* Analyzis has been successful so update the VF value. The
3464 VF should always be a multiple of unroll_factor and we want to
3465 capture the original VF here. */
3466 cached_vf_per_mode[last_mode_i]
3467 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3468 loop_vinfo->suggested_unroll_factor);
3469 /* Once we hit the desired simdlen for the first time,
3470 discard any previous attempts. */
3471 if (simdlen
3472 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3474 delete first_loop_vinfo;
3475 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3476 simdlen = 0;
3478 else if (pick_lowest_cost_p
3479 && first_loop_vinfo
3480 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3482 /* Pick loop_vinfo over first_loop_vinfo. */
3483 delete first_loop_vinfo;
3484 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3486 if (first_loop_vinfo == NULL)
3487 first_loop_vinfo = loop_vinfo;
3488 else
3490 delete loop_vinfo;
3491 loop_vinfo = opt_loop_vec_info::success (NULL);
3494 /* Commit to first_loop_vinfo if we have no reason to try
3495 alternatives. */
3496 if (!simdlen && !pick_lowest_cost_p)
3497 break;
3499 if (mode_i == vector_modes.length ()
3500 || autodetected_vector_mode == VOIDmode)
3501 break;
3503 /* Try the next biggest vector size. */
3504 if (dump_enabled_p ())
3505 dump_printf_loc (MSG_NOTE, vect_location,
3506 "***** Re-trying analysis with vector mode %s\n",
3507 GET_MODE_NAME (vector_modes[mode_i]));
3509 if (!first_loop_vinfo)
3510 return opt_loop_vec_info::propagate_failure (res);
3512 if (dump_enabled_p ())
3513 dump_printf_loc (MSG_NOTE, vect_location,
3514 "***** Choosing vector mode %s\n",
3515 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3517 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3518 enabled, SIMDUID is not set, it is the innermost loop and we have
3519 either already found the loop's SIMDLEN or there was no SIMDLEN to
3520 begin with.
3521 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3522 bool vect_epilogues = (!simdlen
3523 && loop->inner == NULL
3524 && param_vect_epilogues_nomask
3525 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3526 && !loop->simduid);
3527 if (!vect_epilogues)
3528 return first_loop_vinfo;
3530 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3531 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3533 /* For epilogues start the analysis from the first mode. The motivation
3534 behind starting from the beginning comes from cases where the VECTOR_MODES
3535 array may contain length-agnostic and length-specific modes. Their
3536 ordering is not guaranteed, so we could end up picking a mode for the main
3537 loop that is after the epilogue's optimal mode. */
3538 vector_modes[0] = autodetected_vector_mode;
3539 mode_i = 0;
3541 bool supports_partial_vectors =
3542 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3543 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3545 while (1)
3547 /* If the target does not support partial vectors we can shorten the
3548 number of modes to analyze for the epilogue as we know we can't pick a
3549 mode that would lead to a VF at least as big as the
3550 FIRST_VINFO_VF. */
3551 if (!supports_partial_vectors
3552 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3554 mode_i++;
3555 if (mode_i == vector_modes.length ())
3556 break;
3557 continue;
3560 if (dump_enabled_p ())
3561 dump_printf_loc (MSG_NOTE, vect_location,
3562 "***** Re-trying epilogue analysis with vector "
3563 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3565 bool fatal;
3566 opt_loop_vec_info loop_vinfo
3567 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3568 first_loop_vinfo,
3569 vector_modes, mode_i,
3570 autodetected_vector_mode, fatal);
3571 if (fatal)
3572 break;
3574 if (loop_vinfo)
3576 if (pick_lowest_cost_p)
3578 /* Keep trying to roll back vectorization attempts while the
3579 loop_vec_infos they produced were worse than this one. */
3580 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3581 while (!vinfos.is_empty ()
3582 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3584 gcc_assert (vect_epilogues);
3585 delete vinfos.pop ();
3588 /* For now only allow one epilogue loop. */
3589 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3591 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3592 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3593 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3594 || maybe_ne (lowest_th, 0U));
3595 /* Keep track of the known smallest versioning
3596 threshold. */
3597 if (ordered_p (lowest_th, th))
3598 lowest_th = ordered_min (lowest_th, th);
3600 else
3602 delete loop_vinfo;
3603 loop_vinfo = opt_loop_vec_info::success (NULL);
3606 /* For now only allow one epilogue loop, but allow
3607 pick_lowest_cost_p to replace it, so commit to the
3608 first epilogue if we have no reason to try alternatives. */
3609 if (!pick_lowest_cost_p)
3610 break;
3613 if (mode_i == vector_modes.length ())
3614 break;
3618 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3620 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3621 if (dump_enabled_p ())
3622 dump_printf_loc (MSG_NOTE, vect_location,
3623 "***** Choosing epilogue vector mode %s\n",
3624 GET_MODE_NAME
3625 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3628 return first_loop_vinfo;
3631 /* Return true if there is an in-order reduction function for CODE, storing
3632 it in *REDUC_FN if so. */
3634 static bool
3635 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3637 if (code == PLUS_EXPR)
3639 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3640 return true;
3642 return false;
3645 /* Function reduction_fn_for_scalar_code
3647 Input:
3648 CODE - tree_code of a reduction operations.
3650 Output:
3651 REDUC_FN - the corresponding internal function to be used to reduce the
3652 vector of partial results into a single scalar result, or IFN_LAST
3653 if the operation is a supported reduction operation, but does not have
3654 such an internal function.
3656 Return FALSE if CODE currently cannot be vectorized as reduction. */
3658 bool
3659 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3661 if (code.is_tree_code ())
3662 switch (tree_code (code))
3664 case MAX_EXPR:
3665 *reduc_fn = IFN_REDUC_MAX;
3666 return true;
3668 case MIN_EXPR:
3669 *reduc_fn = IFN_REDUC_MIN;
3670 return true;
3672 case PLUS_EXPR:
3673 *reduc_fn = IFN_REDUC_PLUS;
3674 return true;
3676 case BIT_AND_EXPR:
3677 *reduc_fn = IFN_REDUC_AND;
3678 return true;
3680 case BIT_IOR_EXPR:
3681 *reduc_fn = IFN_REDUC_IOR;
3682 return true;
3684 case BIT_XOR_EXPR:
3685 *reduc_fn = IFN_REDUC_XOR;
3686 return true;
3688 case MULT_EXPR:
3689 case MINUS_EXPR:
3690 *reduc_fn = IFN_LAST;
3691 return true;
3693 default:
3694 return false;
3696 else
3697 switch (combined_fn (code))
3699 CASE_CFN_FMAX:
3700 *reduc_fn = IFN_REDUC_FMAX;
3701 return true;
3703 CASE_CFN_FMIN:
3704 *reduc_fn = IFN_REDUC_FMIN;
3705 return true;
3707 default:
3708 return false;
3712 /* If there is a neutral value X such that a reduction would not be affected
3713 by the introduction of additional X elements, return that X, otherwise
3714 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3715 of the scalar elements. If the reduction has just a single initial value
3716 then INITIAL_VALUE is that value, otherwise it is null. */
3718 tree
3719 neutral_op_for_reduction (tree scalar_type, code_helper code,
3720 tree initial_value)
3722 if (code.is_tree_code ())
3723 switch (tree_code (code))
3725 case WIDEN_SUM_EXPR:
3726 case DOT_PROD_EXPR:
3727 case SAD_EXPR:
3728 case PLUS_EXPR:
3729 case MINUS_EXPR:
3730 case BIT_IOR_EXPR:
3731 case BIT_XOR_EXPR:
3732 return build_zero_cst (scalar_type);
3734 case MULT_EXPR:
3735 return build_one_cst (scalar_type);
3737 case BIT_AND_EXPR:
3738 return build_all_ones_cst (scalar_type);
3740 case MAX_EXPR:
3741 case MIN_EXPR:
3742 return initial_value;
3744 default:
3745 return NULL_TREE;
3747 else
3748 switch (combined_fn (code))
3750 CASE_CFN_FMIN:
3751 CASE_CFN_FMAX:
3752 return initial_value;
3754 default:
3755 return NULL_TREE;
3759 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3760 STMT is printed with a message MSG. */
3762 static void
3763 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3765 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3768 /* Return true if we need an in-order reduction for operation CODE
3769 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3770 overflow must wrap. */
3772 bool
3773 needs_fold_left_reduction_p (tree type, code_helper code)
3775 /* CHECKME: check for !flag_finite_math_only too? */
3776 if (SCALAR_FLOAT_TYPE_P (type))
3778 if (code.is_tree_code ())
3779 switch (tree_code (code))
3781 case MIN_EXPR:
3782 case MAX_EXPR:
3783 return false;
3785 default:
3786 return !flag_associative_math;
3788 else
3789 switch (combined_fn (code))
3791 CASE_CFN_FMIN:
3792 CASE_CFN_FMAX:
3793 return false;
3795 default:
3796 return !flag_associative_math;
3800 if (INTEGRAL_TYPE_P (type))
3801 return (!code.is_tree_code ()
3802 || !operation_no_trapping_overflow (type, tree_code (code)));
3804 if (SAT_FIXED_POINT_TYPE_P (type))
3805 return true;
3807 return false;
3810 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3811 has a handled computation expression. Store the main reduction
3812 operation in *CODE. */
3814 static bool
3815 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3816 tree loop_arg, code_helper *code,
3817 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3819 auto_bitmap visited;
3820 tree lookfor = PHI_RESULT (phi);
3821 ssa_op_iter curri;
3822 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3823 while (USE_FROM_PTR (curr) != loop_arg)
3824 curr = op_iter_next_use (&curri);
3825 curri.i = curri.numops;
3828 path.safe_push (std::make_pair (curri, curr));
3829 tree use = USE_FROM_PTR (curr);
3830 if (use == lookfor)
3831 break;
3832 gimple *def = SSA_NAME_DEF_STMT (use);
3833 if (gimple_nop_p (def)
3834 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3836 pop:
3839 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3840 curri = x.first;
3841 curr = x.second;
3843 curr = op_iter_next_use (&curri);
3844 /* Skip already visited or non-SSA operands (from iterating
3845 over PHI args). */
3846 while (curr != NULL_USE_OPERAND_P
3847 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3848 || ! bitmap_set_bit (visited,
3849 SSA_NAME_VERSION
3850 (USE_FROM_PTR (curr)))));
3852 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3853 if (curr == NULL_USE_OPERAND_P)
3854 break;
3856 else
3858 if (gimple_code (def) == GIMPLE_PHI)
3859 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3860 else
3861 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3862 while (curr != NULL_USE_OPERAND_P
3863 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3864 || ! bitmap_set_bit (visited,
3865 SSA_NAME_VERSION
3866 (USE_FROM_PTR (curr)))))
3867 curr = op_iter_next_use (&curri);
3868 if (curr == NULL_USE_OPERAND_P)
3869 goto pop;
3872 while (1);
3873 if (dump_file && (dump_flags & TDF_DETAILS))
3875 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3876 unsigned i;
3877 std::pair<ssa_op_iter, use_operand_p> *x;
3878 FOR_EACH_VEC_ELT (path, i, x)
3879 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3880 dump_printf (MSG_NOTE, "\n");
3883 /* Check whether the reduction path detected is valid. */
3884 bool fail = path.length () == 0;
3885 bool neg = false;
3886 int sign = -1;
3887 *code = ERROR_MARK;
3888 for (unsigned i = 1; i < path.length (); ++i)
3890 gimple *use_stmt = USE_STMT (path[i].second);
3891 gimple_match_op op;
3892 if (!gimple_extract_op (use_stmt, &op))
3894 fail = true;
3895 break;
3897 unsigned int opi = op.num_ops;
3898 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3900 /* The following make sure we can compute the operand index
3901 easily plus it mostly disallows chaining via COND_EXPR condition
3902 operands. */
3903 for (opi = 0; opi < op.num_ops; ++opi)
3904 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3905 break;
3907 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3909 for (opi = 0; opi < op.num_ops; ++opi)
3910 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3911 break;
3913 if (opi == op.num_ops)
3915 fail = true;
3916 break;
3918 op.code = canonicalize_code (op.code, op.type);
3919 if (op.code == MINUS_EXPR)
3921 op.code = PLUS_EXPR;
3922 /* Track whether we negate the reduction value each iteration. */
3923 if (op.ops[1] == op.ops[opi])
3924 neg = ! neg;
3926 if (CONVERT_EXPR_CODE_P (op.code)
3927 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3929 else if (*code == ERROR_MARK)
3931 *code = op.code;
3932 sign = TYPE_SIGN (op.type);
3934 else if (op.code != *code)
3936 fail = true;
3937 break;
3939 else if ((op.code == MIN_EXPR
3940 || op.code == MAX_EXPR)
3941 && sign != TYPE_SIGN (op.type))
3943 fail = true;
3944 break;
3946 /* Check there's only a single stmt the op is used on. For the
3947 not value-changing tail and the last stmt allow out-of-loop uses.
3948 ??? We could relax this and handle arbitrary live stmts by
3949 forcing a scalar epilogue for example. */
3950 imm_use_iterator imm_iter;
3951 gimple *op_use_stmt;
3952 unsigned cnt = 0;
3953 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3954 if (!is_gimple_debug (op_use_stmt)
3955 && (*code != ERROR_MARK
3956 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3958 /* We want to allow x + x but not x < 1 ? x : 2. */
3959 if (is_gimple_assign (op_use_stmt)
3960 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3962 use_operand_p use_p;
3963 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3964 cnt++;
3966 else
3967 cnt++;
3969 if (cnt != 1)
3971 fail = true;
3972 break;
3975 return ! fail && ! neg && *code != ERROR_MARK;
3978 bool
3979 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3980 tree loop_arg, enum tree_code code)
3982 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3983 code_helper code_;
3984 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3985 && code_ == code);
3990 /* Function vect_is_simple_reduction
3992 (1) Detect a cross-iteration def-use cycle that represents a simple
3993 reduction computation. We look for the following pattern:
3995 loop_header:
3996 a1 = phi < a0, a2 >
3997 a3 = ...
3998 a2 = operation (a3, a1)
4002 a3 = ...
4003 loop_header:
4004 a1 = phi < a0, a2 >
4005 a2 = operation (a3, a1)
4007 such that:
4008 1. operation is commutative and associative and it is safe to
4009 change the order of the computation
4010 2. no uses for a2 in the loop (a2 is used out of the loop)
4011 3. no uses of a1 in the loop besides the reduction operation
4012 4. no uses of a1 outside the loop.
4014 Conditions 1,4 are tested here.
4015 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4017 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4018 nested cycles.
4020 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4021 reductions:
4023 a1 = phi < a0, a2 >
4024 inner loop (def of a3)
4025 a2 = phi < a3 >
4027 (4) Detect condition expressions, ie:
4028 for (int i = 0; i < N; i++)
4029 if (a[i] < val)
4030 ret_val = a[i];
4034 static stmt_vec_info
4035 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4036 bool *double_reduc, bool *reduc_chain_p, bool slp)
4038 gphi *phi = as_a <gphi *> (phi_info->stmt);
4039 gimple *phi_use_stmt = NULL;
4040 imm_use_iterator imm_iter;
4041 use_operand_p use_p;
4043 *double_reduc = false;
4044 *reduc_chain_p = false;
4045 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4047 tree phi_name = PHI_RESULT (phi);
4048 /* ??? If there are no uses of the PHI result the inner loop reduction
4049 won't be detected as possibly double-reduction by vectorizable_reduction
4050 because that tries to walk the PHI arg from the preheader edge which
4051 can be constant. See PR60382. */
4052 if (has_zero_uses (phi_name))
4053 return NULL;
4054 class loop *loop = (gimple_bb (phi))->loop_father;
4055 unsigned nphi_def_loop_uses = 0;
4056 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4058 gimple *use_stmt = USE_STMT (use_p);
4059 if (is_gimple_debug (use_stmt))
4060 continue;
4062 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4064 if (dump_enabled_p ())
4065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4066 "intermediate value used outside loop.\n");
4068 return NULL;
4071 nphi_def_loop_uses++;
4072 phi_use_stmt = use_stmt;
4075 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4076 if (TREE_CODE (latch_def) != SSA_NAME)
4078 if (dump_enabled_p ())
4079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4080 "reduction: not ssa_name: %T\n", latch_def);
4081 return NULL;
4084 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4085 if (!def_stmt_info
4086 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4087 return NULL;
4089 bool nested_in_vect_loop
4090 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4091 unsigned nlatch_def_loop_uses = 0;
4092 auto_vec<gphi *, 3> lcphis;
4093 bool inner_loop_of_double_reduc = false;
4094 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4096 gimple *use_stmt = USE_STMT (use_p);
4097 if (is_gimple_debug (use_stmt))
4098 continue;
4099 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4100 nlatch_def_loop_uses++;
4101 else
4103 /* We can have more than one loop-closed PHI. */
4104 lcphis.safe_push (as_a <gphi *> (use_stmt));
4105 if (nested_in_vect_loop
4106 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4107 == vect_double_reduction_def))
4108 inner_loop_of_double_reduc = true;
4112 /* If we are vectorizing an inner reduction we are executing that
4113 in the original order only in case we are not dealing with a
4114 double reduction. */
4115 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4117 if (dump_enabled_p ())
4118 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4119 "detected nested cycle: ");
4120 return def_stmt_info;
4123 /* When the inner loop of a double reduction ends up with more than
4124 one loop-closed PHI we have failed to classify alternate such
4125 PHIs as double reduction, leading to wrong code. See PR103237. */
4126 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4128 if (dump_enabled_p ())
4129 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4130 "unhandle double reduction\n");
4131 return NULL;
4134 /* If this isn't a nested cycle or if the nested cycle reduction value
4135 is used ouside of the inner loop we cannot handle uses of the reduction
4136 value. */
4137 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4139 if (dump_enabled_p ())
4140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4141 "reduction used in loop.\n");
4142 return NULL;
4145 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4146 defined in the inner loop. */
4147 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4149 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4150 if (gimple_phi_num_args (def_stmt) != 1
4151 || TREE_CODE (op1) != SSA_NAME)
4153 if (dump_enabled_p ())
4154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4155 "unsupported phi node definition.\n");
4157 return NULL;
4160 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4161 and the latch definition op1. */
4162 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4163 if (gimple_bb (def1)
4164 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4165 && loop->inner
4166 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4167 && (is_gimple_assign (def1) || is_gimple_call (def1))
4168 && is_a <gphi *> (phi_use_stmt)
4169 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4170 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4171 loop_latch_edge (loop->inner))))
4173 if (dump_enabled_p ())
4174 report_vect_op (MSG_NOTE, def_stmt,
4175 "detected double reduction: ");
4177 *double_reduc = true;
4178 return def_stmt_info;
4181 return NULL;
4184 /* Look for the expression computing latch_def from then loop PHI result. */
4185 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4186 code_helper code;
4187 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4188 path))
4190 STMT_VINFO_REDUC_CODE (phi_info) = code;
4191 if (code == COND_EXPR && !nested_in_vect_loop)
4192 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4194 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4195 reduction chain for which the additional restriction is that
4196 all operations in the chain are the same. */
4197 auto_vec<stmt_vec_info, 8> reduc_chain;
4198 unsigned i;
4199 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4200 for (i = path.length () - 1; i >= 1; --i)
4202 gimple *stmt = USE_STMT (path[i].second);
4203 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4204 gimple_match_op op;
4205 if (!gimple_extract_op (stmt, &op))
4206 gcc_unreachable ();
4207 if (gassign *assign = dyn_cast<gassign *> (stmt))
4208 STMT_VINFO_REDUC_IDX (stmt_info)
4209 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4210 else
4212 gcall *call = as_a<gcall *> (stmt);
4213 STMT_VINFO_REDUC_IDX (stmt_info)
4214 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4216 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4217 && (i == 1 || i == path.length () - 1));
4218 if ((op.code != code && !leading_conversion)
4219 /* We can only handle the final value in epilogue
4220 generation for reduction chains. */
4221 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4222 is_slp_reduc = false;
4223 /* For reduction chains we support a trailing/leading
4224 conversions. We do not store those in the actual chain. */
4225 if (leading_conversion)
4226 continue;
4227 reduc_chain.safe_push (stmt_info);
4229 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4231 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4233 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4234 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4236 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4237 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4239 /* Save the chain for further analysis in SLP detection. */
4240 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4241 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4243 *reduc_chain_p = true;
4244 if (dump_enabled_p ())
4245 dump_printf_loc (MSG_NOTE, vect_location,
4246 "reduction: detected reduction chain\n");
4248 else if (dump_enabled_p ())
4249 dump_printf_loc (MSG_NOTE, vect_location,
4250 "reduction: detected reduction\n");
4252 return def_stmt_info;
4255 if (dump_enabled_p ())
4256 dump_printf_loc (MSG_NOTE, vect_location,
4257 "reduction: unknown pattern\n");
4259 return NULL;
4262 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4263 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4264 or -1 if not known. */
4266 static int
4267 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4269 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4270 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4272 if (dump_enabled_p ())
4273 dump_printf_loc (MSG_NOTE, vect_location,
4274 "cost model: epilogue peel iters set to vf/2 "
4275 "because loop iterations are unknown .\n");
4276 return assumed_vf / 2;
4278 else
4280 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4281 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4282 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4283 /* If we need to peel for gaps, but no peeling is required, we have to
4284 peel VF iterations. */
4285 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4286 peel_iters_epilogue = assumed_vf;
4287 return peel_iters_epilogue;
4291 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4293 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4294 int *peel_iters_epilogue,
4295 stmt_vector_for_cost *scalar_cost_vec,
4296 stmt_vector_for_cost *prologue_cost_vec,
4297 stmt_vector_for_cost *epilogue_cost_vec)
4299 int retval = 0;
4301 *peel_iters_epilogue
4302 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4304 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4306 /* If peeled iterations are known but number of scalar loop
4307 iterations are unknown, count a taken branch per peeled loop. */
4308 if (peel_iters_prologue > 0)
4309 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4310 vect_prologue);
4311 if (*peel_iters_epilogue > 0)
4312 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4313 vect_epilogue);
4316 stmt_info_for_cost *si;
4317 int j;
4318 if (peel_iters_prologue)
4319 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4320 retval += record_stmt_cost (prologue_cost_vec,
4321 si->count * peel_iters_prologue,
4322 si->kind, si->stmt_info, si->misalign,
4323 vect_prologue);
4324 if (*peel_iters_epilogue)
4325 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4326 retval += record_stmt_cost (epilogue_cost_vec,
4327 si->count * *peel_iters_epilogue,
4328 si->kind, si->stmt_info, si->misalign,
4329 vect_epilogue);
4331 return retval;
4334 /* Function vect_estimate_min_profitable_iters
4336 Return the number of iterations required for the vector version of the
4337 loop to be profitable relative to the cost of the scalar version of the
4338 loop.
4340 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4341 of iterations for vectorization. -1 value means loop vectorization
4342 is not profitable. This returned value may be used for dynamic
4343 profitability check.
4345 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4346 for static check against estimated number of iterations. */
4348 static void
4349 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4350 int *ret_min_profitable_niters,
4351 int *ret_min_profitable_estimate,
4352 unsigned *suggested_unroll_factor)
4354 int min_profitable_iters;
4355 int min_profitable_estimate;
4356 int peel_iters_prologue;
4357 int peel_iters_epilogue;
4358 unsigned vec_inside_cost = 0;
4359 int vec_outside_cost = 0;
4360 unsigned vec_prologue_cost = 0;
4361 unsigned vec_epilogue_cost = 0;
4362 int scalar_single_iter_cost = 0;
4363 int scalar_outside_cost = 0;
4364 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4365 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4366 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4368 /* Cost model disabled. */
4369 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4371 if (dump_enabled_p ())
4372 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4373 *ret_min_profitable_niters = 0;
4374 *ret_min_profitable_estimate = 0;
4375 return;
4378 /* Requires loop versioning tests to handle misalignment. */
4379 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4381 /* FIXME: Make cost depend on complexity of individual check. */
4382 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4383 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4384 if (dump_enabled_p ())
4385 dump_printf (MSG_NOTE,
4386 "cost model: Adding cost of checks for loop "
4387 "versioning to treat misalignment.\n");
4390 /* Requires loop versioning with alias checks. */
4391 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4393 /* FIXME: Make cost depend on complexity of individual check. */
4394 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4395 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4396 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4397 if (len)
4398 /* Count LEN - 1 ANDs and LEN comparisons. */
4399 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4400 scalar_stmt, vect_prologue);
4401 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4402 if (len)
4404 /* Count LEN - 1 ANDs and LEN comparisons. */
4405 unsigned int nstmts = len * 2 - 1;
4406 /* +1 for each bias that needs adding. */
4407 for (unsigned int i = 0; i < len; ++i)
4408 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4409 nstmts += 1;
4410 (void) add_stmt_cost (target_cost_data, nstmts,
4411 scalar_stmt, vect_prologue);
4413 if (dump_enabled_p ())
4414 dump_printf (MSG_NOTE,
4415 "cost model: Adding cost of checks for loop "
4416 "versioning aliasing.\n");
4419 /* Requires loop versioning with niter checks. */
4420 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4422 /* FIXME: Make cost depend on complexity of individual check. */
4423 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4424 NULL, NULL, NULL_TREE, 0, vect_prologue);
4425 if (dump_enabled_p ())
4426 dump_printf (MSG_NOTE,
4427 "cost model: Adding cost of checks for loop "
4428 "versioning niters.\n");
4431 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4432 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4433 vect_prologue);
4435 /* Count statements in scalar loop. Using this as scalar cost for a single
4436 iteration for now.
4438 TODO: Add outer loop support.
4440 TODO: Consider assigning different costs to different scalar
4441 statements. */
4443 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4445 /* Add additional cost for the peeled instructions in prologue and epilogue
4446 loop. (For fully-masked loops there will be no peeling.)
4448 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4449 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4451 TODO: Build an expression that represents peel_iters for prologue and
4452 epilogue to be used in a run-time test. */
4454 bool prologue_need_br_taken_cost = false;
4455 bool prologue_need_br_not_taken_cost = false;
4457 /* Calculate peel_iters_prologue. */
4458 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4459 peel_iters_prologue = 0;
4460 else if (npeel < 0)
4462 peel_iters_prologue = assumed_vf / 2;
4463 if (dump_enabled_p ())
4464 dump_printf (MSG_NOTE, "cost model: "
4465 "prologue peel iters set to vf/2.\n");
4467 /* If peeled iterations are unknown, count a taken branch and a not taken
4468 branch per peeled loop. Even if scalar loop iterations are known,
4469 vector iterations are not known since peeled prologue iterations are
4470 not known. Hence guards remain the same. */
4471 prologue_need_br_taken_cost = true;
4472 prologue_need_br_not_taken_cost = true;
4474 else
4476 peel_iters_prologue = npeel;
4477 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4478 /* If peeled iterations are known but number of scalar loop
4479 iterations are unknown, count a taken branch per peeled loop. */
4480 prologue_need_br_taken_cost = true;
4483 bool epilogue_need_br_taken_cost = false;
4484 bool epilogue_need_br_not_taken_cost = false;
4486 /* Calculate peel_iters_epilogue. */
4487 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4488 /* We need to peel exactly one iteration for gaps. */
4489 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4490 else if (npeel < 0)
4492 /* If peeling for alignment is unknown, loop bound of main loop
4493 becomes unknown. */
4494 peel_iters_epilogue = assumed_vf / 2;
4495 if (dump_enabled_p ())
4496 dump_printf (MSG_NOTE, "cost model: "
4497 "epilogue peel iters set to vf/2 because "
4498 "peeling for alignment is unknown.\n");
4500 /* See the same reason above in peel_iters_prologue calculation. */
4501 epilogue_need_br_taken_cost = true;
4502 epilogue_need_br_not_taken_cost = true;
4504 else
4506 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4507 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4508 /* If peeled iterations are known but number of scalar loop
4509 iterations are unknown, count a taken branch per peeled loop. */
4510 epilogue_need_br_taken_cost = true;
4513 stmt_info_for_cost *si;
4514 int j;
4515 /* Add costs associated with peel_iters_prologue. */
4516 if (peel_iters_prologue)
4517 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4519 (void) add_stmt_cost (target_cost_data,
4520 si->count * peel_iters_prologue, si->kind,
4521 si->stmt_info, si->node, si->vectype,
4522 si->misalign, vect_prologue);
4525 /* Add costs associated with peel_iters_epilogue. */
4526 if (peel_iters_epilogue)
4527 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4529 (void) add_stmt_cost (target_cost_data,
4530 si->count * peel_iters_epilogue, si->kind,
4531 si->stmt_info, si->node, si->vectype,
4532 si->misalign, vect_epilogue);
4535 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4537 if (prologue_need_br_taken_cost)
4538 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4539 vect_prologue);
4541 if (prologue_need_br_not_taken_cost)
4542 (void) add_stmt_cost (target_cost_data, 1,
4543 cond_branch_not_taken, vect_prologue);
4545 if (epilogue_need_br_taken_cost)
4546 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4547 vect_epilogue);
4549 if (epilogue_need_br_not_taken_cost)
4550 (void) add_stmt_cost (target_cost_data, 1,
4551 cond_branch_not_taken, vect_epilogue);
4553 /* Take care of special costs for rgroup controls of partial vectors. */
4554 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4555 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4556 == vect_partial_vectors_avx512))
4558 /* Calculate how many masks we need to generate. */
4559 unsigned int num_masks = 0;
4560 bool need_saturation = false;
4561 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4562 if (rgm.type)
4564 unsigned nvectors = rgm.factor;
4565 num_masks += nvectors;
4566 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4567 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4568 need_saturation = true;
4571 /* ??? The target isn't able to identify the costs below as
4572 producing masks so it cannot penaltize cases where we'd run
4573 out of mask registers for example. */
4575 /* ??? We are also failing to account for smaller vector masks
4576 we generate by splitting larger masks in vect_get_loop_mask. */
4578 /* In the worst case, we need to generate each mask in the prologue
4579 and in the loop body. We need one splat per group and one
4580 compare per mask.
4582 Sometimes the prologue mask will fold to a constant,
4583 so the actual prologue cost might be smaller. However, it's
4584 simpler and safer to use the worst-case cost; if this ends up
4585 being the tie-breaker between vectorizing or not, then it's
4586 probably better not to vectorize. */
4587 (void) add_stmt_cost (target_cost_data,
4588 num_masks
4589 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4590 vector_stmt, NULL, NULL, NULL_TREE, 0,
4591 vect_prologue);
4592 (void) add_stmt_cost (target_cost_data,
4593 num_masks
4594 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4595 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4597 /* When we need saturation we need it both in the prologue and
4598 the epilogue. */
4599 if (need_saturation)
4601 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4602 NULL, NULL, NULL_TREE, 0, vect_prologue);
4603 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4604 NULL, NULL, NULL_TREE, 0, vect_body);
4607 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4608 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4609 == vect_partial_vectors_while_ult))
4611 /* Calculate how many masks we need to generate. */
4612 unsigned int num_masks = 0;
4613 rgroup_controls *rgm;
4614 unsigned int num_vectors_m1;
4615 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4616 num_vectors_m1, rgm)
4617 if (rgm->type)
4618 num_masks += num_vectors_m1 + 1;
4619 gcc_assert (num_masks > 0);
4621 /* In the worst case, we need to generate each mask in the prologue
4622 and in the loop body. One of the loop body mask instructions
4623 replaces the comparison in the scalar loop, and since we don't
4624 count the scalar comparison against the scalar body, we shouldn't
4625 count that vector instruction against the vector body either.
4627 Sometimes we can use unpacks instead of generating prologue
4628 masks and sometimes the prologue mask will fold to a constant,
4629 so the actual prologue cost might be smaller. However, it's
4630 simpler and safer to use the worst-case cost; if this ends up
4631 being the tie-breaker between vectorizing or not, then it's
4632 probably better not to vectorize. */
4633 (void) add_stmt_cost (target_cost_data, num_masks,
4634 vector_stmt, NULL, NULL, NULL_TREE, 0,
4635 vect_prologue);
4636 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4637 vector_stmt, NULL, NULL, NULL_TREE, 0,
4638 vect_body);
4640 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4642 /* Referring to the functions vect_set_loop_condition_partial_vectors
4643 and vect_set_loop_controls_directly, we need to generate each
4644 length in the prologue and in the loop body if required. Although
4645 there are some possible optimizations, we consider the worst case
4646 here. */
4648 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4649 signed char partial_load_store_bias
4650 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4651 bool need_iterate_p
4652 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4653 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4655 /* Calculate how many statements to be added. */
4656 unsigned int prologue_stmts = 0;
4657 unsigned int body_stmts = 0;
4659 rgroup_controls *rgc;
4660 unsigned int num_vectors_m1;
4661 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4662 if (rgc->type)
4664 /* May need one SHIFT for nitems_total computation. */
4665 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4666 if (nitems != 1 && !niters_known_p)
4667 prologue_stmts += 1;
4669 /* May need one MAX and one MINUS for wrap around. */
4670 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4671 prologue_stmts += 2;
4673 /* Need one MAX and one MINUS for each batch limit excepting for
4674 the 1st one. */
4675 prologue_stmts += num_vectors_m1 * 2;
4677 unsigned int num_vectors = num_vectors_m1 + 1;
4679 /* Need to set up lengths in prologue, only one MIN required
4680 for each since start index is zero. */
4681 prologue_stmts += num_vectors;
4683 /* If we have a non-zero partial load bias, we need one PLUS
4684 to adjust the load length. */
4685 if (partial_load_store_bias != 0)
4686 body_stmts += 1;
4688 /* Each may need two MINs and one MINUS to update lengths in body
4689 for next iteration. */
4690 if (need_iterate_p)
4691 body_stmts += 3 * num_vectors;
4694 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4695 scalar_stmt, vect_prologue);
4696 (void) add_stmt_cost (target_cost_data, body_stmts,
4697 scalar_stmt, vect_body);
4700 /* FORNOW: The scalar outside cost is incremented in one of the
4701 following ways:
4703 1. The vectorizer checks for alignment and aliasing and generates
4704 a condition that allows dynamic vectorization. A cost model
4705 check is ANDED with the versioning condition. Hence scalar code
4706 path now has the added cost of the versioning check.
4708 if (cost > th & versioning_check)
4709 jmp to vector code
4711 Hence run-time scalar is incremented by not-taken branch cost.
4713 2. The vectorizer then checks if a prologue is required. If the
4714 cost model check was not done before during versioning, it has to
4715 be done before the prologue check.
4717 if (cost <= th)
4718 prologue = scalar_iters
4719 if (prologue == 0)
4720 jmp to vector code
4721 else
4722 execute prologue
4723 if (prologue == num_iters)
4724 go to exit
4726 Hence the run-time scalar cost is incremented by a taken branch,
4727 plus a not-taken branch, plus a taken branch cost.
4729 3. The vectorizer then checks if an epilogue is required. If the
4730 cost model check was not done before during prologue check, it
4731 has to be done with the epilogue check.
4733 if (prologue == 0)
4734 jmp to vector code
4735 else
4736 execute prologue
4737 if (prologue == num_iters)
4738 go to exit
4739 vector code:
4740 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4741 jmp to epilogue
4743 Hence the run-time scalar cost should be incremented by 2 taken
4744 branches.
4746 TODO: The back end may reorder the BBS's differently and reverse
4747 conditions/branch directions. Change the estimates below to
4748 something more reasonable. */
4750 /* If the number of iterations is known and we do not do versioning, we can
4751 decide whether to vectorize at compile time. Hence the scalar version
4752 do not carry cost model guard costs. */
4753 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4754 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4756 /* Cost model check occurs at versioning. */
4757 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4758 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4759 else
4761 /* Cost model check occurs at prologue generation. */
4762 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4763 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4764 + vect_get_stmt_cost (cond_branch_not_taken);
4765 /* Cost model check occurs at epilogue generation. */
4766 else
4767 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4771 /* Complete the target-specific cost calculations. */
4772 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4773 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4774 suggested_unroll_factor);
4776 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4777 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4778 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4779 *suggested_unroll_factor,
4780 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4782 if (dump_enabled_p ())
4783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4784 "can't unroll as unrolled vectorization factor larger"
4785 " than maximum vectorization factor: "
4786 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4787 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4788 *suggested_unroll_factor = 1;
4791 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4793 if (dump_enabled_p ())
4795 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4796 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4797 vec_inside_cost);
4798 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4799 vec_prologue_cost);
4800 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4801 vec_epilogue_cost);
4802 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4803 scalar_single_iter_cost);
4804 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4805 scalar_outside_cost);
4806 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4807 vec_outside_cost);
4808 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4809 peel_iters_prologue);
4810 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4811 peel_iters_epilogue);
4814 /* Calculate number of iterations required to make the vector version
4815 profitable, relative to the loop bodies only. The following condition
4816 must hold true:
4817 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4818 where
4819 SIC = scalar iteration cost, VIC = vector iteration cost,
4820 VOC = vector outside cost, VF = vectorization factor,
4821 NPEEL = prologue iterations + epilogue iterations,
4822 SOC = scalar outside cost for run time cost model check. */
4824 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4825 - vec_inside_cost);
4826 if (saving_per_viter <= 0)
4828 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4829 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4830 "vectorization did not happen for a simd loop");
4832 if (dump_enabled_p ())
4833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4834 "cost model: the vector iteration cost = %d "
4835 "divided by the scalar iteration cost = %d "
4836 "is greater or equal to the vectorization factor = %d"
4837 ".\n",
4838 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4839 *ret_min_profitable_niters = -1;
4840 *ret_min_profitable_estimate = -1;
4841 return;
4844 /* ??? The "if" arm is written to handle all cases; see below for what
4845 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4846 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4848 /* Rewriting the condition above in terms of the number of
4849 vector iterations (vniters) rather than the number of
4850 scalar iterations (niters) gives:
4852 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4854 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4856 For integer N, X and Y when X > 0:
4858 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4859 int outside_overhead = (vec_outside_cost
4860 - scalar_single_iter_cost * peel_iters_prologue
4861 - scalar_single_iter_cost * peel_iters_epilogue
4862 - scalar_outside_cost);
4863 /* We're only interested in cases that require at least one
4864 vector iteration. */
4865 int min_vec_niters = 1;
4866 if (outside_overhead > 0)
4867 min_vec_niters = outside_overhead / saving_per_viter + 1;
4869 if (dump_enabled_p ())
4870 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4871 min_vec_niters);
4873 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4875 /* Now that we know the minimum number of vector iterations,
4876 find the minimum niters for which the scalar cost is larger:
4878 SIC * niters > VIC * vniters + VOC - SOC
4880 We know that the minimum niters is no more than
4881 vniters * VF + NPEEL, but it might be (and often is) less
4882 than that if a partial vector iteration is cheaper than the
4883 equivalent scalar code. */
4884 int threshold = (vec_inside_cost * min_vec_niters
4885 + vec_outside_cost
4886 - scalar_outside_cost);
4887 if (threshold <= 0)
4888 min_profitable_iters = 1;
4889 else
4890 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4892 else
4893 /* Convert the number of vector iterations into a number of
4894 scalar iterations. */
4895 min_profitable_iters = (min_vec_niters * assumed_vf
4896 + peel_iters_prologue
4897 + peel_iters_epilogue);
4899 else
4901 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4902 * assumed_vf
4903 - vec_inside_cost * peel_iters_prologue
4904 - vec_inside_cost * peel_iters_epilogue);
4905 if (min_profitable_iters <= 0)
4906 min_profitable_iters = 0;
4907 else
4909 min_profitable_iters /= saving_per_viter;
4911 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4912 <= (((int) vec_inside_cost * min_profitable_iters)
4913 + (((int) vec_outside_cost - scalar_outside_cost)
4914 * assumed_vf)))
4915 min_profitable_iters++;
4919 if (dump_enabled_p ())
4920 dump_printf (MSG_NOTE,
4921 " Calculated minimum iters for profitability: %d\n",
4922 min_profitable_iters);
4924 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4925 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4926 /* We want the vectorized loop to execute at least once. */
4927 min_profitable_iters = assumed_vf + peel_iters_prologue;
4928 else if (min_profitable_iters < peel_iters_prologue)
4929 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4930 vectorized loop executes at least once. */
4931 min_profitable_iters = peel_iters_prologue;
4933 if (dump_enabled_p ())
4934 dump_printf_loc (MSG_NOTE, vect_location,
4935 " Runtime profitability threshold = %d\n",
4936 min_profitable_iters);
4938 *ret_min_profitable_niters = min_profitable_iters;
4940 /* Calculate number of iterations required to make the vector version
4941 profitable, relative to the loop bodies only.
4943 Non-vectorized variant is SIC * niters and it must win over vector
4944 variant on the expected loop trip count. The following condition must hold true:
4945 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4947 if (vec_outside_cost <= 0)
4948 min_profitable_estimate = 0;
4949 /* ??? This "else if" arm is written to handle all cases; see below for
4950 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4951 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4953 /* This is a repeat of the code above, but with + SOC rather
4954 than - SOC. */
4955 int outside_overhead = (vec_outside_cost
4956 - scalar_single_iter_cost * peel_iters_prologue
4957 - scalar_single_iter_cost * peel_iters_epilogue
4958 + scalar_outside_cost);
4959 int min_vec_niters = 1;
4960 if (outside_overhead > 0)
4961 min_vec_niters = outside_overhead / saving_per_viter + 1;
4963 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4965 int threshold = (vec_inside_cost * min_vec_niters
4966 + vec_outside_cost
4967 + scalar_outside_cost);
4968 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4970 else
4971 min_profitable_estimate = (min_vec_niters * assumed_vf
4972 + peel_iters_prologue
4973 + peel_iters_epilogue);
4975 else
4977 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4978 * assumed_vf
4979 - vec_inside_cost * peel_iters_prologue
4980 - vec_inside_cost * peel_iters_epilogue)
4981 / ((scalar_single_iter_cost * assumed_vf)
4982 - vec_inside_cost);
4984 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4985 if (dump_enabled_p ())
4986 dump_printf_loc (MSG_NOTE, vect_location,
4987 " Static estimate profitability threshold = %d\n",
4988 min_profitable_estimate);
4990 *ret_min_profitable_estimate = min_profitable_estimate;
4993 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4994 vector elements (not bits) for a vector with NELT elements. */
4995 static void
4996 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4997 vec_perm_builder *sel)
4999 /* The encoding is a single stepped pattern. Any wrap-around is handled
5000 by vec_perm_indices. */
5001 sel->new_vector (nelt, 1, 3);
5002 for (unsigned int i = 0; i < 3; i++)
5003 sel->quick_push (i + offset);
5006 /* Checks whether the target supports whole-vector shifts for vectors of mode
5007 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5008 it supports vec_perm_const with masks for all necessary shift amounts. */
5009 static bool
5010 have_whole_vector_shift (machine_mode mode)
5012 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5013 return true;
5015 /* Variable-length vectors should be handled via the optab. */
5016 unsigned int nelt;
5017 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5018 return false;
5020 vec_perm_builder sel;
5021 vec_perm_indices indices;
5022 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5024 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5025 indices.new_vector (sel, 2, nelt);
5026 if (!can_vec_perm_const_p (mode, mode, indices, false))
5027 return false;
5029 return true;
5032 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5033 multiplication operands have differing signs and (b) we intend
5034 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5035 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5037 static bool
5038 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5039 stmt_vec_info stmt_info)
5041 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5042 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5043 return false;
5045 tree rhs1 = gimple_assign_rhs1 (assign);
5046 tree rhs2 = gimple_assign_rhs2 (assign);
5047 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5048 return false;
5050 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5051 gcc_assert (reduc_info->is_reduc_info);
5052 return !directly_supported_p (DOT_PROD_EXPR,
5053 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5054 optab_vector_mixed_sign);
5057 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5058 functions. Design better to avoid maintenance issues. */
5060 /* Function vect_model_reduction_cost.
5062 Models cost for a reduction operation, including the vector ops
5063 generated within the strip-mine loop in some cases, the initial
5064 definition before the loop, and the epilogue code that must be generated. */
5066 static void
5067 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5068 stmt_vec_info stmt_info, internal_fn reduc_fn,
5069 vect_reduction_type reduction_type,
5070 int ncopies, stmt_vector_for_cost *cost_vec)
5072 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5073 tree vectype;
5074 machine_mode mode;
5075 class loop *loop = NULL;
5077 if (loop_vinfo)
5078 loop = LOOP_VINFO_LOOP (loop_vinfo);
5080 /* Condition reductions generate two reductions in the loop. */
5081 if (reduction_type == COND_REDUCTION)
5082 ncopies *= 2;
5084 vectype = STMT_VINFO_VECTYPE (stmt_info);
5085 mode = TYPE_MODE (vectype);
5086 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5088 gimple_match_op op;
5089 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5090 gcc_unreachable ();
5092 bool emulated_mixed_dot_prod
5093 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5094 if (reduction_type == EXTRACT_LAST_REDUCTION)
5095 /* No extra instructions are needed in the prologue. The loop body
5096 operations are costed in vectorizable_condition. */
5097 inside_cost = 0;
5098 else if (reduction_type == FOLD_LEFT_REDUCTION)
5100 /* No extra instructions needed in the prologue. */
5101 prologue_cost = 0;
5103 if (reduc_fn != IFN_LAST)
5104 /* Count one reduction-like operation per vector. */
5105 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5106 stmt_info, 0, vect_body);
5107 else
5109 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5110 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5111 inside_cost = record_stmt_cost (cost_vec, nelements,
5112 vec_to_scalar, stmt_info, 0,
5113 vect_body);
5114 inside_cost += record_stmt_cost (cost_vec, nelements,
5115 scalar_stmt, stmt_info, 0,
5116 vect_body);
5119 else
5121 /* Add in the cost of the initial definitions. */
5122 int prologue_stmts;
5123 if (reduction_type == COND_REDUCTION)
5124 /* For cond reductions we have four vectors: initial index, step,
5125 initial result of the data reduction, initial value of the index
5126 reduction. */
5127 prologue_stmts = 4;
5128 else if (emulated_mixed_dot_prod)
5129 /* We need the initial reduction value and two invariants:
5130 one that contains the minimum signed value and one that
5131 contains half of its negative. */
5132 prologue_stmts = 3;
5133 else
5134 prologue_stmts = 1;
5135 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5136 scalar_to_vec, stmt_info, 0,
5137 vect_prologue);
5140 /* Determine cost of epilogue code.
5142 We have a reduction operator that will reduce the vector in one statement.
5143 Also requires scalar extract. */
5145 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5147 if (reduc_fn != IFN_LAST)
5149 if (reduction_type == COND_REDUCTION)
5151 /* An EQ stmt and an COND_EXPR stmt. */
5152 epilogue_cost += record_stmt_cost (cost_vec, 2,
5153 vector_stmt, stmt_info, 0,
5154 vect_epilogue);
5155 /* Reduction of the max index and a reduction of the found
5156 values. */
5157 epilogue_cost += record_stmt_cost (cost_vec, 2,
5158 vec_to_scalar, stmt_info, 0,
5159 vect_epilogue);
5160 /* A broadcast of the max value. */
5161 epilogue_cost += record_stmt_cost (cost_vec, 1,
5162 scalar_to_vec, stmt_info, 0,
5163 vect_epilogue);
5165 else
5167 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5168 stmt_info, 0, vect_epilogue);
5169 epilogue_cost += record_stmt_cost (cost_vec, 1,
5170 vec_to_scalar, stmt_info, 0,
5171 vect_epilogue);
5174 else if (reduction_type == COND_REDUCTION)
5176 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5177 /* Extraction of scalar elements. */
5178 epilogue_cost += record_stmt_cost (cost_vec,
5179 2 * estimated_nunits,
5180 vec_to_scalar, stmt_info, 0,
5181 vect_epilogue);
5182 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5183 epilogue_cost += record_stmt_cost (cost_vec,
5184 2 * estimated_nunits - 3,
5185 scalar_stmt, stmt_info, 0,
5186 vect_epilogue);
5188 else if (reduction_type == EXTRACT_LAST_REDUCTION
5189 || reduction_type == FOLD_LEFT_REDUCTION)
5190 /* No extra instructions need in the epilogue. */
5192 else
5194 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5195 tree bitsize = TYPE_SIZE (op.type);
5196 int element_bitsize = tree_to_uhwi (bitsize);
5197 int nelements = vec_size_in_bits / element_bitsize;
5199 if (op.code == COND_EXPR)
5200 op.code = MAX_EXPR;
5202 /* We have a whole vector shift available. */
5203 if (VECTOR_MODE_P (mode)
5204 && directly_supported_p (op.code, vectype)
5205 && have_whole_vector_shift (mode))
5207 /* Final reduction via vector shifts and the reduction operator.
5208 Also requires scalar extract. */
5209 epilogue_cost += record_stmt_cost (cost_vec,
5210 exact_log2 (nelements) * 2,
5211 vector_stmt, stmt_info, 0,
5212 vect_epilogue);
5213 epilogue_cost += record_stmt_cost (cost_vec, 1,
5214 vec_to_scalar, stmt_info, 0,
5215 vect_epilogue);
5217 else
5218 /* Use extracts and reduction op for final reduction. For N
5219 elements, we have N extracts and N-1 reduction ops. */
5220 epilogue_cost += record_stmt_cost (cost_vec,
5221 nelements + nelements - 1,
5222 vector_stmt, stmt_info, 0,
5223 vect_epilogue);
5227 if (dump_enabled_p ())
5228 dump_printf (MSG_NOTE,
5229 "vect_model_reduction_cost: inside_cost = %d, "
5230 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5231 prologue_cost, epilogue_cost);
5234 /* SEQ is a sequence of instructions that initialize the reduction
5235 described by REDUC_INFO. Emit them in the appropriate place. */
5237 static void
5238 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5239 stmt_vec_info reduc_info, gimple *seq)
5241 if (reduc_info->reused_accumulator)
5243 /* When reusing an accumulator from the main loop, we only need
5244 initialization instructions if the main loop can be skipped.
5245 In that case, emit the initialization instructions at the end
5246 of the guard block that does the skip. */
5247 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5248 gcc_assert (skip_edge);
5249 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5250 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5252 else
5254 /* The normal case: emit the initialization instructions on the
5255 preheader edge. */
5256 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5257 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5261 /* Function get_initial_def_for_reduction
5263 Input:
5264 REDUC_INFO - the info_for_reduction
5265 INIT_VAL - the initial value of the reduction variable
5266 NEUTRAL_OP - a value that has no effect on the reduction, as per
5267 neutral_op_for_reduction
5269 Output:
5270 Return a vector variable, initialized according to the operation that
5271 STMT_VINFO performs. This vector will be used as the initial value
5272 of the vector of partial results.
5274 The value we need is a vector in which element 0 has value INIT_VAL
5275 and every other element has value NEUTRAL_OP. */
5277 static tree
5278 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5279 stmt_vec_info reduc_info,
5280 tree init_val, tree neutral_op)
5282 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5283 tree scalar_type = TREE_TYPE (init_val);
5284 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5285 tree init_def;
5286 gimple_seq stmts = NULL;
5288 gcc_assert (vectype);
5290 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5291 || SCALAR_FLOAT_TYPE_P (scalar_type));
5293 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5294 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5296 if (operand_equal_p (init_val, neutral_op))
5298 /* If both elements are equal then the vector described above is
5299 just a splat. */
5300 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5301 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5303 else
5305 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5306 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5307 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5309 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5310 element 0. */
5311 init_def = gimple_build_vector_from_val (&stmts, vectype,
5312 neutral_op);
5313 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5314 vectype, init_def, init_val);
5316 else
5318 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5319 tree_vector_builder elts (vectype, 1, 2);
5320 elts.quick_push (init_val);
5321 elts.quick_push (neutral_op);
5322 init_def = gimple_build_vector (&stmts, &elts);
5326 if (stmts)
5327 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5328 return init_def;
5331 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5332 which performs a reduction involving GROUP_SIZE scalar statements.
5333 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5334 is nonnull, introducing extra elements of that value will not change the
5335 result. */
5337 static void
5338 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5339 stmt_vec_info reduc_info,
5340 vec<tree> *vec_oprnds,
5341 unsigned int number_of_vectors,
5342 unsigned int group_size, tree neutral_op)
5344 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5345 unsigned HOST_WIDE_INT nunits;
5346 unsigned j, number_of_places_left_in_vector;
5347 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5348 unsigned int i;
5350 gcc_assert (group_size == initial_values.length () || neutral_op);
5352 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5353 created vectors. It is greater than 1 if unrolling is performed.
5355 For example, we have two scalar operands, s1 and s2 (e.g., group of
5356 strided accesses of size two), while NUNITS is four (i.e., four scalars
5357 of this type can be packed in a vector). The output vector will contain
5358 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5359 will be 2).
5361 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5362 vectors containing the operands.
5364 For example, NUNITS is four as before, and the group size is 8
5365 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5366 {s5, s6, s7, s8}. */
5368 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5369 nunits = group_size;
5371 number_of_places_left_in_vector = nunits;
5372 bool constant_p = true;
5373 tree_vector_builder elts (vector_type, nunits, 1);
5374 elts.quick_grow (nunits);
5375 gimple_seq ctor_seq = NULL;
5376 for (j = 0; j < nunits * number_of_vectors; ++j)
5378 tree op;
5379 i = j % group_size;
5381 /* Get the def before the loop. In reduction chain we have only
5382 one initial value. Else we have as many as PHIs in the group. */
5383 if (i >= initial_values.length () || (j > i && neutral_op))
5384 op = neutral_op;
5385 else
5386 op = initial_values[i];
5388 /* Create 'vect_ = {op0,op1,...,opn}'. */
5389 number_of_places_left_in_vector--;
5390 elts[nunits - number_of_places_left_in_vector - 1] = op;
5391 if (!CONSTANT_CLASS_P (op))
5392 constant_p = false;
5394 if (number_of_places_left_in_vector == 0)
5396 tree init;
5397 if (constant_p && !neutral_op
5398 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5399 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5400 /* Build the vector directly from ELTS. */
5401 init = gimple_build_vector (&ctor_seq, &elts);
5402 else if (neutral_op)
5404 /* Build a vector of the neutral value and shift the
5405 other elements into place. */
5406 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5407 neutral_op);
5408 int k = nunits;
5409 while (k > 0 && elts[k - 1] == neutral_op)
5410 k -= 1;
5411 while (k > 0)
5413 k -= 1;
5414 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5415 vector_type, init, elts[k]);
5418 else
5420 /* First time round, duplicate ELTS to fill the
5421 required number of vectors. */
5422 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5423 elts, number_of_vectors, *vec_oprnds);
5424 break;
5426 vec_oprnds->quick_push (init);
5428 number_of_places_left_in_vector = nunits;
5429 elts.new_vector (vector_type, nunits, 1);
5430 elts.quick_grow (nunits);
5431 constant_p = true;
5434 if (ctor_seq != NULL)
5435 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5438 /* For a statement STMT_INFO taking part in a reduction operation return
5439 the stmt_vec_info the meta information is stored on. */
5441 stmt_vec_info
5442 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5444 stmt_info = vect_orig_stmt (stmt_info);
5445 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5446 if (!is_a <gphi *> (stmt_info->stmt)
5447 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5448 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5449 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5450 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5452 if (gimple_phi_num_args (phi) == 1)
5453 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5455 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5457 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5458 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5459 stmt_info = info;
5461 return stmt_info;
5464 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5465 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5466 return false. */
5468 static bool
5469 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5470 stmt_vec_info reduc_info)
5472 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5473 if (!main_loop_vinfo)
5474 return false;
5476 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5477 return false;
5479 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5480 auto_vec<tree, 16> main_loop_results (num_phis);
5481 auto_vec<tree, 16> initial_values (num_phis);
5482 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5484 /* The epilogue loop can be entered either from the main loop or
5485 from an earlier guard block. */
5486 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5487 for (tree incoming_value : reduc_info->reduc_initial_values)
5489 /* Look for:
5491 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5492 INITIAL_VALUE(guard block)>. */
5493 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5495 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5496 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5498 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5499 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5501 main_loop_results.quick_push (from_main_loop);
5502 initial_values.quick_push (from_skip);
5505 else
5506 /* The main loop dominates the epilogue loop. */
5507 main_loop_results.splice (reduc_info->reduc_initial_values);
5509 /* See if the main loop has the kind of accumulator we need. */
5510 vect_reusable_accumulator *accumulator
5511 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5512 if (!accumulator
5513 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5514 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5515 accumulator->reduc_info->reduc_scalar_results.begin ()))
5516 return false;
5518 /* Handle the case where we can reduce wider vectors to narrower ones. */
5519 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5520 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5521 unsigned HOST_WIDE_INT m;
5522 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5523 TYPE_VECTOR_SUBPARTS (vectype), &m))
5524 return false;
5525 /* Check the intermediate vector types and operations are available. */
5526 tree prev_vectype = old_vectype;
5527 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5528 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5530 intermediate_nunits = exact_div (intermediate_nunits, 2);
5531 tree intermediate_vectype = get_related_vectype_for_scalar_type
5532 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5533 if (!intermediate_vectype
5534 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5535 intermediate_vectype)
5536 || !can_vec_extract (TYPE_MODE (prev_vectype),
5537 TYPE_MODE (intermediate_vectype)))
5538 return false;
5539 prev_vectype = intermediate_vectype;
5542 /* Non-SLP reductions might apply an adjustment after the reduction
5543 operation, in order to simplify the initialization of the accumulator.
5544 If the epilogue loop carries on from where the main loop left off,
5545 it should apply the same adjustment to the final reduction result.
5547 If the epilogue loop can also be entered directly (rather than via
5548 the main loop), we need to be able to handle that case in the same way,
5549 with the same adjustment. (In principle we could add a PHI node
5550 to select the correct adjustment, but in practice that shouldn't be
5551 necessary.) */
5552 tree main_adjustment
5553 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5554 if (loop_vinfo->main_loop_edge && main_adjustment)
5556 gcc_assert (num_phis == 1);
5557 tree initial_value = initial_values[0];
5558 /* Check that we can use INITIAL_VALUE as the adjustment and
5559 initialize the accumulator with a neutral value instead. */
5560 if (!operand_equal_p (initial_value, main_adjustment))
5561 return false;
5562 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5563 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5564 code, initial_value);
5566 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5567 reduc_info->reduc_initial_values.truncate (0);
5568 reduc_info->reduc_initial_values.splice (initial_values);
5569 reduc_info->reused_accumulator = accumulator;
5570 return true;
5573 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5574 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5576 static tree
5577 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5578 gimple_seq *seq)
5580 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5581 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5582 tree stype = TREE_TYPE (vectype);
5583 tree new_temp = vec_def;
5584 while (nunits > nunits1)
5586 nunits /= 2;
5587 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5588 stype, nunits);
5589 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5591 /* The target has to make sure we support lowpart/highpart
5592 extraction, either via direct vector extract or through
5593 an integer mode punning. */
5594 tree dst1, dst2;
5595 gimple *epilog_stmt;
5596 if (convert_optab_handler (vec_extract_optab,
5597 TYPE_MODE (TREE_TYPE (new_temp)),
5598 TYPE_MODE (vectype1))
5599 != CODE_FOR_nothing)
5601 /* Extract sub-vectors directly once vec_extract becomes
5602 a conversion optab. */
5603 dst1 = make_ssa_name (vectype1);
5604 epilog_stmt
5605 = gimple_build_assign (dst1, BIT_FIELD_REF,
5606 build3 (BIT_FIELD_REF, vectype1,
5607 new_temp, TYPE_SIZE (vectype1),
5608 bitsize_int (0)));
5609 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5610 dst2 = make_ssa_name (vectype1);
5611 epilog_stmt
5612 = gimple_build_assign (dst2, BIT_FIELD_REF,
5613 build3 (BIT_FIELD_REF, vectype1,
5614 new_temp, TYPE_SIZE (vectype1),
5615 bitsize_int (bitsize)));
5616 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5618 else
5620 /* Extract via punning to appropriately sized integer mode
5621 vector. */
5622 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5623 tree etype = build_vector_type (eltype, 2);
5624 gcc_assert (convert_optab_handler (vec_extract_optab,
5625 TYPE_MODE (etype),
5626 TYPE_MODE (eltype))
5627 != CODE_FOR_nothing);
5628 tree tem = make_ssa_name (etype);
5629 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5630 build1 (VIEW_CONVERT_EXPR,
5631 etype, new_temp));
5632 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5633 new_temp = tem;
5634 tem = make_ssa_name (eltype);
5635 epilog_stmt
5636 = gimple_build_assign (tem, BIT_FIELD_REF,
5637 build3 (BIT_FIELD_REF, eltype,
5638 new_temp, TYPE_SIZE (eltype),
5639 bitsize_int (0)));
5640 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5641 dst1 = make_ssa_name (vectype1);
5642 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5643 build1 (VIEW_CONVERT_EXPR,
5644 vectype1, tem));
5645 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5646 tem = make_ssa_name (eltype);
5647 epilog_stmt
5648 = gimple_build_assign (tem, BIT_FIELD_REF,
5649 build3 (BIT_FIELD_REF, eltype,
5650 new_temp, TYPE_SIZE (eltype),
5651 bitsize_int (bitsize)));
5652 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5653 dst2 = make_ssa_name (vectype1);
5654 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5655 build1 (VIEW_CONVERT_EXPR,
5656 vectype1, tem));
5657 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5660 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5663 return new_temp;
5666 /* Function vect_create_epilog_for_reduction
5668 Create code at the loop-epilog to finalize the result of a reduction
5669 computation.
5671 STMT_INFO is the scalar reduction stmt that is being vectorized.
5672 SLP_NODE is an SLP node containing a group of reduction statements. The
5673 first one in this group is STMT_INFO.
5674 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5675 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5676 (counting from 0)
5678 This function:
5679 1. Completes the reduction def-use cycles.
5680 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5681 by calling the function specified by REDUC_FN if available, or by
5682 other means (whole-vector shifts or a scalar loop).
5683 The function also creates a new phi node at the loop exit to preserve
5684 loop-closed form, as illustrated below.
5686 The flow at the entry to this function:
5688 loop:
5689 vec_def = phi <vec_init, null> # REDUCTION_PHI
5690 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5691 s_loop = scalar_stmt # (scalar) STMT_INFO
5692 loop_exit:
5693 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5694 use <s_out0>
5695 use <s_out0>
5697 The above is transformed by this function into:
5699 loop:
5700 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5701 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5702 s_loop = scalar_stmt # (scalar) STMT_INFO
5703 loop_exit:
5704 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5705 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5706 v_out2 = reduce <v_out1>
5707 s_out3 = extract_field <v_out2, 0>
5708 s_out4 = adjust_result <s_out3>
5709 use <s_out4>
5710 use <s_out4>
5713 static void
5714 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5715 stmt_vec_info stmt_info,
5716 slp_tree slp_node,
5717 slp_instance slp_node_instance)
5719 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5720 gcc_assert (reduc_info->is_reduc_info);
5721 /* For double reductions we need to get at the inner loop reduction
5722 stmt which has the meta info attached. Our stmt_info is that of the
5723 loop-closed PHI of the inner loop which we remember as
5724 def for the reduction PHI generation. */
5725 bool double_reduc = false;
5726 stmt_vec_info rdef_info = stmt_info;
5727 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5729 gcc_assert (!slp_node);
5730 double_reduc = true;
5731 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5732 (stmt_info->stmt, 0));
5733 stmt_info = vect_stmt_to_vectorize (stmt_info);
5735 gphi *reduc_def_stmt
5736 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5737 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5738 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5739 tree vectype;
5740 machine_mode mode;
5741 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5742 basic_block exit_bb;
5743 tree scalar_dest;
5744 tree scalar_type;
5745 gimple *new_phi = NULL, *phi;
5746 gimple_stmt_iterator exit_gsi;
5747 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5748 gimple *epilog_stmt = NULL;
5749 gimple *exit_phi;
5750 tree bitsize;
5751 tree def;
5752 tree orig_name, scalar_result;
5753 imm_use_iterator imm_iter, phi_imm_iter;
5754 use_operand_p use_p, phi_use_p;
5755 gimple *use_stmt;
5756 auto_vec<tree> reduc_inputs;
5757 int j, i;
5758 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5759 unsigned int group_size = 1, k;
5760 auto_vec<gimple *> phis;
5761 /* SLP reduction without reduction chain, e.g.,
5762 # a1 = phi <a2, a0>
5763 # b1 = phi <b2, b0>
5764 a2 = operation (a1)
5765 b2 = operation (b1) */
5766 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5767 bool direct_slp_reduc;
5768 tree induction_index = NULL_TREE;
5770 if (slp_node)
5771 group_size = SLP_TREE_LANES (slp_node);
5773 if (nested_in_vect_loop_p (loop, stmt_info))
5775 outer_loop = loop;
5776 loop = loop->inner;
5777 gcc_assert (!slp_node && double_reduc);
5780 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5781 gcc_assert (vectype);
5782 mode = TYPE_MODE (vectype);
5784 tree induc_val = NULL_TREE;
5785 tree adjustment_def = NULL;
5786 if (slp_node)
5788 else
5790 /* Optimize: for induction condition reduction, if we can't use zero
5791 for induc_val, use initial_def. */
5792 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5793 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5794 else if (double_reduc)
5796 else
5797 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5800 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5801 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5802 if (slp_reduc)
5803 /* All statements produce live-out values. */
5804 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5805 else if (slp_node)
5807 /* The last statement in the reduction chain produces the live-out
5808 value. Note SLP optimization can shuffle scalar stmts to
5809 optimize permutations so we have to search for the last stmt. */
5810 for (k = 0; k < group_size; ++k)
5811 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5813 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5814 break;
5818 unsigned vec_num;
5819 int ncopies;
5820 if (slp_node)
5822 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5823 ncopies = 1;
5825 else
5827 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5828 vec_num = 1;
5829 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5832 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5833 which is updated with the current index of the loop for every match of
5834 the original loop's cond_expr (VEC_STMT). This results in a vector
5835 containing the last time the condition passed for that vector lane.
5836 The first match will be a 1 to allow 0 to be used for non-matching
5837 indexes. If there are no matches at all then the vector will be all
5838 zeroes.
5840 PR92772: This algorithm is broken for architectures that support
5841 masked vectors, but do not provide fold_extract_last. */
5842 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5844 auto_vec<std::pair<tree, bool>, 2> ccompares;
5845 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5846 cond_info = vect_stmt_to_vectorize (cond_info);
5847 while (cond_info != reduc_info)
5849 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5851 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5852 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5853 ccompares.safe_push
5854 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5855 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5857 cond_info
5858 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5859 1 + STMT_VINFO_REDUC_IDX
5860 (cond_info)));
5861 cond_info = vect_stmt_to_vectorize (cond_info);
5863 gcc_assert (ccompares.length () != 0);
5865 tree indx_before_incr, indx_after_incr;
5866 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5867 int scalar_precision
5868 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5869 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5870 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5871 (TYPE_MODE (vectype), cr_index_scalar_type,
5872 TYPE_VECTOR_SUBPARTS (vectype));
5874 /* First we create a simple vector induction variable which starts
5875 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5876 vector size (STEP). */
5878 /* Create a {1,2,3,...} vector. */
5879 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5881 /* Create a vector of the step value. */
5882 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5883 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5885 /* Create an induction variable. */
5886 gimple_stmt_iterator incr_gsi;
5887 bool insert_after;
5888 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5889 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5890 insert_after, &indx_before_incr, &indx_after_incr);
5892 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5893 filled with zeros (VEC_ZERO). */
5895 /* Create a vector of 0s. */
5896 tree zero = build_zero_cst (cr_index_scalar_type);
5897 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5899 /* Create a vector phi node. */
5900 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5901 new_phi = create_phi_node (new_phi_tree, loop->header);
5902 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5903 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5905 /* Now take the condition from the loops original cond_exprs
5906 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5907 every match uses values from the induction variable
5908 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5909 (NEW_PHI_TREE).
5910 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5911 the new cond_expr (INDEX_COND_EXPR). */
5912 gimple_seq stmts = NULL;
5913 for (int i = ccompares.length () - 1; i != -1; --i)
5915 tree ccompare = ccompares[i].first;
5916 if (ccompares[i].second)
5917 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5918 cr_index_vector_type,
5919 ccompare,
5920 indx_before_incr, new_phi_tree);
5921 else
5922 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5923 cr_index_vector_type,
5924 ccompare,
5925 new_phi_tree, indx_before_incr);
5927 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5929 /* Update the phi with the vec cond. */
5930 induction_index = new_phi_tree;
5931 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5932 loop_latch_edge (loop), UNKNOWN_LOCATION);
5935 /* 2. Create epilog code.
5936 The reduction epilog code operates across the elements of the vector
5937 of partial results computed by the vectorized loop.
5938 The reduction epilog code consists of:
5940 step 1: compute the scalar result in a vector (v_out2)
5941 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5942 step 3: adjust the scalar result (s_out3) if needed.
5944 Step 1 can be accomplished using one the following three schemes:
5945 (scheme 1) using reduc_fn, if available.
5946 (scheme 2) using whole-vector shifts, if available.
5947 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5948 combined.
5950 The overall epilog code looks like this:
5952 s_out0 = phi <s_loop> # original EXIT_PHI
5953 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5954 v_out2 = reduce <v_out1> # step 1
5955 s_out3 = extract_field <v_out2, 0> # step 2
5956 s_out4 = adjust_result <s_out3> # step 3
5958 (step 3 is optional, and steps 1 and 2 may be combined).
5959 Lastly, the uses of s_out0 are replaced by s_out4. */
5962 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5963 v_out1 = phi <VECT_DEF>
5964 Store them in NEW_PHIS. */
5965 if (double_reduc)
5966 loop = outer_loop;
5967 exit_bb = single_exit (loop)->dest;
5968 exit_gsi = gsi_after_labels (exit_bb);
5969 reduc_inputs.create (slp_node ? vec_num : ncopies);
5970 for (unsigned i = 0; i < vec_num; i++)
5972 gimple_seq stmts = NULL;
5973 if (slp_node)
5974 def = vect_get_slp_vect_def (slp_node, i);
5975 else
5976 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5977 for (j = 0; j < ncopies; j++)
5979 tree new_def = copy_ssa_name (def);
5980 phi = create_phi_node (new_def, exit_bb);
5981 if (j)
5982 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5983 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5984 new_def = gimple_convert (&stmts, vectype, new_def);
5985 reduc_inputs.quick_push (new_def);
5987 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5990 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5991 (i.e. when reduc_fn is not available) and in the final adjustment
5992 code (if needed). Also get the original scalar reduction variable as
5993 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5994 represents a reduction pattern), the tree-code and scalar-def are
5995 taken from the original stmt that the pattern-stmt (STMT) replaces.
5996 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5997 are taken from STMT. */
5999 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6000 if (orig_stmt_info != stmt_info)
6002 /* Reduction pattern */
6003 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6004 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6007 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6008 scalar_type = TREE_TYPE (scalar_dest);
6009 scalar_results.truncate (0);
6010 scalar_results.reserve_exact (group_size);
6011 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6012 bitsize = TYPE_SIZE (scalar_type);
6014 /* True if we should implement SLP_REDUC using native reduction operations
6015 instead of scalar operations. */
6016 direct_slp_reduc = (reduc_fn != IFN_LAST
6017 && slp_reduc
6018 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6020 /* In case of reduction chain, e.g.,
6021 # a1 = phi <a3, a0>
6022 a2 = operation (a1)
6023 a3 = operation (a2),
6025 we may end up with more than one vector result. Here we reduce them
6026 to one vector.
6028 The same is true for a SLP reduction, e.g.,
6029 # a1 = phi <a2, a0>
6030 # b1 = phi <b2, b0>
6031 a2 = operation (a1)
6032 b2 = operation (a2),
6034 where we can end up with more than one vector as well. We can
6035 easily accumulate vectors when the number of vector elements is
6036 a multiple of the SLP group size.
6038 The same is true if we couldn't use a single defuse cycle. */
6039 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6040 || direct_slp_reduc
6041 || (slp_reduc
6042 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6043 || ncopies > 1)
6045 gimple_seq stmts = NULL;
6046 tree single_input = reduc_inputs[0];
6047 for (k = 1; k < reduc_inputs.length (); k++)
6048 single_input = gimple_build (&stmts, code, vectype,
6049 single_input, reduc_inputs[k]);
6050 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6052 reduc_inputs.truncate (0);
6053 reduc_inputs.safe_push (single_input);
6056 tree orig_reduc_input = reduc_inputs[0];
6058 /* If this loop is an epilogue loop that can be skipped after the
6059 main loop, we can only share a reduction operation between the
6060 main loop and the epilogue if we put it at the target of the
6061 skip edge.
6063 We can still reuse accumulators if this check fails. Doing so has
6064 the minor(?) benefit of making the epilogue loop's scalar result
6065 independent of the main loop's scalar result. */
6066 bool unify_with_main_loop_p = false;
6067 if (reduc_info->reused_accumulator
6068 && loop_vinfo->skip_this_loop_edge
6069 && single_succ_p (exit_bb)
6070 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6072 unify_with_main_loop_p = true;
6074 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6075 reduc_inputs[0] = make_ssa_name (vectype);
6076 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6077 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6078 UNKNOWN_LOCATION);
6079 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6080 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6081 exit_gsi = gsi_after_labels (reduc_block);
6084 /* Shouldn't be used beyond this point. */
6085 exit_bb = nullptr;
6087 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6088 && reduc_fn != IFN_LAST)
6090 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6091 various data values where the condition matched and another vector
6092 (INDUCTION_INDEX) containing all the indexes of those matches. We
6093 need to extract the last matching index (which will be the index with
6094 highest value) and use this to index into the data vector.
6095 For the case where there were no matches, the data vector will contain
6096 all default values and the index vector will be all zeros. */
6098 /* Get various versions of the type of the vector of indexes. */
6099 tree index_vec_type = TREE_TYPE (induction_index);
6100 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6101 tree index_scalar_type = TREE_TYPE (index_vec_type);
6102 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6104 /* Get an unsigned integer version of the type of the data vector. */
6105 int scalar_precision
6106 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6107 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6108 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6109 vectype);
6111 /* First we need to create a vector (ZERO_VEC) of zeros and another
6112 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6113 can create using a MAX reduction and then expanding.
6114 In the case where the loop never made any matches, the max index will
6115 be zero. */
6117 /* Vector of {0, 0, 0,...}. */
6118 tree zero_vec = build_zero_cst (vectype);
6120 /* Find maximum value from the vector of found indexes. */
6121 tree max_index = make_ssa_name (index_scalar_type);
6122 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6123 1, induction_index);
6124 gimple_call_set_lhs (max_index_stmt, max_index);
6125 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6127 /* Vector of {max_index, max_index, max_index,...}. */
6128 tree max_index_vec = make_ssa_name (index_vec_type);
6129 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6130 max_index);
6131 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6132 max_index_vec_rhs);
6133 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6135 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6136 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6137 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6138 otherwise. Only one value should match, resulting in a vector
6139 (VEC_COND) with one data value and the rest zeros.
6140 In the case where the loop never made any matches, every index will
6141 match, resulting in a vector with all data values (which will all be
6142 the default value). */
6144 /* Compare the max index vector to the vector of found indexes to find
6145 the position of the max value. */
6146 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6147 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6148 induction_index,
6149 max_index_vec);
6150 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6152 /* Use the compare to choose either values from the data vector or
6153 zero. */
6154 tree vec_cond = make_ssa_name (vectype);
6155 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6156 vec_compare,
6157 reduc_inputs[0],
6158 zero_vec);
6159 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6161 /* Finally we need to extract the data value from the vector (VEC_COND)
6162 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6163 reduction, but because this doesn't exist, we can use a MAX reduction
6164 instead. The data value might be signed or a float so we need to cast
6165 it first.
6166 In the case where the loop never made any matches, the data values are
6167 all identical, and so will reduce down correctly. */
6169 /* Make the matched data values unsigned. */
6170 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6171 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6172 vec_cond);
6173 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6174 VIEW_CONVERT_EXPR,
6175 vec_cond_cast_rhs);
6176 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6178 /* Reduce down to a scalar value. */
6179 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6180 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6181 1, vec_cond_cast);
6182 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6183 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6185 /* Convert the reduced value back to the result type and set as the
6186 result. */
6187 gimple_seq stmts = NULL;
6188 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6189 data_reduc);
6190 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6191 scalar_results.safe_push (new_temp);
6193 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6194 && reduc_fn == IFN_LAST)
6196 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6197 idx = 0;
6198 idx_val = induction_index[0];
6199 val = data_reduc[0];
6200 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6201 if (induction_index[i] > idx_val)
6202 val = data_reduc[i], idx_val = induction_index[i];
6203 return val; */
6205 tree data_eltype = TREE_TYPE (vectype);
6206 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6207 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6208 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6209 /* Enforced by vectorizable_reduction, which ensures we have target
6210 support before allowing a conditional reduction on variable-length
6211 vectors. */
6212 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6213 tree idx_val = NULL_TREE, val = NULL_TREE;
6214 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6216 tree old_idx_val = idx_val;
6217 tree old_val = val;
6218 idx_val = make_ssa_name (idx_eltype);
6219 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6220 build3 (BIT_FIELD_REF, idx_eltype,
6221 induction_index,
6222 bitsize_int (el_size),
6223 bitsize_int (off)));
6224 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6225 val = make_ssa_name (data_eltype);
6226 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6227 build3 (BIT_FIELD_REF,
6228 data_eltype,
6229 reduc_inputs[0],
6230 bitsize_int (el_size),
6231 bitsize_int (off)));
6232 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6233 if (off != 0)
6235 tree new_idx_val = idx_val;
6236 if (off != v_size - el_size)
6238 new_idx_val = make_ssa_name (idx_eltype);
6239 epilog_stmt = gimple_build_assign (new_idx_val,
6240 MAX_EXPR, idx_val,
6241 old_idx_val);
6242 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6244 tree cond = make_ssa_name (boolean_type_node);
6245 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6246 idx_val, old_idx_val);
6247 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6248 tree new_val = make_ssa_name (data_eltype);
6249 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6250 cond, val, old_val);
6251 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6252 idx_val = new_idx_val;
6253 val = new_val;
6256 /* Convert the reduced value back to the result type and set as the
6257 result. */
6258 gimple_seq stmts = NULL;
6259 val = gimple_convert (&stmts, scalar_type, val);
6260 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6261 scalar_results.safe_push (val);
6264 /* 2.3 Create the reduction code, using one of the three schemes described
6265 above. In SLP we simply need to extract all the elements from the
6266 vector (without reducing them), so we use scalar shifts. */
6267 else if (reduc_fn != IFN_LAST && !slp_reduc)
6269 tree tmp;
6270 tree vec_elem_type;
6272 /* Case 1: Create:
6273 v_out2 = reduc_expr <v_out1> */
6275 if (dump_enabled_p ())
6276 dump_printf_loc (MSG_NOTE, vect_location,
6277 "Reduce using direct vector reduction.\n");
6279 gimple_seq stmts = NULL;
6280 vec_elem_type = TREE_TYPE (vectype);
6281 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6282 vec_elem_type, reduc_inputs[0]);
6283 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6284 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6286 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6287 && induc_val)
6289 /* Earlier we set the initial value to be a vector if induc_val
6290 values. Check the result and if it is induc_val then replace
6291 with the original initial value, unless induc_val is
6292 the same as initial_def already. */
6293 tree zcompare = make_ssa_name (boolean_type_node);
6294 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6295 new_temp, induc_val);
6296 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6297 tree initial_def = reduc_info->reduc_initial_values[0];
6298 tmp = make_ssa_name (new_scalar_dest);
6299 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6300 initial_def, new_temp);
6301 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6302 new_temp = tmp;
6305 scalar_results.safe_push (new_temp);
6307 else if (direct_slp_reduc)
6309 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6310 with the elements for other SLP statements replaced with the
6311 neutral value. We can then do a normal reduction on each vector. */
6313 /* Enforced by vectorizable_reduction. */
6314 gcc_assert (reduc_inputs.length () == 1);
6315 gcc_assert (pow2p_hwi (group_size));
6317 gimple_seq seq = NULL;
6319 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6320 and the same element size as VECTYPE. */
6321 tree index = build_index_vector (vectype, 0, 1);
6322 tree index_type = TREE_TYPE (index);
6323 tree index_elt_type = TREE_TYPE (index_type);
6324 tree mask_type = truth_type_for (index_type);
6326 /* Create a vector that, for each element, identifies which of
6327 the REDUC_GROUP_SIZE results should use it. */
6328 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6329 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6330 build_vector_from_val (index_type, index_mask));
6332 /* Get a neutral vector value. This is simply a splat of the neutral
6333 scalar value if we have one, otherwise the initial scalar value
6334 is itself a neutral value. */
6335 tree vector_identity = NULL_TREE;
6336 tree neutral_op = NULL_TREE;
6337 if (slp_node)
6339 tree initial_value = NULL_TREE;
6340 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6341 initial_value = reduc_info->reduc_initial_values[0];
6342 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6343 initial_value);
6345 if (neutral_op)
6346 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6347 neutral_op);
6348 for (unsigned int i = 0; i < group_size; ++i)
6350 /* If there's no univeral neutral value, we can use the
6351 initial scalar value from the original PHI. This is used
6352 for MIN and MAX reduction, for example. */
6353 if (!neutral_op)
6355 tree scalar_value = reduc_info->reduc_initial_values[i];
6356 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6357 scalar_value);
6358 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6359 scalar_value);
6362 /* Calculate the equivalent of:
6364 sel[j] = (index[j] == i);
6366 which selects the elements of REDUC_INPUTS[0] that should
6367 be included in the result. */
6368 tree compare_val = build_int_cst (index_elt_type, i);
6369 compare_val = build_vector_from_val (index_type, compare_val);
6370 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6371 index, compare_val);
6373 /* Calculate the equivalent of:
6375 vec = seq ? reduc_inputs[0] : vector_identity;
6377 VEC is now suitable for a full vector reduction. */
6378 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6379 sel, reduc_inputs[0], vector_identity);
6381 /* Do the reduction and convert it to the appropriate type. */
6382 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6383 TREE_TYPE (vectype), vec);
6384 scalar = gimple_convert (&seq, scalar_type, scalar);
6385 scalar_results.safe_push (scalar);
6387 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6389 else
6391 bool reduce_with_shift;
6392 tree vec_temp;
6394 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6396 /* See if the target wants to do the final (shift) reduction
6397 in a vector mode of smaller size and first reduce upper/lower
6398 halves against each other. */
6399 enum machine_mode mode1 = mode;
6400 tree stype = TREE_TYPE (vectype);
6401 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6402 unsigned nunits1 = nunits;
6403 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6404 && reduc_inputs.length () == 1)
6406 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6407 /* For SLP reductions we have to make sure lanes match up, but
6408 since we're doing individual element final reduction reducing
6409 vector width here is even more important.
6410 ??? We can also separate lanes with permutes, for the common
6411 case of power-of-two group-size odd/even extracts would work. */
6412 if (slp_reduc && nunits != nunits1)
6414 nunits1 = least_common_multiple (nunits1, group_size);
6415 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6418 if (!slp_reduc
6419 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6420 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6422 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6423 stype, nunits1);
6424 reduce_with_shift = have_whole_vector_shift (mode1);
6425 if (!VECTOR_MODE_P (mode1)
6426 || !directly_supported_p (code, vectype1))
6427 reduce_with_shift = false;
6429 /* First reduce the vector to the desired vector size we should
6430 do shift reduction on by combining upper and lower halves. */
6431 gimple_seq stmts = NULL;
6432 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6433 code, &stmts);
6434 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6435 reduc_inputs[0] = new_temp;
6437 if (reduce_with_shift && !slp_reduc)
6439 int element_bitsize = tree_to_uhwi (bitsize);
6440 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6441 for variable-length vectors and also requires direct target support
6442 for loop reductions. */
6443 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6444 int nelements = vec_size_in_bits / element_bitsize;
6445 vec_perm_builder sel;
6446 vec_perm_indices indices;
6448 int elt_offset;
6450 tree zero_vec = build_zero_cst (vectype1);
6451 /* Case 2: Create:
6452 for (offset = nelements/2; offset >= 1; offset/=2)
6454 Create: va' = vec_shift <va, offset>
6455 Create: va = vop <va, va'>
6456 } */
6458 tree rhs;
6460 if (dump_enabled_p ())
6461 dump_printf_loc (MSG_NOTE, vect_location,
6462 "Reduce using vector shifts\n");
6464 gimple_seq stmts = NULL;
6465 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6466 for (elt_offset = nelements / 2;
6467 elt_offset >= 1;
6468 elt_offset /= 2)
6470 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6471 indices.new_vector (sel, 2, nelements);
6472 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6473 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6474 new_temp, zero_vec, mask);
6475 new_temp = gimple_build (&stmts, code,
6476 vectype1, new_name, new_temp);
6478 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6480 /* 2.4 Extract the final scalar result. Create:
6481 s_out3 = extract_field <v_out2, bitpos> */
6483 if (dump_enabled_p ())
6484 dump_printf_loc (MSG_NOTE, vect_location,
6485 "extract scalar result\n");
6487 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6488 bitsize, bitsize_zero_node);
6489 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6490 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6491 gimple_assign_set_lhs (epilog_stmt, new_temp);
6492 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6493 scalar_results.safe_push (new_temp);
6495 else
6497 /* Case 3: Create:
6498 s = extract_field <v_out2, 0>
6499 for (offset = element_size;
6500 offset < vector_size;
6501 offset += element_size;)
6503 Create: s' = extract_field <v_out2, offset>
6504 Create: s = op <s, s'> // For non SLP cases
6505 } */
6507 if (dump_enabled_p ())
6508 dump_printf_loc (MSG_NOTE, vect_location,
6509 "Reduce using scalar code.\n");
6511 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6512 int element_bitsize = tree_to_uhwi (bitsize);
6513 tree compute_type = TREE_TYPE (vectype);
6514 gimple_seq stmts = NULL;
6515 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6517 int bit_offset;
6518 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6519 vec_temp, bitsize, bitsize_zero_node);
6521 /* In SLP we don't need to apply reduction operation, so we just
6522 collect s' values in SCALAR_RESULTS. */
6523 if (slp_reduc)
6524 scalar_results.safe_push (new_temp);
6526 for (bit_offset = element_bitsize;
6527 bit_offset < vec_size_in_bits;
6528 bit_offset += element_bitsize)
6530 tree bitpos = bitsize_int (bit_offset);
6531 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6532 compute_type, vec_temp,
6533 bitsize, bitpos);
6534 if (slp_reduc)
6536 /* In SLP we don't need to apply reduction operation, so
6537 we just collect s' values in SCALAR_RESULTS. */
6538 new_temp = new_name;
6539 scalar_results.safe_push (new_name);
6541 else
6542 new_temp = gimple_build (&stmts, code, compute_type,
6543 new_name, new_temp);
6547 /* The only case where we need to reduce scalar results in SLP, is
6548 unrolling. If the size of SCALAR_RESULTS is greater than
6549 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6550 REDUC_GROUP_SIZE. */
6551 if (slp_reduc)
6553 tree res, first_res, new_res;
6555 /* Reduce multiple scalar results in case of SLP unrolling. */
6556 for (j = group_size; scalar_results.iterate (j, &res);
6557 j++)
6559 first_res = scalar_results[j % group_size];
6560 new_res = gimple_build (&stmts, code, compute_type,
6561 first_res, res);
6562 scalar_results[j % group_size] = new_res;
6564 scalar_results.truncate (group_size);
6565 for (k = 0; k < group_size; k++)
6566 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6567 scalar_results[k]);
6569 else
6571 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6572 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6573 scalar_results.safe_push (new_temp);
6576 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6579 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6580 && induc_val)
6582 /* Earlier we set the initial value to be a vector if induc_val
6583 values. Check the result and if it is induc_val then replace
6584 with the original initial value, unless induc_val is
6585 the same as initial_def already. */
6586 tree zcompare = make_ssa_name (boolean_type_node);
6587 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6588 induc_val);
6589 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6590 tree initial_def = reduc_info->reduc_initial_values[0];
6591 tree tmp = make_ssa_name (new_scalar_dest);
6592 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6593 initial_def, new_temp);
6594 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6595 scalar_results[0] = tmp;
6599 /* 2.5 Adjust the final result by the initial value of the reduction
6600 variable. (When such adjustment is not needed, then
6601 'adjustment_def' is zero). For example, if code is PLUS we create:
6602 new_temp = loop_exit_def + adjustment_def */
6604 if (adjustment_def)
6606 gcc_assert (!slp_reduc);
6607 gimple_seq stmts = NULL;
6608 if (double_reduc)
6610 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6611 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6612 new_temp = gimple_build (&stmts, code, vectype,
6613 reduc_inputs[0], adjustment_def);
6615 else
6617 new_temp = scalar_results[0];
6618 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6619 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6620 adjustment_def);
6621 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6622 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6623 new_temp, adjustment_def);
6624 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6627 epilog_stmt = gimple_seq_last_stmt (stmts);
6628 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6629 scalar_results[0] = new_temp;
6632 /* Record this operation if it could be reused by the epilogue loop. */
6633 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6634 && reduc_inputs.length () == 1)
6635 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6636 { orig_reduc_input, reduc_info });
6638 if (double_reduc)
6639 loop = outer_loop;
6641 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6642 phis with new adjusted scalar results, i.e., replace use <s_out0>
6643 with use <s_out4>.
6645 Transform:
6646 loop_exit:
6647 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6648 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6649 v_out2 = reduce <v_out1>
6650 s_out3 = extract_field <v_out2, 0>
6651 s_out4 = adjust_result <s_out3>
6652 use <s_out0>
6653 use <s_out0>
6655 into:
6657 loop_exit:
6658 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6659 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6660 v_out2 = reduce <v_out1>
6661 s_out3 = extract_field <v_out2, 0>
6662 s_out4 = adjust_result <s_out3>
6663 use <s_out4>
6664 use <s_out4> */
6666 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6667 for (k = 0; k < live_out_stmts.size (); k++)
6669 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6670 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6672 phis.create (3);
6673 /* Find the loop-closed-use at the loop exit of the original scalar
6674 result. (The reduction result is expected to have two immediate uses,
6675 one at the latch block, and one at the loop exit). For double
6676 reductions we are looking for exit phis of the outer loop. */
6677 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6679 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6681 if (!is_gimple_debug (USE_STMT (use_p)))
6682 phis.safe_push (USE_STMT (use_p));
6684 else
6686 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6688 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6690 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6692 if (!flow_bb_inside_loop_p (loop,
6693 gimple_bb (USE_STMT (phi_use_p)))
6694 && !is_gimple_debug (USE_STMT (phi_use_p)))
6695 phis.safe_push (USE_STMT (phi_use_p));
6701 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6703 /* Replace the uses: */
6704 orig_name = PHI_RESULT (exit_phi);
6706 /* Look for a single use at the target of the skip edge. */
6707 if (unify_with_main_loop_p)
6709 use_operand_p use_p;
6710 gimple *user;
6711 if (!single_imm_use (orig_name, &use_p, &user))
6712 gcc_unreachable ();
6713 orig_name = gimple_get_lhs (user);
6716 scalar_result = scalar_results[k];
6717 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6719 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6720 SET_USE (use_p, scalar_result);
6721 update_stmt (use_stmt);
6725 phis.release ();
6729 /* Return a vector of type VECTYPE that is equal to the vector select
6730 operation "MASK ? VEC : IDENTITY". Insert the select statements
6731 before GSI. */
6733 static tree
6734 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6735 tree vec, tree identity)
6737 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6738 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6739 mask, vec, identity);
6740 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6741 return cond;
6744 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6745 order, starting with LHS. Insert the extraction statements before GSI and
6746 associate the new scalar SSA names with variable SCALAR_DEST.
6747 Return the SSA name for the result. */
6749 static tree
6750 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6751 tree_code code, tree lhs, tree vector_rhs)
6753 tree vectype = TREE_TYPE (vector_rhs);
6754 tree scalar_type = TREE_TYPE (vectype);
6755 tree bitsize = TYPE_SIZE (scalar_type);
6756 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6757 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6759 for (unsigned HOST_WIDE_INT bit_offset = 0;
6760 bit_offset < vec_size_in_bits;
6761 bit_offset += element_bitsize)
6763 tree bitpos = bitsize_int (bit_offset);
6764 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6765 bitsize, bitpos);
6767 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6768 rhs = make_ssa_name (scalar_dest, stmt);
6769 gimple_assign_set_lhs (stmt, rhs);
6770 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6772 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6773 tree new_name = make_ssa_name (scalar_dest, stmt);
6774 gimple_assign_set_lhs (stmt, new_name);
6775 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6776 lhs = new_name;
6778 return lhs;
6781 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6782 type of the vector input. */
6784 static internal_fn
6785 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6787 internal_fn mask_reduc_fn;
6789 switch (reduc_fn)
6791 case IFN_FOLD_LEFT_PLUS:
6792 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6793 break;
6795 default:
6796 return IFN_LAST;
6799 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6800 OPTIMIZE_FOR_SPEED))
6801 return mask_reduc_fn;
6802 return IFN_LAST;
6805 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6806 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6807 statement. CODE is the operation performed by STMT_INFO and OPS are
6808 its scalar operands. REDUC_INDEX is the index of the operand in
6809 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6810 implements in-order reduction, or IFN_LAST if we should open-code it.
6811 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6812 that should be used to control the operation in a fully-masked loop. */
6814 static bool
6815 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6816 stmt_vec_info stmt_info,
6817 gimple_stmt_iterator *gsi,
6818 gimple **vec_stmt, slp_tree slp_node,
6819 gimple *reduc_def_stmt,
6820 tree_code code, internal_fn reduc_fn,
6821 tree ops[3], tree vectype_in,
6822 int reduc_index, vec_loop_masks *masks)
6824 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6825 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6826 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6828 int ncopies;
6829 if (slp_node)
6830 ncopies = 1;
6831 else
6832 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6834 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6835 gcc_assert (ncopies == 1);
6836 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6838 if (slp_node)
6839 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6840 TYPE_VECTOR_SUBPARTS (vectype_in)));
6842 tree op0 = ops[1 - reduc_index];
6844 int group_size = 1;
6845 stmt_vec_info scalar_dest_def_info;
6846 auto_vec<tree> vec_oprnds0;
6847 if (slp_node)
6849 auto_vec<vec<tree> > vec_defs (2);
6850 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6851 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6852 vec_defs[0].release ();
6853 vec_defs[1].release ();
6854 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6855 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6857 else
6859 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6860 op0, &vec_oprnds0);
6861 scalar_dest_def_info = stmt_info;
6864 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6865 tree scalar_type = TREE_TYPE (scalar_dest);
6866 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6868 int vec_num = vec_oprnds0.length ();
6869 gcc_assert (vec_num == 1 || slp_node);
6870 tree vec_elem_type = TREE_TYPE (vectype_out);
6871 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6873 tree vector_identity = NULL_TREE;
6874 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6875 vector_identity = build_zero_cst (vectype_out);
6877 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6878 int i;
6879 tree def0;
6880 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6882 gimple *new_stmt;
6883 tree mask = NULL_TREE;
6884 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6885 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
6887 /* Handle MINUS by adding the negative. */
6888 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6890 tree negated = make_ssa_name (vectype_out);
6891 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6892 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6893 def0 = negated;
6896 if (mask && mask_reduc_fn == IFN_LAST)
6897 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6898 vector_identity);
6900 /* On the first iteration the input is simply the scalar phi
6901 result, and for subsequent iterations it is the output of
6902 the preceding operation. */
6903 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6905 if (mask && mask_reduc_fn != IFN_LAST)
6906 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6907 def0, mask);
6908 else
6909 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6910 def0);
6911 /* For chained SLP reductions the output of the previous reduction
6912 operation serves as the input of the next. For the final statement
6913 the output cannot be a temporary - we reuse the original
6914 scalar destination of the last statement. */
6915 if (i != vec_num - 1)
6917 gimple_set_lhs (new_stmt, scalar_dest_var);
6918 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6919 gimple_set_lhs (new_stmt, reduc_var);
6922 else
6924 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6925 reduc_var, def0);
6926 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6927 /* Remove the statement, so that we can use the same code paths
6928 as for statements that we've just created. */
6929 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6930 gsi_remove (&tmp_gsi, true);
6933 if (i == vec_num - 1)
6935 gimple_set_lhs (new_stmt, scalar_dest);
6936 vect_finish_replace_stmt (loop_vinfo,
6937 scalar_dest_def_info,
6938 new_stmt);
6940 else
6941 vect_finish_stmt_generation (loop_vinfo,
6942 scalar_dest_def_info,
6943 new_stmt, gsi);
6945 if (slp_node)
6946 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6947 else
6949 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6950 *vec_stmt = new_stmt;
6954 return true;
6957 /* Function is_nonwrapping_integer_induction.
6959 Check if STMT_VINO (which is part of loop LOOP) both increments and
6960 does not cause overflow. */
6962 static bool
6963 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6965 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6966 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6967 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6968 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6969 widest_int ni, max_loop_value, lhs_max;
6970 wi::overflow_type overflow = wi::OVF_NONE;
6972 /* Make sure the loop is integer based. */
6973 if (TREE_CODE (base) != INTEGER_CST
6974 || TREE_CODE (step) != INTEGER_CST)
6975 return false;
6977 /* Check that the max size of the loop will not wrap. */
6979 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6980 return true;
6982 if (! max_stmt_executions (loop, &ni))
6983 return false;
6985 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6986 &overflow);
6987 if (overflow)
6988 return false;
6990 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6991 TYPE_SIGN (lhs_type), &overflow);
6992 if (overflow)
6993 return false;
6995 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6996 <= TYPE_PRECISION (lhs_type));
6999 /* Check if masking can be supported by inserting a conditional expression.
7000 CODE is the code for the operation. COND_FN is the conditional internal
7001 function, if it exists. VECTYPE_IN is the type of the vector input. */
7002 static bool
7003 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7004 tree vectype_in)
7006 if (cond_fn != IFN_LAST
7007 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7008 OPTIMIZE_FOR_SPEED))
7009 return false;
7011 if (code.is_tree_code ())
7012 switch (tree_code (code))
7014 case DOT_PROD_EXPR:
7015 case SAD_EXPR:
7016 return true;
7018 default:
7019 break;
7021 return false;
7024 /* Insert a conditional expression to enable masked vectorization. CODE is the
7025 code for the operation. VOP is the array of operands. MASK is the loop
7026 mask. GSI is a statement iterator used to place the new conditional
7027 expression. */
7028 static void
7029 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7030 gimple_stmt_iterator *gsi)
7032 switch (tree_code (code))
7034 case DOT_PROD_EXPR:
7036 tree vectype = TREE_TYPE (vop[1]);
7037 tree zero = build_zero_cst (vectype);
7038 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7039 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7040 mask, vop[1], zero);
7041 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7042 vop[1] = masked_op1;
7043 break;
7046 case SAD_EXPR:
7048 tree vectype = TREE_TYPE (vop[1]);
7049 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7050 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7051 mask, vop[1], vop[0]);
7052 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7053 vop[1] = masked_op1;
7054 break;
7057 default:
7058 gcc_unreachable ();
7062 /* Function vectorizable_reduction.
7064 Check if STMT_INFO performs a reduction operation that can be vectorized.
7065 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7066 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7067 Return true if STMT_INFO is vectorizable in this way.
7069 This function also handles reduction idioms (patterns) that have been
7070 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7071 may be of this form:
7072 X = pattern_expr (arg0, arg1, ..., X)
7073 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7074 sequence that had been detected and replaced by the pattern-stmt
7075 (STMT_INFO).
7077 This function also handles reduction of condition expressions, for example:
7078 for (int i = 0; i < N; i++)
7079 if (a[i] < value)
7080 last = a[i];
7081 This is handled by vectorising the loop and creating an additional vector
7082 containing the loop indexes for which "a[i] < value" was true. In the
7083 function epilogue this is reduced to a single max value and then used to
7084 index into the vector of results.
7086 In some cases of reduction patterns, the type of the reduction variable X is
7087 different than the type of the other arguments of STMT_INFO.
7088 In such cases, the vectype that is used when transforming STMT_INFO into
7089 a vector stmt is different than the vectype that is used to determine the
7090 vectorization factor, because it consists of a different number of elements
7091 than the actual number of elements that are being operated upon in parallel.
7093 For example, consider an accumulation of shorts into an int accumulator.
7094 On some targets it's possible to vectorize this pattern operating on 8
7095 shorts at a time (hence, the vectype for purposes of determining the
7096 vectorization factor should be V8HI); on the other hand, the vectype that
7097 is used to create the vector form is actually V4SI (the type of the result).
7099 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7100 indicates what is the actual level of parallelism (V8HI in the example), so
7101 that the right vectorization factor would be derived. This vectype
7102 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7103 be used to create the vectorized stmt. The right vectype for the vectorized
7104 stmt is obtained from the type of the result X:
7105 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7107 This means that, contrary to "regular" reductions (or "regular" stmts in
7108 general), the following equation:
7109 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7110 does *NOT* necessarily hold for reduction patterns. */
7112 bool
7113 vectorizable_reduction (loop_vec_info loop_vinfo,
7114 stmt_vec_info stmt_info, slp_tree slp_node,
7115 slp_instance slp_node_instance,
7116 stmt_vector_for_cost *cost_vec)
7118 tree vectype_in = NULL_TREE;
7119 tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7120 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7121 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7122 stmt_vec_info cond_stmt_vinfo = NULL;
7123 int i;
7124 int ncopies;
7125 bool single_defuse_cycle = false;
7126 bool nested_cycle = false;
7127 bool double_reduc = false;
7128 int vec_num;
7129 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7130 tree cond_reduc_val = NULL_TREE;
7132 /* Make sure it was already recognized as a reduction computation. */
7133 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7134 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7135 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7136 return false;
7138 /* The stmt we store reduction analysis meta on. */
7139 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7140 reduc_info->is_reduc_info = true;
7142 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7144 if (is_a <gphi *> (stmt_info->stmt))
7146 if (slp_node)
7148 /* We eventually need to set a vector type on invariant
7149 arguments. */
7150 unsigned j;
7151 slp_tree child;
7152 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7153 if (!vect_maybe_update_slp_op_vectype
7154 (child, SLP_TREE_VECTYPE (slp_node)))
7156 if (dump_enabled_p ())
7157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7158 "incompatible vector types for "
7159 "invariants\n");
7160 return false;
7163 /* Analysis for double-reduction is done on the outer
7164 loop PHI, nested cycles have no further restrictions. */
7165 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7167 else
7168 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7169 return true;
7172 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7173 stmt_vec_info phi_info = stmt_info;
7174 if (!is_a <gphi *> (stmt_info->stmt))
7176 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7177 return true;
7179 if (slp_node)
7181 slp_node_instance->reduc_phis = slp_node;
7182 /* ??? We're leaving slp_node to point to the PHIs, we only
7183 need it to get at the number of vector stmts which wasn't
7184 yet initialized for the instance root. */
7186 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7188 use_operand_p use_p;
7189 gimple *use_stmt;
7190 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7191 &use_p, &use_stmt);
7192 gcc_assert (res);
7193 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7196 /* PHIs should not participate in patterns. */
7197 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7198 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7200 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7201 and compute the reduction chain length. Discover the real
7202 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7203 tree reduc_def
7204 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7205 loop_latch_edge
7206 (gimple_bb (reduc_def_phi)->loop_father));
7207 unsigned reduc_chain_length = 0;
7208 bool only_slp_reduc_chain = true;
7209 stmt_info = NULL;
7210 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7211 while (reduc_def != PHI_RESULT (reduc_def_phi))
7213 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7214 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7215 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7217 if (dump_enabled_p ())
7218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7219 "reduction chain broken by patterns.\n");
7220 return false;
7222 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7223 only_slp_reduc_chain = false;
7224 /* For epilogue generation live members of the chain need
7225 to point back to the PHI via their original stmt for
7226 info_for_reduction to work. For SLP we need to look at
7227 all lanes here - even though we only will vectorize from
7228 the SLP node with live lane zero the other live lanes also
7229 need to be identified as part of a reduction to be able
7230 to skip code generation for them. */
7231 if (slp_for_stmt_info)
7233 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7234 if (STMT_VINFO_LIVE_P (s))
7235 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7237 else if (STMT_VINFO_LIVE_P (vdef))
7238 STMT_VINFO_REDUC_DEF (def) = phi_info;
7239 gimple_match_op op;
7240 if (!gimple_extract_op (vdef->stmt, &op))
7242 if (dump_enabled_p ())
7243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7244 "reduction chain includes unsupported"
7245 " statement type.\n");
7246 return false;
7248 if (CONVERT_EXPR_CODE_P (op.code))
7250 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7252 if (dump_enabled_p ())
7253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7254 "conversion in the reduction chain.\n");
7255 return false;
7258 else if (!stmt_info)
7259 /* First non-conversion stmt. */
7260 stmt_info = vdef;
7261 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7262 reduc_chain_length++;
7263 if (!stmt_info && slp_node)
7264 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7266 /* PHIs should not participate in patterns. */
7267 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7269 if (nested_in_vect_loop_p (loop, stmt_info))
7271 loop = loop->inner;
7272 nested_cycle = true;
7275 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7276 element. */
7277 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7279 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7280 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7282 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7283 gcc_assert (slp_node
7284 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7286 /* 1. Is vectorizable reduction? */
7287 /* Not supportable if the reduction variable is used in the loop, unless
7288 it's a reduction chain. */
7289 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7290 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7291 return false;
7293 /* Reductions that are not used even in an enclosing outer-loop,
7294 are expected to be "live" (used out of the loop). */
7295 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7296 && !STMT_VINFO_LIVE_P (stmt_info))
7297 return false;
7299 /* 2. Has this been recognized as a reduction pattern?
7301 Check if STMT represents a pattern that has been recognized
7302 in earlier analysis stages. For stmts that represent a pattern,
7303 the STMT_VINFO_RELATED_STMT field records the last stmt in
7304 the original sequence that constitutes the pattern. */
7306 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7307 if (orig_stmt_info)
7309 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7310 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7313 /* 3. Check the operands of the operation. The first operands are defined
7314 inside the loop body. The last operand is the reduction variable,
7315 which is defined by the loop-header-phi. */
7317 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7318 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7319 gimple_match_op op;
7320 if (!gimple_extract_op (stmt_info->stmt, &op))
7321 gcc_unreachable ();
7322 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7323 || op.code == WIDEN_SUM_EXPR
7324 || op.code == SAD_EXPR);
7326 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7327 && !SCALAR_FLOAT_TYPE_P (op.type))
7328 return false;
7330 /* Do not try to vectorize bit-precision reductions. */
7331 if (!type_has_mode_precision_p (op.type))
7332 return false;
7334 /* For lane-reducing ops we're reducing the number of reduction PHIs
7335 which means the only use of that may be in the lane-reducing operation. */
7336 if (lane_reduc_code_p
7337 && reduc_chain_length != 1
7338 && !only_slp_reduc_chain)
7340 if (dump_enabled_p ())
7341 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7342 "lane-reducing reduction with extra stmts.\n");
7343 return false;
7346 /* All uses but the last are expected to be defined in the loop.
7347 The last use is the reduction variable. In case of nested cycle this
7348 assumption is not true: we use reduc_index to record the index of the
7349 reduction variable. */
7350 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7351 /* We need to skip an extra operand for COND_EXPRs with embedded
7352 comparison. */
7353 unsigned opno_adjust = 0;
7354 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7355 opno_adjust = 1;
7356 for (i = 0; i < (int) op.num_ops; i++)
7358 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7359 if (i == 0 && op.code == COND_EXPR)
7360 continue;
7362 stmt_vec_info def_stmt_info;
7363 enum vect_def_type dt;
7364 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7365 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7366 &vectype_op[i], &def_stmt_info))
7368 if (dump_enabled_p ())
7369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7370 "use not simple.\n");
7371 return false;
7373 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7374 continue;
7376 /* There should be only one cycle def in the stmt, the one
7377 leading to reduc_def. */
7378 if (VECTORIZABLE_CYCLE_DEF (dt))
7379 return false;
7381 if (!vectype_op[i])
7382 vectype_op[i]
7383 = get_vectype_for_scalar_type (loop_vinfo,
7384 TREE_TYPE (op.ops[i]), slp_op[i]);
7386 /* To properly compute ncopies we are interested in the widest
7387 non-reduction input type in case we're looking at a widening
7388 accumulation that we later handle in vect_transform_reduction. */
7389 if (lane_reduc_code_p
7390 && vectype_op[i]
7391 && (!vectype_in
7392 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7393 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7394 vectype_in = vectype_op[i];
7396 if (op.code == COND_EXPR)
7398 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7399 if (dt == vect_constant_def)
7401 cond_reduc_dt = dt;
7402 cond_reduc_val = op.ops[i];
7404 if (dt == vect_induction_def
7405 && def_stmt_info
7406 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7408 cond_reduc_dt = dt;
7409 cond_stmt_vinfo = def_stmt_info;
7413 if (!vectype_in)
7414 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7415 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7417 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7418 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7419 /* If we have a condition reduction, see if we can simplify it further. */
7420 if (v_reduc_type == COND_REDUCTION)
7422 if (slp_node)
7423 return false;
7425 /* When the condition uses the reduction value in the condition, fail. */
7426 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7428 if (dump_enabled_p ())
7429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7430 "condition depends on previous iteration\n");
7431 return false;
7434 if (reduc_chain_length == 1
7435 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
7436 vectype_in, OPTIMIZE_FOR_SPEED))
7438 if (dump_enabled_p ())
7439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7440 "optimizing condition reduction with"
7441 " FOLD_EXTRACT_LAST.\n");
7442 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7444 else if (cond_reduc_dt == vect_induction_def)
7446 tree base
7447 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7448 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7450 gcc_assert (TREE_CODE (base) == INTEGER_CST
7451 && TREE_CODE (step) == INTEGER_CST);
7452 cond_reduc_val = NULL_TREE;
7453 enum tree_code cond_reduc_op_code = ERROR_MARK;
7454 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7455 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7457 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7458 above base; punt if base is the minimum value of the type for
7459 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7460 else if (tree_int_cst_sgn (step) == -1)
7462 cond_reduc_op_code = MIN_EXPR;
7463 if (tree_int_cst_sgn (base) == -1)
7464 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7465 else if (tree_int_cst_lt (base,
7466 TYPE_MAX_VALUE (TREE_TYPE (base))))
7467 cond_reduc_val
7468 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7470 else
7472 cond_reduc_op_code = MAX_EXPR;
7473 if (tree_int_cst_sgn (base) == 1)
7474 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7475 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7476 base))
7477 cond_reduc_val
7478 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7480 if (cond_reduc_val)
7482 if (dump_enabled_p ())
7483 dump_printf_loc (MSG_NOTE, vect_location,
7484 "condition expression based on "
7485 "integer induction.\n");
7486 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7487 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7488 = cond_reduc_val;
7489 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7492 else if (cond_reduc_dt == vect_constant_def)
7494 enum vect_def_type cond_initial_dt;
7495 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7496 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7497 if (cond_initial_dt == vect_constant_def
7498 && types_compatible_p (TREE_TYPE (cond_initial_val),
7499 TREE_TYPE (cond_reduc_val)))
7501 tree e = fold_binary (LE_EXPR, boolean_type_node,
7502 cond_initial_val, cond_reduc_val);
7503 if (e && (integer_onep (e) || integer_zerop (e)))
7505 if (dump_enabled_p ())
7506 dump_printf_loc (MSG_NOTE, vect_location,
7507 "condition expression based on "
7508 "compile time constant.\n");
7509 /* Record reduction code at analysis stage. */
7510 STMT_VINFO_REDUC_CODE (reduc_info)
7511 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7512 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7518 if (STMT_VINFO_LIVE_P (phi_info))
7519 return false;
7521 if (slp_node)
7522 ncopies = 1;
7523 else
7524 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7526 gcc_assert (ncopies >= 1);
7528 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7530 if (nested_cycle)
7532 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7533 == vect_double_reduction_def);
7534 double_reduc = true;
7537 /* 4.2. Check support for the epilog operation.
7539 If STMT represents a reduction pattern, then the type of the
7540 reduction variable may be different than the type of the rest
7541 of the arguments. For example, consider the case of accumulation
7542 of shorts into an int accumulator; The original code:
7543 S1: int_a = (int) short_a;
7544 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7546 was replaced with:
7547 STMT: int_acc = widen_sum <short_a, int_acc>
7549 This means that:
7550 1. The tree-code that is used to create the vector operation in the
7551 epilog code (that reduces the partial results) is not the
7552 tree-code of STMT, but is rather the tree-code of the original
7553 stmt from the pattern that STMT is replacing. I.e, in the example
7554 above we want to use 'widen_sum' in the loop, but 'plus' in the
7555 epilog.
7556 2. The type (mode) we use to check available target support
7557 for the vector operation to be created in the *epilog*, is
7558 determined by the type of the reduction variable (in the example
7559 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7560 However the type (mode) we use to check available target support
7561 for the vector operation to be created *inside the loop*, is
7562 determined by the type of the other arguments to STMT (in the
7563 example we'd check this: optab_handler (widen_sum_optab,
7564 vect_short_mode)).
7566 This is contrary to "regular" reductions, in which the types of all
7567 the arguments are the same as the type of the reduction variable.
7568 For "regular" reductions we can therefore use the same vector type
7569 (and also the same tree-code) when generating the epilog code and
7570 when generating the code inside the loop. */
7572 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7573 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7575 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7576 if (reduction_type == TREE_CODE_REDUCTION)
7578 /* Check whether it's ok to change the order of the computation.
7579 Generally, when vectorizing a reduction we change the order of the
7580 computation. This may change the behavior of the program in some
7581 cases, so we need to check that this is ok. One exception is when
7582 vectorizing an outer-loop: the inner-loop is executed sequentially,
7583 and therefore vectorizing reductions in the inner-loop during
7584 outer-loop vectorization is safe. Likewise when we are vectorizing
7585 a series of reductions using SLP and the VF is one the reductions
7586 are performed in scalar order. */
7587 if (slp_node
7588 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7589 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7591 else if (needs_fold_left_reduction_p (op.type, orig_code))
7593 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7594 is not directy used in stmt. */
7595 if (!only_slp_reduc_chain
7596 && reduc_chain_length != 1)
7598 if (dump_enabled_p ())
7599 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7600 "in-order reduction chain without SLP.\n");
7601 return false;
7603 STMT_VINFO_REDUC_TYPE (reduc_info)
7604 = reduction_type = FOLD_LEFT_REDUCTION;
7606 else if (!commutative_binary_op_p (orig_code, op.type)
7607 || !associative_binary_op_p (orig_code, op.type))
7609 if (dump_enabled_p ())
7610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7611 "reduction: not commutative/associative");
7612 return false;
7616 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7617 && ncopies > 1)
7619 if (dump_enabled_p ())
7620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7621 "multiple types in double reduction or condition "
7622 "reduction or fold-left reduction.\n");
7623 return false;
7626 internal_fn reduc_fn = IFN_LAST;
7627 if (reduction_type == TREE_CODE_REDUCTION
7628 || reduction_type == FOLD_LEFT_REDUCTION
7629 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7630 || reduction_type == CONST_COND_REDUCTION)
7632 if (reduction_type == FOLD_LEFT_REDUCTION
7633 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7634 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7636 if (reduc_fn != IFN_LAST
7637 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7638 OPTIMIZE_FOR_SPEED))
7640 if (dump_enabled_p ())
7641 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7642 "reduc op not supported by target.\n");
7644 reduc_fn = IFN_LAST;
7647 else
7649 if (!nested_cycle || double_reduc)
7651 if (dump_enabled_p ())
7652 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7653 "no reduc code for scalar code.\n");
7655 return false;
7659 else if (reduction_type == COND_REDUCTION)
7661 int scalar_precision
7662 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7663 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7664 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7665 vectype_out);
7667 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7668 OPTIMIZE_FOR_SPEED))
7669 reduc_fn = IFN_REDUC_MAX;
7671 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7673 if (reduction_type != EXTRACT_LAST_REDUCTION
7674 && (!nested_cycle || double_reduc)
7675 && reduc_fn == IFN_LAST
7676 && !nunits_out.is_constant ())
7678 if (dump_enabled_p ())
7679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7680 "missing target support for reduction on"
7681 " variable-length vectors.\n");
7682 return false;
7685 /* For SLP reductions, see if there is a neutral value we can use. */
7686 tree neutral_op = NULL_TREE;
7687 if (slp_node)
7689 tree initial_value = NULL_TREE;
7690 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7691 initial_value = vect_phi_initial_value (reduc_def_phi);
7692 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7693 orig_code, initial_value);
7696 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7698 /* We can't support in-order reductions of code such as this:
7700 for (int i = 0; i < n1; ++i)
7701 for (int j = 0; j < n2; ++j)
7702 l += a[j];
7704 since GCC effectively transforms the loop when vectorizing:
7706 for (int i = 0; i < n1 / VF; ++i)
7707 for (int j = 0; j < n2; ++j)
7708 for (int k = 0; k < VF; ++k)
7709 l += a[j];
7711 which is a reassociation of the original operation. */
7712 if (dump_enabled_p ())
7713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7714 "in-order double reduction not supported.\n");
7716 return false;
7719 if (reduction_type == FOLD_LEFT_REDUCTION
7720 && slp_node
7721 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7723 /* We cannot use in-order reductions in this case because there is
7724 an implicit reassociation of the operations involved. */
7725 if (dump_enabled_p ())
7726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7727 "in-order unchained SLP reductions not supported.\n");
7728 return false;
7731 /* For double reductions, and for SLP reductions with a neutral value,
7732 we construct a variable-length initial vector by loading a vector
7733 full of the neutral value and then shift-and-inserting the start
7734 values into the low-numbered elements. */
7735 if ((double_reduc || neutral_op)
7736 && !nunits_out.is_constant ()
7737 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7738 vectype_out, OPTIMIZE_FOR_SPEED))
7740 if (dump_enabled_p ())
7741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7742 "reduction on variable-length vectors requires"
7743 " target support for a vector-shift-and-insert"
7744 " operation.\n");
7745 return false;
7748 /* Check extra constraints for variable-length unchained SLP reductions. */
7749 if (slp_node
7750 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7751 && !nunits_out.is_constant ())
7753 /* We checked above that we could build the initial vector when
7754 there's a neutral element value. Check here for the case in
7755 which each SLP statement has its own initial value and in which
7756 that value needs to be repeated for every instance of the
7757 statement within the initial vector. */
7758 unsigned int group_size = SLP_TREE_LANES (slp_node);
7759 if (!neutral_op
7760 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7761 TREE_TYPE (vectype_out)))
7763 if (dump_enabled_p ())
7764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7765 "unsupported form of SLP reduction for"
7766 " variable-length vectors: cannot build"
7767 " initial vector.\n");
7768 return false;
7770 /* The epilogue code relies on the number of elements being a multiple
7771 of the group size. The duplicate-and-interleave approach to setting
7772 up the initial vector does too. */
7773 if (!multiple_p (nunits_out, group_size))
7775 if (dump_enabled_p ())
7776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7777 "unsupported form of SLP reduction for"
7778 " variable-length vectors: the vector size"
7779 " is not a multiple of the number of results.\n");
7780 return false;
7784 if (reduction_type == COND_REDUCTION)
7786 widest_int ni;
7788 if (! max_loop_iterations (loop, &ni))
7790 if (dump_enabled_p ())
7791 dump_printf_loc (MSG_NOTE, vect_location,
7792 "loop count not known, cannot create cond "
7793 "reduction.\n");
7794 return false;
7796 /* Convert backedges to iterations. */
7797 ni += 1;
7799 /* The additional index will be the same type as the condition. Check
7800 that the loop can fit into this less one (because we'll use up the
7801 zero slot for when there are no matches). */
7802 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7803 if (wi::geu_p (ni, wi::to_widest (max_index)))
7805 if (dump_enabled_p ())
7806 dump_printf_loc (MSG_NOTE, vect_location,
7807 "loop size is greater than data size.\n");
7808 return false;
7812 /* In case the vectorization factor (VF) is bigger than the number
7813 of elements that we can fit in a vectype (nunits), we have to generate
7814 more than one vector stmt - i.e - we need to "unroll" the
7815 vector stmt by a factor VF/nunits. For more details see documentation
7816 in vectorizable_operation. */
7818 /* If the reduction is used in an outer loop we need to generate
7819 VF intermediate results, like so (e.g. for ncopies=2):
7820 r0 = phi (init, r0)
7821 r1 = phi (init, r1)
7822 r0 = x0 + r0;
7823 r1 = x1 + r1;
7824 (i.e. we generate VF results in 2 registers).
7825 In this case we have a separate def-use cycle for each copy, and therefore
7826 for each copy we get the vector def for the reduction variable from the
7827 respective phi node created for this copy.
7829 Otherwise (the reduction is unused in the loop nest), we can combine
7830 together intermediate results, like so (e.g. for ncopies=2):
7831 r = phi (init, r)
7832 r = x0 + r;
7833 r = x1 + r;
7834 (i.e. we generate VF/2 results in a single register).
7835 In this case for each copy we get the vector def for the reduction variable
7836 from the vectorized reduction operation generated in the previous iteration.
7838 This only works when we see both the reduction PHI and its only consumer
7839 in vectorizable_reduction and there are no intermediate stmts
7840 participating. When unrolling we want each unrolled iteration to have its
7841 own reduction accumulator since one of the main goals of unrolling a
7842 reduction is to reduce the aggregate loop-carried latency. */
7843 if (ncopies > 1
7844 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7845 && reduc_chain_length == 1
7846 && loop_vinfo->suggested_unroll_factor == 1)
7847 single_defuse_cycle = true;
7849 if (single_defuse_cycle || lane_reduc_code_p)
7851 gcc_assert (op.code != COND_EXPR);
7853 /* 4. Supportable by target? */
7854 bool ok = true;
7856 /* 4.1. check support for the operation in the loop
7858 This isn't necessary for the lane reduction codes, since they
7859 can only be produced by pattern matching, and it's up to the
7860 pattern matcher to test for support. The main reason for
7861 specifically skipping this step is to avoid rechecking whether
7862 mixed-sign dot-products can be implemented using signed
7863 dot-products. */
7864 machine_mode vec_mode = TYPE_MODE (vectype_in);
7865 if (!lane_reduc_code_p
7866 && !directly_supported_p (op.code, vectype_in, optab_vector))
7868 if (dump_enabled_p ())
7869 dump_printf (MSG_NOTE, "op not supported by target.\n");
7870 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7871 || !vect_can_vectorize_without_simd_p (op.code))
7872 ok = false;
7873 else
7874 if (dump_enabled_p ())
7875 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7878 if (vect_emulated_vector_p (vectype_in)
7879 && !vect_can_vectorize_without_simd_p (op.code))
7881 if (dump_enabled_p ())
7882 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7883 return false;
7886 /* lane-reducing operations have to go through vect_transform_reduction.
7887 For the other cases try without the single cycle optimization. */
7888 if (!ok)
7890 if (lane_reduc_code_p)
7891 return false;
7892 else
7893 single_defuse_cycle = false;
7896 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7898 /* If the reduction stmt is one of the patterns that have lane
7899 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7900 if ((ncopies > 1 && ! single_defuse_cycle)
7901 && lane_reduc_code_p)
7903 if (dump_enabled_p ())
7904 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7905 "multi def-use cycle not possible for lane-reducing "
7906 "reduction operation\n");
7907 return false;
7910 if (slp_node
7911 && !(!single_defuse_cycle
7912 && !lane_reduc_code_p
7913 && reduction_type != FOLD_LEFT_REDUCTION))
7914 for (i = 0; i < (int) op.num_ops; i++)
7915 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7917 if (dump_enabled_p ())
7918 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7919 "incompatible vector types for invariants\n");
7920 return false;
7923 if (slp_node)
7924 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7925 else
7926 vec_num = 1;
7928 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7929 reduction_type, ncopies, cost_vec);
7930 /* Cost the reduction op inside the loop if transformed via
7931 vect_transform_reduction. Otherwise this is costed by the
7932 separate vectorizable_* routines. */
7933 if (single_defuse_cycle || lane_reduc_code_p)
7935 int factor = 1;
7936 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7937 /* Three dot-products and a subtraction. */
7938 factor = 4;
7939 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
7940 stmt_info, 0, vect_body);
7943 if (dump_enabled_p ()
7944 && reduction_type == FOLD_LEFT_REDUCTION)
7945 dump_printf_loc (MSG_NOTE, vect_location,
7946 "using an in-order (fold-left) reduction.\n");
7947 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7948 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7949 reductions go through their own vectorizable_* routines. */
7950 if (!single_defuse_cycle
7951 && !lane_reduc_code_p
7952 && reduction_type != FOLD_LEFT_REDUCTION)
7954 stmt_vec_info tem
7955 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7956 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7958 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7959 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7961 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7962 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7964 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7966 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7967 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7969 if (reduction_type != FOLD_LEFT_REDUCTION
7970 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7971 && (cond_fn == IFN_LAST
7972 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7973 OPTIMIZE_FOR_SPEED)))
7975 if (dump_enabled_p ())
7976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7977 "can't operate on partial vectors because"
7978 " no conditional operation is available.\n");
7979 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7981 else if (reduction_type == FOLD_LEFT_REDUCTION
7982 && reduc_fn == IFN_LAST
7983 && !expand_vec_cond_expr_p (vectype_in,
7984 truth_type_for (vectype_in),
7985 SSA_NAME))
7987 if (dump_enabled_p ())
7988 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7989 "can't operate on partial vectors because"
7990 " no conditional operation is available.\n");
7991 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7993 else
7994 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7995 vectype_in, NULL);
7997 return true;
8000 /* STMT_INFO is a dot-product reduction whose multiplication operands
8001 have different signs. Emit a sequence to emulate the operation
8002 using a series of signed DOT_PROD_EXPRs and return the last
8003 statement generated. VEC_DEST is the result of the vector operation
8004 and VOP lists its inputs. */
8006 static gassign *
8007 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8008 gimple_stmt_iterator *gsi, tree vec_dest,
8009 tree vop[3])
8011 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8012 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8013 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8014 gimple *new_stmt;
8016 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8017 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8018 std::swap (vop[0], vop[1]);
8020 /* Convert all inputs to signed types. */
8021 for (int i = 0; i < 3; ++i)
8022 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8024 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8025 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8026 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8027 vop[i] = tmp;
8030 /* In the comments below we assume 8-bit inputs for simplicity,
8031 but the approach works for any full integer type. */
8033 /* Create a vector of -128. */
8034 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8035 tree min_narrow = build_vector_from_val (narrow_vectype,
8036 min_narrow_elttype);
8038 /* Create a vector of 64. */
8039 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8040 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8041 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8043 /* Emit: SUB_RES = VOP[0] - 128. */
8044 tree sub_res = make_ssa_name (narrow_vectype);
8045 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8046 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8048 /* Emit:
8050 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8051 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8052 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8054 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8055 Doing the two 64 * y steps first allows more time to compute x. */
8056 tree stage1 = make_ssa_name (wide_vectype);
8057 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8058 vop[1], half_narrow, vop[2]);
8059 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8061 tree stage2 = make_ssa_name (wide_vectype);
8062 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8063 vop[1], half_narrow, stage1);
8064 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8066 tree stage3 = make_ssa_name (wide_vectype);
8067 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8068 sub_res, vop[1], stage2);
8069 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8071 /* Convert STAGE3 to the reduction type. */
8072 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8075 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8076 value. */
8078 bool
8079 vect_transform_reduction (loop_vec_info loop_vinfo,
8080 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8081 gimple **vec_stmt, slp_tree slp_node)
8083 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8084 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8085 int i;
8086 int ncopies;
8087 int vec_num;
8089 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8090 gcc_assert (reduc_info->is_reduc_info);
8092 if (nested_in_vect_loop_p (loop, stmt_info))
8094 loop = loop->inner;
8095 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8098 gimple_match_op op;
8099 if (!gimple_extract_op (stmt_info->stmt, &op))
8100 gcc_unreachable ();
8102 /* All uses but the last are expected to be defined in the loop.
8103 The last use is the reduction variable. In case of nested cycle this
8104 assumption is not true: we use reduc_index to record the index of the
8105 reduction variable. */
8106 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8107 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8108 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8109 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8111 if (slp_node)
8113 ncopies = 1;
8114 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8116 else
8118 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8119 vec_num = 1;
8122 code_helper code = canonicalize_code (op.code, op.type);
8123 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8124 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8125 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8127 /* Transform. */
8128 tree new_temp = NULL_TREE;
8129 auto_vec<tree> vec_oprnds0;
8130 auto_vec<tree> vec_oprnds1;
8131 auto_vec<tree> vec_oprnds2;
8132 tree def0;
8134 if (dump_enabled_p ())
8135 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8137 /* FORNOW: Multiple types are not supported for condition. */
8138 if (code == COND_EXPR)
8139 gcc_assert (ncopies == 1);
8141 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8143 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8144 if (reduction_type == FOLD_LEFT_REDUCTION)
8146 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8147 gcc_assert (code.is_tree_code ());
8148 return vectorize_fold_left_reduction
8149 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8150 tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
8153 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8154 gcc_assert (single_defuse_cycle
8155 || code == DOT_PROD_EXPR
8156 || code == WIDEN_SUM_EXPR
8157 || code == SAD_EXPR);
8159 /* Create the destination vector */
8160 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8161 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8163 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8164 single_defuse_cycle && reduc_index == 0
8165 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8166 single_defuse_cycle && reduc_index == 1
8167 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8168 op.num_ops == 3
8169 && !(single_defuse_cycle && reduc_index == 2)
8170 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8171 if (single_defuse_cycle)
8173 gcc_assert (!slp_node);
8174 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8175 op.ops[reduc_index],
8176 reduc_index == 0 ? &vec_oprnds0
8177 : (reduc_index == 1 ? &vec_oprnds1
8178 : &vec_oprnds2));
8181 bool emulated_mixed_dot_prod
8182 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8183 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8185 gimple *new_stmt;
8186 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8187 if (masked_loop_p && !mask_by_cond_expr)
8189 /* No conditional ifns have been defined for dot-product yet. */
8190 gcc_assert (code != DOT_PROD_EXPR);
8192 /* Make sure that the reduction accumulator is vop[0]. */
8193 if (reduc_index == 1)
8195 gcc_assert (commutative_binary_op_p (code, op.type));
8196 std::swap (vop[0], vop[1]);
8198 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8199 vec_num * ncopies, vectype_in, i);
8200 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8201 vop[0], vop[1], vop[0]);
8202 new_temp = make_ssa_name (vec_dest, call);
8203 gimple_call_set_lhs (call, new_temp);
8204 gimple_call_set_nothrow (call, true);
8205 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8206 new_stmt = call;
8208 else
8210 if (op.num_ops == 3)
8211 vop[2] = vec_oprnds2[i];
8213 if (masked_loop_p && mask_by_cond_expr)
8215 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8216 vec_num * ncopies, vectype_in, i);
8217 build_vect_cond_expr (code, vop, mask, gsi);
8220 if (emulated_mixed_dot_prod)
8221 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8222 vec_dest, vop);
8223 else if (code.is_internal_fn ())
8224 new_stmt = gimple_build_call_internal (internal_fn (code),
8225 op.num_ops,
8226 vop[0], vop[1], vop[2]);
8227 else
8228 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8229 vop[0], vop[1], vop[2]);
8230 new_temp = make_ssa_name (vec_dest, new_stmt);
8231 gimple_set_lhs (new_stmt, new_temp);
8232 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8235 if (slp_node)
8236 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
8237 else if (single_defuse_cycle
8238 && i < ncopies - 1)
8240 if (reduc_index == 0)
8241 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8242 else if (reduc_index == 1)
8243 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8244 else if (reduc_index == 2)
8245 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8247 else
8248 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8251 if (!slp_node)
8252 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8254 return true;
8257 /* Transform phase of a cycle PHI. */
8259 bool
8260 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8261 stmt_vec_info stmt_info, gimple **vec_stmt,
8262 slp_tree slp_node, slp_instance slp_node_instance)
8264 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8265 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8266 int i;
8267 int ncopies;
8268 int j;
8269 bool nested_cycle = false;
8270 int vec_num;
8272 if (nested_in_vect_loop_p (loop, stmt_info))
8274 loop = loop->inner;
8275 nested_cycle = true;
8278 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8279 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8280 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8281 gcc_assert (reduc_info->is_reduc_info);
8283 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8284 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8285 /* Leave the scalar phi in place. */
8286 return true;
8288 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8289 /* For a nested cycle we do not fill the above. */
8290 if (!vectype_in)
8291 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8292 gcc_assert (vectype_in);
8294 if (slp_node)
8296 /* The size vect_schedule_slp_instance computes is off for us. */
8297 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8298 * SLP_TREE_LANES (slp_node), vectype_in);
8299 ncopies = 1;
8301 else
8303 vec_num = 1;
8304 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8307 /* Check whether we should use a single PHI node and accumulate
8308 vectors to one before the backedge. */
8309 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8310 ncopies = 1;
8312 /* Create the destination vector */
8313 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8314 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8315 vectype_out);
8317 /* Get the loop-entry arguments. */
8318 tree vec_initial_def = NULL_TREE;
8319 auto_vec<tree> vec_initial_defs;
8320 if (slp_node)
8322 vec_initial_defs.reserve (vec_num);
8323 if (nested_cycle)
8325 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8326 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8327 &vec_initial_defs);
8329 else
8331 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8332 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8333 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8335 unsigned int num_phis = stmts.length ();
8336 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8337 num_phis = 1;
8338 initial_values.reserve (num_phis);
8339 for (unsigned int i = 0; i < num_phis; ++i)
8341 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8342 initial_values.quick_push (vect_phi_initial_value (this_phi));
8344 if (vec_num == 1)
8345 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8346 if (!initial_values.is_empty ())
8348 tree initial_value
8349 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8350 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8351 tree neutral_op
8352 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8353 code, initial_value);
8354 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8355 &vec_initial_defs, vec_num,
8356 stmts.length (), neutral_op);
8360 else
8362 /* Get at the scalar def before the loop, that defines the initial
8363 value of the reduction variable. */
8364 tree initial_def = vect_phi_initial_value (phi);
8365 reduc_info->reduc_initial_values.safe_push (initial_def);
8366 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8367 and we can't use zero for induc_val, use initial_def. Similarly
8368 for REDUC_MIN and initial_def larger than the base. */
8369 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8371 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8372 if (TREE_CODE (initial_def) == INTEGER_CST
8373 && !integer_zerop (induc_val)
8374 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8375 && tree_int_cst_lt (initial_def, induc_val))
8376 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8377 && tree_int_cst_lt (induc_val, initial_def))))
8379 induc_val = initial_def;
8380 /* Communicate we used the initial_def to epilouge
8381 generation. */
8382 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8384 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8386 else if (nested_cycle)
8388 /* Do not use an adjustment def as that case is not supported
8389 correctly if ncopies is not one. */
8390 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8391 ncopies, initial_def,
8392 &vec_initial_defs);
8394 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8395 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8396 /* Fill the initial vector with the initial scalar value. */
8397 vec_initial_def
8398 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8399 initial_def, initial_def);
8400 else
8402 if (ncopies == 1)
8403 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8404 if (!reduc_info->reduc_initial_values.is_empty ())
8406 initial_def = reduc_info->reduc_initial_values[0];
8407 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8408 tree neutral_op
8409 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8410 code, initial_def);
8411 gcc_assert (neutral_op);
8412 /* Try to simplify the vector initialization by applying an
8413 adjustment after the reduction has been performed. */
8414 if (!reduc_info->reused_accumulator
8415 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8416 && !operand_equal_p (neutral_op, initial_def))
8418 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8419 = initial_def;
8420 initial_def = neutral_op;
8422 vec_initial_def
8423 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8424 initial_def, neutral_op);
8429 if (vec_initial_def)
8431 vec_initial_defs.create (ncopies);
8432 for (i = 0; i < ncopies; ++i)
8433 vec_initial_defs.quick_push (vec_initial_def);
8436 if (auto *accumulator = reduc_info->reused_accumulator)
8438 tree def = accumulator->reduc_input;
8439 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8441 unsigned int nreduc;
8442 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8443 (TREE_TYPE (def)),
8444 TYPE_VECTOR_SUBPARTS (vectype_out),
8445 &nreduc);
8446 gcc_assert (res);
8447 gimple_seq stmts = NULL;
8448 /* Reduce the single vector to a smaller one. */
8449 if (nreduc != 1)
8451 /* Perform the reduction in the appropriate type. */
8452 tree rvectype = vectype_out;
8453 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8454 TREE_TYPE (TREE_TYPE (def))))
8455 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8456 TYPE_VECTOR_SUBPARTS
8457 (vectype_out));
8458 def = vect_create_partial_epilog (def, rvectype,
8459 STMT_VINFO_REDUC_CODE
8460 (reduc_info),
8461 &stmts);
8463 /* The epilogue loop might use a different vector mode, like
8464 VNx2DI vs. V2DI. */
8465 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8467 tree reduc_type = build_vector_type_for_mode
8468 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8469 def = gimple_convert (&stmts, reduc_type, def);
8471 /* Adjust the input so we pick up the partially reduced value
8472 for the skip edge in vect_create_epilog_for_reduction. */
8473 accumulator->reduc_input = def;
8474 /* And the reduction could be carried out using a different sign. */
8475 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8476 def = gimple_convert (&stmts, vectype_out, def);
8477 if (loop_vinfo->main_loop_edge)
8479 /* While we'd like to insert on the edge this will split
8480 blocks and disturb bookkeeping, we also will eventually
8481 need this on the skip edge. Rely on sinking to
8482 fixup optimal placement and insert in the pred. */
8483 gimple_stmt_iterator gsi
8484 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8485 /* Insert before a cond that eventually skips the
8486 epilogue. */
8487 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8488 gsi_prev (&gsi);
8489 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8491 else
8492 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8493 stmts);
8495 if (loop_vinfo->main_loop_edge)
8496 vec_initial_defs[0]
8497 = vect_get_main_loop_result (loop_vinfo, def,
8498 vec_initial_defs[0]);
8499 else
8500 vec_initial_defs.safe_push (def);
8503 /* Generate the reduction PHIs upfront. */
8504 for (i = 0; i < vec_num; i++)
8506 tree vec_init_def = vec_initial_defs[i];
8507 for (j = 0; j < ncopies; j++)
8509 /* Create the reduction-phi that defines the reduction
8510 operand. */
8511 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8513 /* Set the loop-entry arg of the reduction-phi. */
8514 if (j != 0 && nested_cycle)
8515 vec_init_def = vec_initial_defs[j];
8516 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8517 UNKNOWN_LOCATION);
8519 /* The loop-latch arg is set in epilogue processing. */
8521 if (slp_node)
8522 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8523 else
8525 if (j == 0)
8526 *vec_stmt = new_phi;
8527 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8532 return true;
8535 /* Vectorizes LC PHIs. */
8537 bool
8538 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8539 stmt_vec_info stmt_info, gimple **vec_stmt,
8540 slp_tree slp_node)
8542 if (!loop_vinfo
8543 || !is_a <gphi *> (stmt_info->stmt)
8544 || gimple_phi_num_args (stmt_info->stmt) != 1)
8545 return false;
8547 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8548 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8549 return false;
8551 if (!vec_stmt) /* transformation not required. */
8553 /* Deal with copies from externs or constants that disguise as
8554 loop-closed PHI nodes (PR97886). */
8555 if (slp_node
8556 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8557 SLP_TREE_VECTYPE (slp_node)))
8559 if (dump_enabled_p ())
8560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8561 "incompatible vector types for invariants\n");
8562 return false;
8564 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8565 return true;
8568 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8569 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8570 basic_block bb = gimple_bb (stmt_info->stmt);
8571 edge e = single_pred_edge (bb);
8572 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8573 auto_vec<tree> vec_oprnds;
8574 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8575 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8576 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8577 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8579 /* Create the vectorized LC PHI node. */
8580 gphi *new_phi = create_phi_node (vec_dest, bb);
8581 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8582 if (slp_node)
8583 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8584 else
8585 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8587 if (!slp_node)
8588 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8590 return true;
8593 /* Vectorizes PHIs. */
8595 bool
8596 vectorizable_phi (vec_info *,
8597 stmt_vec_info stmt_info, gimple **vec_stmt,
8598 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8600 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8601 return false;
8603 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8604 return false;
8606 tree vectype = SLP_TREE_VECTYPE (slp_node);
8608 if (!vec_stmt) /* transformation not required. */
8610 slp_tree child;
8611 unsigned i;
8612 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8613 if (!child)
8615 if (dump_enabled_p ())
8616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8617 "PHI node with unvectorized backedge def\n");
8618 return false;
8620 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8622 if (dump_enabled_p ())
8623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8624 "incompatible vector types for invariants\n");
8625 return false;
8627 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8628 && !useless_type_conversion_p (vectype,
8629 SLP_TREE_VECTYPE (child)))
8631 /* With bools we can have mask and non-mask precision vectors
8632 or different non-mask precisions. while pattern recog is
8633 supposed to guarantee consistency here bugs in it can cause
8634 mismatches (PR103489 and PR103800 for example).
8635 Deal with them here instead of ICEing later. */
8636 if (dump_enabled_p ())
8637 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8638 "incompatible vector type setup from "
8639 "bool pattern detection\n");
8640 return false;
8643 /* For single-argument PHIs assume coalescing which means zero cost
8644 for the scalar and the vector PHIs. This avoids artificially
8645 favoring the vector path (but may pessimize it in some cases). */
8646 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8647 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8648 vector_stmt, stmt_info, vectype, 0, vect_body);
8649 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8650 return true;
8653 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8654 basic_block bb = gimple_bb (stmt_info->stmt);
8655 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8656 auto_vec<gphi *> new_phis;
8657 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8659 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8661 /* Skip not yet vectorized defs. */
8662 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8663 && SLP_TREE_VEC_STMTS (child).is_empty ())
8664 continue;
8666 auto_vec<tree> vec_oprnds;
8667 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8668 if (!new_phis.exists ())
8670 new_phis.create (vec_oprnds.length ());
8671 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8673 /* Create the vectorized LC PHI node. */
8674 new_phis.quick_push (create_phi_node (vec_dest, bb));
8675 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8678 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8679 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8680 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8682 /* We should have at least one already vectorized child. */
8683 gcc_assert (new_phis.exists ());
8685 return true;
8688 /* Vectorizes first order recurrences. An overview of the transformation
8689 is described below. Suppose we have the following loop.
8691 int t = 0;
8692 for (int i = 0; i < n; ++i)
8694 b[i] = a[i] - t;
8695 t = a[i];
8698 There is a first-order recurrence on 'a'. For this loop, the scalar IR
8699 looks (simplified) like:
8701 scalar.preheader:
8702 init = 0;
8704 scalar.body:
8705 i = PHI <0(scalar.preheader), i+1(scalar.body)>
8706 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8707 _1 = a[i]
8708 b[i] = _1 - _2
8709 if (i < n) goto scalar.body
8711 In this example, _2 is a recurrence because it's value depends on the
8712 previous iteration. We vectorize this as (VF = 4)
8714 vector.preheader:
8715 vect_init = vect_cst(..., ..., ..., 0)
8717 vector.body
8718 i = PHI <0(vector.preheader), i+4(vector.body)>
8719 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8720 vect_2 = a[i, i+1, i+2, i+3];
8721 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8722 b[i, i+1, i+2, i+3] = vect_2 - vect_3
8723 if (..) goto vector.body
8725 In this function, vectorizable_recurr, we code generate both the
8726 vector PHI node and the permute since those together compute the
8727 vectorized value of the scalar PHI. We do not yet have the
8728 backedge value to fill in there nor into the vec_perm. Those
8729 are filled in maybe_set_vectorized_backedge_value and
8730 vect_schedule_scc.
8732 TODO: Since the scalar loop does not have a use of the recurrence
8733 outside of the loop the natural way to implement peeling via
8734 vectorizing the live value doesn't work. For now peeling of loops
8735 with a recurrence is not implemented. For SLP the supported cases
8736 are restricted to those requiring a single vector recurrence PHI. */
8738 bool
8739 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8740 gimple **vec_stmt, slp_tree slp_node,
8741 stmt_vector_for_cost *cost_vec)
8743 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8744 return false;
8746 gphi *phi = as_a<gphi *> (stmt_info->stmt);
8748 /* So far we only support first-order recurrence auto-vectorization. */
8749 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8750 return false;
8752 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8753 unsigned ncopies;
8754 if (slp_node)
8755 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8756 else
8757 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8758 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8759 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8760 /* We need to be able to make progress with a single vector. */
8761 if (maybe_gt (dist * 2, nunits))
8763 if (dump_enabled_p ())
8764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8765 "first order recurrence exceeds half of "
8766 "a vector\n");
8767 return false;
8770 /* First-order recurrence autovectorization needs to handle permutation
8771 with indices = [nunits-1, nunits, nunits+1, ...]. */
8772 vec_perm_builder sel (nunits, 1, 3);
8773 for (int i = 0; i < 3; ++i)
8774 sel.quick_push (nunits - dist + i);
8775 vec_perm_indices indices (sel, 2, nunits);
8777 if (!vec_stmt) /* transformation not required. */
8779 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8780 indices))
8781 return false;
8783 if (slp_node)
8785 /* We eventually need to set a vector type on invariant
8786 arguments. */
8787 unsigned j;
8788 slp_tree child;
8789 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8790 if (!vect_maybe_update_slp_op_vectype
8791 (child, SLP_TREE_VECTYPE (slp_node)))
8793 if (dump_enabled_p ())
8794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8795 "incompatible vector types for "
8796 "invariants\n");
8797 return false;
8800 /* The recurrence costs the initialization vector and one permute
8801 for each copy. */
8802 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8803 stmt_info, 0, vect_prologue);
8804 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8805 stmt_info, 0, vect_body);
8806 if (dump_enabled_p ())
8807 dump_printf_loc (MSG_NOTE, vect_location,
8808 "vectorizable_recurr: inside_cost = %d, "
8809 "prologue_cost = %d .\n", inside_cost,
8810 prologue_cost);
8812 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8813 return true;
8816 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8817 basic_block bb = gimple_bb (phi);
8818 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8819 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8821 gimple_seq stmts = NULL;
8822 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8823 gsi_insert_seq_on_edge_immediate (pe, stmts);
8825 tree vec_init = build_vector_from_val (vectype, preheader);
8826 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8828 /* Create the vectorized first-order PHI node. */
8829 tree vec_dest = vect_get_new_vect_var (vectype,
8830 vect_simple_var, "vec_recur_");
8831 gphi *new_phi = create_phi_node (vec_dest, bb);
8832 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8834 /* Insert shuffles the first-order recurrence autovectorization.
8835 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8836 tree perm = vect_gen_perm_mask_checked (vectype, indices);
8838 /* Insert the required permute after the latch definition. The
8839 second and later operands are tentative and will be updated when we have
8840 vectorized the latch definition. */
8841 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8842 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8843 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8844 gsi_next (&gsi2);
8846 for (unsigned i = 0; i < ncopies; ++i)
8848 vec_dest = make_ssa_name (vectype);
8849 gassign *vperm
8850 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8851 i == 0 ? gimple_phi_result (new_phi) : NULL,
8852 NULL, perm);
8853 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8855 if (slp_node)
8856 SLP_TREE_VEC_STMTS (slp_node).quick_push (vperm);
8857 else
8858 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8861 if (!slp_node)
8862 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8863 return true;
8866 /* Return true if VECTYPE represents a vector that requires lowering
8867 by the vector lowering pass. */
8869 bool
8870 vect_emulated_vector_p (tree vectype)
8872 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8873 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8874 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8877 /* Return true if we can emulate CODE on an integer mode representation
8878 of a vector. */
8880 bool
8881 vect_can_vectorize_without_simd_p (tree_code code)
8883 switch (code)
8885 case PLUS_EXPR:
8886 case MINUS_EXPR:
8887 case NEGATE_EXPR:
8888 case BIT_AND_EXPR:
8889 case BIT_IOR_EXPR:
8890 case BIT_XOR_EXPR:
8891 case BIT_NOT_EXPR:
8892 return true;
8894 default:
8895 return false;
8899 /* Likewise, but taking a code_helper. */
8901 bool
8902 vect_can_vectorize_without_simd_p (code_helper code)
8904 return (code.is_tree_code ()
8905 && vect_can_vectorize_without_simd_p (tree_code (code)));
8908 /* Create vector init for vectorized iv. */
8909 static tree
8910 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8911 tree step_expr, poly_uint64 nunits,
8912 tree vectype,
8913 enum vect_induction_op_type induction_type)
8915 unsigned HOST_WIDE_INT const_nunits;
8916 tree vec_shift, vec_init, new_name;
8917 unsigned i;
8918 tree itype = TREE_TYPE (vectype);
8920 /* iv_loop is the loop to be vectorized. Create:
8921 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8922 new_name = gimple_convert (stmts, itype, init_expr);
8923 switch (induction_type)
8925 case vect_step_op_shr:
8926 case vect_step_op_shl:
8927 /* Build the Initial value from shift_expr. */
8928 vec_init = gimple_build_vector_from_val (stmts,
8929 vectype,
8930 new_name);
8931 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8932 build_zero_cst (itype), step_expr);
8933 vec_init = gimple_build (stmts,
8934 (induction_type == vect_step_op_shr
8935 ? RSHIFT_EXPR : LSHIFT_EXPR),
8936 vectype, vec_init, vec_shift);
8937 break;
8939 case vect_step_op_neg:
8941 vec_init = gimple_build_vector_from_val (stmts,
8942 vectype,
8943 new_name);
8944 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8945 vectype, vec_init);
8946 /* The encoding has 2 interleaved stepped patterns. */
8947 vec_perm_builder sel (nunits, 2, 3);
8948 sel.quick_grow (6);
8949 for (i = 0; i < 3; i++)
8951 sel[2 * i] = i;
8952 sel[2 * i + 1] = i + nunits;
8954 vec_perm_indices indices (sel, 2, nunits);
8955 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8956 fail when vec_init is const vector. In that situation vec_perm is not
8957 really needed. */
8958 tree perm_mask_even
8959 = vect_gen_perm_mask_any (vectype, indices);
8960 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8961 vectype,
8962 vec_init, vec_neg,
8963 perm_mask_even);
8965 break;
8967 case vect_step_op_mul:
8969 /* Use unsigned mult to avoid UD integer overflow. */
8970 gcc_assert (nunits.is_constant (&const_nunits));
8971 tree utype = unsigned_type_for (itype);
8972 tree uvectype = build_vector_type (utype,
8973 TYPE_VECTOR_SUBPARTS (vectype));
8974 new_name = gimple_convert (stmts, utype, new_name);
8975 vec_init = gimple_build_vector_from_val (stmts,
8976 uvectype,
8977 new_name);
8978 tree_vector_builder elts (uvectype, const_nunits, 1);
8979 tree elt_step = build_one_cst (utype);
8981 elts.quick_push (elt_step);
8982 for (i = 1; i < const_nunits; i++)
8984 /* Create: new_name_i = new_name + step_expr. */
8985 elt_step = gimple_build (stmts, MULT_EXPR,
8986 utype, elt_step, step_expr);
8987 elts.quick_push (elt_step);
8989 /* Create a vector from [new_name_0, new_name_1, ...,
8990 new_name_nunits-1]. */
8991 tree vec_mul = gimple_build_vector (stmts, &elts);
8992 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
8993 vec_init, vec_mul);
8994 vec_init = gimple_convert (stmts, vectype, vec_init);
8996 break;
8998 default:
8999 gcc_unreachable ();
9002 return vec_init;
9005 /* Peel init_expr by skip_niter for induction_type. */
9006 tree
9007 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9008 tree skip_niters, tree step_expr,
9009 enum vect_induction_op_type induction_type)
9011 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9012 tree type = TREE_TYPE (init_expr);
9013 unsigned prec = TYPE_PRECISION (type);
9014 switch (induction_type)
9016 case vect_step_op_neg:
9017 if (TREE_INT_CST_LOW (skip_niters) % 2)
9018 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9019 /* else no change. */
9020 break;
9022 case vect_step_op_shr:
9023 case vect_step_op_shl:
9024 skip_niters = gimple_convert (stmts, type, skip_niters);
9025 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9026 /* When shift mount >= precision, need to avoid UD.
9027 In the original loop, there's no UD, and according to semantic,
9028 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9029 if (!tree_fits_uhwi_p (step_expr)
9030 || tree_to_uhwi (step_expr) >= prec)
9032 if (induction_type == vect_step_op_shl
9033 || TYPE_UNSIGNED (type))
9034 init_expr = build_zero_cst (type);
9035 else
9036 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9037 init_expr,
9038 wide_int_to_tree (type, prec - 1));
9040 else
9041 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9042 ? RSHIFT_EXPR : LSHIFT_EXPR),
9043 type, init_expr, step_expr);
9044 break;
9046 case vect_step_op_mul:
9048 tree utype = unsigned_type_for (type);
9049 init_expr = gimple_convert (stmts, utype, init_expr);
9050 unsigned skipn = TREE_INT_CST_LOW (skip_niters);
9051 wide_int begin = wi::to_wide (step_expr);
9052 for (unsigned i = 0; i != skipn - 1; i++)
9053 begin = wi::mul (begin, wi::to_wide (step_expr));
9054 tree mult_expr = wide_int_to_tree (utype, begin);
9055 init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
9056 init_expr = gimple_convert (stmts, type, init_expr);
9058 break;
9060 default:
9061 gcc_unreachable ();
9064 return init_expr;
9067 /* Create vector step for vectorized iv. */
9068 static tree
9069 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9070 poly_uint64 vf,
9071 enum vect_induction_op_type induction_type)
9073 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9074 tree new_name = NULL;
9075 /* Step should be pow (step, vf) for mult induction. */
9076 if (induction_type == vect_step_op_mul)
9078 gcc_assert (vf.is_constant ());
9079 wide_int begin = wi::to_wide (step_expr);
9081 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9082 begin = wi::mul (begin, wi::to_wide (step_expr));
9084 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9086 else if (induction_type == vect_step_op_neg)
9087 /* Do nothing. */
9089 else
9090 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9091 expr, step_expr);
9092 return new_name;
9095 static tree
9096 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9097 stmt_vec_info stmt_info,
9098 tree new_name, tree vectype,
9099 enum vect_induction_op_type induction_type)
9101 /* No step is needed for neg induction. */
9102 if (induction_type == vect_step_op_neg)
9103 return NULL;
9105 tree t = unshare_expr (new_name);
9106 gcc_assert (CONSTANT_CLASS_P (new_name)
9107 || TREE_CODE (new_name) == SSA_NAME);
9108 tree new_vec = build_vector_from_val (vectype, t);
9109 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9110 new_vec, vectype, NULL);
9111 return vec_step;
9114 /* Update vectorized iv with vect_step, induc_def is init. */
9115 static tree
9116 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9117 tree induc_def, tree vec_step,
9118 enum vect_induction_op_type induction_type)
9120 tree vec_def = induc_def;
9121 switch (induction_type)
9123 case vect_step_op_mul:
9125 /* Use unsigned mult to avoid UD integer overflow. */
9126 tree uvectype
9127 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9128 TYPE_VECTOR_SUBPARTS (vectype));
9129 vec_def = gimple_convert (stmts, uvectype, vec_def);
9130 vec_step = gimple_convert (stmts, uvectype, vec_step);
9131 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9132 vec_def, vec_step);
9133 vec_def = gimple_convert (stmts, vectype, vec_def);
9135 break;
9137 case vect_step_op_shr:
9138 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9139 vec_def, vec_step);
9140 break;
9142 case vect_step_op_shl:
9143 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9144 vec_def, vec_step);
9145 break;
9146 case vect_step_op_neg:
9147 vec_def = induc_def;
9148 /* Do nothing. */
9149 break;
9150 default:
9151 gcc_unreachable ();
9154 return vec_def;
9158 /* Function vectorizable_induction
9160 Check if STMT_INFO performs an nonlinear induction computation that can be
9161 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9162 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9163 basic block.
9164 Return true if STMT_INFO is vectorizable in this way. */
9166 static bool
9167 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9168 stmt_vec_info stmt_info,
9169 gimple **vec_stmt, slp_tree slp_node,
9170 stmt_vector_for_cost *cost_vec)
9172 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9173 unsigned ncopies;
9174 bool nested_in_vect_loop = false;
9175 class loop *iv_loop;
9176 tree vec_def;
9177 edge pe = loop_preheader_edge (loop);
9178 basic_block new_bb;
9179 tree vec_init, vec_step;
9180 tree new_name;
9181 gimple *new_stmt;
9182 gphi *induction_phi;
9183 tree induc_def, vec_dest;
9184 tree init_expr, step_expr;
9185 tree niters_skip;
9186 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9187 unsigned i;
9188 gimple_stmt_iterator si;
9190 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9192 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9193 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9194 enum vect_induction_op_type induction_type
9195 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9197 gcc_assert (induction_type > vect_step_op_add);
9199 if (slp_node)
9200 ncopies = 1;
9201 else
9202 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9203 gcc_assert (ncopies >= 1);
9205 /* FORNOW. Only handle nonlinear induction in the same loop. */
9206 if (nested_in_vect_loop_p (loop, stmt_info))
9208 if (dump_enabled_p ())
9209 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9210 "nonlinear induction in nested loop.\n");
9211 return false;
9214 iv_loop = loop;
9215 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9217 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9218 update for each iv and a permutation to generate wanted vector iv. */
9219 if (slp_node)
9221 if (dump_enabled_p ())
9222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9223 "SLP induction not supported for nonlinear"
9224 " induction.\n");
9225 return false;
9228 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9230 if (dump_enabled_p ())
9231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9232 "floating point nonlinear induction vectorization"
9233 " not supported.\n");
9234 return false;
9237 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9238 init_expr = vect_phi_initial_value (phi);
9239 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9240 && TREE_CODE (step_expr) == INTEGER_CST);
9241 /* step_expr should be aligned with init_expr,
9242 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9243 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9245 if (TREE_CODE (init_expr) == INTEGER_CST)
9246 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9247 else
9248 gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9249 TREE_TYPE (init_expr)));
9251 switch (induction_type)
9253 case vect_step_op_neg:
9254 if (TREE_CODE (init_expr) != INTEGER_CST
9255 && TREE_CODE (init_expr) != REAL_CST)
9257 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9258 if (!directly_supported_p (NEGATE_EXPR, vectype))
9259 return false;
9261 /* The encoding has 2 interleaved stepped patterns. */
9262 vec_perm_builder sel (nunits, 2, 3);
9263 machine_mode mode = TYPE_MODE (vectype);
9264 sel.quick_grow (6);
9265 for (i = 0; i < 3; i++)
9267 sel[i * 2] = i;
9268 sel[i * 2 + 1] = i + nunits;
9270 vec_perm_indices indices (sel, 2, nunits);
9271 if (!can_vec_perm_const_p (mode, mode, indices))
9272 return false;
9274 break;
9276 case vect_step_op_mul:
9278 /* Check for backend support of MULT_EXPR. */
9279 if (!directly_supported_p (MULT_EXPR, vectype))
9280 return false;
9282 /* ?? How to construct vector step for variable number vector.
9283 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9284 if (!vf.is_constant ())
9285 return false;
9287 break;
9289 case vect_step_op_shr:
9290 /* Check for backend support of RSHIFT_EXPR. */
9291 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9292 return false;
9294 /* Don't shift more than type precision to avoid UD. */
9295 if (!tree_fits_uhwi_p (step_expr)
9296 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9297 TYPE_PRECISION (TREE_TYPE (init_expr))))
9298 return false;
9299 break;
9301 case vect_step_op_shl:
9302 /* Check for backend support of RSHIFT_EXPR. */
9303 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9304 return false;
9306 /* Don't shift more than type precision to avoid UD. */
9307 if (!tree_fits_uhwi_p (step_expr)
9308 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9309 TYPE_PRECISION (TREE_TYPE (init_expr))))
9310 return false;
9312 break;
9314 default:
9315 gcc_unreachable ();
9318 if (!vec_stmt) /* transformation not required. */
9320 unsigned inside_cost = 0, prologue_cost = 0;
9321 /* loop cost for vec_loop. Neg induction doesn't have any
9322 inside_cost. */
9323 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9324 stmt_info, 0, vect_body);
9326 /* loop cost for vec_loop. Neg induction doesn't have any
9327 inside_cost. */
9328 if (induction_type == vect_step_op_neg)
9329 inside_cost = 0;
9331 /* prologue cost for vec_init and vec_step. */
9332 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9333 stmt_info, 0, vect_prologue);
9335 if (dump_enabled_p ())
9336 dump_printf_loc (MSG_NOTE, vect_location,
9337 "vect_model_induction_cost: inside_cost = %d, "
9338 "prologue_cost = %d. \n", inside_cost,
9339 prologue_cost);
9341 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9342 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9343 return true;
9346 /* Transform. */
9348 /* Compute a vector variable, initialized with the first VF values of
9349 the induction variable. E.g., for an iv with IV_PHI='X' and
9350 evolution S, for a vector of 4 units, we want to compute:
9351 [X, X + S, X + 2*S, X + 3*S]. */
9353 if (dump_enabled_p ())
9354 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9356 pe = loop_preheader_edge (iv_loop);
9357 /* Find the first insertion point in the BB. */
9358 basic_block bb = gimple_bb (phi);
9359 si = gsi_after_labels (bb);
9361 gimple_seq stmts = NULL;
9363 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9364 /* If we are using the loop mask to "peel" for alignment then we need
9365 to adjust the start value here. */
9366 if (niters_skip != NULL_TREE)
9367 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9368 step_expr, induction_type);
9370 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9371 step_expr, nunits, vectype,
9372 induction_type);
9373 if (stmts)
9375 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9376 gcc_assert (!new_bb);
9379 stmts = NULL;
9380 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9381 vf, induction_type);
9382 if (stmts)
9384 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9385 gcc_assert (!new_bb);
9388 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9389 new_name, vectype,
9390 induction_type);
9391 /* Create the following def-use cycle:
9392 loop prolog:
9393 vec_init = ...
9394 vec_step = ...
9395 loop:
9396 vec_iv = PHI <vec_init, vec_loop>
9398 STMT
9400 vec_loop = vec_iv + vec_step; */
9402 /* Create the induction-phi that defines the induction-operand. */
9403 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9404 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9405 induc_def = PHI_RESULT (induction_phi);
9407 /* Create the iv update inside the loop. */
9408 stmts = NULL;
9409 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9410 induc_def, vec_step,
9411 induction_type);
9413 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9414 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9416 /* Set the arguments of the phi node: */
9417 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9418 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9419 UNKNOWN_LOCATION);
9421 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9422 *vec_stmt = induction_phi;
9424 /* In case that vectorization factor (VF) is bigger than the number
9425 of elements that we can fit in a vectype (nunits), we have to generate
9426 more than one vector stmt - i.e - we need to "unroll" the
9427 vector stmt by a factor VF/nunits. For more details see documentation
9428 in vectorizable_operation. */
9430 if (ncopies > 1)
9432 stmts = NULL;
9433 /* FORNOW. This restriction should be relaxed. */
9434 gcc_assert (!nested_in_vect_loop);
9436 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9437 nunits, induction_type);
9439 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9440 new_name, vectype,
9441 induction_type);
9442 vec_def = induc_def;
9443 for (i = 1; i < ncopies; i++)
9445 /* vec_i = vec_prev + vec_step. */
9446 stmts = NULL;
9447 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9448 vec_def, vec_step,
9449 induction_type);
9450 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9451 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9452 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9456 if (dump_enabled_p ())
9457 dump_printf_loc (MSG_NOTE, vect_location,
9458 "transform induction: created def-use cycle: %G%G",
9459 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9461 return true;
9464 /* Function vectorizable_induction
9466 Check if STMT_INFO performs an induction computation that can be vectorized.
9467 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9468 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9469 Return true if STMT_INFO is vectorizable in this way. */
9471 bool
9472 vectorizable_induction (loop_vec_info loop_vinfo,
9473 stmt_vec_info stmt_info,
9474 gimple **vec_stmt, slp_tree slp_node,
9475 stmt_vector_for_cost *cost_vec)
9477 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9478 unsigned ncopies;
9479 bool nested_in_vect_loop = false;
9480 class loop *iv_loop;
9481 tree vec_def;
9482 edge pe = loop_preheader_edge (loop);
9483 basic_block new_bb;
9484 tree new_vec, vec_init, vec_step, t;
9485 tree new_name;
9486 gimple *new_stmt;
9487 gphi *induction_phi;
9488 tree induc_def, vec_dest;
9489 tree init_expr, step_expr;
9490 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9491 unsigned i;
9492 tree expr;
9493 gimple_stmt_iterator si;
9494 enum vect_induction_op_type induction_type
9495 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9497 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9498 if (!phi)
9499 return false;
9501 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9502 return false;
9504 /* Make sure it was recognized as induction computation. */
9505 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9506 return false;
9508 /* Handle nonlinear induction in a separate place. */
9509 if (induction_type != vect_step_op_add)
9510 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9511 vec_stmt, slp_node, cost_vec);
9513 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9514 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9516 if (slp_node)
9517 ncopies = 1;
9518 else
9519 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9520 gcc_assert (ncopies >= 1);
9522 /* FORNOW. These restrictions should be relaxed. */
9523 if (nested_in_vect_loop_p (loop, stmt_info))
9525 imm_use_iterator imm_iter;
9526 use_operand_p use_p;
9527 gimple *exit_phi;
9528 edge latch_e;
9529 tree loop_arg;
9531 if (ncopies > 1)
9533 if (dump_enabled_p ())
9534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9535 "multiple types in nested loop.\n");
9536 return false;
9539 exit_phi = NULL;
9540 latch_e = loop_latch_edge (loop->inner);
9541 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9542 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9544 gimple *use_stmt = USE_STMT (use_p);
9545 if (is_gimple_debug (use_stmt))
9546 continue;
9548 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9550 exit_phi = use_stmt;
9551 break;
9554 if (exit_phi)
9556 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9557 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9558 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9560 if (dump_enabled_p ())
9561 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9562 "inner-loop induction only used outside "
9563 "of the outer vectorized loop.\n");
9564 return false;
9568 nested_in_vect_loop = true;
9569 iv_loop = loop->inner;
9571 else
9572 iv_loop = loop;
9573 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9575 if (slp_node && !nunits.is_constant ())
9577 /* The current SLP code creates the step value element-by-element. */
9578 if (dump_enabled_p ())
9579 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9580 "SLP induction not supported for variable-length"
9581 " vectors.\n");
9582 return false;
9585 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9587 if (dump_enabled_p ())
9588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9589 "floating point induction vectorization disabled\n");
9590 return false;
9593 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9594 gcc_assert (step_expr != NULL_TREE);
9595 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9597 /* Check for backend support of PLUS/MINUS_EXPR. */
9598 if (!directly_supported_p (PLUS_EXPR, step_vectype)
9599 || !directly_supported_p (MINUS_EXPR, step_vectype))
9600 return false;
9602 if (!vec_stmt) /* transformation not required. */
9604 unsigned inside_cost = 0, prologue_cost = 0;
9605 if (slp_node)
9607 /* We eventually need to set a vector type on invariant
9608 arguments. */
9609 unsigned j;
9610 slp_tree child;
9611 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9612 if (!vect_maybe_update_slp_op_vectype
9613 (child, SLP_TREE_VECTYPE (slp_node)))
9615 if (dump_enabled_p ())
9616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9617 "incompatible vector types for "
9618 "invariants\n");
9619 return false;
9621 /* loop cost for vec_loop. */
9622 inside_cost
9623 = record_stmt_cost (cost_vec,
9624 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9625 vector_stmt, stmt_info, 0, vect_body);
9626 /* prologue cost for vec_init (if not nested) and step. */
9627 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9628 scalar_to_vec,
9629 stmt_info, 0, vect_prologue);
9631 else /* if (!slp_node) */
9633 /* loop cost for vec_loop. */
9634 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9635 stmt_info, 0, vect_body);
9636 /* prologue cost for vec_init and vec_step. */
9637 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9638 stmt_info, 0, vect_prologue);
9640 if (dump_enabled_p ())
9641 dump_printf_loc (MSG_NOTE, vect_location,
9642 "vect_model_induction_cost: inside_cost = %d, "
9643 "prologue_cost = %d .\n", inside_cost,
9644 prologue_cost);
9646 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9647 DUMP_VECT_SCOPE ("vectorizable_induction");
9648 return true;
9651 /* Transform. */
9653 /* Compute a vector variable, initialized with the first VF values of
9654 the induction variable. E.g., for an iv with IV_PHI='X' and
9655 evolution S, for a vector of 4 units, we want to compute:
9656 [X, X + S, X + 2*S, X + 3*S]. */
9658 if (dump_enabled_p ())
9659 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9661 pe = loop_preheader_edge (iv_loop);
9662 /* Find the first insertion point in the BB. */
9663 basic_block bb = gimple_bb (phi);
9664 si = gsi_after_labels (bb);
9666 /* For SLP induction we have to generate several IVs as for example
9667 with group size 3 we need
9668 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9669 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9670 if (slp_node)
9672 /* Enforced above. */
9673 unsigned int const_nunits = nunits.to_constant ();
9675 /* The initial values are vectorized, but any lanes > group_size
9676 need adjustment. */
9677 slp_tree init_node
9678 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9680 /* Gather steps. Since we do not vectorize inductions as
9681 cycles we have to reconstruct the step from SCEV data. */
9682 unsigned group_size = SLP_TREE_LANES (slp_node);
9683 tree *steps = XALLOCAVEC (tree, group_size);
9684 tree *inits = XALLOCAVEC (tree, group_size);
9685 stmt_vec_info phi_info;
9686 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9688 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9689 if (!init_node)
9690 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9691 pe->dest_idx);
9694 /* Now generate the IVs. */
9695 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9696 gcc_assert ((const_nunits * nvects) % group_size == 0);
9697 unsigned nivs;
9698 if (nested_in_vect_loop)
9699 nivs = nvects;
9700 else
9702 /* Compute the number of distinct IVs we need. First reduce
9703 group_size if it is a multiple of const_nunits so we get
9704 one IV for a group_size of 4 but const_nunits 2. */
9705 unsigned group_sizep = group_size;
9706 if (group_sizep % const_nunits == 0)
9707 group_sizep = group_sizep / const_nunits;
9708 nivs = least_common_multiple (group_sizep,
9709 const_nunits) / const_nunits;
9711 tree stept = TREE_TYPE (step_vectype);
9712 tree lupdate_mul = NULL_TREE;
9713 if (!nested_in_vect_loop)
9715 /* The number of iterations covered in one vector iteration. */
9716 unsigned lup_mul = (nvects * const_nunits) / group_size;
9717 lupdate_mul
9718 = build_vector_from_val (step_vectype,
9719 SCALAR_FLOAT_TYPE_P (stept)
9720 ? build_real_from_wide (stept, lup_mul,
9721 UNSIGNED)
9722 : build_int_cstu (stept, lup_mul));
9724 tree peel_mul = NULL_TREE;
9725 gimple_seq init_stmts = NULL;
9726 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9728 if (SCALAR_FLOAT_TYPE_P (stept))
9729 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9730 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9731 else
9732 peel_mul = gimple_convert (&init_stmts, stept,
9733 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9734 peel_mul = gimple_build_vector_from_val (&init_stmts,
9735 step_vectype, peel_mul);
9737 unsigned ivn;
9738 auto_vec<tree> vec_steps;
9739 for (ivn = 0; ivn < nivs; ++ivn)
9741 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9742 tree_vector_builder init_elts (vectype, const_nunits, 1);
9743 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9744 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9746 /* The scalar steps of the IVs. */
9747 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9748 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9749 step_elts.quick_push (elt);
9750 if (!init_node)
9752 /* The scalar inits of the IVs if not vectorized. */
9753 elt = inits[(ivn*const_nunits + eltn) % group_size];
9754 if (!useless_type_conversion_p (TREE_TYPE (vectype),
9755 TREE_TYPE (elt)))
9756 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9757 TREE_TYPE (vectype), elt);
9758 init_elts.quick_push (elt);
9760 /* The number of steps to add to the initial values. */
9761 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9762 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9763 ? build_real_from_wide (stept,
9764 mul_elt, UNSIGNED)
9765 : build_int_cstu (stept, mul_elt));
9767 vec_step = gimple_build_vector (&init_stmts, &step_elts);
9768 vec_steps.safe_push (vec_step);
9769 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9770 if (peel_mul)
9771 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9772 step_mul, peel_mul);
9773 if (!init_node)
9774 vec_init = gimple_build_vector (&init_stmts, &init_elts);
9776 /* Create the induction-phi that defines the induction-operand. */
9777 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9778 "vec_iv_");
9779 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9780 induc_def = PHI_RESULT (induction_phi);
9782 /* Create the iv update inside the loop */
9783 tree up = vec_step;
9784 if (lupdate_mul)
9785 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9786 vec_step, lupdate_mul);
9787 gimple_seq stmts = NULL;
9788 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9789 vec_def = gimple_build (&stmts,
9790 PLUS_EXPR, step_vectype, vec_def, up);
9791 vec_def = gimple_convert (&stmts, vectype, vec_def);
9792 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9793 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9794 UNKNOWN_LOCATION);
9796 if (init_node)
9797 vec_init = vect_get_slp_vect_def (init_node, ivn);
9798 if (!nested_in_vect_loop
9799 && !integer_zerop (step_mul))
9801 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9802 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9803 vec_step, step_mul);
9804 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9805 vec_def, up);
9806 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9809 /* Set the arguments of the phi node: */
9810 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9812 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
9814 if (!nested_in_vect_loop)
9816 /* Fill up to the number of vectors we need for the whole group. */
9817 nivs = least_common_multiple (group_size,
9818 const_nunits) / const_nunits;
9819 vec_steps.reserve (nivs-ivn);
9820 for (; ivn < nivs; ++ivn)
9822 SLP_TREE_VEC_STMTS (slp_node)
9823 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
9824 vec_steps.quick_push (vec_steps[0]);
9828 /* Re-use IVs when we can. We are generating further vector
9829 stmts by adding VF' * stride to the IVs generated above. */
9830 if (ivn < nvects)
9832 unsigned vfp
9833 = least_common_multiple (group_size, const_nunits) / group_size;
9834 tree lupdate_mul
9835 = build_vector_from_val (step_vectype,
9836 SCALAR_FLOAT_TYPE_P (stept)
9837 ? build_real_from_wide (stept,
9838 vfp, UNSIGNED)
9839 : build_int_cstu (stept, vfp));
9840 for (; ivn < nvects; ++ivn)
9842 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
9843 tree def = gimple_get_lhs (iv);
9844 if (ivn < 2*nivs)
9845 vec_steps[ivn - nivs]
9846 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9847 vec_steps[ivn - nivs], lupdate_mul);
9848 gimple_seq stmts = NULL;
9849 def = gimple_convert (&stmts, step_vectype, def);
9850 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9851 def, vec_steps[ivn % nivs]);
9852 def = gimple_convert (&stmts, vectype, def);
9853 if (gimple_code (iv) == GIMPLE_PHI)
9854 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9855 else
9857 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9858 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9860 SLP_TREE_VEC_STMTS (slp_node)
9861 .quick_push (SSA_NAME_DEF_STMT (def));
9865 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9866 gcc_assert (!new_bb);
9868 return true;
9871 init_expr = vect_phi_initial_value (phi);
9873 gimple_seq stmts = NULL;
9874 if (!nested_in_vect_loop)
9876 /* Convert the initial value to the IV update type. */
9877 tree new_type = TREE_TYPE (step_expr);
9878 init_expr = gimple_convert (&stmts, new_type, init_expr);
9880 /* If we are using the loop mask to "peel" for alignment then we need
9881 to adjust the start value here. */
9882 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9883 if (skip_niters != NULL_TREE)
9885 if (FLOAT_TYPE_P (vectype))
9886 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9887 skip_niters);
9888 else
9889 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9890 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9891 skip_niters, step_expr);
9892 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9893 init_expr, skip_step);
9897 if (stmts)
9899 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9900 gcc_assert (!new_bb);
9903 /* Create the vector that holds the initial_value of the induction. */
9904 if (nested_in_vect_loop)
9906 /* iv_loop is nested in the loop to be vectorized. init_expr had already
9907 been created during vectorization of previous stmts. We obtain it
9908 from the STMT_VINFO_VEC_STMT of the defining stmt. */
9909 auto_vec<tree> vec_inits;
9910 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9911 init_expr, &vec_inits);
9912 vec_init = vec_inits[0];
9913 /* If the initial value is not of proper type, convert it. */
9914 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9916 new_stmt
9917 = gimple_build_assign (vect_get_new_ssa_name (vectype,
9918 vect_simple_var,
9919 "vec_iv_"),
9920 VIEW_CONVERT_EXPR,
9921 build1 (VIEW_CONVERT_EXPR, vectype,
9922 vec_init));
9923 vec_init = gimple_assign_lhs (new_stmt);
9924 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
9925 new_stmt);
9926 gcc_assert (!new_bb);
9929 else
9931 /* iv_loop is the loop to be vectorized. Create:
9932 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
9933 stmts = NULL;
9934 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
9936 unsigned HOST_WIDE_INT const_nunits;
9937 if (nunits.is_constant (&const_nunits))
9939 tree_vector_builder elts (step_vectype, const_nunits, 1);
9940 elts.quick_push (new_name);
9941 for (i = 1; i < const_nunits; i++)
9943 /* Create: new_name_i = new_name + step_expr */
9944 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
9945 new_name, step_expr);
9946 elts.quick_push (new_name);
9948 /* Create a vector from [new_name_0, new_name_1, ...,
9949 new_name_nunits-1] */
9950 vec_init = gimple_build_vector (&stmts, &elts);
9952 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
9953 /* Build the initial value directly from a VEC_SERIES_EXPR. */
9954 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
9955 new_name, step_expr);
9956 else
9958 /* Build:
9959 [base, base, base, ...]
9960 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9961 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
9962 gcc_assert (flag_associative_math);
9963 tree index = build_index_vector (step_vectype, 0, 1);
9964 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9965 new_name);
9966 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9967 step_expr);
9968 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
9969 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
9970 vec_init, step_vec);
9971 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9972 vec_init, base_vec);
9974 vec_init = gimple_convert (&stmts, vectype, vec_init);
9976 if (stmts)
9978 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9979 gcc_assert (!new_bb);
9984 /* Create the vector that holds the step of the induction. */
9985 if (nested_in_vect_loop)
9986 /* iv_loop is nested in the loop to be vectorized. Generate:
9987 vec_step = [S, S, S, S] */
9988 new_name = step_expr;
9989 else
9991 /* iv_loop is the loop to be vectorized. Generate:
9992 vec_step = [VF*S, VF*S, VF*S, VF*S] */
9993 gimple_seq seq = NULL;
9994 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9996 expr = build_int_cst (integer_type_node, vf);
9997 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9999 else
10000 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10001 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10002 expr, step_expr);
10003 if (seq)
10005 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10006 gcc_assert (!new_bb);
10010 t = unshare_expr (new_name);
10011 gcc_assert (CONSTANT_CLASS_P (new_name)
10012 || TREE_CODE (new_name) == SSA_NAME);
10013 new_vec = build_vector_from_val (step_vectype, t);
10014 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10015 new_vec, step_vectype, NULL);
10018 /* Create the following def-use cycle:
10019 loop prolog:
10020 vec_init = ...
10021 vec_step = ...
10022 loop:
10023 vec_iv = PHI <vec_init, vec_loop>
10025 STMT
10027 vec_loop = vec_iv + vec_step; */
10029 /* Create the induction-phi that defines the induction-operand. */
10030 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10031 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10032 induc_def = PHI_RESULT (induction_phi);
10034 /* Create the iv update inside the loop */
10035 stmts = NULL;
10036 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10037 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10038 vec_def = gimple_convert (&stmts, vectype, vec_def);
10039 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10040 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10042 /* Set the arguments of the phi node: */
10043 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10044 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10045 UNKNOWN_LOCATION);
10047 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10048 *vec_stmt = induction_phi;
10050 /* In case that vectorization factor (VF) is bigger than the number
10051 of elements that we can fit in a vectype (nunits), we have to generate
10052 more than one vector stmt - i.e - we need to "unroll" the
10053 vector stmt by a factor VF/nunits. For more details see documentation
10054 in vectorizable_operation. */
10056 if (ncopies > 1)
10058 gimple_seq seq = NULL;
10059 /* FORNOW. This restriction should be relaxed. */
10060 gcc_assert (!nested_in_vect_loop);
10062 /* Create the vector that holds the step of the induction. */
10063 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10065 expr = build_int_cst (integer_type_node, nunits);
10066 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10068 else
10069 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10070 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10071 expr, step_expr);
10072 if (seq)
10074 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10075 gcc_assert (!new_bb);
10078 t = unshare_expr (new_name);
10079 gcc_assert (CONSTANT_CLASS_P (new_name)
10080 || TREE_CODE (new_name) == SSA_NAME);
10081 new_vec = build_vector_from_val (step_vectype, t);
10082 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10083 new_vec, step_vectype, NULL);
10085 vec_def = induc_def;
10086 for (i = 1; i < ncopies; i++)
10088 /* vec_i = vec_prev + vec_step */
10089 gimple_seq stmts = NULL;
10090 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10091 vec_def = gimple_build (&stmts,
10092 PLUS_EXPR, step_vectype, vec_def, vec_step);
10093 vec_def = gimple_convert (&stmts, vectype, vec_def);
10095 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10096 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10097 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10101 if (dump_enabled_p ())
10102 dump_printf_loc (MSG_NOTE, vect_location,
10103 "transform induction: created def-use cycle: %G%G",
10104 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10106 return true;
10109 /* Function vectorizable_live_operation.
10111 STMT_INFO computes a value that is used outside the loop. Check if
10112 it can be supported. */
10114 bool
10115 vectorizable_live_operation (vec_info *vinfo,
10116 stmt_vec_info stmt_info,
10117 gimple_stmt_iterator *gsi,
10118 slp_tree slp_node, slp_instance slp_node_instance,
10119 int slp_index, bool vec_stmt_p,
10120 stmt_vector_for_cost *cost_vec)
10122 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10123 imm_use_iterator imm_iter;
10124 tree lhs, lhs_type, bitsize;
10125 tree vectype = (slp_node
10126 ? SLP_TREE_VECTYPE (slp_node)
10127 : STMT_VINFO_VECTYPE (stmt_info));
10128 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10129 int ncopies;
10130 gimple *use_stmt;
10131 auto_vec<tree> vec_oprnds;
10132 int vec_entry = 0;
10133 poly_uint64 vec_index = 0;
10135 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10137 /* If a stmt of a reduction is live, vectorize it via
10138 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10139 validity so just trigger the transform here. */
10140 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10142 if (!vec_stmt_p)
10143 return true;
10144 if (slp_node)
10146 /* For reduction chains the meta-info is attached to
10147 the group leader. */
10148 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10149 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10150 /* For SLP reductions we vectorize the epilogue for
10151 all involved stmts together. */
10152 else if (slp_index != 0)
10153 return true;
10155 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10156 gcc_assert (reduc_info->is_reduc_info);
10157 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10158 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10159 return true;
10160 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10161 slp_node_instance);
10162 return true;
10165 /* If STMT is not relevant and it is a simple assignment and its inputs are
10166 invariant then it can remain in place, unvectorized. The original last
10167 scalar value that it computes will be used. */
10168 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10170 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10171 if (dump_enabled_p ())
10172 dump_printf_loc (MSG_NOTE, vect_location,
10173 "statement is simple and uses invariant. Leaving in "
10174 "place.\n");
10175 return true;
10178 if (slp_node)
10179 ncopies = 1;
10180 else
10181 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10183 if (slp_node)
10185 gcc_assert (slp_index >= 0);
10187 /* Get the last occurrence of the scalar index from the concatenation of
10188 all the slp vectors. Calculate which slp vector it is and the index
10189 within. */
10190 int num_scalar = SLP_TREE_LANES (slp_node);
10191 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10192 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10194 /* Calculate which vector contains the result, and which lane of
10195 that vector we need. */
10196 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10198 if (dump_enabled_p ())
10199 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10200 "Cannot determine which vector holds the"
10201 " final result.\n");
10202 return false;
10206 if (!vec_stmt_p)
10208 /* No transformation required. */
10209 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10211 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10212 OPTIMIZE_FOR_SPEED))
10214 if (dump_enabled_p ())
10215 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10216 "can't operate on partial vectors "
10217 "because the target doesn't support extract "
10218 "last reduction.\n");
10219 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10221 else if (slp_node)
10223 if (dump_enabled_p ())
10224 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10225 "can't operate on partial vectors "
10226 "because an SLP statement is live after "
10227 "the loop.\n");
10228 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10230 else if (ncopies > 1)
10232 if (dump_enabled_p ())
10233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10234 "can't operate on partial vectors "
10235 "because ncopies is greater than 1.\n");
10236 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10238 else
10240 gcc_assert (ncopies == 1 && !slp_node);
10241 vect_record_loop_mask (loop_vinfo,
10242 &LOOP_VINFO_MASKS (loop_vinfo),
10243 1, vectype, NULL);
10246 /* ??? Enable for loop costing as well. */
10247 if (!loop_vinfo)
10248 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10249 0, vect_epilogue);
10250 return true;
10253 /* Use the lhs of the original scalar statement. */
10254 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10255 if (dump_enabled_p ())
10256 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10257 "stmt %G", stmt);
10259 lhs = gimple_get_lhs (stmt);
10260 lhs_type = TREE_TYPE (lhs);
10262 bitsize = vector_element_bits_tree (vectype);
10264 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10265 tree vec_lhs, bitstart;
10266 gimple *vec_stmt;
10267 if (slp_node)
10269 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
10271 /* Get the correct slp vectorized stmt. */
10272 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
10273 vec_lhs = gimple_get_lhs (vec_stmt);
10275 /* Get entry to use. */
10276 bitstart = bitsize_int (vec_index);
10277 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10279 else
10281 /* For multiple copies, get the last copy. */
10282 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10283 vec_lhs = gimple_get_lhs (vec_stmt);
10285 /* Get the last lane in the vector. */
10286 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10289 if (loop_vinfo)
10291 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10292 requirement, insert one phi node for it. It looks like:
10293 loop;
10295 # lhs' = PHI <lhs>
10297 loop;
10299 # vec_lhs' = PHI <vec_lhs>
10300 new_tree = lane_extract <vec_lhs', ...>;
10301 lhs' = new_tree; */
10303 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10304 basic_block exit_bb = single_exit (loop)->dest;
10305 gcc_assert (single_pred_p (exit_bb));
10307 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10308 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10309 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
10311 gimple_seq stmts = NULL;
10312 tree new_tree;
10313 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10315 /* Emit:
10317 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10319 where VEC_LHS is the vectorized live-out result and MASK is
10320 the loop mask for the final iteration. */
10321 gcc_assert (ncopies == 1 && !slp_node);
10322 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10323 tree mask = vect_get_loop_mask (loop_vinfo, gsi,
10324 &LOOP_VINFO_MASKS (loop_vinfo),
10325 1, vectype, 0);
10326 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10327 mask, vec_lhs_phi);
10329 /* Convert the extracted vector element to the scalar type. */
10330 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10332 else
10334 tree bftype = TREE_TYPE (vectype);
10335 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10336 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10337 new_tree = build3 (BIT_FIELD_REF, bftype,
10338 vec_lhs_phi, bitsize, bitstart);
10339 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10340 &stmts, true, NULL_TREE);
10343 if (stmts)
10345 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10346 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10348 /* Remove existing phi from lhs and create one copy from new_tree. */
10349 tree lhs_phi = NULL_TREE;
10350 gimple_stmt_iterator gsi;
10351 for (gsi = gsi_start_phis (exit_bb);
10352 !gsi_end_p (gsi); gsi_next (&gsi))
10354 gimple *phi = gsi_stmt (gsi);
10355 if ((gimple_phi_arg_def (phi, 0) == lhs))
10357 remove_phi_node (&gsi, false);
10358 lhs_phi = gimple_phi_result (phi);
10359 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10360 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10361 break;
10366 /* Replace use of lhs with newly computed result. If the use stmt is a
10367 single arg PHI, just replace all uses of PHI result. It's necessary
10368 because lcssa PHI defining lhs may be before newly inserted stmt. */
10369 use_operand_p use_p;
10370 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10371 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10372 && !is_gimple_debug (use_stmt))
10374 if (gimple_code (use_stmt) == GIMPLE_PHI
10375 && gimple_phi_num_args (use_stmt) == 1)
10377 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10379 else
10381 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10382 SET_USE (use_p, new_tree);
10384 update_stmt (use_stmt);
10387 else
10389 /* For basic-block vectorization simply insert the lane-extraction. */
10390 tree bftype = TREE_TYPE (vectype);
10391 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10392 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10393 tree new_tree = build3 (BIT_FIELD_REF, bftype,
10394 vec_lhs, bitsize, bitstart);
10395 gimple_seq stmts = NULL;
10396 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10397 &stmts, true, NULL_TREE);
10398 if (TREE_CODE (new_tree) == SSA_NAME
10399 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10400 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10401 if (is_a <gphi *> (vec_stmt))
10403 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10404 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10406 else
10408 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10409 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10412 /* Replace use of lhs with newly computed result. If the use stmt is a
10413 single arg PHI, just replace all uses of PHI result. It's necessary
10414 because lcssa PHI defining lhs may be before newly inserted stmt. */
10415 use_operand_p use_p;
10416 stmt_vec_info use_stmt_info;
10417 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10418 if (!is_gimple_debug (use_stmt)
10419 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10420 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10422 /* ??? This can happen when the live lane ends up being
10423 used in a vector construction code-generated by an
10424 external SLP node (and code-generation for that already
10425 happened). See gcc.dg/vect/bb-slp-47.c.
10426 Doing this is what would happen if that vector CTOR
10427 were not code-generated yet so it is not too bad.
10428 ??? In fact we'd likely want to avoid this situation
10429 in the first place. */
10430 if (TREE_CODE (new_tree) == SSA_NAME
10431 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10432 && gimple_code (use_stmt) != GIMPLE_PHI
10433 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10434 use_stmt))
10436 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10437 gcc_checking_assert (code == SSA_NAME
10438 || code == CONSTRUCTOR
10439 || code == VIEW_CONVERT_EXPR
10440 || CONVERT_EXPR_CODE_P (code));
10441 if (dump_enabled_p ())
10442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10443 "Using original scalar computation for "
10444 "live lane because use preceeds vector "
10445 "def\n");
10446 continue;
10448 /* ??? It can also happen that we end up pulling a def into
10449 a loop where replacing out-of-loop uses would require
10450 a new LC SSA PHI node. Retain the original scalar in
10451 those cases as well. PR98064. */
10452 if (TREE_CODE (new_tree) == SSA_NAME
10453 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10454 && (gimple_bb (use_stmt)->loop_father
10455 != gimple_bb (vec_stmt)->loop_father)
10456 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10457 gimple_bb (use_stmt)->loop_father))
10459 if (dump_enabled_p ())
10460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10461 "Using original scalar computation for "
10462 "live lane because there is an out-of-loop "
10463 "definition for it\n");
10464 continue;
10466 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10467 SET_USE (use_p, new_tree);
10468 update_stmt (use_stmt);
10472 return true;
10475 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
10477 static void
10478 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10480 ssa_op_iter op_iter;
10481 imm_use_iterator imm_iter;
10482 def_operand_p def_p;
10483 gimple *ustmt;
10485 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10487 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10489 basic_block bb;
10491 if (!is_gimple_debug (ustmt))
10492 continue;
10494 bb = gimple_bb (ustmt);
10496 if (!flow_bb_inside_loop_p (loop, bb))
10498 if (gimple_debug_bind_p (ustmt))
10500 if (dump_enabled_p ())
10501 dump_printf_loc (MSG_NOTE, vect_location,
10502 "killing debug use\n");
10504 gimple_debug_bind_reset_value (ustmt);
10505 update_stmt (ustmt);
10507 else
10508 gcc_unreachable ();
10514 /* Given loop represented by LOOP_VINFO, return true if computation of
10515 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10516 otherwise. */
10518 static bool
10519 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10521 /* Constant case. */
10522 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10524 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10525 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10527 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10528 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10529 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10530 return true;
10533 widest_int max;
10534 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10535 /* Check the upper bound of loop niters. */
10536 if (get_max_loop_iterations (loop, &max))
10538 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10539 signop sgn = TYPE_SIGN (type);
10540 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10541 if (max < type_max)
10542 return true;
10544 return false;
10547 /* Return a mask type with half the number of elements as OLD_TYPE,
10548 given that it should have mode NEW_MODE. */
10550 tree
10551 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10553 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10554 return build_truth_vector_type_for_mode (nunits, new_mode);
10557 /* Return a mask type with twice as many elements as OLD_TYPE,
10558 given that it should have mode NEW_MODE. */
10560 tree
10561 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10563 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10564 return build_truth_vector_type_for_mode (nunits, new_mode);
10567 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10568 contain a sequence of NVECTORS masks that each control a vector of type
10569 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10570 these vector masks with the vector version of SCALAR_MASK. */
10572 void
10573 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10574 unsigned int nvectors, tree vectype, tree scalar_mask)
10576 gcc_assert (nvectors != 0);
10578 if (scalar_mask)
10580 scalar_cond_masked_key cond (scalar_mask, nvectors);
10581 loop_vinfo->scalar_cond_masked_set.add (cond);
10584 masks->mask_set.add (std::make_pair (vectype, nvectors));
10587 /* Given a complete set of masks MASKS, extract mask number INDEX
10588 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10589 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10591 See the comment above vec_loop_masks for more details about the mask
10592 arrangement. */
10594 tree
10595 vect_get_loop_mask (loop_vec_info loop_vinfo,
10596 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10597 unsigned int nvectors, tree vectype, unsigned int index)
10599 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10600 == vect_partial_vectors_while_ult)
10602 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10603 tree mask_type = rgm->type;
10605 /* Populate the rgroup's mask array, if this is the first time we've
10606 used it. */
10607 if (rgm->controls.is_empty ())
10609 rgm->controls.safe_grow_cleared (nvectors, true);
10610 for (unsigned int i = 0; i < nvectors; ++i)
10612 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10613 /* Provide a dummy definition until the real one is available. */
10614 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10615 rgm->controls[i] = mask;
10619 tree mask = rgm->controls[index];
10620 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10621 TYPE_VECTOR_SUBPARTS (vectype)))
10623 /* A loop mask for data type X can be reused for data type Y
10624 if X has N times more elements than Y and if Y's elements
10625 are N times bigger than X's. In this case each sequence
10626 of N elements in the loop mask will be all-zero or all-one.
10627 We can then view-convert the mask so that each sequence of
10628 N elements is replaced by a single element. */
10629 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10630 TYPE_VECTOR_SUBPARTS (vectype)));
10631 gimple_seq seq = NULL;
10632 mask_type = truth_type_for (vectype);
10633 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10634 if (seq)
10635 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10637 return mask;
10639 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10640 == vect_partial_vectors_avx512)
10642 /* The number of scalars per iteration and the number of vectors are
10643 both compile-time constants. */
10644 unsigned int nscalars_per_iter
10645 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10646 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10648 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10650 /* The stored nV is dependent on the mask type produced. */
10651 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10652 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10653 == rgm->factor);
10654 nvectors = rgm->factor;
10656 /* Populate the rgroup's mask array, if this is the first time we've
10657 used it. */
10658 if (rgm->controls.is_empty ())
10660 rgm->controls.safe_grow_cleared (nvectors, true);
10661 for (unsigned int i = 0; i < nvectors; ++i)
10663 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10664 /* Provide a dummy definition until the real one is available. */
10665 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10666 rgm->controls[i] = mask;
10669 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10670 TYPE_VECTOR_SUBPARTS (vectype)))
10671 return rgm->controls[index];
10673 /* Split the vector if needed. Since we are dealing with integer mode
10674 masks with AVX512 we can operate on the integer representation
10675 performing the whole vector shifting. */
10676 unsigned HOST_WIDE_INT factor;
10677 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10678 TYPE_VECTOR_SUBPARTS (vectype), &factor);
10679 gcc_assert (ok);
10680 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10681 tree mask_type = truth_type_for (vectype);
10682 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10683 unsigned vi = index / factor;
10684 unsigned vpart = index % factor;
10685 tree vec = rgm->controls[vi];
10686 gimple_seq seq = NULL;
10687 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10688 lang_hooks.types.type_for_mode
10689 (TYPE_MODE (rgm->type), 1), vec);
10690 /* For integer mode masks simply shift the right bits into position. */
10691 if (vpart != 0)
10692 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10693 build_int_cst (integer_type_node,
10694 (TYPE_VECTOR_SUBPARTS (vectype)
10695 * vpart)));
10696 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10697 (TYPE_MODE (mask_type), 1), vec);
10698 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10699 if (seq)
10700 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10701 return vec;
10703 else
10704 gcc_unreachable ();
10707 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10708 lengths for controlling an operation on VECTYPE. The operation splits
10709 each element of VECTYPE into FACTOR separate subelements, measuring the
10710 length as a number of these subelements. */
10712 void
10713 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10714 unsigned int nvectors, tree vectype, unsigned int factor)
10716 gcc_assert (nvectors != 0);
10717 if (lens->length () < nvectors)
10718 lens->safe_grow_cleared (nvectors, true);
10719 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10721 /* The number of scalars per iteration, scalar occupied bytes and
10722 the number of vectors are both compile-time constants. */
10723 unsigned int nscalars_per_iter
10724 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10725 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10727 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10729 /* For now, we only support cases in which all loads and stores fall back
10730 to VnQI or none do. */
10731 gcc_assert (!rgl->max_nscalars_per_iter
10732 || (rgl->factor == 1 && factor == 1)
10733 || (rgl->max_nscalars_per_iter * rgl->factor
10734 == nscalars_per_iter * factor));
10735 rgl->max_nscalars_per_iter = nscalars_per_iter;
10736 rgl->type = vectype;
10737 rgl->factor = factor;
10741 /* Given a complete set of lengths LENS, extract length number INDEX
10742 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10743 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10744 multipled by the number of elements that should be processed.
10745 Insert any set-up statements before GSI. */
10747 tree
10748 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10749 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10750 unsigned int index, unsigned int factor)
10752 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10753 bool use_bias_adjusted_len =
10754 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10756 /* Populate the rgroup's len array, if this is the first time we've
10757 used it. */
10758 if (rgl->controls.is_empty ())
10760 rgl->controls.safe_grow_cleared (nvectors, true);
10761 for (unsigned int i = 0; i < nvectors; ++i)
10763 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10764 gcc_assert (len_type != NULL_TREE);
10766 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10768 /* Provide a dummy definition until the real one is available. */
10769 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10770 rgl->controls[i] = len;
10772 if (use_bias_adjusted_len)
10774 gcc_assert (i == 0);
10775 tree adjusted_len =
10776 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10777 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10778 rgl->bias_adjusted_ctrl = adjusted_len;
10783 if (use_bias_adjusted_len)
10784 return rgl->bias_adjusted_ctrl;
10786 tree loop_len = rgl->controls[index];
10787 if (rgl->factor == 1 && factor == 1)
10789 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10790 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10791 if (maybe_ne (nunits1, nunits2))
10793 /* A loop len for data type X can be reused for data type Y
10794 if X has N times more elements than Y and if Y's elements
10795 are N times bigger than X's. */
10796 gcc_assert (multiple_p (nunits1, nunits2));
10797 factor = exact_div (nunits1, nunits2).to_constant ();
10798 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10799 gimple_seq seq = NULL;
10800 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
10801 build_int_cst (iv_type, factor));
10802 if (seq)
10803 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10806 return loop_len;
10809 /* Scale profiling counters by estimation for LOOP which is vectorized
10810 by factor VF. */
10812 static void
10813 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
10815 edge preheader = loop_preheader_edge (loop);
10816 /* Reduce loop iterations by the vectorization factor. */
10817 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
10818 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
10820 if (freq_h.nonzero_p ())
10822 profile_probability p;
10824 /* Avoid dropping loop body profile counter to 0 because of zero count
10825 in loop's preheader. */
10826 if (!(freq_e == profile_count::zero ()))
10827 freq_e = freq_e.force_nonzero ();
10828 p = (freq_e * (new_est_niter + 1)).probability_in (freq_h);
10829 scale_loop_frequencies (loop, p);
10832 edge exit_e = single_exit (loop);
10833 exit_e->probability = profile_probability::always () / (new_est_niter + 1);
10835 edge exit_l = single_pred_edge (loop->latch);
10836 profile_probability prob = exit_l->probability;
10837 exit_l->probability = exit_e->probability.invert ();
10838 if (prob.initialized_p () && exit_l->probability.initialized_p ())
10839 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
10842 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10843 latch edge values originally defined by it. */
10845 static void
10846 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
10847 stmt_vec_info def_stmt_info)
10849 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
10850 if (!def || TREE_CODE (def) != SSA_NAME)
10851 return;
10852 stmt_vec_info phi_info;
10853 imm_use_iterator iter;
10854 use_operand_p use_p;
10855 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
10857 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
10858 if (!phi)
10859 continue;
10860 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
10861 && (phi_info = loop_vinfo->lookup_stmt (phi))
10862 && STMT_VINFO_RELEVANT_P (phi_info)))
10863 continue;
10864 loop_p loop = gimple_bb (phi)->loop_father;
10865 edge e = loop_latch_edge (loop);
10866 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
10867 continue;
10869 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
10870 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
10871 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
10873 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10874 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10875 gcc_assert (phi_defs.length () == latch_defs.length ());
10876 for (unsigned i = 0; i < phi_defs.length (); ++i)
10877 add_phi_arg (as_a <gphi *> (phi_defs[i]),
10878 gimple_get_lhs (latch_defs[i]), e,
10879 gimple_phi_arg_location (phi, e->dest_idx));
10881 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
10883 /* For first order recurrences we have to update both uses of
10884 the latch definition, the one in the PHI node and the one
10885 in the generated VEC_PERM_EXPR. */
10886 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10887 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10888 gcc_assert (phi_defs.length () == latch_defs.length ());
10889 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
10890 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
10891 for (unsigned i = 0; i < phi_defs.length (); ++i)
10893 gassign *perm = as_a <gassign *> (phi_defs[i]);
10894 if (i > 0)
10895 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
10896 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
10897 update_stmt (perm);
10899 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
10900 gimple_phi_arg_location (phi, e->dest_idx));
10905 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
10906 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
10907 stmt_vec_info. */
10909 static bool
10910 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
10911 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
10913 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10914 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10916 if (dump_enabled_p ())
10917 dump_printf_loc (MSG_NOTE, vect_location,
10918 "------>vectorizing statement: %G", stmt_info->stmt);
10920 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10921 vect_loop_kill_debug_uses (loop, stmt_info);
10923 if (!STMT_VINFO_RELEVANT_P (stmt_info)
10924 && !STMT_VINFO_LIVE_P (stmt_info))
10925 return false;
10927 if (STMT_VINFO_VECTYPE (stmt_info))
10929 poly_uint64 nunits
10930 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
10931 if (!STMT_SLP_TYPE (stmt_info)
10932 && maybe_ne (nunits, vf)
10933 && dump_enabled_p ())
10934 /* For SLP VF is set according to unrolling factor, and not
10935 to vector size, hence for SLP this print is not valid. */
10936 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10939 /* Pure SLP statements have already been vectorized. We still need
10940 to apply loop vectorization to hybrid SLP statements. */
10941 if (PURE_SLP_STMT (stmt_info))
10942 return false;
10944 if (dump_enabled_p ())
10945 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
10947 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
10948 *seen_store = stmt_info;
10950 return true;
10953 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
10954 in the hash_map with its corresponding values. */
10956 static tree
10957 find_in_mapping (tree t, void *context)
10959 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
10961 tree *value = mapping->get (t);
10962 return value ? *value : t;
10965 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10966 original loop that has now been vectorized.
10968 The inits of the data_references need to be advanced with the number of
10969 iterations of the main loop. This has been computed in vect_do_peeling and
10970 is stored in parameter ADVANCE. We first restore the data_references
10971 initial offset with the values recored in ORIG_DRS_INIT.
10973 Since the loop_vec_info of this EPILOGUE was constructed for the original
10974 loop, its stmt_vec_infos all point to the original statements. These need
10975 to be updated to point to their corresponding copies as well as the SSA_NAMES
10976 in their PATTERN_DEF_SEQs and RELATED_STMTs.
10978 The data_reference's connections also need to be updated. Their
10979 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10980 stmt_vec_infos, their statements need to point to their corresponding copy,
10981 if they are gather loads or scatter stores then their reference needs to be
10982 updated to point to its corresponding copy and finally we set
10983 'base_misaligned' to false as we have already peeled for alignment in the
10984 prologue of the main loop. */
10986 static void
10987 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10989 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10990 auto_vec<gimple *> stmt_worklist;
10991 hash_map<tree,tree> mapping;
10992 gimple *orig_stmt, *new_stmt;
10993 gimple_stmt_iterator epilogue_gsi;
10994 gphi_iterator epilogue_phi_gsi;
10995 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10996 basic_block *epilogue_bbs = get_loop_body (epilogue);
10997 unsigned i;
10999 free (LOOP_VINFO_BBS (epilogue_vinfo));
11000 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11002 /* Advance data_reference's with the number of iterations of the previous
11003 loop and its prologue. */
11004 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11007 /* The EPILOGUE loop is a copy of the original loop so they share the same
11008 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11009 point to the copied statements. We also create a mapping of all LHS' in
11010 the original loop and all the LHS' in the EPILOGUE and create worklists to
11011 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11012 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11014 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11015 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11017 new_stmt = epilogue_phi_gsi.phi ();
11019 gcc_assert (gimple_uid (new_stmt) > 0);
11020 stmt_vinfo
11021 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11023 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11024 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11026 mapping.put (gimple_phi_result (orig_stmt),
11027 gimple_phi_result (new_stmt));
11028 /* PHI nodes can not have patterns or related statements. */
11029 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11030 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11033 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11034 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11036 new_stmt = gsi_stmt (epilogue_gsi);
11037 if (is_gimple_debug (new_stmt))
11038 continue;
11040 gcc_assert (gimple_uid (new_stmt) > 0);
11041 stmt_vinfo
11042 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11044 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11045 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11047 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11048 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11050 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11052 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11053 for (gimple_stmt_iterator gsi = gsi_start (seq);
11054 !gsi_end_p (gsi); gsi_next (&gsi))
11055 stmt_worklist.safe_push (gsi_stmt (gsi));
11058 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11059 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11061 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11062 stmt_worklist.safe_push (stmt);
11063 /* Set BB such that the assert in
11064 'get_initial_def_for_reduction' is able to determine that
11065 the BB of the related stmt is inside this loop. */
11066 gimple_set_bb (stmt,
11067 gimple_bb (new_stmt));
11068 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11069 gcc_assert (related_vinfo == NULL
11070 || related_vinfo == stmt_vinfo);
11075 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11076 using the original main loop and thus need to be updated to refer to the
11077 cloned variables used in the epilogue. */
11078 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11080 gimple *stmt = stmt_worklist[i];
11081 tree *new_op;
11083 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11085 tree op = gimple_op (stmt, j);
11086 if ((new_op = mapping.get(op)))
11087 gimple_set_op (stmt, j, *new_op);
11088 else
11090 /* PR92429: The last argument of simplify_replace_tree disables
11091 folding when replacing arguments. This is required as
11092 otherwise you might end up with different statements than the
11093 ones analyzed in vect_loop_analyze, leading to different
11094 vectorization. */
11095 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11096 &find_in_mapping, &mapping, false);
11097 gimple_set_op (stmt, j, op);
11102 struct data_reference *dr;
11103 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11104 FOR_EACH_VEC_ELT (datarefs, i, dr)
11106 orig_stmt = DR_STMT (dr);
11107 gcc_assert (gimple_uid (orig_stmt) > 0);
11108 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11109 /* Data references for gather loads and scatter stores do not use the
11110 updated offset we set using ADVANCE. Instead we have to make sure the
11111 reference in the data references point to the corresponding copy of
11112 the original in the epilogue. */
11113 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
11114 == VMAT_GATHER_SCATTER)
11116 DR_REF (dr)
11117 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11118 &find_in_mapping, &mapping);
11119 DR_BASE_ADDRESS (dr)
11120 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11121 &find_in_mapping, &mapping);
11123 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11124 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11125 /* The vector size of the epilogue is smaller than that of the main loop
11126 so the alignment is either the same or lower. This means the dr will
11127 thus by definition be aligned. */
11128 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11131 epilogue_vinfo->shared->datarefs_copy.release ();
11132 epilogue_vinfo->shared->save_datarefs ();
11135 /* Function vect_transform_loop.
11137 The analysis phase has determined that the loop is vectorizable.
11138 Vectorize the loop - created vectorized stmts to replace the scalar
11139 stmts in the loop, and update the loop exit condition.
11140 Returns scalar epilogue loop if any. */
11142 class loop *
11143 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11145 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11146 class loop *epilogue = NULL;
11147 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11148 int nbbs = loop->num_nodes;
11149 int i;
11150 tree niters_vector = NULL_TREE;
11151 tree step_vector = NULL_TREE;
11152 tree niters_vector_mult_vf = NULL_TREE;
11153 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11154 unsigned int lowest_vf = constant_lower_bound (vf);
11155 gimple *stmt;
11156 bool check_profitability = false;
11157 unsigned int th;
11159 DUMP_VECT_SCOPE ("vec_transform_loop");
11161 loop_vinfo->shared->check_datarefs ();
11163 /* Use the more conservative vectorization threshold. If the number
11164 of iterations is constant assume the cost check has been performed
11165 by our caller. If the threshold makes all loops profitable that
11166 run at least the (estimated) vectorization factor number of times
11167 checking is pointless, too. */
11168 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11169 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11171 if (dump_enabled_p ())
11172 dump_printf_loc (MSG_NOTE, vect_location,
11173 "Profitability threshold is %d loop iterations.\n",
11174 th);
11175 check_profitability = true;
11178 /* Make sure there exists a single-predecessor exit bb. Do this before
11179 versioning. */
11180 edge e = single_exit (loop);
11181 if (! single_pred_p (e->dest))
11183 split_loop_exit_edge (e, true);
11184 if (dump_enabled_p ())
11185 dump_printf (MSG_NOTE, "split exit edge\n");
11188 /* Version the loop first, if required, so the profitability check
11189 comes first. */
11191 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11193 class loop *sloop
11194 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11195 sloop->force_vectorize = false;
11196 check_profitability = false;
11199 /* Make sure there exists a single-predecessor exit bb also on the
11200 scalar loop copy. Do this after versioning but before peeling
11201 so CFG structure is fine for both scalar and if-converted loop
11202 to make slpeel_duplicate_current_defs_from_edges face matched
11203 loop closed PHI nodes on the exit. */
11204 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11206 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
11207 if (! single_pred_p (e->dest))
11209 split_loop_exit_edge (e, true);
11210 if (dump_enabled_p ())
11211 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11215 tree niters = vect_build_loop_niters (loop_vinfo);
11216 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11217 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11218 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11219 tree advance;
11220 drs_init_vec orig_drs_init;
11222 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11223 &step_vector, &niters_vector_mult_vf, th,
11224 check_profitability, niters_no_overflow,
11225 &advance);
11227 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11228 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11229 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11230 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11232 if (niters_vector == NULL_TREE)
11234 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11235 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11236 && known_eq (lowest_vf, vf))
11238 niters_vector
11239 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11240 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11241 step_vector = build_one_cst (TREE_TYPE (niters));
11243 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11244 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11245 &step_vector, niters_no_overflow);
11246 else
11247 /* vect_do_peeling subtracted the number of peeled prologue
11248 iterations from LOOP_VINFO_NITERS. */
11249 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11250 &niters_vector, &step_vector,
11251 niters_no_overflow);
11254 /* 1) Make sure the loop header has exactly two entries
11255 2) Make sure we have a preheader basic block. */
11257 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11259 split_edge (loop_preheader_edge (loop));
11261 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11262 /* This will deal with any possible peeling. */
11263 vect_prepare_for_masked_peels (loop_vinfo);
11265 /* Schedule the SLP instances first, then handle loop vectorization
11266 below. */
11267 if (!loop_vinfo->slp_instances.is_empty ())
11269 DUMP_VECT_SCOPE ("scheduling SLP instances");
11270 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11273 /* FORNOW: the vectorizer supports only loops which body consist
11274 of one basic block (header + empty latch). When the vectorizer will
11275 support more involved loop forms, the order by which the BBs are
11276 traversed need to be reconsidered. */
11278 for (i = 0; i < nbbs; i++)
11280 basic_block bb = bbs[i];
11281 stmt_vec_info stmt_info;
11283 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11284 gsi_next (&si))
11286 gphi *phi = si.phi ();
11287 if (dump_enabled_p ())
11288 dump_printf_loc (MSG_NOTE, vect_location,
11289 "------>vectorizing phi: %G", (gimple *) phi);
11290 stmt_info = loop_vinfo->lookup_stmt (phi);
11291 if (!stmt_info)
11292 continue;
11294 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11295 vect_loop_kill_debug_uses (loop, stmt_info);
11297 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11298 && !STMT_VINFO_LIVE_P (stmt_info))
11299 continue;
11301 if (STMT_VINFO_VECTYPE (stmt_info)
11302 && (maybe_ne
11303 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11304 && dump_enabled_p ())
11305 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11307 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11308 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11309 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11310 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11311 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11312 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11313 && ! PURE_SLP_STMT (stmt_info))
11315 if (dump_enabled_p ())
11316 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11317 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11321 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11322 gsi_next (&si))
11324 gphi *phi = si.phi ();
11325 stmt_info = loop_vinfo->lookup_stmt (phi);
11326 if (!stmt_info)
11327 continue;
11329 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11330 && !STMT_VINFO_LIVE_P (stmt_info))
11331 continue;
11333 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11334 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11335 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11336 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11337 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11338 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11339 && ! PURE_SLP_STMT (stmt_info))
11340 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11343 for (gimple_stmt_iterator si = gsi_start_bb (bb);
11344 !gsi_end_p (si);)
11346 stmt = gsi_stmt (si);
11347 /* During vectorization remove existing clobber stmts. */
11348 if (gimple_clobber_p (stmt))
11350 unlink_stmt_vdef (stmt);
11351 gsi_remove (&si, true);
11352 release_defs (stmt);
11354 else
11356 /* Ignore vector stmts created in the outer loop. */
11357 stmt_info = loop_vinfo->lookup_stmt (stmt);
11359 /* vector stmts created in the outer-loop during vectorization of
11360 stmts in an inner-loop may not have a stmt_info, and do not
11361 need to be vectorized. */
11362 stmt_vec_info seen_store = NULL;
11363 if (stmt_info)
11365 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11367 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11368 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11369 !gsi_end_p (subsi); gsi_next (&subsi))
11371 stmt_vec_info pat_stmt_info
11372 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11373 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11374 &si, &seen_store);
11376 stmt_vec_info pat_stmt_info
11377 = STMT_VINFO_RELATED_STMT (stmt_info);
11378 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11379 &si, &seen_store))
11380 maybe_set_vectorized_backedge_value (loop_vinfo,
11381 pat_stmt_info);
11383 else
11385 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11386 &seen_store))
11387 maybe_set_vectorized_backedge_value (loop_vinfo,
11388 stmt_info);
11391 gsi_next (&si);
11392 if (seen_store)
11394 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11395 /* Interleaving. If IS_STORE is TRUE, the
11396 vectorization of the interleaving chain was
11397 completed - free all the stores in the chain. */
11398 vect_remove_stores (loop_vinfo,
11399 DR_GROUP_FIRST_ELEMENT (seen_store));
11400 else
11401 /* Free the attached stmt_vec_info and remove the stmt. */
11402 loop_vinfo->remove_stmt (stmt_info);
11407 /* Stub out scalar statements that must not survive vectorization.
11408 Doing this here helps with grouped statements, or statements that
11409 are involved in patterns. */
11410 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11411 !gsi_end_p (gsi); gsi_next (&gsi))
11413 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11414 if (!call || !gimple_call_internal_p (call))
11415 continue;
11416 internal_fn ifn = gimple_call_internal_fn (call);
11417 if (ifn == IFN_MASK_LOAD)
11419 tree lhs = gimple_get_lhs (call);
11420 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11422 tree zero = build_zero_cst (TREE_TYPE (lhs));
11423 gimple *new_stmt = gimple_build_assign (lhs, zero);
11424 gsi_replace (&gsi, new_stmt, true);
11427 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11429 tree lhs = gimple_get_lhs (call);
11430 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11432 tree else_arg
11433 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11434 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11435 gsi_replace (&gsi, new_stmt, true);
11439 } /* BBs in loop */
11441 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11442 a zero NITERS becomes a nonzero NITERS_VECTOR. */
11443 if (integer_onep (step_vector))
11444 niters_no_overflow = true;
11445 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11446 niters_vector_mult_vf, !niters_no_overflow);
11448 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11449 scale_profile_for_vect_loop (loop, assumed_vf);
11451 /* True if the final iteration might not handle a full vector's
11452 worth of scalar iterations. */
11453 bool final_iter_may_be_partial
11454 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11455 /* The minimum number of iterations performed by the epilogue. This
11456 is 1 when peeling for gaps because we always need a final scalar
11457 iteration. */
11458 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11459 /* +1 to convert latch counts to loop iteration counts,
11460 -min_epilogue_iters to remove iterations that cannot be performed
11461 by the vector code. */
11462 int bias_for_lowest = 1 - min_epilogue_iters;
11463 int bias_for_assumed = bias_for_lowest;
11464 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11465 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11467 /* When the amount of peeling is known at compile time, the first
11468 iteration will have exactly alignment_npeels active elements.
11469 In the worst case it will have at least one. */
11470 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11471 bias_for_lowest += lowest_vf - min_first_active;
11472 bias_for_assumed += assumed_vf - min_first_active;
11474 /* In these calculations the "- 1" converts loop iteration counts
11475 back to latch counts. */
11476 if (loop->any_upper_bound)
11478 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11479 loop->nb_iterations_upper_bound
11480 = (final_iter_may_be_partial
11481 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11482 lowest_vf) - 1
11483 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11484 lowest_vf) - 1);
11485 if (main_vinfo
11486 /* Both peeling for alignment and peeling for gaps can end up
11487 with the scalar epilogue running for more than VF-1 iterations. */
11488 && !main_vinfo->peeling_for_alignment
11489 && !main_vinfo->peeling_for_gaps)
11491 unsigned int bound;
11492 poly_uint64 main_iters
11493 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11494 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11495 main_iters
11496 = upper_bound (main_iters,
11497 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11498 if (can_div_away_from_zero_p (main_iters,
11499 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11500 &bound))
11501 loop->nb_iterations_upper_bound
11502 = wi::umin ((widest_int) (bound - 1),
11503 loop->nb_iterations_upper_bound);
11506 if (loop->any_likely_upper_bound)
11507 loop->nb_iterations_likely_upper_bound
11508 = (final_iter_may_be_partial
11509 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11510 + bias_for_lowest, lowest_vf) - 1
11511 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11512 + bias_for_lowest, lowest_vf) - 1);
11513 if (loop->any_estimate)
11514 loop->nb_iterations_estimate
11515 = (final_iter_may_be_partial
11516 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11517 assumed_vf) - 1
11518 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11519 assumed_vf) - 1);
11521 if (dump_enabled_p ())
11523 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11525 dump_printf_loc (MSG_NOTE, vect_location,
11526 "LOOP VECTORIZED\n");
11527 if (loop->inner)
11528 dump_printf_loc (MSG_NOTE, vect_location,
11529 "OUTER LOOP VECTORIZED\n");
11530 dump_printf (MSG_NOTE, "\n");
11532 else
11533 dump_printf_loc (MSG_NOTE, vect_location,
11534 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11535 GET_MODE_NAME (loop_vinfo->vector_mode));
11538 /* Loops vectorized with a variable factor won't benefit from
11539 unrolling/peeling. */
11540 if (!vf.is_constant ())
11542 loop->unroll = 1;
11543 if (dump_enabled_p ())
11544 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11545 " variable-length vectorization factor\n");
11547 /* Free SLP instances here because otherwise stmt reference counting
11548 won't work. */
11549 slp_instance instance;
11550 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11551 vect_free_slp_instance (instance);
11552 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11553 /* Clear-up safelen field since its value is invalid after vectorization
11554 since vectorized loop can have loop-carried dependencies. */
11555 loop->safelen = 0;
11557 if (epilogue)
11559 update_epilogue_loop_vinfo (epilogue, advance);
11561 epilogue->simduid = loop->simduid;
11562 epilogue->force_vectorize = loop->force_vectorize;
11563 epilogue->dont_vectorize = false;
11566 return epilogue;
11569 /* The code below is trying to perform simple optimization - revert
11570 if-conversion for masked stores, i.e. if the mask of a store is zero
11571 do not perform it and all stored value producers also if possible.
11572 For example,
11573 for (i=0; i<n; i++)
11574 if (c[i])
11576 p1[i] += 1;
11577 p2[i] = p3[i] +2;
11579 this transformation will produce the following semi-hammock:
11581 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11583 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11584 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11585 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11586 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11587 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11588 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11592 void
11593 optimize_mask_stores (class loop *loop)
11595 basic_block *bbs = get_loop_body (loop);
11596 unsigned nbbs = loop->num_nodes;
11597 unsigned i;
11598 basic_block bb;
11599 class loop *bb_loop;
11600 gimple_stmt_iterator gsi;
11601 gimple *stmt;
11602 auto_vec<gimple *> worklist;
11603 auto_purge_vect_location sentinel;
11605 vect_location = find_loop_location (loop);
11606 /* Pick up all masked stores in loop if any. */
11607 for (i = 0; i < nbbs; i++)
11609 bb = bbs[i];
11610 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11611 gsi_next (&gsi))
11613 stmt = gsi_stmt (gsi);
11614 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11615 worklist.safe_push (stmt);
11619 free (bbs);
11620 if (worklist.is_empty ())
11621 return;
11623 /* Loop has masked stores. */
11624 while (!worklist.is_empty ())
11626 gimple *last, *last_store;
11627 edge e, efalse;
11628 tree mask;
11629 basic_block store_bb, join_bb;
11630 gimple_stmt_iterator gsi_to;
11631 tree vdef, new_vdef;
11632 gphi *phi;
11633 tree vectype;
11634 tree zero;
11636 last = worklist.pop ();
11637 mask = gimple_call_arg (last, 2);
11638 bb = gimple_bb (last);
11639 /* Create then_bb and if-then structure in CFG, then_bb belongs to
11640 the same loop as if_bb. It could be different to LOOP when two
11641 level loop-nest is vectorized and mask_store belongs to the inner
11642 one. */
11643 e = split_block (bb, last);
11644 bb_loop = bb->loop_father;
11645 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11646 join_bb = e->dest;
11647 store_bb = create_empty_bb (bb);
11648 add_bb_to_loop (store_bb, bb_loop);
11649 e->flags = EDGE_TRUE_VALUE;
11650 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11651 /* Put STORE_BB to likely part. */
11652 efalse->probability = profile_probability::unlikely ();
11653 store_bb->count = efalse->count ();
11654 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11655 if (dom_info_available_p (CDI_DOMINATORS))
11656 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11657 if (dump_enabled_p ())
11658 dump_printf_loc (MSG_NOTE, vect_location,
11659 "Create new block %d to sink mask stores.",
11660 store_bb->index);
11661 /* Create vector comparison with boolean result. */
11662 vectype = TREE_TYPE (mask);
11663 zero = build_zero_cst (vectype);
11664 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11665 gsi = gsi_last_bb (bb);
11666 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11667 /* Create new PHI node for vdef of the last masked store:
11668 .MEM_2 = VDEF <.MEM_1>
11669 will be converted to
11670 .MEM.3 = VDEF <.MEM_1>
11671 and new PHI node will be created in join bb
11672 .MEM_2 = PHI <.MEM_1, .MEM_3>
11674 vdef = gimple_vdef (last);
11675 new_vdef = make_ssa_name (gimple_vop (cfun), last);
11676 gimple_set_vdef (last, new_vdef);
11677 phi = create_phi_node (vdef, join_bb);
11678 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11680 /* Put all masked stores with the same mask to STORE_BB if possible. */
11681 while (true)
11683 gimple_stmt_iterator gsi_from;
11684 gimple *stmt1 = NULL;
11686 /* Move masked store to STORE_BB. */
11687 last_store = last;
11688 gsi = gsi_for_stmt (last);
11689 gsi_from = gsi;
11690 /* Shift GSI to the previous stmt for further traversal. */
11691 gsi_prev (&gsi);
11692 gsi_to = gsi_start_bb (store_bb);
11693 gsi_move_before (&gsi_from, &gsi_to);
11694 /* Setup GSI_TO to the non-empty block start. */
11695 gsi_to = gsi_start_bb (store_bb);
11696 if (dump_enabled_p ())
11697 dump_printf_loc (MSG_NOTE, vect_location,
11698 "Move stmt to created bb\n%G", last);
11699 /* Move all stored value producers if possible. */
11700 while (!gsi_end_p (gsi))
11702 tree lhs;
11703 imm_use_iterator imm_iter;
11704 use_operand_p use_p;
11705 bool res;
11707 /* Skip debug statements. */
11708 if (is_gimple_debug (gsi_stmt (gsi)))
11710 gsi_prev (&gsi);
11711 continue;
11713 stmt1 = gsi_stmt (gsi);
11714 /* Do not consider statements writing to memory or having
11715 volatile operand. */
11716 if (gimple_vdef (stmt1)
11717 || gimple_has_volatile_ops (stmt1))
11718 break;
11719 gsi_from = gsi;
11720 gsi_prev (&gsi);
11721 lhs = gimple_get_lhs (stmt1);
11722 if (!lhs)
11723 break;
11725 /* LHS of vectorized stmt must be SSA_NAME. */
11726 if (TREE_CODE (lhs) != SSA_NAME)
11727 break;
11729 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11731 /* Remove dead scalar statement. */
11732 if (has_zero_uses (lhs))
11734 gsi_remove (&gsi_from, true);
11735 continue;
11739 /* Check that LHS does not have uses outside of STORE_BB. */
11740 res = true;
11741 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11743 gimple *use_stmt;
11744 use_stmt = USE_STMT (use_p);
11745 if (is_gimple_debug (use_stmt))
11746 continue;
11747 if (gimple_bb (use_stmt) != store_bb)
11749 res = false;
11750 break;
11753 if (!res)
11754 break;
11756 if (gimple_vuse (stmt1)
11757 && gimple_vuse (stmt1) != gimple_vuse (last_store))
11758 break;
11760 /* Can move STMT1 to STORE_BB. */
11761 if (dump_enabled_p ())
11762 dump_printf_loc (MSG_NOTE, vect_location,
11763 "Move stmt to created bb\n%G", stmt1);
11764 gsi_move_before (&gsi_from, &gsi_to);
11765 /* Shift GSI_TO for further insertion. */
11766 gsi_prev (&gsi_to);
11768 /* Put other masked stores with the same mask to STORE_BB. */
11769 if (worklist.is_empty ()
11770 || gimple_call_arg (worklist.last (), 2) != mask
11771 || worklist.last () != stmt1)
11772 break;
11773 last = worklist.pop ();
11775 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11779 /* Decide whether it is possible to use a zero-based induction variable
11780 when vectorizing LOOP_VINFO with partial vectors. If it is, return
11781 the value that the induction variable must be able to hold in order
11782 to ensure that the rgroups eventually have no active vector elements.
11783 Return -1 otherwise. */
11785 widest_int
11786 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11788 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11789 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11790 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11792 /* Calculate the value that the induction variable must be able
11793 to hit in order to ensure that we end the loop with an all-false mask.
11794 This involves adding the maximum number of inactive trailing scalar
11795 iterations. */
11796 widest_int iv_limit = -1;
11797 if (max_loop_iterations (loop, &iv_limit))
11799 if (niters_skip)
11801 /* Add the maximum number of skipped iterations to the
11802 maximum iteration count. */
11803 if (TREE_CODE (niters_skip) == INTEGER_CST)
11804 iv_limit += wi::to_widest (niters_skip);
11805 else
11806 iv_limit += max_vf - 1;
11808 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11809 /* Make a conservatively-correct assumption. */
11810 iv_limit += max_vf - 1;
11812 /* IV_LIMIT is the maximum number of latch iterations, which is also
11813 the maximum in-range IV value. Round this value down to the previous
11814 vector alignment boundary and then add an extra full iteration. */
11815 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11816 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11818 return iv_limit;
11821 /* For the given rgroup_controls RGC, check whether an induction variable
11822 would ever hit a value that produces a set of all-false masks or zero
11823 lengths before wrapping around. Return true if it's possible to wrap
11824 around before hitting the desirable value, otherwise return false. */
11826 bool
11827 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11829 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11831 if (iv_limit == -1)
11832 return true;
11834 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11835 unsigned int compare_precision = TYPE_PRECISION (compare_type);
11836 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11838 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11839 return true;
11841 return false;