ada: Fix spurious -Wstringop-overflow with link time optimization
[official-gcc.git] / gcc / tree-vect-loop.cc
blob5213aa0169cbfb32aada1d0a826e638f5d1e5337
1 /* Loop Vectorization
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "memmodel.h"
36 #include "optabs.h"
37 #include "diagnostic-core.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "cfganal.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "gimple-fold.h"
52 #include "cgraph.h"
53 #include "tree-cfg.h"
54 #include "tree-if-conv.h"
55 #include "internal-fn.h"
56 #include "tree-vector-builder.h"
57 #include "vec-perm-indices.h"
58 #include "tree-eh.h"
59 #include "case-cfn-macros.h"
60 #include "langhooks.h"
62 /* Loop Vectorization Pass.
64 This pass tries to vectorize loops.
66 For example, the vectorizer transforms the following simple loop:
68 short a[N]; short b[N]; short c[N]; int i;
70 for (i=0; i<N; i++){
71 a[i] = b[i] + c[i];
74 as if it was manually vectorized by rewriting the source code into:
76 typedef int __attribute__((mode(V8HI))) v8hi;
77 short a[N]; short b[N]; short c[N]; int i;
78 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 v8hi va, vb, vc;
81 for (i=0; i<N/8; i++){
82 vb = pb[i];
83 vc = pc[i];
84 va = vb + vc;
85 pa[i] = va;
88 The main entry to this pass is vectorize_loops(), in which
89 the vectorizer applies a set of analyses on a given set of loops,
90 followed by the actual vectorization transformation for the loops that
91 had successfully passed the analysis phase.
92 Throughout this pass we make a distinction between two types of
93 data: scalars (which are represented by SSA_NAMES), and memory references
94 ("data-refs"). These two types of data require different handling both
95 during analysis and transformation. The types of data-refs that the
96 vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 accesses are required to have a simple (consecutive) access pattern.
100 Analysis phase:
101 ===============
102 The driver for the analysis phase is vect_analyze_loop().
103 It applies a set of analyses, some of which rely on the scalar evolution
104 analyzer (scev) developed by Sebastian Pop.
106 During the analysis phase the vectorizer records some information
107 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 loop, as well as general information about the loop as a whole, which is
109 recorded in a "loop_vec_info" struct attached to each loop.
111 Transformation phase:
112 =====================
113 The loop transformation phase scans all the stmts in the loop, and
114 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 the loop that needs to be vectorized. It inserts the vector code sequence
116 just before the scalar stmt S, and records a pointer to the vector code
117 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 attached to S). This pointer will be used for the vectorization of following
119 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 otherwise, we rely on dead code elimination for removing it.
122 For example, say stmt S1 was vectorized into stmt VS1:
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 S2: a = b;
128 To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 resulting sequence would be:
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 VS2: va = vb;
136 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 Operands that are not SSA_NAMEs, are data-refs that appear in
139 load/store operations (like 'x[i]' in S1), and are handled differently.
141 Target modeling:
142 =================
143 Currently the only target specific information that is used is the
144 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 Targets that can support different sizes of vectors, for now will need
146 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 flexibility will be added in the future.
149 Since we only vectorize operations which vector form can be
150 expressed using existing tree codes, to verify that an operation is
151 supported, the vectorizer checks the relevant optab at the relevant
152 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 the value found is CODE_FOR_nothing, then there's no target support, and
154 we can't vectorize the stmt.
156 For additional information on this project see:
157 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 unsigned *);
162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 bool *, bool *, bool);
165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 may already be set for general statements (not just data refs). */
169 static opt_result
170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 bool vectype_maybe_set_p,
172 poly_uint64 *vf)
174 gimple *stmt = stmt_info->stmt;
176 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 && !STMT_VINFO_LIVE_P (stmt_info))
178 || gimple_clobber_p (stmt))
180 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 return opt_result::success ();
185 tree stmt_vectype, nunits_vectype;
186 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 &stmt_vectype,
188 &nunits_vectype);
189 if (!res)
190 return res;
192 if (stmt_vectype)
194 if (STMT_VINFO_VECTYPE (stmt_info))
195 /* The only case when a vectype had been already set is for stmts
196 that contain a data ref, or for "pattern-stmts" (stmts generated
197 by the vectorizer to represent/replace a certain idiom). */
198 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 || vectype_maybe_set_p)
200 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 else
202 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
205 if (nunits_vectype)
206 vect_update_max_nunits (vf, nunits_vectype);
208 return opt_result::success ();
211 /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 types of STMT_INFO and all attached pattern statements and update
213 the vectorization factor VF accordingly. Return true on success
214 or false if something prevented vectorization. */
216 static opt_result
217 vect_determine_vf_for_stmt (vec_info *vinfo,
218 stmt_vec_info stmt_info, poly_uint64 *vf)
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 if (!res)
225 return res;
227 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 && STMT_VINFO_RELATED_STMT (stmt_info))
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
233 /* If a pattern statement has def stmts, analyze them too. */
234 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 !gsi_end_p (si); gsi_next (&si))
237 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "==> examining pattern def stmt: %G",
241 def_stmt_info->stmt);
242 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 if (!res)
244 return res;
247 if (dump_enabled_p ())
248 dump_printf_loc (MSG_NOTE, vect_location,
249 "==> examining pattern statement: %G",
250 stmt_info->stmt);
251 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 if (!res)
253 return res;
256 return opt_result::success ();
259 /* Function vect_determine_vectorization_factor
261 Determine the vectorization factor (VF). VF is the number of data elements
262 that are operated upon in parallel in a single iteration of the vectorized
263 loop. For example, when vectorizing a loop that operates on 4byte elements,
264 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 elements can fit in a single vector register.
267 We currently support vectorization of loops in which all types operated upon
268 are of the same size. Therefore this function currently sets VF according to
269 the size of the types operated upon, and fails if there are multiple sizes
270 in the loop.
272 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 original loop:
274 for (i=0; i<N; i++){
275 a[i] = b[i] + c[i];
278 vectorized loop:
279 for (i=0; i<N; i+=VF){
280 a[i:VF] = b[i:VF] + c[i:VF];
284 static opt_result
285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
287 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 unsigned nbbs = loop->num_nodes;
290 poly_uint64 vectorization_factor = 1;
291 tree scalar_type = NULL_TREE;
292 gphi *phi;
293 tree vectype;
294 stmt_vec_info stmt_info;
295 unsigned i;
297 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
299 for (i = 0; i < nbbs; i++)
301 basic_block bb = bbs[i];
303 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 gsi_next (&si))
306 phi = si.phi ();
307 stmt_info = loop_vinfo->lookup_stmt (phi);
308 if (dump_enabled_p ())
309 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 (gimple *) phi);
312 gcc_assert (stmt_info);
314 if (STMT_VINFO_RELEVANT_P (stmt_info)
315 || STMT_VINFO_LIVE_P (stmt_info))
317 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 scalar_type = TREE_TYPE (PHI_RESULT (phi));
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE, vect_location,
322 "get vectype for scalar type: %T\n",
323 scalar_type);
325 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 if (!vectype)
327 return opt_result::failure_at (phi,
328 "not vectorized: unsupported "
329 "data-type %T\n",
330 scalar_type);
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 dump_printf (MSG_NOTE, "\n");
344 vect_update_max_nunits (&vectorization_factor, vectype);
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 gsi_next (&si))
351 if (is_gimple_debug (gsi_stmt (si)))
352 continue;
353 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 opt_result res
355 = vect_determine_vf_for_stmt (loop_vinfo,
356 stmt_info, &vectorization_factor);
357 if (!res)
358 return res;
362 /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 dump_dec (MSG_NOTE, vectorization_factor);
367 dump_printf (MSG_NOTE, "\n");
370 if (known_le (vectorization_factor, 1U))
371 return opt_result::failure_at (vect_location,
372 "not vectorized: unsupported data-type\n");
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 return opt_result::success ();
378 /* Function vect_is_simple_iv_evolution.
380 FORNOW: A simple evolution of an induction variables in the loop is
381 considered a polynomial evolution. */
383 static bool
384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 tree * step)
387 tree init_expr;
388 tree step_expr;
389 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 basic_block bb;
392 /* When there is no evolution in this loop, the evolution function
393 is not "simple". */
394 if (evolution_part == NULL_TREE)
395 return false;
397 /* When the evolution is a polynomial of degree >= 2
398 the evolution function is not "simple". */
399 if (tree_is_chrec (evolution_part))
400 return false;
402 step_expr = evolution_part;
403 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 step_expr, init_expr);
409 *init = init_expr;
410 *step = step_expr;
412 if (TREE_CODE (step_expr) != INTEGER_CST
413 && (TREE_CODE (step_expr) != SSA_NAME
414 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 || !flag_associative_math)))
419 && (TREE_CODE (step_expr) != REAL_CST
420 || !flag_associative_math))
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "step unknown.\n");
425 return false;
428 return true;
431 /* Function vect_is_nonlinear_iv_evolution
433 Only support nonlinear induction for integer type
434 1. neg
435 2. mul by constant
436 3. lshift/rshift by constant.
438 For neg induction, return a fake step as integer -1. */
439 static bool
440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 gphi* loop_phi_node, tree *init, tree *step)
443 tree init_expr, ev_expr, result, op1, op2;
444 gimple* def;
446 if (gimple_phi_num_args (loop_phi_node) != 2)
447 return false;
449 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
452 /* Support nonlinear induction only for integer type. */
453 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 return false;
456 *init = init_expr;
457 result = PHI_RESULT (loop_phi_node);
459 if (TREE_CODE (ev_expr) != SSA_NAME
460 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 || !is_gimple_assign (def))
462 return false;
464 enum tree_code t_code = gimple_assign_rhs_code (def);
465 switch (t_code)
467 case NEGATE_EXPR:
468 if (gimple_assign_rhs1 (def) != result)
469 return false;
470 *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 break;
474 case RSHIFT_EXPR:
475 case LSHIFT_EXPR:
476 case MULT_EXPR:
477 op1 = gimple_assign_rhs1 (def);
478 op2 = gimple_assign_rhs2 (def);
479 if (TREE_CODE (op2) != INTEGER_CST
480 || op1 != result)
481 return false;
482 *step = op2;
483 if (t_code == LSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 else if (t_code == RSHIFT_EXPR)
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 else
489 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 break;
492 default:
493 return false;
496 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
499 return true;
502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 what we are assuming is a double reduction. For example, given
504 a structure like this:
506 outer1:
507 x_1 = PHI <x_4(outer2), ...>;
510 inner:
511 x_2 = PHI <x_1(outer1), ...>;
513 x_3 = ...;
516 outer2:
517 x_4 = PHI <x_3(inner)>;
520 outer loop analysis would treat x_1 as a double reduction phi and
521 this function would then return true for x_2. */
523 static bool
524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
526 use_operand_p use_p;
527 ssa_op_iter op_iter;
528 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 return true;
532 return false;
535 /* Returns true if Phi is a first-order recurrence. A first-order
536 recurrence is a non-reduction recurrence relation in which the value of
537 the recurrence in the current loop iteration equals a value defined in
538 the previous iteration. */
540 static bool
541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 gphi *phi)
544 /* A nested cycle isn't vectorizable as first order recurrence. */
545 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 return false;
548 /* Ensure the loop latch definition is from within the loop. */
549 edge latch = loop_latch_edge (loop);
550 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 if (TREE_CODE (ldef) != SSA_NAME
552 || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 return false;
557 tree def = gimple_phi_result (phi);
559 /* Ensure every use_stmt of the phi node is dominated by the latch
560 definition. */
561 imm_use_iterator imm_iter;
562 use_operand_p use_p;
563 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 if (!is_gimple_debug (USE_STMT (use_p))
565 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 USE_STMT (use_p))))
568 return false;
570 /* First-order recurrence autovectorization needs shuffle vector. */
571 tree scalar_type = TREE_TYPE (def);
572 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 if (!vectype)
574 return false;
576 return true;
579 /* Function vect_analyze_scalar_cycles_1.
581 Examine the cross iteration def-use cycles of scalar variables
582 in LOOP. LOOP_VINFO represents the loop that is now being
583 considered for vectorization (can be LOOP, or an outer-loop
584 enclosing LOOP). SLP indicates there will be some subsequent
585 slp analyses or not. */
587 static void
588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 bool slp)
591 basic_block bb = loop->header;
592 tree init, step;
593 auto_vec<stmt_vec_info, 64> worklist;
594 gphi_iterator gsi;
595 bool double_reduc, reduc_chain;
597 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
599 /* First - identify all inductions. Reduction detection assumes that all the
600 inductions have been identified, therefore, this order must not be
601 changed. */
602 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
604 gphi *phi = gsi.phi ();
605 tree access_fn = NULL;
606 tree def = PHI_RESULT (phi);
607 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 (gimple *) phi);
613 /* Skip virtual phi's. The data dependences that are associated with
614 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 if (virtual_operand_p (def))
616 continue;
618 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
620 /* Analyze the evolution function. */
621 access_fn = analyze_scalar_evolution (loop, def);
622 if (access_fn)
624 STRIP_NOPS (access_fn);
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "Access function of PHI: %T\n", access_fn);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 = initial_condition_in_loop_num (access_fn, loop->num);
630 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 = evolution_part_in_loop_num (access_fn, loop->num);
634 if ((!access_fn
635 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 &init, &step)
638 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 && TREE_CODE (step) != INTEGER_CST))
640 /* Only handle nonlinear iv for same loop. */
641 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 phi, &init, &step)))
645 worklist.safe_push (stmt_vinfo);
646 continue;
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 != NULL_TREE);
651 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
659 /* Second - identify all reductions and nested cycles. */
660 while (worklist.length () > 0)
662 stmt_vec_info stmt_vinfo = worklist.pop ();
663 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
664 tree def = PHI_RESULT (phi);
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 (gimple *) phi);
670 gcc_assert (!virtual_operand_p (def)
671 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
673 stmt_vec_info reduc_stmt_info
674 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 &reduc_chain, slp);
676 if (reduc_stmt_info)
678 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 if (double_reduc)
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location,
684 "Detected double reduction.\n");
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
689 else
691 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "Detected vectorizable nested cycle.\n");
697 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
699 else
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_NOTE, vect_location,
703 "Detected reduction.\n");
705 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
706 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
707 /* Store the reduction cycles for possible vectorization in
708 loop-aware SLP if it was not detected as reduction
709 chain. */
710 if (! reduc_chain)
711 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
712 (reduc_stmt_info);
716 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
718 else
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown def-use cycle pattern.\n");
726 /* Function vect_analyze_scalar_cycles.
728 Examine the cross iteration def-use cycles of scalar variables, by
729 analyzing the loop-header PHIs of scalar variables. Classify each
730 cycle as one of the following: invariant, induction, reduction, unknown.
731 We do that for the loop represented by LOOP_VINFO, and also to its
732 inner-loop, if exists.
733 Examples for scalar cycles:
735 Example1: reduction:
737 loop1:
738 for (i=0; i<N; i++)
739 sum += a[i];
741 Example2: induction:
743 loop2:
744 for (i=0; i<N; i++)
745 a[i] = i; */
747 static void
748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
752 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
754 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 Reductions in such inner-loop therefore have different properties than
756 the reductions in the nest that gets vectorized:
757 1. When vectorized, they are executed in the same order as in the original
758 scalar loop, so we can't change the order of computation when
759 vectorizing them.
760 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 current checks are too strict. */
763 if (loop->inner)
764 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
767 /* Transfer group and reduction information from STMT_INFO to its
768 pattern stmt. */
770 static void
771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
773 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
774 stmt_vec_info stmtp;
775 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
777 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
780 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
781 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
782 == STMT_VINFO_DEF_TYPE (stmt_info));
783 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
784 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
785 if (stmt_info)
786 REDUC_GROUP_NEXT_ELEMENT (stmtp)
787 = STMT_VINFO_RELATED_STMT (stmt_info);
789 while (stmt_info);
792 /* Fixup scalar cycles that now have their stmts detected as patterns. */
794 static void
795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
797 stmt_vec_info first;
798 unsigned i;
800 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
802 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
803 while (next)
805 if ((STMT_VINFO_IN_PATTERN_P (next)
806 != STMT_VINFO_IN_PATTERN_P (first))
807 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
808 break;
809 next = REDUC_GROUP_NEXT_ELEMENT (next);
811 /* If all reduction chain members are well-formed patterns adjust
812 the group to group the pattern stmts instead. */
813 if (! next
814 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
816 if (STMT_VINFO_IN_PATTERN_P (first))
818 vect_fixup_reduc_chain (first);
819 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
820 = STMT_VINFO_RELATED_STMT (first);
823 /* If not all stmt in the chain are patterns or if we failed
824 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 it as regular reduction instead. */
826 else
828 stmt_vec_info vinfo = first;
829 stmt_vec_info last = NULL;
830 while (vinfo)
832 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
833 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
834 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
835 last = vinfo;
836 vinfo = next;
838 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
839 = vect_internal_def;
840 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
841 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
842 --i;
847 /* Function vect_get_loop_niters.
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 niter information holds in ASSUMPTIONS.
854 Return the loop exit conditions. */
857 static vec<gcond *>
858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
859 tree *number_of_iterations, tree *number_of_iterationsm1)
861 auto_vec<edge> exits = get_loop_exit_edges (loop);
862 vec<gcond *> conds;
863 conds.create (exits.length ());
864 class tree_niter_desc niter_desc;
865 tree niter_assumptions, niter, may_be_zero;
867 *assumptions = boolean_true_node;
868 *number_of_iterationsm1 = chrec_dont_know;
869 *number_of_iterations = chrec_dont_know;
871 DUMP_VECT_SCOPE ("get_loop_niters");
873 if (exits.is_empty ())
874 return conds;
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
878 exits.length ());
880 edge exit;
881 unsigned int i;
882 FOR_EACH_VEC_ELT (exits, i, exit)
884 gcond *cond = get_loop_exit_condition (exit);
885 if (cond)
886 conds.safe_push (cond);
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
891 if (exit != main_exit)
892 continue;
894 may_be_zero = NULL_TREE;
895 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
896 || chrec_contains_undetermined (niter_desc.niter))
897 continue;
899 niter_assumptions = niter_desc.assumptions;
900 may_be_zero = niter_desc.may_be_zero;
901 niter = niter_desc.niter;
903 if (may_be_zero && integer_zerop (may_be_zero))
904 may_be_zero = NULL_TREE;
906 if (may_be_zero)
908 if (COMPARISON_CLASS_P (may_be_zero))
910 /* Try to combine may_be_zero with assumptions, this can simplify
911 computation of niter expression. */
912 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
913 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
914 niter_assumptions,
915 fold_build1 (TRUTH_NOT_EXPR,
916 boolean_type_node,
917 may_be_zero));
918 else
919 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
920 build_int_cst (TREE_TYPE (niter), 0),
921 rewrite_to_non_trapping_overflow (niter));
923 may_be_zero = NULL_TREE;
925 else if (integer_nonzerop (may_be_zero))
927 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
928 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
929 continue;
931 else
932 continue;
935 /* Loop assumptions are based off the normal exit. */
936 *assumptions = niter_assumptions;
937 *number_of_iterationsm1 = niter;
939 /* We want the number of loop header executions which is the number
940 of latch executions plus one.
941 ??? For UINT_MAX latch executions this number overflows to zero
942 for loops like do { n++; } while (n != 0); */
943 if (niter && !chrec_contains_undetermined (niter))
944 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
945 unshare_expr (niter),
946 build_int_cst (TREE_TYPE (niter), 1));
947 *number_of_iterations = niter;
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
953 return conds;
956 /* Determine the main loop exit for the vectorizer. */
958 edge
959 vec_init_loop_exit_info (class loop *loop)
961 /* Before we begin we must first determine which exit is the main one and
962 which are auxilary exits. */
963 auto_vec<edge> exits = get_loop_exit_edges (loop);
964 if (exits.length () == 1)
965 return exits[0];
967 /* If we have multiple exits we only support counting IV at the moment. Analyze
968 all exits and return one */
969 class tree_niter_desc niter_desc;
970 edge candidate = NULL;
971 for (edge exit : exits)
973 if (!get_loop_exit_condition (exit))
974 continue;
976 if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
977 && !chrec_contains_undetermined (niter_desc.niter))
979 if (!niter_desc.may_be_zero || !candidate)
980 candidate = exit;
984 return candidate;
987 /* Function bb_in_loop_p
989 Used as predicate for dfs order traversal of the loop bbs. */
991 static bool
992 bb_in_loop_p (const_basic_block bb, const void *data)
994 const class loop *const loop = (const class loop *)data;
995 if (flow_bb_inside_loop_p (loop, bb))
996 return true;
997 return false;
1001 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002 stmt_vec_info structs for all the stmts in LOOP_IN. */
1004 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005 : vec_info (vec_info::loop, shared),
1006 loop (loop_in),
1007 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008 num_itersm1 (NULL_TREE),
1009 num_iters (NULL_TREE),
1010 num_iters_unchanged (NULL_TREE),
1011 num_iters_assumptions (NULL_TREE),
1012 vector_costs (nullptr),
1013 scalar_costs (nullptr),
1014 th (0),
1015 versioning_threshold (0),
1016 vectorization_factor (0),
1017 main_loop_edge (nullptr),
1018 skip_main_loop_edge (nullptr),
1019 skip_this_loop_edge (nullptr),
1020 reusable_accumulators (),
1021 suggested_unroll_factor (1),
1022 max_vectorization_factor (0),
1023 mask_skip_niters (NULL_TREE),
1024 rgroup_compare_type (NULL_TREE),
1025 simd_if_cond (NULL_TREE),
1026 partial_vector_style (vect_partial_vectors_none),
1027 unaligned_dr (NULL),
1028 peeling_for_alignment (0),
1029 ptr_mask (0),
1030 ivexpr_map (NULL),
1031 scan_map (NULL),
1032 slp_unrolling_factor (1),
1033 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034 vectorizable (false),
1035 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036 using_partial_vectors_p (false),
1037 using_decrementing_iv_p (false),
1038 using_select_vl_p (false),
1039 epil_using_partial_vectors_p (false),
1040 partial_load_store_bias (0),
1041 peeling_for_gaps (false),
1042 peeling_for_niter (false),
1043 no_data_dependencies (false),
1044 has_mask_store (false),
1045 scalar_loop_scaling (profile_probability::uninitialized ()),
1046 scalar_loop (NULL),
1047 orig_loop_info (NULL),
1048 vec_loop_iv_exit (NULL),
1049 vec_epilogue_loop_iv_exit (NULL),
1050 scalar_loop_iv_exit (NULL)
1052 /* CHECKME: We want to visit all BBs before their successors (except for
1053 latch blocks, for which this assertion wouldn't hold). In the simple
1054 case of the loop forms we allow, a dfs order of the BBs would the same
1055 as reversed postorder traversal, so we are safe. */
1057 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1058 bbs, loop->num_nodes, loop);
1059 gcc_assert (nbbs == loop->num_nodes);
1061 for (unsigned int i = 0; i < nbbs; i++)
1063 basic_block bb = bbs[i];
1064 gimple_stmt_iterator si;
1066 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1068 gimple *phi = gsi_stmt (si);
1069 gimple_set_uid (phi, 0);
1070 add_stmt (phi);
1073 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1075 gimple *stmt = gsi_stmt (si);
1076 gimple_set_uid (stmt, 0);
1077 if (is_gimple_debug (stmt))
1078 continue;
1079 add_stmt (stmt);
1080 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1081 third argument is the #pragma omp simd if (x) condition, when 0,
1082 loop shouldn't be vectorized, when non-zero constant, it should
1083 be vectorized normally, otherwise versioned with vectorized loop
1084 done if the condition is non-zero at runtime. */
1085 if (loop_in->simduid
1086 && is_gimple_call (stmt)
1087 && gimple_call_internal_p (stmt)
1088 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1089 && gimple_call_num_args (stmt) >= 3
1090 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1091 && (loop_in->simduid
1092 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1094 tree arg = gimple_call_arg (stmt, 2);
1095 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1096 simd_if_cond = arg;
1097 else
1098 gcc_assert (integer_nonzerop (arg));
1103 epilogue_vinfos.create (6);
1106 /* Free all levels of rgroup CONTROLS. */
1108 void
1109 release_vec_loop_controls (vec<rgroup_controls> *controls)
1111 rgroup_controls *rgc;
1112 unsigned int i;
1113 FOR_EACH_VEC_ELT (*controls, i, rgc)
1114 rgc->controls.release ();
1115 controls->release ();
1118 /* Free all memory used by the _loop_vec_info, as well as all the
1119 stmt_vec_info structs of all the stmts in the loop. */
1121 _loop_vec_info::~_loop_vec_info ()
1123 free (bbs);
1125 release_vec_loop_controls (&masks.rgc_vec);
1126 release_vec_loop_controls (&lens);
1127 delete ivexpr_map;
1128 delete scan_map;
1129 epilogue_vinfos.release ();
1130 delete scalar_costs;
1131 delete vector_costs;
1133 /* When we release an epiloge vinfo that we do not intend to use
1134 avoid clearing AUX of the main loop which should continue to
1135 point to the main loop vinfo since otherwise we'll leak that. */
1136 if (loop->aux == this)
1137 loop->aux = NULL;
1140 /* Return an invariant or register for EXPR and emit necessary
1141 computations in the LOOP_VINFO loop preheader. */
1143 tree
1144 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1146 if (is_gimple_reg (expr)
1147 || is_gimple_min_invariant (expr))
1148 return expr;
1150 if (! loop_vinfo->ivexpr_map)
1151 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1152 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1153 if (! cached)
1155 gimple_seq stmts = NULL;
1156 cached = force_gimple_operand (unshare_expr (expr),
1157 &stmts, true, NULL_TREE);
1158 if (stmts)
1160 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1161 gsi_insert_seq_on_edge_immediate (e, stmts);
1164 return cached;
1167 /* Return true if we can use CMP_TYPE as the comparison type to produce
1168 all masks required to mask LOOP_VINFO. */
1170 static bool
1171 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1173 rgroup_controls *rgm;
1174 unsigned int i;
1175 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1176 if (rgm->type != NULL_TREE
1177 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1178 cmp_type, rgm->type,
1179 OPTIMIZE_FOR_SPEED))
1180 return false;
1181 return true;
1184 /* Calculate the maximum number of scalars per iteration for every
1185 rgroup in LOOP_VINFO. */
1187 static unsigned int
1188 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1190 unsigned int res = 1;
1191 unsigned int i;
1192 rgroup_controls *rgm;
1193 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1194 res = MAX (res, rgm->max_nscalars_per_iter);
1195 return res;
1198 /* Calculate the minimum precision necessary to represent:
1200 MAX_NITERS * FACTOR
1202 as an unsigned integer, where MAX_NITERS is the maximum number of
1203 loop header iterations for the original scalar form of LOOP_VINFO. */
1205 static unsigned
1206 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1208 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1210 /* Get the maximum number of iterations that is representable
1211 in the counter type. */
1212 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1213 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1215 /* Get a more refined estimate for the number of iterations. */
1216 widest_int max_back_edges;
1217 if (max_loop_iterations (loop, &max_back_edges))
1218 max_ni = wi::smin (max_ni, max_back_edges + 1);
1220 /* Work out how many bits we need to represent the limit. */
1221 return wi::min_precision (max_ni * factor, UNSIGNED);
1224 /* True if the loop needs peeling or partial vectors when vectorized. */
1226 static bool
1227 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1229 unsigned HOST_WIDE_INT const_vf;
1230 HOST_WIDE_INT max_niter
1231 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1233 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1234 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1235 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1236 (loop_vinfo));
1238 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1239 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1241 /* Work out the (constant) number of iterations that need to be
1242 peeled for reasons other than niters. */
1243 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1244 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1245 peel_niter += 1;
1246 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1247 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1248 return true;
1250 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1251 /* ??? When peeling for gaps but not alignment, we could
1252 try to check whether the (variable) niters is known to be
1253 VF * N + 1. That's something of a niche case though. */
1254 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1255 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1256 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1257 < (unsigned) exact_log2 (const_vf))
1258 /* In case of versioning, check if the maximum number of
1259 iterations is greater than th. If they are identical,
1260 the epilogue is unnecessary. */
1261 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1262 || ((unsigned HOST_WIDE_INT) max_niter
1263 > (th / const_vf) * const_vf))))
1264 return true;
1266 return false;
1269 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1270 whether we can actually generate the masks required. Return true if so,
1271 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1273 static bool
1274 vect_verify_full_masking (loop_vec_info loop_vinfo)
1276 unsigned int min_ni_width;
1278 /* Use a normal loop if there are no statements that need masking.
1279 This only happens in rare degenerate cases: it means that the loop
1280 has no loads, no stores, and no live-out values. */
1281 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1282 return false;
1284 /* Produce the rgroup controls. */
1285 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1287 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1288 tree vectype = mask.first;
1289 unsigned nvectors = mask.second;
1291 if (masks->rgc_vec.length () < nvectors)
1292 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1293 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1294 /* The number of scalars per iteration and the number of vectors are
1295 both compile-time constants. */
1296 unsigned int nscalars_per_iter
1297 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1298 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1300 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1302 rgm->max_nscalars_per_iter = nscalars_per_iter;
1303 rgm->type = truth_type_for (vectype);
1304 rgm->factor = 1;
1308 unsigned int max_nscalars_per_iter
1309 = vect_get_max_nscalars_per_iter (loop_vinfo);
1311 /* Work out how many bits we need to represent the limit. */
1312 min_ni_width
1313 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1315 /* Find a scalar mode for which WHILE_ULT is supported. */
1316 opt_scalar_int_mode cmp_mode_iter;
1317 tree cmp_type = NULL_TREE;
1318 tree iv_type = NULL_TREE;
1319 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1320 unsigned int iv_precision = UINT_MAX;
1322 if (iv_limit != -1)
1323 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1324 UNSIGNED);
1326 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1328 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1329 if (cmp_bits >= min_ni_width
1330 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1332 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1333 if (this_type
1334 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1336 /* Although we could stop as soon as we find a valid mode,
1337 there are at least two reasons why that's not always the
1338 best choice:
1340 - An IV that's Pmode or wider is more likely to be reusable
1341 in address calculations than an IV that's narrower than
1342 Pmode.
1344 - Doing the comparison in IV_PRECISION or wider allows
1345 a natural 0-based IV, whereas using a narrower comparison
1346 type requires mitigations against wrap-around.
1348 Conversely, if the IV limit is variable, doing the comparison
1349 in a wider type than the original type can introduce
1350 unnecessary extensions, so picking the widest valid mode
1351 is not always a good choice either.
1353 Here we prefer the first IV type that's Pmode or wider,
1354 and the first comparison type that's IV_PRECISION or wider.
1355 (The comparison type must be no wider than the IV type,
1356 to avoid extensions in the vector loop.)
1358 ??? We might want to try continuing beyond Pmode for ILP32
1359 targets if CMP_BITS < IV_PRECISION. */
1360 iv_type = this_type;
1361 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1362 cmp_type = this_type;
1363 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1364 break;
1369 if (!cmp_type)
1371 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1372 return false;
1375 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1376 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1377 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1378 return true;
1381 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1382 whether we can actually generate AVX512 style masks. Return true if so,
1383 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1385 static bool
1386 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1388 /* Produce differently organized rgc_vec and differently check
1389 we can produce masks. */
1391 /* Use a normal loop if there are no statements that need masking.
1392 This only happens in rare degenerate cases: it means that the loop
1393 has no loads, no stores, and no live-out values. */
1394 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1395 return false;
1397 /* For the decrementing IV we need to represent all values in
1398 [0, niter + niter_skip] where niter_skip is the elements we
1399 skip in the first iteration for prologue peeling. */
1400 tree iv_type = NULL_TREE;
1401 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1402 unsigned int iv_precision = UINT_MAX;
1403 if (iv_limit != -1)
1404 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1406 /* First compute the type for the IV we use to track the remaining
1407 scalar iterations. */
1408 opt_scalar_int_mode cmp_mode_iter;
1409 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1411 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1412 if (cmp_bits >= iv_precision
1413 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1415 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1416 if (iv_type)
1417 break;
1420 if (!iv_type)
1421 return false;
1423 /* Produce the rgroup controls. */
1424 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1426 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1427 tree vectype = mask.first;
1428 unsigned nvectors = mask.second;
1430 /* The number of scalars per iteration and the number of vectors are
1431 both compile-time constants. */
1432 unsigned int nscalars_per_iter
1433 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1434 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1436 /* We index the rgroup_controls vector with nscalars_per_iter
1437 which we keep constant and instead have a varying nvectors,
1438 remembering the vector mask with the fewest nV. */
1439 if (masks->rgc_vec.length () < nscalars_per_iter)
1440 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1441 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1443 if (!rgm->type || rgm->factor > nvectors)
1445 rgm->type = truth_type_for (vectype);
1446 rgm->compare_type = NULL_TREE;
1447 rgm->max_nscalars_per_iter = nscalars_per_iter;
1448 rgm->factor = nvectors;
1449 rgm->bias_adjusted_ctrl = NULL_TREE;
1453 /* There is no fixed compare type we are going to use but we have to
1454 be able to get at one for each mask group. */
1455 unsigned int min_ni_width
1456 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1458 bool ok = true;
1459 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1461 tree mask_type = rgc.type;
1462 if (!mask_type)
1463 continue;
1465 if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1467 ok = false;
1468 break;
1471 /* If iv_type is usable as compare type use that - we can elide the
1472 saturation in that case. */
1473 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1475 tree cmp_vectype
1476 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1477 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1478 rgc.compare_type = cmp_vectype;
1480 if (!rgc.compare_type)
1481 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1483 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1484 if (cmp_bits >= min_ni_width
1485 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1487 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1488 if (!cmp_type)
1489 continue;
1491 /* Check whether we can produce the mask with cmp_type. */
1492 tree cmp_vectype
1493 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1494 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1496 rgc.compare_type = cmp_vectype;
1497 break;
1501 if (!rgc.compare_type)
1503 ok = false;
1504 break;
1507 if (!ok)
1509 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1510 return false;
1513 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1514 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1515 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1516 return true;
1519 /* Check whether we can use vector access with length based on precison
1520 comparison. So far, to keep it simple, we only allow the case that the
1521 precision of the target supported length is larger than the precision
1522 required by loop niters. */
1524 static bool
1525 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1527 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1528 return false;
1530 machine_mode len_load_mode, len_store_mode;
1531 if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1532 .exists (&len_load_mode))
1533 return false;
1534 if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1535 .exists (&len_store_mode))
1536 return false;
1538 signed char partial_load_bias = internal_len_load_store_bias
1539 (IFN_LEN_LOAD, len_load_mode);
1541 signed char partial_store_bias = internal_len_load_store_bias
1542 (IFN_LEN_STORE, len_store_mode);
1544 gcc_assert (partial_load_bias == partial_store_bias);
1546 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1547 return false;
1549 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1550 len_loads with a length of zero. In order to avoid that we prohibit
1551 more than one loop length here. */
1552 if (partial_load_bias == -1
1553 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1554 return false;
1556 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1558 unsigned int max_nitems_per_iter = 1;
1559 unsigned int i;
1560 rgroup_controls *rgl;
1561 /* Find the maximum number of items per iteration for every rgroup. */
1562 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1564 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1565 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1568 /* Work out how many bits we need to represent the length limit. */
1569 unsigned int min_ni_prec
1570 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1572 /* Now use the maximum of below precisions for one suitable IV type:
1573 - the IV's natural precision
1574 - the precision needed to hold: the maximum number of scalar
1575 iterations multiplied by the scale factor (min_ni_prec above)
1576 - the Pmode precision
1578 If min_ni_prec is less than the precision of the current niters,
1579 we perfer to still use the niters type. Prefer to use Pmode and
1580 wider IV to avoid narrow conversions. */
1582 unsigned int ni_prec
1583 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1584 min_ni_prec = MAX (min_ni_prec, ni_prec);
1585 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1587 tree iv_type = NULL_TREE;
1588 opt_scalar_int_mode tmode_iter;
1589 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1591 scalar_mode tmode = tmode_iter.require ();
1592 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1594 /* ??? Do we really want to construct one IV whose precision exceeds
1595 BITS_PER_WORD? */
1596 if (tbits > BITS_PER_WORD)
1597 break;
1599 /* Find the first available standard integral type. */
1600 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1602 iv_type = build_nonstandard_integer_type (tbits, true);
1603 break;
1607 if (!iv_type)
1609 if (dump_enabled_p ())
1610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1611 "can't vectorize with length-based partial vectors"
1612 " because there is no suitable iv type.\n");
1613 return false;
1616 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1617 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1618 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1620 return true;
1623 /* Calculate the cost of one scalar iteration of the loop. */
1624 static void
1625 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1627 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1628 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1629 int nbbs = loop->num_nodes, factor;
1630 int innerloop_iters, i;
1632 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1634 /* Gather costs for statements in the scalar loop. */
1636 /* FORNOW. */
1637 innerloop_iters = 1;
1638 if (loop->inner)
1639 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1641 for (i = 0; i < nbbs; i++)
1643 gimple_stmt_iterator si;
1644 basic_block bb = bbs[i];
1646 if (bb->loop_father == loop->inner)
1647 factor = innerloop_iters;
1648 else
1649 factor = 1;
1651 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1653 gimple *stmt = gsi_stmt (si);
1654 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1656 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1657 continue;
1659 /* Skip stmts that are not vectorized inside the loop. */
1660 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1661 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1662 && (!STMT_VINFO_LIVE_P (vstmt_info)
1663 || !VECTORIZABLE_CYCLE_DEF
1664 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1665 continue;
1667 vect_cost_for_stmt kind;
1668 if (STMT_VINFO_DATA_REF (stmt_info))
1670 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1671 kind = scalar_load;
1672 else
1673 kind = scalar_store;
1675 else if (vect_nop_conversion_p (stmt_info))
1676 continue;
1677 else
1678 kind = scalar_stmt;
1680 /* We are using vect_prologue here to avoid scaling twice
1681 by the inner loop factor. */
1682 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1683 factor, kind, stmt_info, 0, vect_prologue);
1687 /* Now accumulate cost. */
1688 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1689 add_stmt_costs (loop_vinfo->scalar_costs,
1690 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1691 loop_vinfo->scalar_costs->finish_cost (nullptr);
1695 /* Function vect_analyze_loop_form.
1697 Verify that certain CFG restrictions hold, including:
1698 - the loop has a pre-header
1699 - the loop has a single entry and exit
1700 - the loop exit condition is simple enough
1701 - the number of iterations can be analyzed, i.e, a countable loop. The
1702 niter could be analyzed under some assumptions. */
1704 opt_result
1705 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1707 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1709 edge exit_e = vec_init_loop_exit_info (loop);
1710 if (!exit_e)
1711 return opt_result::failure_at (vect_location,
1712 "not vectorized:"
1713 " could not determine main exit from"
1714 " loop with multiple exits.\n");
1715 info->loop_exit = exit_e;
1716 if (dump_enabled_p ())
1717 dump_printf_loc (MSG_NOTE, vect_location,
1718 "using as main loop exit: %d -> %d [AUX: %p]\n",
1719 exit_e->src->index, exit_e->dest->index, exit_e->aux);
1721 /* Different restrictions apply when we are considering an inner-most loop,
1722 vs. an outer (nested) loop.
1723 (FORNOW. May want to relax some of these restrictions in the future). */
1725 info->inner_loop_cond = NULL;
1726 if (!loop->inner)
1728 /* Inner-most loop. We currently require that the number of BBs is
1729 exactly 2 (the header and latch). Vectorizable inner-most loops
1730 look like this:
1732 (pre-header)
1734 header <--------+
1735 | | |
1736 | +--> latch --+
1738 (exit-bb) */
1740 if (loop->num_nodes != 2)
1741 return opt_result::failure_at (vect_location,
1742 "not vectorized:"
1743 " control flow in loop.\n");
1745 if (empty_block_p (loop->header))
1746 return opt_result::failure_at (vect_location,
1747 "not vectorized: empty loop.\n");
1749 else
1751 class loop *innerloop = loop->inner;
1752 edge entryedge;
1754 /* Nested loop. We currently require that the loop is doubly-nested,
1755 contains a single inner loop, and the number of BBs is exactly 5.
1756 Vectorizable outer-loops look like this:
1758 (pre-header)
1760 header <---+
1762 inner-loop |
1764 tail ------+
1766 (exit-bb)
1768 The inner-loop has the properties expected of inner-most loops
1769 as described above. */
1771 if ((loop->inner)->inner || (loop->inner)->next)
1772 return opt_result::failure_at (vect_location,
1773 "not vectorized:"
1774 " multiple nested loops.\n");
1776 if (loop->num_nodes != 5)
1777 return opt_result::failure_at (vect_location,
1778 "not vectorized:"
1779 " control flow in loop.\n");
1781 entryedge = loop_preheader_edge (innerloop);
1782 if (entryedge->src != loop->header
1783 || !single_exit (innerloop)
1784 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1785 return opt_result::failure_at (vect_location,
1786 "not vectorized:"
1787 " unsupported outerloop form.\n");
1789 /* Analyze the inner-loop. */
1790 vect_loop_form_info inner;
1791 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1792 if (!res)
1794 if (dump_enabled_p ())
1795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1796 "not vectorized: Bad inner loop.\n");
1797 return res;
1800 /* Don't support analyzing niter under assumptions for inner
1801 loop. */
1802 if (!integer_onep (inner.assumptions))
1803 return opt_result::failure_at (vect_location,
1804 "not vectorized: Bad inner loop.\n");
1806 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1807 return opt_result::failure_at (vect_location,
1808 "not vectorized: inner-loop count not"
1809 " invariant.\n");
1811 if (dump_enabled_p ())
1812 dump_printf_loc (MSG_NOTE, vect_location,
1813 "Considering outer-loop vectorization.\n");
1814 info->inner_loop_cond = inner.conds[0];
1817 if (!single_exit (loop))
1818 return opt_result::failure_at (vect_location,
1819 "not vectorized: multiple exits.\n");
1820 if (EDGE_COUNT (loop->header->preds) != 2)
1821 return opt_result::failure_at (vect_location,
1822 "not vectorized:"
1823 " too many incoming edges.\n");
1825 /* We assume that the loop exit condition is at the end of the loop. i.e,
1826 that the loop is represented as a do-while (with a proper if-guard
1827 before the loop if needed), where the loop header contains all the
1828 executable statements, and the latch is empty. */
1829 if (!empty_block_p (loop->latch)
1830 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1831 return opt_result::failure_at (vect_location,
1832 "not vectorized: latch block not empty.\n");
1834 /* Make sure the exit is not abnormal. */
1835 if (exit_e->flags & EDGE_ABNORMAL)
1836 return opt_result::failure_at (vect_location,
1837 "not vectorized:"
1838 " abnormal loop exit edge.\n");
1840 info->conds
1841 = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1842 &info->number_of_iterations,
1843 &info->number_of_iterationsm1);
1845 if (info->conds.is_empty ())
1846 return opt_result::failure_at
1847 (vect_location,
1848 "not vectorized: complicated exit condition.\n");
1850 /* Determine what the primary and alternate exit conds are. */
1851 for (unsigned i = 0; i < info->conds.length (); i++)
1853 gcond *cond = info->conds[i];
1854 if (exit_e->src == gimple_bb (cond))
1855 std::swap (info->conds[0], info->conds[i]);
1858 if (integer_zerop (info->assumptions)
1859 || !info->number_of_iterations
1860 || chrec_contains_undetermined (info->number_of_iterations))
1861 return opt_result::failure_at
1862 (info->conds[0],
1863 "not vectorized: number of iterations cannot be computed.\n");
1865 if (integer_zerop (info->number_of_iterations))
1866 return opt_result::failure_at
1867 (info->conds[0],
1868 "not vectorized: number of iterations = 0.\n");
1870 if (!(tree_fits_shwi_p (info->number_of_iterations)
1871 && tree_to_shwi (info->number_of_iterations) > 0))
1873 if (dump_enabled_p ())
1875 dump_printf_loc (MSG_NOTE, vect_location,
1876 "Symbolic number of iterations is ");
1877 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1878 dump_printf (MSG_NOTE, "\n");
1882 return opt_result::success ();
1885 /* Create a loop_vec_info for LOOP with SHARED and the
1886 vect_analyze_loop_form result. */
1888 loop_vec_info
1889 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1890 const vect_loop_form_info *info,
1891 loop_vec_info main_loop_info)
1893 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1894 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1895 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1896 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1897 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1898 /* Also record the assumptions for versioning. */
1899 if (!integer_onep (info->assumptions) && !main_loop_info)
1900 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1902 for (gcond *cond : info->conds)
1904 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1905 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1908 for (unsigned i = 1; i < info->conds.length (); i ++)
1909 LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1910 LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1912 LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1914 if (info->inner_loop_cond)
1916 stmt_vec_info inner_loop_cond_info
1917 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1918 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1919 /* If we have an estimate on the number of iterations of the inner
1920 loop use that to limit the scale for costing, otherwise use
1921 --param vect-inner-loop-cost-factor literally. */
1922 widest_int nit;
1923 if (estimated_stmt_executions (loop->inner, &nit))
1924 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1925 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1928 return loop_vinfo;
1933 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1934 statements update the vectorization factor. */
1936 static void
1937 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1939 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1940 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1941 int nbbs = loop->num_nodes;
1942 poly_uint64 vectorization_factor;
1943 int i;
1945 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1947 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1948 gcc_assert (known_ne (vectorization_factor, 0U));
1950 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1951 vectorization factor of the loop is the unrolling factor required by
1952 the SLP instances. If that unrolling factor is 1, we say, that we
1953 perform pure SLP on loop - cross iteration parallelism is not
1954 exploited. */
1955 bool only_slp_in_loop = true;
1956 for (i = 0; i < nbbs; i++)
1958 basic_block bb = bbs[i];
1959 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1960 gsi_next (&si))
1962 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1963 if (!stmt_info)
1964 continue;
1965 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1966 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1967 && !PURE_SLP_STMT (stmt_info))
1968 /* STMT needs both SLP and loop-based vectorization. */
1969 only_slp_in_loop = false;
1971 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1972 gsi_next (&si))
1974 if (is_gimple_debug (gsi_stmt (si)))
1975 continue;
1976 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1977 stmt_info = vect_stmt_to_vectorize (stmt_info);
1978 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1979 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1980 && !PURE_SLP_STMT (stmt_info))
1981 /* STMT needs both SLP and loop-based vectorization. */
1982 only_slp_in_loop = false;
1986 if (only_slp_in_loop)
1988 if (dump_enabled_p ())
1989 dump_printf_loc (MSG_NOTE, vect_location,
1990 "Loop contains only SLP stmts\n");
1991 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1993 else
1995 if (dump_enabled_p ())
1996 dump_printf_loc (MSG_NOTE, vect_location,
1997 "Loop contains SLP and non-SLP stmts\n");
1998 /* Both the vectorization factor and unroll factor have the form
1999 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2000 so they must have a common multiple. */
2001 vectorization_factor
2002 = force_common_multiple (vectorization_factor,
2003 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2006 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2007 if (dump_enabled_p ())
2009 dump_printf_loc (MSG_NOTE, vect_location,
2010 "Updating vectorization factor to ");
2011 dump_dec (MSG_NOTE, vectorization_factor);
2012 dump_printf (MSG_NOTE, ".\n");
2016 /* Return true if STMT_INFO describes a double reduction phi and if
2017 the other phi in the reduction is also relevant for vectorization.
2018 This rejects cases such as:
2020 outer1:
2021 x_1 = PHI <x_3(outer2), ...>;
2024 inner:
2025 x_2 = ...;
2028 outer2:
2029 x_3 = PHI <x_2(inner)>;
2031 if nothing in x_2 or elsewhere makes x_1 relevant. */
2033 static bool
2034 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2036 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2037 return false;
2039 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2042 /* Function vect_analyze_loop_operations.
2044 Scan the loop stmts and make sure they are all vectorizable. */
2046 static opt_result
2047 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2049 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2050 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2051 int nbbs = loop->num_nodes;
2052 int i;
2053 stmt_vec_info stmt_info;
2054 bool need_to_vectorize = false;
2055 bool ok;
2057 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2059 auto_vec<stmt_info_for_cost> cost_vec;
2061 for (i = 0; i < nbbs; i++)
2063 basic_block bb = bbs[i];
2065 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2066 gsi_next (&si))
2068 gphi *phi = si.phi ();
2069 ok = true;
2071 stmt_info = loop_vinfo->lookup_stmt (phi);
2072 if (dump_enabled_p ())
2073 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2074 (gimple *) phi);
2075 if (virtual_operand_p (gimple_phi_result (phi)))
2076 continue;
2078 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2079 (i.e., a phi in the tail of the outer-loop). */
2080 if (! is_loop_header_bb_p (bb))
2082 /* FORNOW: we currently don't support the case that these phis
2083 are not used in the outerloop (unless it is double reduction,
2084 i.e., this phi is vect_reduction_def), cause this case
2085 requires to actually do something here. */
2086 if (STMT_VINFO_LIVE_P (stmt_info)
2087 && !vect_active_double_reduction_p (stmt_info))
2088 return opt_result::failure_at (phi,
2089 "Unsupported loop-closed phi"
2090 " in outer-loop.\n");
2092 /* If PHI is used in the outer loop, we check that its operand
2093 is defined in the inner loop. */
2094 if (STMT_VINFO_RELEVANT_P (stmt_info))
2096 tree phi_op;
2098 if (gimple_phi_num_args (phi) != 1)
2099 return opt_result::failure_at (phi, "unsupported phi");
2101 phi_op = PHI_ARG_DEF (phi, 0);
2102 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2103 if (!op_def_info)
2104 return opt_result::failure_at (phi, "unsupported phi\n");
2106 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2107 && (STMT_VINFO_RELEVANT (op_def_info)
2108 != vect_used_in_outer_by_reduction))
2109 return opt_result::failure_at (phi, "unsupported phi\n");
2111 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2112 || (STMT_VINFO_DEF_TYPE (stmt_info)
2113 == vect_double_reduction_def))
2114 && !vectorizable_lc_phi (loop_vinfo,
2115 stmt_info, NULL, NULL))
2116 return opt_result::failure_at (phi, "unsupported phi\n");
2119 continue;
2122 gcc_assert (stmt_info);
2124 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2125 || STMT_VINFO_LIVE_P (stmt_info))
2126 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2127 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2128 /* A scalar-dependence cycle that we don't support. */
2129 return opt_result::failure_at (phi,
2130 "not vectorized:"
2131 " scalar dependence cycle.\n");
2133 if (STMT_VINFO_RELEVANT_P (stmt_info))
2135 need_to_vectorize = true;
2136 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2137 && ! PURE_SLP_STMT (stmt_info))
2138 ok = vectorizable_induction (loop_vinfo,
2139 stmt_info, NULL, NULL,
2140 &cost_vec);
2141 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2142 || (STMT_VINFO_DEF_TYPE (stmt_info)
2143 == vect_double_reduction_def)
2144 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2145 && ! PURE_SLP_STMT (stmt_info))
2146 ok = vectorizable_reduction (loop_vinfo,
2147 stmt_info, NULL, NULL, &cost_vec);
2148 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2149 == vect_first_order_recurrence)
2150 && ! PURE_SLP_STMT (stmt_info))
2151 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2152 &cost_vec);
2155 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2156 if (ok
2157 && STMT_VINFO_LIVE_P (stmt_info)
2158 && !PURE_SLP_STMT (stmt_info))
2159 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2160 -1, false, &cost_vec);
2162 if (!ok)
2163 return opt_result::failure_at (phi,
2164 "not vectorized: relevant phi not "
2165 "supported: %G",
2166 static_cast <gimple *> (phi));
2169 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2170 gsi_next (&si))
2172 gimple *stmt = gsi_stmt (si);
2173 if (!gimple_clobber_p (stmt)
2174 && !is_gimple_debug (stmt))
2176 opt_result res
2177 = vect_analyze_stmt (loop_vinfo,
2178 loop_vinfo->lookup_stmt (stmt),
2179 &need_to_vectorize,
2180 NULL, NULL, &cost_vec);
2181 if (!res)
2182 return res;
2185 } /* bbs */
2187 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2189 /* All operations in the loop are either irrelevant (deal with loop
2190 control, or dead), or only used outside the loop and can be moved
2191 out of the loop (e.g. invariants, inductions). The loop can be
2192 optimized away by scalar optimizations. We're better off not
2193 touching this loop. */
2194 if (!need_to_vectorize)
2196 if (dump_enabled_p ())
2197 dump_printf_loc (MSG_NOTE, vect_location,
2198 "All the computation can be taken out of the loop.\n");
2199 return opt_result::failure_at
2200 (vect_location,
2201 "not vectorized: redundant loop. no profit to vectorize.\n");
2204 return opt_result::success ();
2207 /* Return true if we know that the iteration count is smaller than the
2208 vectorization factor. Return false if it isn't, or if we can't be sure
2209 either way. */
2211 static bool
2212 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2214 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2216 HOST_WIDE_INT max_niter;
2217 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2218 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2219 else
2220 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2222 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2223 return true;
2225 return false;
2228 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2229 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2230 definitely no, or -1 if it's worth retrying. */
2232 static int
2233 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2234 unsigned *suggested_unroll_factor)
2236 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2237 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2239 /* Only loops that can handle partially-populated vectors can have iteration
2240 counts less than the vectorization factor. */
2241 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2242 && vect_known_niters_smaller_than_vf (loop_vinfo))
2244 if (dump_enabled_p ())
2245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246 "not vectorized: iteration count smaller than "
2247 "vectorization factor.\n");
2248 return 0;
2251 /* If we know the number of iterations we can do better, for the
2252 epilogue we can also decide whether the main loop leaves us
2253 with enough iterations, prefering a smaller vector epilog then
2254 also possibly used for the case we skip the vector loop. */
2255 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2257 widest_int scalar_niters
2258 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2259 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2261 loop_vec_info orig_loop_vinfo
2262 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2263 unsigned lowest_vf
2264 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2265 int prolog_peeling = 0;
2266 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2267 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2268 if (prolog_peeling >= 0
2269 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2270 lowest_vf))
2272 unsigned gap
2273 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2274 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2275 % lowest_vf + gap);
2278 /* Reject vectorizing for a single scalar iteration, even if
2279 we could in principle implement that using partial vectors. */
2280 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2281 if (scalar_niters <= peeling_gap + 1)
2283 if (dump_enabled_p ())
2284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285 "not vectorized: loop only has a single "
2286 "scalar iteration.\n");
2287 return 0;
2290 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2292 /* Check that the loop processes at least one full vector. */
2293 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2294 if (known_lt (scalar_niters, vf))
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298 "loop does not have enough iterations "
2299 "to support vectorization.\n");
2300 return 0;
2303 /* If we need to peel an extra epilogue iteration to handle data
2304 accesses with gaps, check that there are enough scalar iterations
2305 available.
2307 The check above is redundant with this one when peeling for gaps,
2308 but the distinction is useful for diagnostics. */
2309 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2310 && known_le (scalar_niters, vf))
2312 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2314 "loop does not have enough iterations "
2315 "to support peeling for gaps.\n");
2316 return 0;
2321 /* If using the "very cheap" model. reject cases in which we'd keep
2322 a copy of the scalar code (even if we might be able to vectorize it). */
2323 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2324 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2325 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2326 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2328 if (dump_enabled_p ())
2329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2330 "some scalar iterations would need to be peeled\n");
2331 return 0;
2334 int min_profitable_iters, min_profitable_estimate;
2335 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2336 &min_profitable_estimate,
2337 suggested_unroll_factor);
2339 if (min_profitable_iters < 0)
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343 "not vectorized: vectorization not profitable.\n");
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346 "not vectorized: vector version will never be "
2347 "profitable.\n");
2348 return -1;
2351 int min_scalar_loop_bound = (param_min_vect_loop_bound
2352 * assumed_vf);
2354 /* Use the cost model only if it is more conservative than user specified
2355 threshold. */
2356 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2357 min_profitable_iters);
2359 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2361 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2362 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2364 if (dump_enabled_p ())
2365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2366 "not vectorized: vectorization not profitable.\n");
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE, vect_location,
2369 "not vectorized: iteration count smaller than user "
2370 "specified loop bound parameter or minimum profitable "
2371 "iterations (whichever is more conservative).\n");
2372 return 0;
2375 /* The static profitablity threshold min_profitable_estimate includes
2376 the cost of having to check at runtime whether the scalar loop
2377 should be used instead. If it turns out that we don't need or want
2378 such a check, the threshold we should use for the static estimate
2379 is simply the point at which the vector loop becomes more profitable
2380 than the scalar loop. */
2381 if (min_profitable_estimate > min_profitable_iters
2382 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2383 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2384 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2385 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2387 if (dump_enabled_p ())
2388 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2389 " choice between the scalar and vector loops\n");
2390 min_profitable_estimate = min_profitable_iters;
2393 /* If the vector loop needs multiple iterations to be beneficial then
2394 things are probably too close to call, and the conservative thing
2395 would be to stick with the scalar code. */
2396 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2397 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2399 if (dump_enabled_p ())
2400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2401 "one iteration of the vector loop would be"
2402 " more expensive than the equivalent number of"
2403 " iterations of the scalar loop\n");
2404 return 0;
2407 HOST_WIDE_INT estimated_niter;
2409 /* If we are vectorizing an epilogue then we know the maximum number of
2410 scalar iterations it will cover is at least one lower than the
2411 vectorization factor of the main loop. */
2412 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2413 estimated_niter
2414 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2415 else
2417 estimated_niter = estimated_stmt_executions_int (loop);
2418 if (estimated_niter == -1)
2419 estimated_niter = likely_max_stmt_executions_int (loop);
2421 if (estimated_niter != -1
2422 && ((unsigned HOST_WIDE_INT) estimated_niter
2423 < MAX (th, (unsigned) min_profitable_estimate)))
2425 if (dump_enabled_p ())
2426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2427 "not vectorized: estimated iteration count too "
2428 "small.\n");
2429 if (dump_enabled_p ())
2430 dump_printf_loc (MSG_NOTE, vect_location,
2431 "not vectorized: estimated iteration count smaller "
2432 "than specified loop bound parameter or minimum "
2433 "profitable iterations (whichever is more "
2434 "conservative).\n");
2435 return -1;
2438 return 1;
2441 static opt_result
2442 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2443 vec<data_reference_p> *datarefs,
2444 unsigned int *n_stmts)
2446 *n_stmts = 0;
2447 for (unsigned i = 0; i < loop->num_nodes; i++)
2448 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2449 !gsi_end_p (gsi); gsi_next (&gsi))
2451 gimple *stmt = gsi_stmt (gsi);
2452 if (is_gimple_debug (stmt))
2453 continue;
2454 ++(*n_stmts);
2455 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2456 NULL, 0);
2457 if (!res)
2459 if (is_gimple_call (stmt) && loop->safelen)
2461 tree fndecl = gimple_call_fndecl (stmt), op;
2462 if (fndecl == NULL_TREE
2463 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2465 fndecl = gimple_call_arg (stmt, 0);
2466 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2467 fndecl = TREE_OPERAND (fndecl, 0);
2468 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2470 if (fndecl != NULL_TREE)
2472 cgraph_node *node = cgraph_node::get (fndecl);
2473 if (node != NULL && node->simd_clones != NULL)
2475 unsigned int j, n = gimple_call_num_args (stmt);
2476 for (j = 0; j < n; j++)
2478 op = gimple_call_arg (stmt, j);
2479 if (DECL_P (op)
2480 || (REFERENCE_CLASS_P (op)
2481 && get_base_address (op)))
2482 break;
2484 op = gimple_call_lhs (stmt);
2485 /* Ignore #pragma omp declare simd functions
2486 if they don't have data references in the
2487 call stmt itself. */
2488 if (j == n
2489 && !(op
2490 && (DECL_P (op)
2491 || (REFERENCE_CLASS_P (op)
2492 && get_base_address (op)))))
2493 continue;
2497 return res;
2499 /* If dependence analysis will give up due to the limit on the
2500 number of datarefs stop here and fail fatally. */
2501 if (datarefs->length ()
2502 > (unsigned)param_loop_max_datarefs_for_datadeps)
2503 return opt_result::failure_at (stmt, "exceeded param "
2504 "loop-max-datarefs-for-datadeps\n");
2506 return opt_result::success ();
2509 /* Look for SLP-only access groups and turn each individual access into its own
2510 group. */
2511 static void
2512 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2514 unsigned int i;
2515 struct data_reference *dr;
2517 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2519 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2520 FOR_EACH_VEC_ELT (datarefs, i, dr)
2522 gcc_assert (DR_REF (dr));
2523 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2525 /* Check if the load is a part of an interleaving chain. */
2526 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2528 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2529 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2530 unsigned int group_size = DR_GROUP_SIZE (first_element);
2532 /* Check if SLP-only groups. */
2533 if (!STMT_SLP_TYPE (stmt_info)
2534 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2536 /* Dissolve the group. */
2537 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2539 stmt_vec_info vinfo = first_element;
2540 while (vinfo)
2542 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2543 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2544 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2545 DR_GROUP_SIZE (vinfo) = 1;
2546 if (STMT_VINFO_STRIDED_P (first_element)
2547 /* We cannot handle stores with gaps. */
2548 || DR_IS_WRITE (dr_info->dr))
2550 STMT_VINFO_STRIDED_P (vinfo) = true;
2551 DR_GROUP_GAP (vinfo) = 0;
2553 else
2554 DR_GROUP_GAP (vinfo) = group_size - 1;
2555 /* Duplicate and adjust alignment info, it needs to
2556 be present on each group leader, see dr_misalignment. */
2557 if (vinfo != first_element)
2559 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2560 dr_info2->target_alignment = dr_info->target_alignment;
2561 int misalignment = dr_info->misalignment;
2562 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2564 HOST_WIDE_INT diff
2565 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2566 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2567 unsigned HOST_WIDE_INT align_c
2568 = dr_info->target_alignment.to_constant ();
2569 misalignment = (misalignment + diff) % align_c;
2571 dr_info2->misalignment = misalignment;
2573 vinfo = next;
2580 /* Determine if operating on full vectors for LOOP_VINFO might leave
2581 some scalar iterations still to do. If so, decide how we should
2582 handle those scalar iterations. The possibilities are:
2584 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2585 In this case:
2587 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2588 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2589 LOOP_VINFO_PEELING_FOR_NITER == false
2591 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2592 to handle the remaining scalar iterations. In this case:
2594 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2595 LOOP_VINFO_PEELING_FOR_NITER == true
2597 There are two choices:
2599 (2a) Consider vectorizing the epilogue loop at the same VF as the
2600 main loop, but using partial vectors instead of full vectors.
2601 In this case:
2603 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2605 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2606 In this case:
2608 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2611 opt_result
2612 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2614 /* Determine whether there would be any scalar iterations left over. */
2615 bool need_peeling_or_partial_vectors_p
2616 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2618 /* Decide whether to vectorize the loop with partial vectors. */
2619 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2620 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2621 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2622 && need_peeling_or_partial_vectors_p)
2624 /* For partial-vector-usage=1, try to push the handling of partial
2625 vectors to the epilogue, with the main loop continuing to operate
2626 on full vectors.
2628 If we are unrolling we also do not want to use partial vectors. This
2629 is to avoid the overhead of generating multiple masks and also to
2630 avoid having to execute entire iterations of FALSE masked instructions
2631 when dealing with one or less full iterations.
2633 ??? We could then end up failing to use partial vectors if we
2634 decide to peel iterations into a prologue, and if the main loop
2635 then ends up processing fewer than VF iterations. */
2636 if ((param_vect_partial_vector_usage == 1
2637 || loop_vinfo->suggested_unroll_factor > 1)
2638 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2639 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2640 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2641 else
2642 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2645 if (dump_enabled_p ())
2646 dump_printf_loc (MSG_NOTE, vect_location,
2647 "operating on %s vectors%s.\n",
2648 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2649 ? "partial" : "full",
2650 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2651 ? " for epilogue loop" : "");
2653 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2654 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2655 && need_peeling_or_partial_vectors_p);
2657 return opt_result::success ();
2660 /* Function vect_analyze_loop_2.
2662 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2663 analyses will record information in some members of LOOP_VINFO. FATAL
2664 indicates if some analysis meets fatal error. If one non-NULL pointer
2665 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2666 worked out suggested unroll factor, while one NULL pointer shows it's
2667 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2668 is to hold the slp decision when the suggested unroll factor is worked
2669 out. */
2670 static opt_result
2671 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2672 unsigned *suggested_unroll_factor,
2673 bool& slp_done_for_suggested_uf)
2675 opt_result ok = opt_result::success ();
2676 int res;
2677 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2678 poly_uint64 min_vf = 2;
2679 loop_vec_info orig_loop_vinfo = NULL;
2681 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2682 loop_vec_info of the first vectorized loop. */
2683 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2684 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2685 else
2686 orig_loop_vinfo = loop_vinfo;
2687 gcc_assert (orig_loop_vinfo);
2689 /* The first group of checks is independent of the vector size. */
2690 fatal = true;
2692 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2693 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2694 return opt_result::failure_at (vect_location,
2695 "not vectorized: simd if(0)\n");
2697 /* Find all data references in the loop (which correspond to vdefs/vuses)
2698 and analyze their evolution in the loop. */
2700 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2702 /* Gather the data references and count stmts in the loop. */
2703 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2705 opt_result res
2706 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2707 &LOOP_VINFO_DATAREFS (loop_vinfo),
2708 &LOOP_VINFO_N_STMTS (loop_vinfo));
2709 if (!res)
2711 if (dump_enabled_p ())
2712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2713 "not vectorized: loop contains function "
2714 "calls or data references that cannot "
2715 "be analyzed\n");
2716 return res;
2718 loop_vinfo->shared->save_datarefs ();
2720 else
2721 loop_vinfo->shared->check_datarefs ();
2723 /* Analyze the data references and also adjust the minimal
2724 vectorization factor according to the loads and stores. */
2726 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2727 if (!ok)
2729 if (dump_enabled_p ())
2730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2731 "bad data references.\n");
2732 return ok;
2735 /* Check if we are applying unroll factor now. */
2736 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2737 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2739 /* If the slp decision is false when suggested unroll factor is worked
2740 out, and we are applying suggested unroll factor, we can simply skip
2741 all slp related analyses this time. */
2742 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2744 /* Classify all cross-iteration scalar data-flow cycles.
2745 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2746 vect_analyze_scalar_cycles (loop_vinfo, slp);
2748 vect_pattern_recog (loop_vinfo);
2750 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2752 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2753 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2755 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2756 if (!ok)
2758 if (dump_enabled_p ())
2759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760 "bad data access.\n");
2761 return ok;
2764 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2766 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2767 if (!ok)
2769 if (dump_enabled_p ())
2770 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2771 "unexpected pattern.\n");
2772 return ok;
2775 /* While the rest of the analysis below depends on it in some way. */
2776 fatal = false;
2778 /* Analyze data dependences between the data-refs in the loop
2779 and adjust the maximum vectorization factor according to
2780 the dependences.
2781 FORNOW: fail at the first data dependence that we encounter. */
2783 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2784 if (!ok)
2786 if (dump_enabled_p ())
2787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2788 "bad data dependence.\n");
2789 return ok;
2791 if (max_vf != MAX_VECTORIZATION_FACTOR
2792 && maybe_lt (max_vf, min_vf))
2793 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2794 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2796 ok = vect_determine_vectorization_factor (loop_vinfo);
2797 if (!ok)
2799 if (dump_enabled_p ())
2800 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2801 "can't determine vectorization factor.\n");
2802 return ok;
2804 if (max_vf != MAX_VECTORIZATION_FACTOR
2805 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2806 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2808 /* Compute the scalar iteration cost. */
2809 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2811 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2813 if (slp)
2815 /* Check the SLP opportunities in the loop, analyze and build
2816 SLP trees. */
2817 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2818 if (!ok)
2819 return ok;
2821 /* If there are any SLP instances mark them as pure_slp. */
2822 slp = vect_make_slp_decision (loop_vinfo);
2823 if (slp)
2825 /* Find stmts that need to be both vectorized and SLPed. */
2826 vect_detect_hybrid_slp (loop_vinfo);
2828 /* Update the vectorization factor based on the SLP decision. */
2829 vect_update_vf_for_slp (loop_vinfo);
2831 /* Optimize the SLP graph with the vectorization factor fixed. */
2832 vect_optimize_slp (loop_vinfo);
2834 /* Gather the loads reachable from the SLP graph entries. */
2835 vect_gather_slp_loads (loop_vinfo);
2839 bool saved_can_use_partial_vectors_p
2840 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2842 /* We don't expect to have to roll back to anything other than an empty
2843 set of rgroups. */
2844 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2846 /* This is the point where we can re-start analysis with SLP forced off. */
2847 start_over:
2849 /* Apply the suggested unrolling factor, this was determined by the backend
2850 during finish_cost the first time we ran the analyzis for this
2851 vector mode. */
2852 if (applying_suggested_uf)
2853 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2855 /* Now the vectorization factor is final. */
2856 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2857 gcc_assert (known_ne (vectorization_factor, 0U));
2859 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2861 dump_printf_loc (MSG_NOTE, vect_location,
2862 "vectorization_factor = ");
2863 dump_dec (MSG_NOTE, vectorization_factor);
2864 dump_printf (MSG_NOTE, ", niters = %wd\n",
2865 LOOP_VINFO_INT_NITERS (loop_vinfo));
2868 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2870 /* Analyze the alignment of the data-refs in the loop.
2871 Fail if a data reference is found that cannot be vectorized. */
2873 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2874 if (!ok)
2876 if (dump_enabled_p ())
2877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2878 "bad data alignment.\n");
2879 return ok;
2882 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2883 It is important to call pruning after vect_analyze_data_ref_accesses,
2884 since we use grouping information gathered by interleaving analysis. */
2885 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2886 if (!ok)
2887 return ok;
2889 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2890 vectorization, since we do not want to add extra peeling or
2891 add versioning for alignment. */
2892 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2893 /* This pass will decide on using loop versioning and/or loop peeling in
2894 order to enhance the alignment of data references in the loop. */
2895 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2896 if (!ok)
2897 return ok;
2899 if (slp)
2901 /* Analyze operations in the SLP instances. Note this may
2902 remove unsupported SLP instances which makes the above
2903 SLP kind detection invalid. */
2904 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2905 vect_slp_analyze_operations (loop_vinfo);
2906 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2908 ok = opt_result::failure_at (vect_location,
2909 "unsupported SLP instances\n");
2910 goto again;
2913 /* Check whether any load in ALL SLP instances is possibly permuted. */
2914 slp_tree load_node, slp_root;
2915 unsigned i, x;
2916 slp_instance instance;
2917 bool can_use_lanes = true;
2918 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2920 slp_root = SLP_INSTANCE_TREE (instance);
2921 int group_size = SLP_TREE_LANES (slp_root);
2922 tree vectype = SLP_TREE_VECTYPE (slp_root);
2923 bool loads_permuted = false;
2924 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2926 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2927 continue;
2928 unsigned j;
2929 stmt_vec_info load_info;
2930 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2931 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2933 loads_permuted = true;
2934 break;
2938 /* If the loads and stores can be handled with load/store-lane
2939 instructions record it and move on to the next instance. */
2940 if (loads_permuted
2941 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2942 && vect_store_lanes_supported (vectype, group_size, false)
2943 != IFN_LAST)
2945 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2946 if (STMT_VINFO_GROUPED_ACCESS
2947 (SLP_TREE_REPRESENTATIVE (load_node)))
2949 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2950 (SLP_TREE_REPRESENTATIVE (load_node));
2951 /* Use SLP for strided accesses (or if we can't
2952 load-lanes). */
2953 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2954 || vect_load_lanes_supported
2955 (STMT_VINFO_VECTYPE (stmt_vinfo),
2956 DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2957 break;
2960 can_use_lanes
2961 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2963 if (can_use_lanes && dump_enabled_p ())
2964 dump_printf_loc (MSG_NOTE, vect_location,
2965 "SLP instance %p can use load/store-lanes\n",
2966 (void *) instance);
2968 else
2970 can_use_lanes = false;
2971 break;
2975 /* If all SLP instances can use load/store-lanes abort SLP and try again
2976 with SLP disabled. */
2977 if (can_use_lanes)
2979 ok = opt_result::failure_at (vect_location,
2980 "Built SLP cancelled: can use "
2981 "load/store-lanes\n");
2982 if (dump_enabled_p ())
2983 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2984 "Built SLP cancelled: all SLP instances support "
2985 "load/store-lanes\n");
2986 goto again;
2990 /* Dissolve SLP-only groups. */
2991 vect_dissolve_slp_only_groups (loop_vinfo);
2993 /* Scan all the remaining operations in the loop that are not subject
2994 to SLP and make sure they are vectorizable. */
2995 ok = vect_analyze_loop_operations (loop_vinfo);
2996 if (!ok)
2998 if (dump_enabled_p ())
2999 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3000 "bad operation or unsupported loop bound.\n");
3001 return ok;
3004 /* For now, we don't expect to mix both masking and length approaches for one
3005 loop, disable it if both are recorded. */
3006 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3007 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3008 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3010 if (dump_enabled_p ())
3011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3012 "can't vectorize a loop with partial vectors"
3013 " because we don't expect to mix different"
3014 " approaches with partial vectors for the"
3015 " same loop.\n");
3016 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3019 /* If we still have the option of using partial vectors,
3020 check whether we can generate the necessary loop controls. */
3021 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3023 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3025 if (!vect_verify_full_masking (loop_vinfo)
3026 && !vect_verify_full_masking_avx512 (loop_vinfo))
3027 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3029 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3030 if (!vect_verify_loop_lens (loop_vinfo))
3031 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3034 /* If we're vectorizing a loop that uses length "controls" and
3035 can iterate more than once, we apply decrementing IV approach
3036 in loop control. */
3037 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3038 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3039 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3040 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3041 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3042 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3043 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3045 /* If a loop uses length controls and has a decrementing loop control IV,
3046 we will normally pass that IV through a MIN_EXPR to calcaluate the
3047 basis for the length controls. E.g. in a loop that processes one
3048 element per scalar iteration, the number of elements would be
3049 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3051 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3052 step, since only the final iteration of the vector loop can have
3053 inactive lanes.
3055 However, some targets have a dedicated instruction for calculating the
3056 preferred length, given the total number of elements that still need to
3057 be processed. This is encapsulated in the SELECT_VL internal function.
3059 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3060 to determine the basis for the length controls. However, unlike the
3061 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3062 lanes inactive in any iteration of the vector loop, not just the last
3063 iteration. This SELECT_VL approach therefore requires us to use pointer
3064 IVs with variable steps.
3066 Once we've decided how many elements should be processed by one
3067 iteration of the vector loop, we need to populate the rgroup controls.
3068 If a loop has multiple rgroups, we need to make sure that those rgroups
3069 "line up" (that is, they must be consistent about which elements are
3070 active and which aren't). This is done by vect_adjust_loop_lens_control.
3072 In principle, it would be possible to use vect_adjust_loop_lens_control
3073 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3074 However:
3076 (1) In practice, it only makes sense to use SELECT_VL when a vector
3077 operation will be controlled directly by the result. It is not
3078 worth using SELECT_VL if it would only be the input to other
3079 calculations.
3081 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3082 pointer IV will need N updates by a variable amount (N-1 updates
3083 within the iteration and 1 update to move to the next iteration).
3085 Because of this, we prefer to use the MIN_EXPR approach whenever there
3086 is more than one length control.
3088 In addition, SELECT_VL always operates to a granularity of 1 unit.
3089 If we wanted to use it to control an SLP operation on N consecutive
3090 elements, we would need to make the SELECT_VL inputs measure scalar
3091 iterations (rather than elements) and then multiply the SELECT_VL
3092 result by N. But using SELECT_VL this way is inefficient because
3093 of (1) above.
3095 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3096 satisfied:
3098 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3099 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3101 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3102 we will fail to gain benefits of following unroll optimizations. We prefer
3103 using the MIN_EXPR approach in this situation. */
3104 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3106 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3107 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3108 OPTIMIZE_FOR_SPEED)
3109 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3110 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3111 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3112 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3113 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3116 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3117 assuming that the loop will be used as a main loop. We will redo
3118 this analysis later if we instead decide to use the loop as an
3119 epilogue loop. */
3120 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3121 if (!ok)
3122 return ok;
3124 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3125 to be able to handle fewer than VF scalars, or needs to have a lower VF
3126 than the main loop. */
3127 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3128 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3130 poly_uint64 unscaled_vf
3131 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3132 orig_loop_vinfo->suggested_unroll_factor);
3133 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3134 return opt_result::failure_at (vect_location,
3135 "Vectorization factor too high for"
3136 " epilogue loop.\n");
3139 /* Check the costings of the loop make vectorizing worthwhile. */
3140 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3141 if (res < 0)
3143 ok = opt_result::failure_at (vect_location,
3144 "Loop costings may not be worthwhile.\n");
3145 goto again;
3147 if (!res)
3148 return opt_result::failure_at (vect_location,
3149 "Loop costings not worthwhile.\n");
3151 /* If an epilogue loop is required make sure we can create one. */
3152 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3153 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3155 if (dump_enabled_p ())
3156 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3157 if (!vect_can_advance_ivs_p (loop_vinfo)
3158 || !slpeel_can_duplicate_loop_p (loop,
3159 LOOP_VINFO_IV_EXIT (loop_vinfo),
3160 LOOP_VINFO_IV_EXIT (loop_vinfo)))
3162 ok = opt_result::failure_at (vect_location,
3163 "not vectorized: can't create required "
3164 "epilog loop\n");
3165 goto again;
3169 /* During peeling, we need to check if number of loop iterations is
3170 enough for both peeled prolog loop and vector loop. This check
3171 can be merged along with threshold check of loop versioning, so
3172 increase threshold for this case if necessary.
3174 If we are analyzing an epilogue we still want to check what its
3175 versioning threshold would be. If we decide to vectorize the epilogues we
3176 will want to use the lowest versioning threshold of all epilogues and main
3177 loop. This will enable us to enter a vectorized epilogue even when
3178 versioning the loop. We can't simply check whether the epilogue requires
3179 versioning though since we may have skipped some versioning checks when
3180 analyzing the epilogue. For instance, checks for alias versioning will be
3181 skipped when dealing with epilogues as we assume we already checked them
3182 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3183 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3185 poly_uint64 niters_th = 0;
3186 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3188 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3190 /* Niters for peeled prolog loop. */
3191 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3193 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3194 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3195 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3197 else
3198 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3201 /* Niters for at least one iteration of vectorized loop. */
3202 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3203 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3204 /* One additional iteration because of peeling for gap. */
3205 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3206 niters_th += 1;
3208 /* Use the same condition as vect_transform_loop to decide when to use
3209 the cost to determine a versioning threshold. */
3210 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3211 && ordered_p (th, niters_th))
3212 niters_th = ordered_max (poly_uint64 (th), niters_th);
3214 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3217 gcc_assert (known_eq (vectorization_factor,
3218 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3220 slp_done_for_suggested_uf = slp;
3222 /* Ok to vectorize! */
3223 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3224 return opt_result::success ();
3226 again:
3227 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3228 gcc_assert (!ok);
3230 /* Try again with SLP forced off but if we didn't do any SLP there is
3231 no point in re-trying. */
3232 if (!slp)
3233 return ok;
3235 /* If the slp decision is true when suggested unroll factor is worked
3236 out, and we are applying suggested unroll factor, we don't need to
3237 re-try any more. */
3238 if (applying_suggested_uf && slp_done_for_suggested_uf)
3239 return ok;
3241 /* If there are reduction chains re-trying will fail anyway. */
3242 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3243 return ok;
3245 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3246 via interleaving or lane instructions. */
3247 slp_instance instance;
3248 slp_tree node;
3249 unsigned i, j;
3250 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3252 stmt_vec_info vinfo;
3253 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3254 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3255 continue;
3256 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3257 unsigned int size = DR_GROUP_SIZE (vinfo);
3258 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3259 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3260 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3261 && ! vect_grouped_store_supported (vectype, size))
3262 return opt_result::failure_at (vinfo->stmt,
3263 "unsupported grouped store\n");
3264 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3266 vinfo = SLP_TREE_REPRESENTATIVE (node);
3267 if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3269 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3270 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3271 size = DR_GROUP_SIZE (vinfo);
3272 vectype = STMT_VINFO_VECTYPE (vinfo);
3273 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3274 && ! vect_grouped_load_supported (vectype, single_element_p,
3275 size))
3276 return opt_result::failure_at (vinfo->stmt,
3277 "unsupported grouped load\n");
3282 if (dump_enabled_p ())
3283 dump_printf_loc (MSG_NOTE, vect_location,
3284 "re-trying with SLP disabled\n");
3286 /* Roll back state appropriately. No SLP this time. */
3287 slp = false;
3288 /* Restore vectorization factor as it were without SLP. */
3289 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3290 /* Free the SLP instances. */
3291 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3292 vect_free_slp_instance (instance);
3293 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3294 /* Reset SLP type to loop_vect on all stmts. */
3295 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3297 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3298 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3299 !gsi_end_p (si); gsi_next (&si))
3301 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3302 STMT_SLP_TYPE (stmt_info) = loop_vect;
3303 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3304 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3306 /* vectorizable_reduction adjusts reduction stmt def-types,
3307 restore them to that of the PHI. */
3308 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3309 = STMT_VINFO_DEF_TYPE (stmt_info);
3310 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3311 (STMT_VINFO_REDUC_DEF (stmt_info)))
3312 = STMT_VINFO_DEF_TYPE (stmt_info);
3315 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3316 !gsi_end_p (si); gsi_next (&si))
3318 if (is_gimple_debug (gsi_stmt (si)))
3319 continue;
3320 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3321 STMT_SLP_TYPE (stmt_info) = loop_vect;
3322 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3324 stmt_vec_info pattern_stmt_info
3325 = STMT_VINFO_RELATED_STMT (stmt_info);
3326 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3327 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3329 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3330 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3331 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3332 !gsi_end_p (pi); gsi_next (&pi))
3333 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3334 = loop_vect;
3338 /* Free optimized alias test DDRS. */
3339 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3340 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3341 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3342 /* Reset target cost data. */
3343 delete loop_vinfo->vector_costs;
3344 loop_vinfo->vector_costs = nullptr;
3345 /* Reset accumulated rgroup information. */
3346 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3347 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3348 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3349 /* Reset assorted flags. */
3350 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3351 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3352 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3353 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3354 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3355 = saved_can_use_partial_vectors_p;
3357 goto start_over;
3360 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3361 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3362 OLD_LOOP_VINFO is better unless something specifically indicates
3363 otherwise.
3365 Note that this deliberately isn't a partial order. */
3367 static bool
3368 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3369 loop_vec_info old_loop_vinfo)
3371 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3372 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3374 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3375 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3377 /* Always prefer a VF of loop->simdlen over any other VF. */
3378 if (loop->simdlen)
3380 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3381 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3382 if (new_simdlen_p != old_simdlen_p)
3383 return new_simdlen_p;
3386 const auto *old_costs = old_loop_vinfo->vector_costs;
3387 const auto *new_costs = new_loop_vinfo->vector_costs;
3388 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3389 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3391 return new_costs->better_main_loop_than_p (old_costs);
3394 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3395 true if we should. */
3397 static bool
3398 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3399 loop_vec_info old_loop_vinfo)
3401 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3402 return false;
3404 if (dump_enabled_p ())
3405 dump_printf_loc (MSG_NOTE, vect_location,
3406 "***** Preferring vector mode %s to vector mode %s\n",
3407 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3408 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3409 return true;
3412 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3413 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3414 MODE_I to the next mode useful to analyze.
3415 Return the loop_vinfo on success and wrapped null on failure. */
3417 static opt_loop_vec_info
3418 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3419 const vect_loop_form_info *loop_form_info,
3420 loop_vec_info main_loop_vinfo,
3421 const vector_modes &vector_modes, unsigned &mode_i,
3422 machine_mode &autodetected_vector_mode,
3423 bool &fatal)
3425 loop_vec_info loop_vinfo
3426 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3428 machine_mode vector_mode = vector_modes[mode_i];
3429 loop_vinfo->vector_mode = vector_mode;
3430 unsigned int suggested_unroll_factor = 1;
3431 bool slp_done_for_suggested_uf = false;
3433 /* Run the main analysis. */
3434 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3435 &suggested_unroll_factor,
3436 slp_done_for_suggested_uf);
3437 if (dump_enabled_p ())
3438 dump_printf_loc (MSG_NOTE, vect_location,
3439 "***** Analysis %s with vector mode %s\n",
3440 res ? "succeeded" : " failed",
3441 GET_MODE_NAME (loop_vinfo->vector_mode));
3443 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3445 if (dump_enabled_p ())
3446 dump_printf_loc (MSG_NOTE, vect_location,
3447 "***** Re-trying analysis for unrolling"
3448 " with unroll factor %d and slp %s.\n",
3449 suggested_unroll_factor,
3450 slp_done_for_suggested_uf ? "on" : "off");
3451 loop_vec_info unroll_vinfo
3452 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3453 unroll_vinfo->vector_mode = vector_mode;
3454 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3455 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3456 slp_done_for_suggested_uf);
3457 if (new_res)
3459 delete loop_vinfo;
3460 loop_vinfo = unroll_vinfo;
3462 else
3463 delete unroll_vinfo;
3466 /* Remember the autodetected vector mode. */
3467 if (vector_mode == VOIDmode)
3468 autodetected_vector_mode = loop_vinfo->vector_mode;
3470 /* Advance mode_i, first skipping modes that would result in the
3471 same analysis result. */
3472 while (mode_i + 1 < vector_modes.length ()
3473 && vect_chooses_same_modes_p (loop_vinfo,
3474 vector_modes[mode_i + 1]))
3476 if (dump_enabled_p ())
3477 dump_printf_loc (MSG_NOTE, vect_location,
3478 "***** The result for vector mode %s would"
3479 " be the same\n",
3480 GET_MODE_NAME (vector_modes[mode_i + 1]));
3481 mode_i += 1;
3483 if (mode_i + 1 < vector_modes.length ()
3484 && VECTOR_MODE_P (autodetected_vector_mode)
3485 && (related_vector_mode (vector_modes[mode_i + 1],
3486 GET_MODE_INNER (autodetected_vector_mode))
3487 == autodetected_vector_mode)
3488 && (related_vector_mode (autodetected_vector_mode,
3489 GET_MODE_INNER (vector_modes[mode_i + 1]))
3490 == vector_modes[mode_i + 1]))
3492 if (dump_enabled_p ())
3493 dump_printf_loc (MSG_NOTE, vect_location,
3494 "***** Skipping vector mode %s, which would"
3495 " repeat the analysis for %s\n",
3496 GET_MODE_NAME (vector_modes[mode_i + 1]),
3497 GET_MODE_NAME (autodetected_vector_mode));
3498 mode_i += 1;
3500 mode_i++;
3502 if (!res)
3504 delete loop_vinfo;
3505 if (fatal)
3506 gcc_checking_assert (main_loop_vinfo == NULL);
3507 return opt_loop_vec_info::propagate_failure (res);
3510 return opt_loop_vec_info::success (loop_vinfo);
3513 /* Function vect_analyze_loop.
3515 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3516 for it. The different analyses will record information in the
3517 loop_vec_info struct. */
3518 opt_loop_vec_info
3519 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3521 DUMP_VECT_SCOPE ("analyze_loop_nest");
3523 if (loop_outer (loop)
3524 && loop_vec_info_for_loop (loop_outer (loop))
3525 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3526 return opt_loop_vec_info::failure_at (vect_location,
3527 "outer-loop already vectorized.\n");
3529 if (!find_loop_nest (loop, &shared->loop_nest))
3530 return opt_loop_vec_info::failure_at
3531 (vect_location,
3532 "not vectorized: loop nest containing two or more consecutive inner"
3533 " loops cannot be vectorized\n");
3535 /* Analyze the loop form. */
3536 vect_loop_form_info loop_form_info;
3537 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3538 if (!res)
3540 if (dump_enabled_p ())
3541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3542 "bad loop form.\n");
3543 return opt_loop_vec_info::propagate_failure (res);
3545 if (!integer_onep (loop_form_info.assumptions))
3547 /* We consider to vectorize this loop by versioning it under
3548 some assumptions. In order to do this, we need to clear
3549 existing information computed by scev and niter analyzer. */
3550 scev_reset_htab ();
3551 free_numbers_of_iterations_estimates (loop);
3552 /* Also set flag for this loop so that following scev and niter
3553 analysis are done under the assumptions. */
3554 loop_constraint_set (loop, LOOP_C_FINITE);
3557 auto_vector_modes vector_modes;
3558 /* Autodetect first vector size we try. */
3559 vector_modes.safe_push (VOIDmode);
3560 unsigned int autovec_flags
3561 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3562 loop->simdlen != 0);
3563 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3564 && !unlimited_cost_model (loop));
3565 machine_mode autodetected_vector_mode = VOIDmode;
3566 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3567 unsigned int mode_i = 0;
3568 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3570 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3571 a mode has not been analyzed. */
3572 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3573 for (unsigned i = 0; i < vector_modes.length (); ++i)
3574 cached_vf_per_mode.safe_push (0);
3576 /* First determine the main loop vectorization mode, either the first
3577 one that works, starting with auto-detecting the vector mode and then
3578 following the targets order of preference, or the one with the
3579 lowest cost if pick_lowest_cost_p. */
3580 while (1)
3582 bool fatal;
3583 unsigned int last_mode_i = mode_i;
3584 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3585 failed. */
3586 cached_vf_per_mode[last_mode_i] = -1;
3587 opt_loop_vec_info loop_vinfo
3588 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3589 NULL, vector_modes, mode_i,
3590 autodetected_vector_mode, fatal);
3591 if (fatal)
3592 break;
3594 if (loop_vinfo)
3596 /* Analyzis has been successful so update the VF value. The
3597 VF should always be a multiple of unroll_factor and we want to
3598 capture the original VF here. */
3599 cached_vf_per_mode[last_mode_i]
3600 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3601 loop_vinfo->suggested_unroll_factor);
3602 /* Once we hit the desired simdlen for the first time,
3603 discard any previous attempts. */
3604 if (simdlen
3605 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3607 delete first_loop_vinfo;
3608 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3609 simdlen = 0;
3611 else if (pick_lowest_cost_p
3612 && first_loop_vinfo
3613 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3615 /* Pick loop_vinfo over first_loop_vinfo. */
3616 delete first_loop_vinfo;
3617 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3619 if (first_loop_vinfo == NULL)
3620 first_loop_vinfo = loop_vinfo;
3621 else
3623 delete loop_vinfo;
3624 loop_vinfo = opt_loop_vec_info::success (NULL);
3627 /* Commit to first_loop_vinfo if we have no reason to try
3628 alternatives. */
3629 if (!simdlen && !pick_lowest_cost_p)
3630 break;
3632 if (mode_i == vector_modes.length ()
3633 || autodetected_vector_mode == VOIDmode)
3634 break;
3636 /* Try the next biggest vector size. */
3637 if (dump_enabled_p ())
3638 dump_printf_loc (MSG_NOTE, vect_location,
3639 "***** Re-trying analysis with vector mode %s\n",
3640 GET_MODE_NAME (vector_modes[mode_i]));
3642 if (!first_loop_vinfo)
3643 return opt_loop_vec_info::propagate_failure (res);
3645 if (dump_enabled_p ())
3646 dump_printf_loc (MSG_NOTE, vect_location,
3647 "***** Choosing vector mode %s\n",
3648 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3650 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3651 enabled, SIMDUID is not set, it is the innermost loop and we have
3652 either already found the loop's SIMDLEN or there was no SIMDLEN to
3653 begin with.
3654 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3655 bool vect_epilogues = (!simdlen
3656 && loop->inner == NULL
3657 && param_vect_epilogues_nomask
3658 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3659 && !loop->simduid);
3660 if (!vect_epilogues)
3661 return first_loop_vinfo;
3663 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3664 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3666 /* For epilogues start the analysis from the first mode. The motivation
3667 behind starting from the beginning comes from cases where the VECTOR_MODES
3668 array may contain length-agnostic and length-specific modes. Their
3669 ordering is not guaranteed, so we could end up picking a mode for the main
3670 loop that is after the epilogue's optimal mode. */
3671 vector_modes[0] = autodetected_vector_mode;
3672 mode_i = 0;
3674 bool supports_partial_vectors =
3675 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3676 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3678 while (1)
3680 /* If the target does not support partial vectors we can shorten the
3681 number of modes to analyze for the epilogue as we know we can't pick a
3682 mode that would lead to a VF at least as big as the
3683 FIRST_VINFO_VF. */
3684 if (!supports_partial_vectors
3685 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3687 mode_i++;
3688 if (mode_i == vector_modes.length ())
3689 break;
3690 continue;
3693 if (dump_enabled_p ())
3694 dump_printf_loc (MSG_NOTE, vect_location,
3695 "***** Re-trying epilogue analysis with vector "
3696 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3698 bool fatal;
3699 opt_loop_vec_info loop_vinfo
3700 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3701 first_loop_vinfo,
3702 vector_modes, mode_i,
3703 autodetected_vector_mode, fatal);
3704 if (fatal)
3705 break;
3707 if (loop_vinfo)
3709 if (pick_lowest_cost_p)
3711 /* Keep trying to roll back vectorization attempts while the
3712 loop_vec_infos they produced were worse than this one. */
3713 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3714 while (!vinfos.is_empty ()
3715 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3717 gcc_assert (vect_epilogues);
3718 delete vinfos.pop ();
3721 /* For now only allow one epilogue loop. */
3722 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3724 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3725 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3726 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3727 || maybe_ne (lowest_th, 0U));
3728 /* Keep track of the known smallest versioning
3729 threshold. */
3730 if (ordered_p (lowest_th, th))
3731 lowest_th = ordered_min (lowest_th, th);
3733 else
3735 delete loop_vinfo;
3736 loop_vinfo = opt_loop_vec_info::success (NULL);
3739 /* For now only allow one epilogue loop, but allow
3740 pick_lowest_cost_p to replace it, so commit to the
3741 first epilogue if we have no reason to try alternatives. */
3742 if (!pick_lowest_cost_p)
3743 break;
3746 if (mode_i == vector_modes.length ())
3747 break;
3751 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3753 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3754 if (dump_enabled_p ())
3755 dump_printf_loc (MSG_NOTE, vect_location,
3756 "***** Choosing epilogue vector mode %s\n",
3757 GET_MODE_NAME
3758 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3761 return first_loop_vinfo;
3764 /* Return true if there is an in-order reduction function for CODE, storing
3765 it in *REDUC_FN if so. */
3767 static bool
3768 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3770 /* We support MINUS_EXPR by negating the operand. This also preserves an
3771 initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3772 (-0.0) = -0.0. */
3773 if (code == PLUS_EXPR || code == MINUS_EXPR)
3775 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3776 return true;
3778 return false;
3781 /* Function reduction_fn_for_scalar_code
3783 Input:
3784 CODE - tree_code of a reduction operations.
3786 Output:
3787 REDUC_FN - the corresponding internal function to be used to reduce the
3788 vector of partial results into a single scalar result, or IFN_LAST
3789 if the operation is a supported reduction operation, but does not have
3790 such an internal function.
3792 Return FALSE if CODE currently cannot be vectorized as reduction. */
3794 bool
3795 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3797 if (code.is_tree_code ())
3798 switch (tree_code (code))
3800 case MAX_EXPR:
3801 *reduc_fn = IFN_REDUC_MAX;
3802 return true;
3804 case MIN_EXPR:
3805 *reduc_fn = IFN_REDUC_MIN;
3806 return true;
3808 case PLUS_EXPR:
3809 *reduc_fn = IFN_REDUC_PLUS;
3810 return true;
3812 case BIT_AND_EXPR:
3813 *reduc_fn = IFN_REDUC_AND;
3814 return true;
3816 case BIT_IOR_EXPR:
3817 *reduc_fn = IFN_REDUC_IOR;
3818 return true;
3820 case BIT_XOR_EXPR:
3821 *reduc_fn = IFN_REDUC_XOR;
3822 return true;
3824 case MULT_EXPR:
3825 case MINUS_EXPR:
3826 *reduc_fn = IFN_LAST;
3827 return true;
3829 default:
3830 return false;
3832 else
3833 switch (combined_fn (code))
3835 CASE_CFN_FMAX:
3836 *reduc_fn = IFN_REDUC_FMAX;
3837 return true;
3839 CASE_CFN_FMIN:
3840 *reduc_fn = IFN_REDUC_FMIN;
3841 return true;
3843 default:
3844 return false;
3848 /* If there is a neutral value X such that a reduction would not be affected
3849 by the introduction of additional X elements, return that X, otherwise
3850 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3851 of the scalar elements. If the reduction has just a single initial value
3852 then INITIAL_VALUE is that value, otherwise it is null.
3853 If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3854 In that case no signed zero is returned. */
3856 tree
3857 neutral_op_for_reduction (tree scalar_type, code_helper code,
3858 tree initial_value, bool as_initial)
3860 if (code.is_tree_code ())
3861 switch (tree_code (code))
3863 case DOT_PROD_EXPR:
3864 case SAD_EXPR:
3865 case MINUS_EXPR:
3866 case BIT_IOR_EXPR:
3867 case BIT_XOR_EXPR:
3868 return build_zero_cst (scalar_type);
3869 case WIDEN_SUM_EXPR:
3870 case PLUS_EXPR:
3871 if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3872 return build_real (scalar_type, dconstm0);
3873 else
3874 return build_zero_cst (scalar_type);
3876 case MULT_EXPR:
3877 return build_one_cst (scalar_type);
3879 case BIT_AND_EXPR:
3880 return build_all_ones_cst (scalar_type);
3882 case MAX_EXPR:
3883 case MIN_EXPR:
3884 return initial_value;
3886 default:
3887 return NULL_TREE;
3889 else
3890 switch (combined_fn (code))
3892 CASE_CFN_FMIN:
3893 CASE_CFN_FMAX:
3894 return initial_value;
3896 default:
3897 return NULL_TREE;
3901 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3902 STMT is printed with a message MSG. */
3904 static void
3905 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3907 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3910 /* Return true if we need an in-order reduction for operation CODE
3911 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3912 overflow must wrap. */
3914 bool
3915 needs_fold_left_reduction_p (tree type, code_helper code)
3917 /* CHECKME: check for !flag_finite_math_only too? */
3918 if (SCALAR_FLOAT_TYPE_P (type))
3920 if (code.is_tree_code ())
3921 switch (tree_code (code))
3923 case MIN_EXPR:
3924 case MAX_EXPR:
3925 return false;
3927 default:
3928 return !flag_associative_math;
3930 else
3931 switch (combined_fn (code))
3933 CASE_CFN_FMIN:
3934 CASE_CFN_FMAX:
3935 return false;
3937 default:
3938 return !flag_associative_math;
3942 if (INTEGRAL_TYPE_P (type))
3943 return (!code.is_tree_code ()
3944 || !operation_no_trapping_overflow (type, tree_code (code)));
3946 if (SAT_FIXED_POINT_TYPE_P (type))
3947 return true;
3949 return false;
3952 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3953 has a handled computation expression. Store the main reduction
3954 operation in *CODE. */
3956 static bool
3957 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3958 tree loop_arg, code_helper *code,
3959 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3961 auto_bitmap visited;
3962 tree lookfor = PHI_RESULT (phi);
3963 ssa_op_iter curri;
3964 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3965 while (USE_FROM_PTR (curr) != loop_arg)
3966 curr = op_iter_next_use (&curri);
3967 curri.i = curri.numops;
3970 path.safe_push (std::make_pair (curri, curr));
3971 tree use = USE_FROM_PTR (curr);
3972 if (use == lookfor)
3973 break;
3974 gimple *def = SSA_NAME_DEF_STMT (use);
3975 if (gimple_nop_p (def)
3976 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3978 pop:
3981 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3982 curri = x.first;
3983 curr = x.second;
3985 curr = op_iter_next_use (&curri);
3986 /* Skip already visited or non-SSA operands (from iterating
3987 over PHI args). */
3988 while (curr != NULL_USE_OPERAND_P
3989 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3990 || ! bitmap_set_bit (visited,
3991 SSA_NAME_VERSION
3992 (USE_FROM_PTR (curr)))));
3994 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3995 if (curr == NULL_USE_OPERAND_P)
3996 break;
3998 else
4000 if (gimple_code (def) == GIMPLE_PHI)
4001 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4002 else
4003 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4004 while (curr != NULL_USE_OPERAND_P
4005 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4006 || ! bitmap_set_bit (visited,
4007 SSA_NAME_VERSION
4008 (USE_FROM_PTR (curr)))))
4009 curr = op_iter_next_use (&curri);
4010 if (curr == NULL_USE_OPERAND_P)
4011 goto pop;
4014 while (1);
4015 if (dump_file && (dump_flags & TDF_DETAILS))
4017 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4018 unsigned i;
4019 std::pair<ssa_op_iter, use_operand_p> *x;
4020 FOR_EACH_VEC_ELT (path, i, x)
4021 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4022 dump_printf (MSG_NOTE, "\n");
4025 /* Check whether the reduction path detected is valid. */
4026 bool fail = path.length () == 0;
4027 bool neg = false;
4028 int sign = -1;
4029 *code = ERROR_MARK;
4030 for (unsigned i = 1; i < path.length (); ++i)
4032 gimple *use_stmt = USE_STMT (path[i].second);
4033 gimple_match_op op;
4034 if (!gimple_extract_op (use_stmt, &op))
4036 fail = true;
4037 break;
4039 unsigned int opi = op.num_ops;
4040 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4042 /* The following make sure we can compute the operand index
4043 easily plus it mostly disallows chaining via COND_EXPR condition
4044 operands. */
4045 for (opi = 0; opi < op.num_ops; ++opi)
4046 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4047 break;
4049 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4051 for (opi = 0; opi < op.num_ops; ++opi)
4052 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4053 break;
4055 if (opi == op.num_ops)
4057 fail = true;
4058 break;
4060 op.code = canonicalize_code (op.code, op.type);
4061 if (op.code == MINUS_EXPR)
4063 op.code = PLUS_EXPR;
4064 /* Track whether we negate the reduction value each iteration. */
4065 if (op.ops[1] == op.ops[opi])
4066 neg = ! neg;
4068 if (CONVERT_EXPR_CODE_P (op.code)
4069 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4071 else if (*code == ERROR_MARK)
4073 *code = op.code;
4074 sign = TYPE_SIGN (op.type);
4076 else if (op.code != *code)
4078 fail = true;
4079 break;
4081 else if ((op.code == MIN_EXPR
4082 || op.code == MAX_EXPR)
4083 && sign != TYPE_SIGN (op.type))
4085 fail = true;
4086 break;
4088 /* Check there's only a single stmt the op is used on. For the
4089 not value-changing tail and the last stmt allow out-of-loop uses.
4090 ??? We could relax this and handle arbitrary live stmts by
4091 forcing a scalar epilogue for example. */
4092 imm_use_iterator imm_iter;
4093 use_operand_p use_p;
4094 gimple *op_use_stmt;
4095 unsigned cnt = 0;
4096 bool cond_fn_p = op.code.is_internal_fn ()
4097 && (conditional_internal_fn_code (internal_fn (op.code))
4098 != ERROR_MARK);
4100 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4102 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4103 op1 twice (once as definition, once as else) in the same operation.
4104 Allow this. */
4105 if (cond_fn_p)
4107 gcall *call = dyn_cast<gcall *> (use_stmt);
4108 unsigned else_pos
4109 = internal_fn_else_index (internal_fn (op.code));
4111 for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4113 if (j == else_pos)
4114 continue;
4115 if (gimple_call_arg (call, j) == op.ops[opi])
4116 cnt++;
4119 else if (!is_gimple_debug (op_use_stmt)
4120 && (*code != ERROR_MARK
4121 || flow_bb_inside_loop_p (loop,
4122 gimple_bb (op_use_stmt))))
4123 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4124 cnt++;
4127 if (cnt != 1)
4129 fail = true;
4130 break;
4133 return ! fail && ! neg && *code != ERROR_MARK;
4136 bool
4137 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4138 tree loop_arg, enum tree_code code)
4140 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4141 code_helper code_;
4142 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4143 && code_ == code);
4148 /* Function vect_is_simple_reduction
4150 (1) Detect a cross-iteration def-use cycle that represents a simple
4151 reduction computation. We look for the following pattern:
4153 loop_header:
4154 a1 = phi < a0, a2 >
4155 a3 = ...
4156 a2 = operation (a3, a1)
4160 a3 = ...
4161 loop_header:
4162 a1 = phi < a0, a2 >
4163 a2 = operation (a3, a1)
4165 such that:
4166 1. operation is commutative and associative and it is safe to
4167 change the order of the computation
4168 2. no uses for a2 in the loop (a2 is used out of the loop)
4169 3. no uses of a1 in the loop besides the reduction operation
4170 4. no uses of a1 outside the loop.
4172 Conditions 1,4 are tested here.
4173 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4175 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4176 nested cycles.
4178 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4179 reductions:
4181 a1 = phi < a0, a2 >
4182 inner loop (def of a3)
4183 a2 = phi < a3 >
4185 (4) Detect condition expressions, ie:
4186 for (int i = 0; i < N; i++)
4187 if (a[i] < val)
4188 ret_val = a[i];
4192 static stmt_vec_info
4193 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4194 bool *double_reduc, bool *reduc_chain_p, bool slp)
4196 gphi *phi = as_a <gphi *> (phi_info->stmt);
4197 gimple *phi_use_stmt = NULL;
4198 imm_use_iterator imm_iter;
4199 use_operand_p use_p;
4201 *double_reduc = false;
4202 *reduc_chain_p = false;
4203 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4205 tree phi_name = PHI_RESULT (phi);
4206 /* ??? If there are no uses of the PHI result the inner loop reduction
4207 won't be detected as possibly double-reduction by vectorizable_reduction
4208 because that tries to walk the PHI arg from the preheader edge which
4209 can be constant. See PR60382. */
4210 if (has_zero_uses (phi_name))
4211 return NULL;
4212 class loop *loop = (gimple_bb (phi))->loop_father;
4213 unsigned nphi_def_loop_uses = 0;
4214 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4216 gimple *use_stmt = USE_STMT (use_p);
4217 if (is_gimple_debug (use_stmt))
4218 continue;
4220 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4222 if (dump_enabled_p ())
4223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4224 "intermediate value used outside loop.\n");
4226 return NULL;
4229 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4230 op1 twice (once as definition, once as else) in the same operation.
4231 Only count it as one. */
4232 if (use_stmt != phi_use_stmt)
4234 nphi_def_loop_uses++;
4235 phi_use_stmt = use_stmt;
4239 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4240 if (TREE_CODE (latch_def) != SSA_NAME)
4242 if (dump_enabled_p ())
4243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4244 "reduction: not ssa_name: %T\n", latch_def);
4245 return NULL;
4248 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4249 if (!def_stmt_info
4250 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4251 return NULL;
4253 bool nested_in_vect_loop
4254 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4255 unsigned nlatch_def_loop_uses = 0;
4256 auto_vec<gphi *, 3> lcphis;
4257 bool inner_loop_of_double_reduc = false;
4258 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4260 gimple *use_stmt = USE_STMT (use_p);
4261 if (is_gimple_debug (use_stmt))
4262 continue;
4263 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4264 nlatch_def_loop_uses++;
4265 else
4267 /* We can have more than one loop-closed PHI. */
4268 lcphis.safe_push (as_a <gphi *> (use_stmt));
4269 if (nested_in_vect_loop
4270 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4271 == vect_double_reduction_def))
4272 inner_loop_of_double_reduc = true;
4276 /* If we are vectorizing an inner reduction we are executing that
4277 in the original order only in case we are not dealing with a
4278 double reduction. */
4279 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4281 if (dump_enabled_p ())
4282 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4283 "detected nested cycle: ");
4284 return def_stmt_info;
4287 /* When the inner loop of a double reduction ends up with more than
4288 one loop-closed PHI we have failed to classify alternate such
4289 PHIs as double reduction, leading to wrong code. See PR103237. */
4290 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4292 if (dump_enabled_p ())
4293 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4294 "unhandle double reduction\n");
4295 return NULL;
4298 /* If this isn't a nested cycle or if the nested cycle reduction value
4299 is used ouside of the inner loop we cannot handle uses of the reduction
4300 value. */
4301 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4303 if (dump_enabled_p ())
4304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4305 "reduction used in loop.\n");
4306 return NULL;
4309 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4310 defined in the inner loop. */
4311 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4313 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4314 if (gimple_phi_num_args (def_stmt) != 1
4315 || TREE_CODE (op1) != SSA_NAME)
4317 if (dump_enabled_p ())
4318 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4319 "unsupported phi node definition.\n");
4321 return NULL;
4324 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4325 and the latch definition op1. */
4326 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4327 if (gimple_bb (def1)
4328 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4329 && loop->inner
4330 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4331 && (is_gimple_assign (def1) || is_gimple_call (def1))
4332 && is_a <gphi *> (phi_use_stmt)
4333 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4334 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4335 loop_latch_edge (loop->inner))))
4337 if (dump_enabled_p ())
4338 report_vect_op (MSG_NOTE, def_stmt,
4339 "detected double reduction: ");
4341 *double_reduc = true;
4342 return def_stmt_info;
4345 return NULL;
4348 /* Look for the expression computing latch_def from then loop PHI result. */
4349 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4350 code_helper code;
4351 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4352 path))
4354 STMT_VINFO_REDUC_CODE (phi_info) = code;
4355 if (code == COND_EXPR && !nested_in_vect_loop)
4356 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4358 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4359 reduction chain for which the additional restriction is that
4360 all operations in the chain are the same. */
4361 auto_vec<stmt_vec_info, 8> reduc_chain;
4362 unsigned i;
4363 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4364 for (i = path.length () - 1; i >= 1; --i)
4366 gimple *stmt = USE_STMT (path[i].second);
4367 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4368 gimple_match_op op;
4369 if (!gimple_extract_op (stmt, &op))
4370 gcc_unreachable ();
4371 if (gassign *assign = dyn_cast<gassign *> (stmt))
4372 STMT_VINFO_REDUC_IDX (stmt_info)
4373 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4374 else
4376 gcall *call = as_a<gcall *> (stmt);
4377 STMT_VINFO_REDUC_IDX (stmt_info)
4378 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4380 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4381 && (i == 1 || i == path.length () - 1));
4382 if ((op.code != code && !leading_conversion)
4383 /* We can only handle the final value in epilogue
4384 generation for reduction chains. */
4385 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4386 is_slp_reduc = false;
4387 /* For reduction chains we support a trailing/leading
4388 conversions. We do not store those in the actual chain. */
4389 if (leading_conversion)
4390 continue;
4391 reduc_chain.safe_push (stmt_info);
4393 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4395 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4397 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4398 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4400 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4401 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4403 /* Save the chain for further analysis in SLP detection. */
4404 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4405 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4407 *reduc_chain_p = true;
4408 if (dump_enabled_p ())
4409 dump_printf_loc (MSG_NOTE, vect_location,
4410 "reduction: detected reduction chain\n");
4412 else if (dump_enabled_p ())
4413 dump_printf_loc (MSG_NOTE, vect_location,
4414 "reduction: detected reduction\n");
4416 return def_stmt_info;
4419 if (dump_enabled_p ())
4420 dump_printf_loc (MSG_NOTE, vect_location,
4421 "reduction: unknown pattern\n");
4423 return NULL;
4426 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4427 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4428 or -1 if not known. */
4430 static int
4431 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4433 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4434 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4436 if (dump_enabled_p ())
4437 dump_printf_loc (MSG_NOTE, vect_location,
4438 "cost model: epilogue peel iters set to vf/2 "
4439 "because loop iterations are unknown .\n");
4440 return assumed_vf / 2;
4442 else
4444 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4445 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4446 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4447 /* If we need to peel for gaps, but no peeling is required, we have to
4448 peel VF iterations. */
4449 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4450 peel_iters_epilogue = assumed_vf;
4451 return peel_iters_epilogue;
4455 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4457 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4458 int *peel_iters_epilogue,
4459 stmt_vector_for_cost *scalar_cost_vec,
4460 stmt_vector_for_cost *prologue_cost_vec,
4461 stmt_vector_for_cost *epilogue_cost_vec)
4463 int retval = 0;
4465 *peel_iters_epilogue
4466 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4468 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4470 /* If peeled iterations are known but number of scalar loop
4471 iterations are unknown, count a taken branch per peeled loop. */
4472 if (peel_iters_prologue > 0)
4473 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4474 vect_prologue);
4475 if (*peel_iters_epilogue > 0)
4476 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4477 vect_epilogue);
4480 stmt_info_for_cost *si;
4481 int j;
4482 if (peel_iters_prologue)
4483 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4484 retval += record_stmt_cost (prologue_cost_vec,
4485 si->count * peel_iters_prologue,
4486 si->kind, si->stmt_info, si->misalign,
4487 vect_prologue);
4488 if (*peel_iters_epilogue)
4489 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4490 retval += record_stmt_cost (epilogue_cost_vec,
4491 si->count * *peel_iters_epilogue,
4492 si->kind, si->stmt_info, si->misalign,
4493 vect_epilogue);
4495 return retval;
4498 /* Function vect_estimate_min_profitable_iters
4500 Return the number of iterations required for the vector version of the
4501 loop to be profitable relative to the cost of the scalar version of the
4502 loop.
4504 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4505 of iterations for vectorization. -1 value means loop vectorization
4506 is not profitable. This returned value may be used for dynamic
4507 profitability check.
4509 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4510 for static check against estimated number of iterations. */
4512 static void
4513 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4514 int *ret_min_profitable_niters,
4515 int *ret_min_profitable_estimate,
4516 unsigned *suggested_unroll_factor)
4518 int min_profitable_iters;
4519 int min_profitable_estimate;
4520 int peel_iters_prologue;
4521 int peel_iters_epilogue;
4522 unsigned vec_inside_cost = 0;
4523 int vec_outside_cost = 0;
4524 unsigned vec_prologue_cost = 0;
4525 unsigned vec_epilogue_cost = 0;
4526 int scalar_single_iter_cost = 0;
4527 int scalar_outside_cost = 0;
4528 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4529 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4530 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4532 /* Cost model disabled. */
4533 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4535 if (dump_enabled_p ())
4536 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4537 *ret_min_profitable_niters = 0;
4538 *ret_min_profitable_estimate = 0;
4539 return;
4542 /* Requires loop versioning tests to handle misalignment. */
4543 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4545 /* FIXME: Make cost depend on complexity of individual check. */
4546 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4547 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4548 if (dump_enabled_p ())
4549 dump_printf (MSG_NOTE,
4550 "cost model: Adding cost of checks for loop "
4551 "versioning to treat misalignment.\n");
4554 /* Requires loop versioning with alias checks. */
4555 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4557 /* FIXME: Make cost depend on complexity of individual check. */
4558 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4559 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4560 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4561 if (len)
4562 /* Count LEN - 1 ANDs and LEN comparisons. */
4563 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4564 scalar_stmt, vect_prologue);
4565 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4566 if (len)
4568 /* Count LEN - 1 ANDs and LEN comparisons. */
4569 unsigned int nstmts = len * 2 - 1;
4570 /* +1 for each bias that needs adding. */
4571 for (unsigned int i = 0; i < len; ++i)
4572 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4573 nstmts += 1;
4574 (void) add_stmt_cost (target_cost_data, nstmts,
4575 scalar_stmt, vect_prologue);
4577 if (dump_enabled_p ())
4578 dump_printf (MSG_NOTE,
4579 "cost model: Adding cost of checks for loop "
4580 "versioning aliasing.\n");
4583 /* Requires loop versioning with niter checks. */
4584 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4586 /* FIXME: Make cost depend on complexity of individual check. */
4587 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4588 NULL, NULL, NULL_TREE, 0, vect_prologue);
4589 if (dump_enabled_p ())
4590 dump_printf (MSG_NOTE,
4591 "cost model: Adding cost of checks for loop "
4592 "versioning niters.\n");
4595 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4596 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4597 vect_prologue);
4599 /* Count statements in scalar loop. Using this as scalar cost for a single
4600 iteration for now.
4602 TODO: Add outer loop support.
4604 TODO: Consider assigning different costs to different scalar
4605 statements. */
4607 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4609 /* Add additional cost for the peeled instructions in prologue and epilogue
4610 loop. (For fully-masked loops there will be no peeling.)
4612 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4613 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4615 TODO: Build an expression that represents peel_iters for prologue and
4616 epilogue to be used in a run-time test. */
4618 bool prologue_need_br_taken_cost = false;
4619 bool prologue_need_br_not_taken_cost = false;
4621 /* Calculate peel_iters_prologue. */
4622 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4623 peel_iters_prologue = 0;
4624 else if (npeel < 0)
4626 peel_iters_prologue = assumed_vf / 2;
4627 if (dump_enabled_p ())
4628 dump_printf (MSG_NOTE, "cost model: "
4629 "prologue peel iters set to vf/2.\n");
4631 /* If peeled iterations are unknown, count a taken branch and a not taken
4632 branch per peeled loop. Even if scalar loop iterations are known,
4633 vector iterations are not known since peeled prologue iterations are
4634 not known. Hence guards remain the same. */
4635 prologue_need_br_taken_cost = true;
4636 prologue_need_br_not_taken_cost = true;
4638 else
4640 peel_iters_prologue = npeel;
4641 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4642 /* If peeled iterations are known but number of scalar loop
4643 iterations are unknown, count a taken branch per peeled loop. */
4644 prologue_need_br_taken_cost = true;
4647 bool epilogue_need_br_taken_cost = false;
4648 bool epilogue_need_br_not_taken_cost = false;
4650 /* Calculate peel_iters_epilogue. */
4651 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4652 /* We need to peel exactly one iteration for gaps. */
4653 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4654 else if (npeel < 0)
4656 /* If peeling for alignment is unknown, loop bound of main loop
4657 becomes unknown. */
4658 peel_iters_epilogue = assumed_vf / 2;
4659 if (dump_enabled_p ())
4660 dump_printf (MSG_NOTE, "cost model: "
4661 "epilogue peel iters set to vf/2 because "
4662 "peeling for alignment is unknown.\n");
4664 /* See the same reason above in peel_iters_prologue calculation. */
4665 epilogue_need_br_taken_cost = true;
4666 epilogue_need_br_not_taken_cost = true;
4668 else
4670 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4671 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4672 /* If peeled iterations are known but number of scalar loop
4673 iterations are unknown, count a taken branch per peeled loop. */
4674 epilogue_need_br_taken_cost = true;
4677 stmt_info_for_cost *si;
4678 int j;
4679 /* Add costs associated with peel_iters_prologue. */
4680 if (peel_iters_prologue)
4681 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4683 (void) add_stmt_cost (target_cost_data,
4684 si->count * peel_iters_prologue, si->kind,
4685 si->stmt_info, si->node, si->vectype,
4686 si->misalign, vect_prologue);
4689 /* Add costs associated with peel_iters_epilogue. */
4690 if (peel_iters_epilogue)
4691 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4693 (void) add_stmt_cost (target_cost_data,
4694 si->count * peel_iters_epilogue, si->kind,
4695 si->stmt_info, si->node, si->vectype,
4696 si->misalign, vect_epilogue);
4699 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4701 if (prologue_need_br_taken_cost)
4702 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4703 vect_prologue);
4705 if (prologue_need_br_not_taken_cost)
4706 (void) add_stmt_cost (target_cost_data, 1,
4707 cond_branch_not_taken, vect_prologue);
4709 if (epilogue_need_br_taken_cost)
4710 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4711 vect_epilogue);
4713 if (epilogue_need_br_not_taken_cost)
4714 (void) add_stmt_cost (target_cost_data, 1,
4715 cond_branch_not_taken, vect_epilogue);
4717 /* Take care of special costs for rgroup controls of partial vectors. */
4718 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4719 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4720 == vect_partial_vectors_avx512))
4722 /* Calculate how many masks we need to generate. */
4723 unsigned int num_masks = 0;
4724 bool need_saturation = false;
4725 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4726 if (rgm.type)
4728 unsigned nvectors = rgm.factor;
4729 num_masks += nvectors;
4730 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4731 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4732 need_saturation = true;
4735 /* ??? The target isn't able to identify the costs below as
4736 producing masks so it cannot penaltize cases where we'd run
4737 out of mask registers for example. */
4739 /* ??? We are also failing to account for smaller vector masks
4740 we generate by splitting larger masks in vect_get_loop_mask. */
4742 /* In the worst case, we need to generate each mask in the prologue
4743 and in the loop body. We need one splat per group and one
4744 compare per mask.
4746 Sometimes the prologue mask will fold to a constant,
4747 so the actual prologue cost might be smaller. However, it's
4748 simpler and safer to use the worst-case cost; if this ends up
4749 being the tie-breaker between vectorizing or not, then it's
4750 probably better not to vectorize. */
4751 (void) add_stmt_cost (target_cost_data,
4752 num_masks
4753 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4754 vector_stmt, NULL, NULL, NULL_TREE, 0,
4755 vect_prologue);
4756 (void) add_stmt_cost (target_cost_data,
4757 num_masks
4758 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4759 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4761 /* When we need saturation we need it both in the prologue and
4762 the epilogue. */
4763 if (need_saturation)
4765 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4766 NULL, NULL, NULL_TREE, 0, vect_prologue);
4767 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4768 NULL, NULL, NULL_TREE, 0, vect_body);
4771 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4772 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4773 == vect_partial_vectors_while_ult))
4775 /* Calculate how many masks we need to generate. */
4776 unsigned int num_masks = 0;
4777 rgroup_controls *rgm;
4778 unsigned int num_vectors_m1;
4779 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4780 num_vectors_m1, rgm)
4781 if (rgm->type)
4782 num_masks += num_vectors_m1 + 1;
4783 gcc_assert (num_masks > 0);
4785 /* In the worst case, we need to generate each mask in the prologue
4786 and in the loop body. One of the loop body mask instructions
4787 replaces the comparison in the scalar loop, and since we don't
4788 count the scalar comparison against the scalar body, we shouldn't
4789 count that vector instruction against the vector body either.
4791 Sometimes we can use unpacks instead of generating prologue
4792 masks and sometimes the prologue mask will fold to a constant,
4793 so the actual prologue cost might be smaller. However, it's
4794 simpler and safer to use the worst-case cost; if this ends up
4795 being the tie-breaker between vectorizing or not, then it's
4796 probably better not to vectorize. */
4797 (void) add_stmt_cost (target_cost_data, num_masks,
4798 vector_stmt, NULL, NULL, NULL_TREE, 0,
4799 vect_prologue);
4800 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4801 vector_stmt, NULL, NULL, NULL_TREE, 0,
4802 vect_body);
4804 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4806 /* Referring to the functions vect_set_loop_condition_partial_vectors
4807 and vect_set_loop_controls_directly, we need to generate each
4808 length in the prologue and in the loop body if required. Although
4809 there are some possible optimizations, we consider the worst case
4810 here. */
4812 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4813 signed char partial_load_store_bias
4814 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4815 bool need_iterate_p
4816 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4817 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4819 /* Calculate how many statements to be added. */
4820 unsigned int prologue_stmts = 0;
4821 unsigned int body_stmts = 0;
4823 rgroup_controls *rgc;
4824 unsigned int num_vectors_m1;
4825 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4826 if (rgc->type)
4828 /* May need one SHIFT for nitems_total computation. */
4829 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4830 if (nitems != 1 && !niters_known_p)
4831 prologue_stmts += 1;
4833 /* May need one MAX and one MINUS for wrap around. */
4834 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4835 prologue_stmts += 2;
4837 /* Need one MAX and one MINUS for each batch limit excepting for
4838 the 1st one. */
4839 prologue_stmts += num_vectors_m1 * 2;
4841 unsigned int num_vectors = num_vectors_m1 + 1;
4843 /* Need to set up lengths in prologue, only one MIN required
4844 for each since start index is zero. */
4845 prologue_stmts += num_vectors;
4847 /* If we have a non-zero partial load bias, we need one PLUS
4848 to adjust the load length. */
4849 if (partial_load_store_bias != 0)
4850 body_stmts += 1;
4852 /* Each may need two MINs and one MINUS to update lengths in body
4853 for next iteration. */
4854 if (need_iterate_p)
4855 body_stmts += 3 * num_vectors;
4858 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4859 scalar_stmt, vect_prologue);
4860 (void) add_stmt_cost (target_cost_data, body_stmts,
4861 scalar_stmt, vect_body);
4864 /* FORNOW: The scalar outside cost is incremented in one of the
4865 following ways:
4867 1. The vectorizer checks for alignment and aliasing and generates
4868 a condition that allows dynamic vectorization. A cost model
4869 check is ANDED with the versioning condition. Hence scalar code
4870 path now has the added cost of the versioning check.
4872 if (cost > th & versioning_check)
4873 jmp to vector code
4875 Hence run-time scalar is incremented by not-taken branch cost.
4877 2. The vectorizer then checks if a prologue is required. If the
4878 cost model check was not done before during versioning, it has to
4879 be done before the prologue check.
4881 if (cost <= th)
4882 prologue = scalar_iters
4883 if (prologue == 0)
4884 jmp to vector code
4885 else
4886 execute prologue
4887 if (prologue == num_iters)
4888 go to exit
4890 Hence the run-time scalar cost is incremented by a taken branch,
4891 plus a not-taken branch, plus a taken branch cost.
4893 3. The vectorizer then checks if an epilogue is required. If the
4894 cost model check was not done before during prologue check, it
4895 has to be done with the epilogue check.
4897 if (prologue == 0)
4898 jmp to vector code
4899 else
4900 execute prologue
4901 if (prologue == num_iters)
4902 go to exit
4903 vector code:
4904 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4905 jmp to epilogue
4907 Hence the run-time scalar cost should be incremented by 2 taken
4908 branches.
4910 TODO: The back end may reorder the BBS's differently and reverse
4911 conditions/branch directions. Change the estimates below to
4912 something more reasonable. */
4914 /* If the number of iterations is known and we do not do versioning, we can
4915 decide whether to vectorize at compile time. Hence the scalar version
4916 do not carry cost model guard costs. */
4917 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4918 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4920 /* Cost model check occurs at versioning. */
4921 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4922 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4923 else
4925 /* Cost model check occurs at prologue generation. */
4926 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4927 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4928 + vect_get_stmt_cost (cond_branch_not_taken);
4929 /* Cost model check occurs at epilogue generation. */
4930 else
4931 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4935 /* Complete the target-specific cost calculations. */
4936 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4937 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4938 suggested_unroll_factor);
4940 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4941 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4942 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4943 *suggested_unroll_factor,
4944 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4946 if (dump_enabled_p ())
4947 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4948 "can't unroll as unrolled vectorization factor larger"
4949 " than maximum vectorization factor: "
4950 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4951 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4952 *suggested_unroll_factor = 1;
4955 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4957 if (dump_enabled_p ())
4959 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4960 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4961 vec_inside_cost);
4962 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4963 vec_prologue_cost);
4964 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4965 vec_epilogue_cost);
4966 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4967 scalar_single_iter_cost);
4968 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4969 scalar_outside_cost);
4970 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4971 vec_outside_cost);
4972 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4973 peel_iters_prologue);
4974 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4975 peel_iters_epilogue);
4978 /* Calculate number of iterations required to make the vector version
4979 profitable, relative to the loop bodies only. The following condition
4980 must hold true:
4981 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4982 where
4983 SIC = scalar iteration cost, VIC = vector iteration cost,
4984 VOC = vector outside cost, VF = vectorization factor,
4985 NPEEL = prologue iterations + epilogue iterations,
4986 SOC = scalar outside cost for run time cost model check. */
4988 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4989 - vec_inside_cost);
4990 if (saving_per_viter <= 0)
4992 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4993 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4994 "vectorization did not happen for a simd loop");
4996 if (dump_enabled_p ())
4997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4998 "cost model: the vector iteration cost = %d "
4999 "divided by the scalar iteration cost = %d "
5000 "is greater or equal to the vectorization factor = %d"
5001 ".\n",
5002 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5003 *ret_min_profitable_niters = -1;
5004 *ret_min_profitable_estimate = -1;
5005 return;
5008 /* ??? The "if" arm is written to handle all cases; see below for what
5009 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5010 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5012 /* Rewriting the condition above in terms of the number of
5013 vector iterations (vniters) rather than the number of
5014 scalar iterations (niters) gives:
5016 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5018 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5020 For integer N, X and Y when X > 0:
5022 N * X > Y <==> N >= (Y /[floor] X) + 1. */
5023 int outside_overhead = (vec_outside_cost
5024 - scalar_single_iter_cost * peel_iters_prologue
5025 - scalar_single_iter_cost * peel_iters_epilogue
5026 - scalar_outside_cost);
5027 /* We're only interested in cases that require at least one
5028 vector iteration. */
5029 int min_vec_niters = 1;
5030 if (outside_overhead > 0)
5031 min_vec_niters = outside_overhead / saving_per_viter + 1;
5033 if (dump_enabled_p ())
5034 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5035 min_vec_niters);
5037 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5039 /* Now that we know the minimum number of vector iterations,
5040 find the minimum niters for which the scalar cost is larger:
5042 SIC * niters > VIC * vniters + VOC - SOC
5044 We know that the minimum niters is no more than
5045 vniters * VF + NPEEL, but it might be (and often is) less
5046 than that if a partial vector iteration is cheaper than the
5047 equivalent scalar code. */
5048 int threshold = (vec_inside_cost * min_vec_niters
5049 + vec_outside_cost
5050 - scalar_outside_cost);
5051 if (threshold <= 0)
5052 min_profitable_iters = 1;
5053 else
5054 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5056 else
5057 /* Convert the number of vector iterations into a number of
5058 scalar iterations. */
5059 min_profitable_iters = (min_vec_niters * assumed_vf
5060 + peel_iters_prologue
5061 + peel_iters_epilogue);
5063 else
5065 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5066 * assumed_vf
5067 - vec_inside_cost * peel_iters_prologue
5068 - vec_inside_cost * peel_iters_epilogue);
5069 if (min_profitable_iters <= 0)
5070 min_profitable_iters = 0;
5071 else
5073 min_profitable_iters /= saving_per_viter;
5075 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5076 <= (((int) vec_inside_cost * min_profitable_iters)
5077 + (((int) vec_outside_cost - scalar_outside_cost)
5078 * assumed_vf)))
5079 min_profitable_iters++;
5083 if (dump_enabled_p ())
5084 dump_printf (MSG_NOTE,
5085 " Calculated minimum iters for profitability: %d\n",
5086 min_profitable_iters);
5088 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5089 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5090 /* We want the vectorized loop to execute at least once. */
5091 min_profitable_iters = assumed_vf + peel_iters_prologue;
5092 else if (min_profitable_iters < peel_iters_prologue)
5093 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5094 vectorized loop executes at least once. */
5095 min_profitable_iters = peel_iters_prologue;
5097 if (dump_enabled_p ())
5098 dump_printf_loc (MSG_NOTE, vect_location,
5099 " Runtime profitability threshold = %d\n",
5100 min_profitable_iters);
5102 *ret_min_profitable_niters = min_profitable_iters;
5104 /* Calculate number of iterations required to make the vector version
5105 profitable, relative to the loop bodies only.
5107 Non-vectorized variant is SIC * niters and it must win over vector
5108 variant on the expected loop trip count. The following condition must hold true:
5109 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5111 if (vec_outside_cost <= 0)
5112 min_profitable_estimate = 0;
5113 /* ??? This "else if" arm is written to handle all cases; see below for
5114 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5115 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5117 /* This is a repeat of the code above, but with + SOC rather
5118 than - SOC. */
5119 int outside_overhead = (vec_outside_cost
5120 - scalar_single_iter_cost * peel_iters_prologue
5121 - scalar_single_iter_cost * peel_iters_epilogue
5122 + scalar_outside_cost);
5123 int min_vec_niters = 1;
5124 if (outside_overhead > 0)
5125 min_vec_niters = outside_overhead / saving_per_viter + 1;
5127 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5129 int threshold = (vec_inside_cost * min_vec_niters
5130 + vec_outside_cost
5131 + scalar_outside_cost);
5132 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5134 else
5135 min_profitable_estimate = (min_vec_niters * assumed_vf
5136 + peel_iters_prologue
5137 + peel_iters_epilogue);
5139 else
5141 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5142 * assumed_vf
5143 - vec_inside_cost * peel_iters_prologue
5144 - vec_inside_cost * peel_iters_epilogue)
5145 / ((scalar_single_iter_cost * assumed_vf)
5146 - vec_inside_cost);
5148 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5149 if (dump_enabled_p ())
5150 dump_printf_loc (MSG_NOTE, vect_location,
5151 " Static estimate profitability threshold = %d\n",
5152 min_profitable_estimate);
5154 *ret_min_profitable_estimate = min_profitable_estimate;
5157 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5158 vector elements (not bits) for a vector with NELT elements. */
5159 static void
5160 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5161 vec_perm_builder *sel)
5163 /* The encoding is a single stepped pattern. Any wrap-around is handled
5164 by vec_perm_indices. */
5165 sel->new_vector (nelt, 1, 3);
5166 for (unsigned int i = 0; i < 3; i++)
5167 sel->quick_push (i + offset);
5170 /* Checks whether the target supports whole-vector shifts for vectors of mode
5171 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5172 it supports vec_perm_const with masks for all necessary shift amounts. */
5173 static bool
5174 have_whole_vector_shift (machine_mode mode)
5176 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5177 return true;
5179 /* Variable-length vectors should be handled via the optab. */
5180 unsigned int nelt;
5181 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5182 return false;
5184 vec_perm_builder sel;
5185 vec_perm_indices indices;
5186 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5188 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5189 indices.new_vector (sel, 2, nelt);
5190 if (!can_vec_perm_const_p (mode, mode, indices, false))
5191 return false;
5193 return true;
5196 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5197 multiplication operands have differing signs and (b) we intend
5198 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5199 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5201 static bool
5202 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5203 stmt_vec_info stmt_info)
5205 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5206 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5207 return false;
5209 tree rhs1 = gimple_assign_rhs1 (assign);
5210 tree rhs2 = gimple_assign_rhs2 (assign);
5211 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5212 return false;
5214 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5215 gcc_assert (reduc_info->is_reduc_info);
5216 return !directly_supported_p (DOT_PROD_EXPR,
5217 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5218 optab_vector_mixed_sign);
5221 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5222 functions. Design better to avoid maintenance issues. */
5224 /* Function vect_model_reduction_cost.
5226 Models cost for a reduction operation, including the vector ops
5227 generated within the strip-mine loop in some cases, the initial
5228 definition before the loop, and the epilogue code that must be generated. */
5230 static void
5231 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5232 stmt_vec_info stmt_info, internal_fn reduc_fn,
5233 vect_reduction_type reduction_type,
5234 int ncopies, stmt_vector_for_cost *cost_vec)
5236 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5237 tree vectype;
5238 machine_mode mode;
5239 class loop *loop = NULL;
5241 if (loop_vinfo)
5242 loop = LOOP_VINFO_LOOP (loop_vinfo);
5244 /* Condition reductions generate two reductions in the loop. */
5245 if (reduction_type == COND_REDUCTION)
5246 ncopies *= 2;
5248 vectype = STMT_VINFO_VECTYPE (stmt_info);
5249 mode = TYPE_MODE (vectype);
5250 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5252 gimple_match_op op;
5253 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5254 gcc_unreachable ();
5256 bool emulated_mixed_dot_prod
5257 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5258 if (reduction_type == EXTRACT_LAST_REDUCTION)
5259 /* No extra instructions are needed in the prologue. The loop body
5260 operations are costed in vectorizable_condition. */
5261 inside_cost = 0;
5262 else if (reduction_type == FOLD_LEFT_REDUCTION)
5264 /* No extra instructions needed in the prologue. */
5265 prologue_cost = 0;
5267 if (reduc_fn != IFN_LAST)
5268 /* Count one reduction-like operation per vector. */
5269 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5270 stmt_info, 0, vect_body);
5271 else
5273 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5274 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5275 inside_cost = record_stmt_cost (cost_vec, nelements,
5276 vec_to_scalar, stmt_info, 0,
5277 vect_body);
5278 inside_cost += record_stmt_cost (cost_vec, nelements,
5279 scalar_stmt, stmt_info, 0,
5280 vect_body);
5283 else
5285 /* Add in the cost of the initial definitions. */
5286 int prologue_stmts;
5287 if (reduction_type == COND_REDUCTION)
5288 /* For cond reductions we have four vectors: initial index, step,
5289 initial result of the data reduction, initial value of the index
5290 reduction. */
5291 prologue_stmts = 4;
5292 else if (emulated_mixed_dot_prod)
5293 /* We need the initial reduction value and two invariants:
5294 one that contains the minimum signed value and one that
5295 contains half of its negative. */
5296 prologue_stmts = 3;
5297 else
5298 prologue_stmts = 1;
5299 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5300 scalar_to_vec, stmt_info, 0,
5301 vect_prologue);
5304 /* Determine cost of epilogue code.
5306 We have a reduction operator that will reduce the vector in one statement.
5307 Also requires scalar extract. */
5309 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5311 if (reduc_fn != IFN_LAST)
5313 if (reduction_type == COND_REDUCTION)
5315 /* An EQ stmt and an COND_EXPR stmt. */
5316 epilogue_cost += record_stmt_cost (cost_vec, 2,
5317 vector_stmt, stmt_info, 0,
5318 vect_epilogue);
5319 /* Reduction of the max index and a reduction of the found
5320 values. */
5321 epilogue_cost += record_stmt_cost (cost_vec, 2,
5322 vec_to_scalar, stmt_info, 0,
5323 vect_epilogue);
5324 /* A broadcast of the max value. */
5325 epilogue_cost += record_stmt_cost (cost_vec, 1,
5326 scalar_to_vec, stmt_info, 0,
5327 vect_epilogue);
5329 else
5331 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5332 stmt_info, 0, vect_epilogue);
5333 epilogue_cost += record_stmt_cost (cost_vec, 1,
5334 vec_to_scalar, stmt_info, 0,
5335 vect_epilogue);
5338 else if (reduction_type == COND_REDUCTION)
5340 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5341 /* Extraction of scalar elements. */
5342 epilogue_cost += record_stmt_cost (cost_vec,
5343 2 * estimated_nunits,
5344 vec_to_scalar, stmt_info, 0,
5345 vect_epilogue);
5346 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5347 epilogue_cost += record_stmt_cost (cost_vec,
5348 2 * estimated_nunits - 3,
5349 scalar_stmt, stmt_info, 0,
5350 vect_epilogue);
5352 else if (reduction_type == EXTRACT_LAST_REDUCTION
5353 || reduction_type == FOLD_LEFT_REDUCTION)
5354 /* No extra instructions need in the epilogue. */
5356 else
5358 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5359 tree bitsize = TYPE_SIZE (op.type);
5360 int element_bitsize = tree_to_uhwi (bitsize);
5361 int nelements = vec_size_in_bits / element_bitsize;
5363 if (op.code == COND_EXPR)
5364 op.code = MAX_EXPR;
5366 /* We have a whole vector shift available. */
5367 if (VECTOR_MODE_P (mode)
5368 && directly_supported_p (op.code, vectype)
5369 && have_whole_vector_shift (mode))
5371 /* Final reduction via vector shifts and the reduction operator.
5372 Also requires scalar extract. */
5373 epilogue_cost += record_stmt_cost (cost_vec,
5374 exact_log2 (nelements) * 2,
5375 vector_stmt, stmt_info, 0,
5376 vect_epilogue);
5377 epilogue_cost += record_stmt_cost (cost_vec, 1,
5378 vec_to_scalar, stmt_info, 0,
5379 vect_epilogue);
5381 else
5382 /* Use extracts and reduction op for final reduction. For N
5383 elements, we have N extracts and N-1 reduction ops. */
5384 epilogue_cost += record_stmt_cost (cost_vec,
5385 nelements + nelements - 1,
5386 vector_stmt, stmt_info, 0,
5387 vect_epilogue);
5391 if (dump_enabled_p ())
5392 dump_printf (MSG_NOTE,
5393 "vect_model_reduction_cost: inside_cost = %d, "
5394 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5395 prologue_cost, epilogue_cost);
5398 /* SEQ is a sequence of instructions that initialize the reduction
5399 described by REDUC_INFO. Emit them in the appropriate place. */
5401 static void
5402 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5403 stmt_vec_info reduc_info, gimple *seq)
5405 if (reduc_info->reused_accumulator)
5407 /* When reusing an accumulator from the main loop, we only need
5408 initialization instructions if the main loop can be skipped.
5409 In that case, emit the initialization instructions at the end
5410 of the guard block that does the skip. */
5411 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5412 gcc_assert (skip_edge);
5413 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5414 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5416 else
5418 /* The normal case: emit the initialization instructions on the
5419 preheader edge. */
5420 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5421 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5425 /* Function get_initial_def_for_reduction
5427 Input:
5428 REDUC_INFO - the info_for_reduction
5429 INIT_VAL - the initial value of the reduction variable
5430 NEUTRAL_OP - a value that has no effect on the reduction, as per
5431 neutral_op_for_reduction
5433 Output:
5434 Return a vector variable, initialized according to the operation that
5435 STMT_VINFO performs. This vector will be used as the initial value
5436 of the vector of partial results.
5438 The value we need is a vector in which element 0 has value INIT_VAL
5439 and every other element has value NEUTRAL_OP. */
5441 static tree
5442 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5443 stmt_vec_info reduc_info,
5444 tree init_val, tree neutral_op)
5446 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5447 tree scalar_type = TREE_TYPE (init_val);
5448 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5449 tree init_def;
5450 gimple_seq stmts = NULL;
5452 gcc_assert (vectype);
5454 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5455 || SCALAR_FLOAT_TYPE_P (scalar_type));
5457 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5458 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5460 if (operand_equal_p (init_val, neutral_op))
5462 /* If both elements are equal then the vector described above is
5463 just a splat. */
5464 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5465 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5467 else
5469 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5470 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5471 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5473 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5474 element 0. */
5475 init_def = gimple_build_vector_from_val (&stmts, vectype,
5476 neutral_op);
5477 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5478 vectype, init_def, init_val);
5480 else
5482 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5483 tree_vector_builder elts (vectype, 1, 2);
5484 elts.quick_push (init_val);
5485 elts.quick_push (neutral_op);
5486 init_def = gimple_build_vector (&stmts, &elts);
5490 if (stmts)
5491 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5492 return init_def;
5495 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5496 which performs a reduction involving GROUP_SIZE scalar statements.
5497 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5498 is nonnull, introducing extra elements of that value will not change the
5499 result. */
5501 static void
5502 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5503 stmt_vec_info reduc_info,
5504 vec<tree> *vec_oprnds,
5505 unsigned int number_of_vectors,
5506 unsigned int group_size, tree neutral_op)
5508 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5509 unsigned HOST_WIDE_INT nunits;
5510 unsigned j, number_of_places_left_in_vector;
5511 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5512 unsigned int i;
5514 gcc_assert (group_size == initial_values.length () || neutral_op);
5516 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5517 created vectors. It is greater than 1 if unrolling is performed.
5519 For example, we have two scalar operands, s1 and s2 (e.g., group of
5520 strided accesses of size two), while NUNITS is four (i.e., four scalars
5521 of this type can be packed in a vector). The output vector will contain
5522 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5523 will be 2).
5525 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5526 vectors containing the operands.
5528 For example, NUNITS is four as before, and the group size is 8
5529 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5530 {s5, s6, s7, s8}. */
5532 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5533 nunits = group_size;
5535 number_of_places_left_in_vector = nunits;
5536 bool constant_p = true;
5537 tree_vector_builder elts (vector_type, nunits, 1);
5538 elts.quick_grow (nunits);
5539 gimple_seq ctor_seq = NULL;
5540 for (j = 0; j < nunits * number_of_vectors; ++j)
5542 tree op;
5543 i = j % group_size;
5545 /* Get the def before the loop. In reduction chain we have only
5546 one initial value. Else we have as many as PHIs in the group. */
5547 if (i >= initial_values.length () || (j > i && neutral_op))
5548 op = neutral_op;
5549 else
5550 op = initial_values[i];
5552 /* Create 'vect_ = {op0,op1,...,opn}'. */
5553 number_of_places_left_in_vector--;
5554 elts[nunits - number_of_places_left_in_vector - 1] = op;
5555 if (!CONSTANT_CLASS_P (op))
5556 constant_p = false;
5558 if (number_of_places_left_in_vector == 0)
5560 tree init;
5561 if (constant_p && !neutral_op
5562 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5563 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5564 /* Build the vector directly from ELTS. */
5565 init = gimple_build_vector (&ctor_seq, &elts);
5566 else if (neutral_op)
5568 /* Build a vector of the neutral value and shift the
5569 other elements into place. */
5570 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5571 neutral_op);
5572 int k = nunits;
5573 while (k > 0 && elts[k - 1] == neutral_op)
5574 k -= 1;
5575 while (k > 0)
5577 k -= 1;
5578 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5579 vector_type, init, elts[k]);
5582 else
5584 /* First time round, duplicate ELTS to fill the
5585 required number of vectors. */
5586 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5587 elts, number_of_vectors, *vec_oprnds);
5588 break;
5590 vec_oprnds->quick_push (init);
5592 number_of_places_left_in_vector = nunits;
5593 elts.new_vector (vector_type, nunits, 1);
5594 elts.quick_grow (nunits);
5595 constant_p = true;
5598 if (ctor_seq != NULL)
5599 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5602 /* For a statement STMT_INFO taking part in a reduction operation return
5603 the stmt_vec_info the meta information is stored on. */
5605 stmt_vec_info
5606 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5608 stmt_info = vect_orig_stmt (stmt_info);
5609 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5610 if (!is_a <gphi *> (stmt_info->stmt)
5611 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5612 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5613 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5614 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5616 if (gimple_phi_num_args (phi) == 1)
5617 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5619 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5621 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5622 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5623 stmt_info = info;
5625 return stmt_info;
5628 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5629 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5630 return false. */
5632 static bool
5633 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5634 stmt_vec_info reduc_info)
5636 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5637 if (!main_loop_vinfo)
5638 return false;
5640 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5641 return false;
5643 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5644 auto_vec<tree, 16> main_loop_results (num_phis);
5645 auto_vec<tree, 16> initial_values (num_phis);
5646 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5648 /* The epilogue loop can be entered either from the main loop or
5649 from an earlier guard block. */
5650 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5651 for (tree incoming_value : reduc_info->reduc_initial_values)
5653 /* Look for:
5655 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5656 INITIAL_VALUE(guard block)>. */
5657 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5659 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5660 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5662 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5663 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5665 main_loop_results.quick_push (from_main_loop);
5666 initial_values.quick_push (from_skip);
5669 else
5670 /* The main loop dominates the epilogue loop. */
5671 main_loop_results.splice (reduc_info->reduc_initial_values);
5673 /* See if the main loop has the kind of accumulator we need. */
5674 vect_reusable_accumulator *accumulator
5675 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5676 if (!accumulator
5677 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5678 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5679 accumulator->reduc_info->reduc_scalar_results.begin ()))
5680 return false;
5682 /* Handle the case where we can reduce wider vectors to narrower ones. */
5683 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5684 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5685 unsigned HOST_WIDE_INT m;
5686 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5687 TYPE_VECTOR_SUBPARTS (vectype), &m))
5688 return false;
5689 /* Check the intermediate vector types and operations are available. */
5690 tree prev_vectype = old_vectype;
5691 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5692 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5694 intermediate_nunits = exact_div (intermediate_nunits, 2);
5695 tree intermediate_vectype = get_related_vectype_for_scalar_type
5696 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5697 if (!intermediate_vectype
5698 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5699 intermediate_vectype)
5700 || !can_vec_extract (TYPE_MODE (prev_vectype),
5701 TYPE_MODE (intermediate_vectype)))
5702 return false;
5703 prev_vectype = intermediate_vectype;
5706 /* Non-SLP reductions might apply an adjustment after the reduction
5707 operation, in order to simplify the initialization of the accumulator.
5708 If the epilogue loop carries on from where the main loop left off,
5709 it should apply the same adjustment to the final reduction result.
5711 If the epilogue loop can also be entered directly (rather than via
5712 the main loop), we need to be able to handle that case in the same way,
5713 with the same adjustment. (In principle we could add a PHI node
5714 to select the correct adjustment, but in practice that shouldn't be
5715 necessary.) */
5716 tree main_adjustment
5717 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5718 if (loop_vinfo->main_loop_edge && main_adjustment)
5720 gcc_assert (num_phis == 1);
5721 tree initial_value = initial_values[0];
5722 /* Check that we can use INITIAL_VALUE as the adjustment and
5723 initialize the accumulator with a neutral value instead. */
5724 if (!operand_equal_p (initial_value, main_adjustment))
5725 return false;
5726 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5727 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5728 code, initial_value);
5730 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5731 reduc_info->reduc_initial_values.truncate (0);
5732 reduc_info->reduc_initial_values.splice (initial_values);
5733 reduc_info->reused_accumulator = accumulator;
5734 return true;
5737 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5738 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5740 static tree
5741 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5742 gimple_seq *seq)
5744 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5745 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5746 tree stype = TREE_TYPE (vectype);
5747 tree new_temp = vec_def;
5748 while (nunits > nunits1)
5750 nunits /= 2;
5751 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5752 stype, nunits);
5753 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5755 /* The target has to make sure we support lowpart/highpart
5756 extraction, either via direct vector extract or through
5757 an integer mode punning. */
5758 tree dst1, dst2;
5759 gimple *epilog_stmt;
5760 if (convert_optab_handler (vec_extract_optab,
5761 TYPE_MODE (TREE_TYPE (new_temp)),
5762 TYPE_MODE (vectype1))
5763 != CODE_FOR_nothing)
5765 /* Extract sub-vectors directly once vec_extract becomes
5766 a conversion optab. */
5767 dst1 = make_ssa_name (vectype1);
5768 epilog_stmt
5769 = gimple_build_assign (dst1, BIT_FIELD_REF,
5770 build3 (BIT_FIELD_REF, vectype1,
5771 new_temp, TYPE_SIZE (vectype1),
5772 bitsize_int (0)));
5773 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5774 dst2 = make_ssa_name (vectype1);
5775 epilog_stmt
5776 = gimple_build_assign (dst2, BIT_FIELD_REF,
5777 build3 (BIT_FIELD_REF, vectype1,
5778 new_temp, TYPE_SIZE (vectype1),
5779 bitsize_int (bitsize)));
5780 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5782 else
5784 /* Extract via punning to appropriately sized integer mode
5785 vector. */
5786 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5787 tree etype = build_vector_type (eltype, 2);
5788 gcc_assert (convert_optab_handler (vec_extract_optab,
5789 TYPE_MODE (etype),
5790 TYPE_MODE (eltype))
5791 != CODE_FOR_nothing);
5792 tree tem = make_ssa_name (etype);
5793 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5794 build1 (VIEW_CONVERT_EXPR,
5795 etype, new_temp));
5796 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5797 new_temp = tem;
5798 tem = make_ssa_name (eltype);
5799 epilog_stmt
5800 = gimple_build_assign (tem, BIT_FIELD_REF,
5801 build3 (BIT_FIELD_REF, eltype,
5802 new_temp, TYPE_SIZE (eltype),
5803 bitsize_int (0)));
5804 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5805 dst1 = make_ssa_name (vectype1);
5806 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5807 build1 (VIEW_CONVERT_EXPR,
5808 vectype1, tem));
5809 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5810 tem = make_ssa_name (eltype);
5811 epilog_stmt
5812 = gimple_build_assign (tem, BIT_FIELD_REF,
5813 build3 (BIT_FIELD_REF, eltype,
5814 new_temp, TYPE_SIZE (eltype),
5815 bitsize_int (bitsize)));
5816 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5817 dst2 = make_ssa_name (vectype1);
5818 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5819 build1 (VIEW_CONVERT_EXPR,
5820 vectype1, tem));
5821 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5824 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5827 return new_temp;
5830 /* Function vect_create_epilog_for_reduction
5832 Create code at the loop-epilog to finalize the result of a reduction
5833 computation.
5835 STMT_INFO is the scalar reduction stmt that is being vectorized.
5836 SLP_NODE is an SLP node containing a group of reduction statements. The
5837 first one in this group is STMT_INFO.
5838 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5839 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5840 (counting from 0)
5842 This function:
5843 1. Completes the reduction def-use cycles.
5844 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5845 by calling the function specified by REDUC_FN if available, or by
5846 other means (whole-vector shifts or a scalar loop).
5847 The function also creates a new phi node at the loop exit to preserve
5848 loop-closed form, as illustrated below.
5850 The flow at the entry to this function:
5852 loop:
5853 vec_def = phi <vec_init, null> # REDUCTION_PHI
5854 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5855 s_loop = scalar_stmt # (scalar) STMT_INFO
5856 loop_exit:
5857 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5858 use <s_out0>
5859 use <s_out0>
5861 The above is transformed by this function into:
5863 loop:
5864 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5865 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5866 s_loop = scalar_stmt # (scalar) STMT_INFO
5867 loop_exit:
5868 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5869 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5870 v_out2 = reduce <v_out1>
5871 s_out3 = extract_field <v_out2, 0>
5872 s_out4 = adjust_result <s_out3>
5873 use <s_out4>
5874 use <s_out4>
5877 static void
5878 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5879 stmt_vec_info stmt_info,
5880 slp_tree slp_node,
5881 slp_instance slp_node_instance)
5883 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5884 gcc_assert (reduc_info->is_reduc_info);
5885 /* For double reductions we need to get at the inner loop reduction
5886 stmt which has the meta info attached. Our stmt_info is that of the
5887 loop-closed PHI of the inner loop which we remember as
5888 def for the reduction PHI generation. */
5889 bool double_reduc = false;
5890 stmt_vec_info rdef_info = stmt_info;
5891 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5893 gcc_assert (!slp_node);
5894 double_reduc = true;
5895 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5896 (stmt_info->stmt, 0));
5897 stmt_info = vect_stmt_to_vectorize (stmt_info);
5899 gphi *reduc_def_stmt
5900 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5901 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5902 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5903 tree vectype;
5904 machine_mode mode;
5905 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5906 basic_block exit_bb;
5907 tree scalar_dest;
5908 tree scalar_type;
5909 gimple *new_phi = NULL, *phi = NULL;
5910 gimple_stmt_iterator exit_gsi;
5911 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5912 gimple *epilog_stmt = NULL;
5913 gimple *exit_phi;
5914 tree bitsize;
5915 tree def;
5916 tree orig_name, scalar_result;
5917 imm_use_iterator imm_iter, phi_imm_iter;
5918 use_operand_p use_p, phi_use_p;
5919 gimple *use_stmt;
5920 auto_vec<tree> reduc_inputs;
5921 int j, i;
5922 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5923 unsigned int group_size = 1, k;
5924 auto_vec<gimple *> phis;
5925 /* SLP reduction without reduction chain, e.g.,
5926 # a1 = phi <a2, a0>
5927 # b1 = phi <b2, b0>
5928 a2 = operation (a1)
5929 b2 = operation (b1) */
5930 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5931 bool direct_slp_reduc;
5932 tree induction_index = NULL_TREE;
5934 if (slp_node)
5935 group_size = SLP_TREE_LANES (slp_node);
5937 if (nested_in_vect_loop_p (loop, stmt_info))
5939 outer_loop = loop;
5940 loop = loop->inner;
5941 gcc_assert (!slp_node && double_reduc);
5944 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5945 gcc_assert (vectype);
5946 mode = TYPE_MODE (vectype);
5948 tree induc_val = NULL_TREE;
5949 tree adjustment_def = NULL;
5950 if (slp_node)
5952 else
5954 /* Optimize: for induction condition reduction, if we can't use zero
5955 for induc_val, use initial_def. */
5956 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5957 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5958 else if (double_reduc)
5960 else
5961 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5964 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5965 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5966 if (slp_reduc)
5967 /* All statements produce live-out values. */
5968 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5969 else if (slp_node)
5971 /* The last statement in the reduction chain produces the live-out
5972 value. Note SLP optimization can shuffle scalar stmts to
5973 optimize permutations so we have to search for the last stmt. */
5974 for (k = 0; k < group_size; ++k)
5975 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5977 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5978 break;
5982 unsigned vec_num;
5983 int ncopies;
5984 if (slp_node)
5986 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5987 ncopies = 1;
5989 else
5991 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5992 vec_num = 1;
5993 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5996 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5997 which is updated with the current index of the loop for every match of
5998 the original loop's cond_expr (VEC_STMT). This results in a vector
5999 containing the last time the condition passed for that vector lane.
6000 The first match will be a 1 to allow 0 to be used for non-matching
6001 indexes. If there are no matches at all then the vector will be all
6002 zeroes.
6004 PR92772: This algorithm is broken for architectures that support
6005 masked vectors, but do not provide fold_extract_last. */
6006 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6008 auto_vec<std::pair<tree, bool>, 2> ccompares;
6009 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6010 cond_info = vect_stmt_to_vectorize (cond_info);
6011 while (cond_info != reduc_info)
6013 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6015 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6016 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6017 ccompares.safe_push
6018 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6019 STMT_VINFO_REDUC_IDX (cond_info) == 2));
6021 cond_info
6022 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6023 1 + STMT_VINFO_REDUC_IDX
6024 (cond_info)));
6025 cond_info = vect_stmt_to_vectorize (cond_info);
6027 gcc_assert (ccompares.length () != 0);
6029 tree indx_before_incr, indx_after_incr;
6030 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6031 int scalar_precision
6032 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6033 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6034 tree cr_index_vector_type = get_related_vectype_for_scalar_type
6035 (TYPE_MODE (vectype), cr_index_scalar_type,
6036 TYPE_VECTOR_SUBPARTS (vectype));
6038 /* First we create a simple vector induction variable which starts
6039 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6040 vector size (STEP). */
6042 /* Create a {1,2,3,...} vector. */
6043 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6045 /* Create a vector of the step value. */
6046 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6047 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6049 /* Create an induction variable. */
6050 gimple_stmt_iterator incr_gsi;
6051 bool insert_after;
6052 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6053 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6054 insert_after, &indx_before_incr, &indx_after_incr);
6056 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6057 filled with zeros (VEC_ZERO). */
6059 /* Create a vector of 0s. */
6060 tree zero = build_zero_cst (cr_index_scalar_type);
6061 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6063 /* Create a vector phi node. */
6064 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6065 new_phi = create_phi_node (new_phi_tree, loop->header);
6066 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6067 loop_preheader_edge (loop), UNKNOWN_LOCATION);
6069 /* Now take the condition from the loops original cond_exprs
6070 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6071 every match uses values from the induction variable
6072 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6073 (NEW_PHI_TREE).
6074 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6075 the new cond_expr (INDEX_COND_EXPR). */
6076 gimple_seq stmts = NULL;
6077 for (int i = ccompares.length () - 1; i != -1; --i)
6079 tree ccompare = ccompares[i].first;
6080 if (ccompares[i].second)
6081 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6082 cr_index_vector_type,
6083 ccompare,
6084 indx_before_incr, new_phi_tree);
6085 else
6086 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6087 cr_index_vector_type,
6088 ccompare,
6089 new_phi_tree, indx_before_incr);
6091 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6093 /* Update the phi with the vec cond. */
6094 induction_index = new_phi_tree;
6095 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6096 loop_latch_edge (loop), UNKNOWN_LOCATION);
6099 /* 2. Create epilog code.
6100 The reduction epilog code operates across the elements of the vector
6101 of partial results computed by the vectorized loop.
6102 The reduction epilog code consists of:
6104 step 1: compute the scalar result in a vector (v_out2)
6105 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6106 step 3: adjust the scalar result (s_out3) if needed.
6108 Step 1 can be accomplished using one the following three schemes:
6109 (scheme 1) using reduc_fn, if available.
6110 (scheme 2) using whole-vector shifts, if available.
6111 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6112 combined.
6114 The overall epilog code looks like this:
6116 s_out0 = phi <s_loop> # original EXIT_PHI
6117 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6118 v_out2 = reduce <v_out1> # step 1
6119 s_out3 = extract_field <v_out2, 0> # step 2
6120 s_out4 = adjust_result <s_out3> # step 3
6122 (step 3 is optional, and steps 1 and 2 may be combined).
6123 Lastly, the uses of s_out0 are replaced by s_out4. */
6126 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6127 v_out1 = phi <VECT_DEF>
6128 Store them in NEW_PHIS. */
6129 if (double_reduc)
6130 loop = outer_loop;
6131 exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
6132 exit_gsi = gsi_after_labels (exit_bb);
6133 reduc_inputs.create (slp_node ? vec_num : ncopies);
6134 for (unsigned i = 0; i < vec_num; i++)
6136 gimple_seq stmts = NULL;
6137 if (slp_node)
6138 def = vect_get_slp_vect_def (slp_node, i);
6139 else
6140 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6141 for (j = 0; j < ncopies; j++)
6143 tree new_def = copy_ssa_name (def);
6144 phi = create_phi_node (new_def, exit_bb);
6145 if (j)
6146 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6147 SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
6148 new_def = gimple_convert (&stmts, vectype, new_def);
6149 reduc_inputs.quick_push (new_def);
6151 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6154 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6155 (i.e. when reduc_fn is not available) and in the final adjustment
6156 code (if needed). Also get the original scalar reduction variable as
6157 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6158 represents a reduction pattern), the tree-code and scalar-def are
6159 taken from the original stmt that the pattern-stmt (STMT) replaces.
6160 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6161 are taken from STMT. */
6163 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6164 if (orig_stmt_info != stmt_info)
6166 /* Reduction pattern */
6167 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6168 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6171 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6172 scalar_type = TREE_TYPE (scalar_dest);
6173 scalar_results.truncate (0);
6174 scalar_results.reserve_exact (group_size);
6175 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6176 bitsize = TYPE_SIZE (scalar_type);
6178 /* True if we should implement SLP_REDUC using native reduction operations
6179 instead of scalar operations. */
6180 direct_slp_reduc = (reduc_fn != IFN_LAST
6181 && slp_reduc
6182 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6184 /* In case of reduction chain, e.g.,
6185 # a1 = phi <a3, a0>
6186 a2 = operation (a1)
6187 a3 = operation (a2),
6189 we may end up with more than one vector result. Here we reduce them
6190 to one vector.
6192 The same is true for a SLP reduction, e.g.,
6193 # a1 = phi <a2, a0>
6194 # b1 = phi <b2, b0>
6195 a2 = operation (a1)
6196 b2 = operation (a2),
6198 where we can end up with more than one vector as well. We can
6199 easily accumulate vectors when the number of vector elements is
6200 a multiple of the SLP group size.
6202 The same is true if we couldn't use a single defuse cycle. */
6203 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6204 || direct_slp_reduc
6205 || (slp_reduc
6206 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6207 || ncopies > 1)
6209 gimple_seq stmts = NULL;
6210 tree single_input = reduc_inputs[0];
6211 for (k = 1; k < reduc_inputs.length (); k++)
6212 single_input = gimple_build (&stmts, code, vectype,
6213 single_input, reduc_inputs[k]);
6214 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6216 reduc_inputs.truncate (0);
6217 reduc_inputs.safe_push (single_input);
6220 tree orig_reduc_input = reduc_inputs[0];
6222 /* If this loop is an epilogue loop that can be skipped after the
6223 main loop, we can only share a reduction operation between the
6224 main loop and the epilogue if we put it at the target of the
6225 skip edge.
6227 We can still reuse accumulators if this check fails. Doing so has
6228 the minor(?) benefit of making the epilogue loop's scalar result
6229 independent of the main loop's scalar result. */
6230 bool unify_with_main_loop_p = false;
6231 if (reduc_info->reused_accumulator
6232 && loop_vinfo->skip_this_loop_edge
6233 && single_succ_p (exit_bb)
6234 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6236 unify_with_main_loop_p = true;
6238 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6239 reduc_inputs[0] = make_ssa_name (vectype);
6240 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6241 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6242 UNKNOWN_LOCATION);
6243 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6244 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6245 exit_gsi = gsi_after_labels (reduc_block);
6248 /* Shouldn't be used beyond this point. */
6249 exit_bb = nullptr;
6251 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6252 && reduc_fn != IFN_LAST)
6254 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6255 various data values where the condition matched and another vector
6256 (INDUCTION_INDEX) containing all the indexes of those matches. We
6257 need to extract the last matching index (which will be the index with
6258 highest value) and use this to index into the data vector.
6259 For the case where there were no matches, the data vector will contain
6260 all default values and the index vector will be all zeros. */
6262 /* Get various versions of the type of the vector of indexes. */
6263 tree index_vec_type = TREE_TYPE (induction_index);
6264 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6265 tree index_scalar_type = TREE_TYPE (index_vec_type);
6266 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6268 /* Get an unsigned integer version of the type of the data vector. */
6269 int scalar_precision
6270 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6271 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6272 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6273 vectype);
6275 /* First we need to create a vector (ZERO_VEC) of zeros and another
6276 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6277 can create using a MAX reduction and then expanding.
6278 In the case where the loop never made any matches, the max index will
6279 be zero. */
6281 /* Vector of {0, 0, 0,...}. */
6282 tree zero_vec = build_zero_cst (vectype);
6284 /* Find maximum value from the vector of found indexes. */
6285 tree max_index = make_ssa_name (index_scalar_type);
6286 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6287 1, induction_index);
6288 gimple_call_set_lhs (max_index_stmt, max_index);
6289 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6291 /* Vector of {max_index, max_index, max_index,...}. */
6292 tree max_index_vec = make_ssa_name (index_vec_type);
6293 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6294 max_index);
6295 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6296 max_index_vec_rhs);
6297 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6299 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6300 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6301 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6302 otherwise. Only one value should match, resulting in a vector
6303 (VEC_COND) with one data value and the rest zeros.
6304 In the case where the loop never made any matches, every index will
6305 match, resulting in a vector with all data values (which will all be
6306 the default value). */
6308 /* Compare the max index vector to the vector of found indexes to find
6309 the position of the max value. */
6310 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6311 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6312 induction_index,
6313 max_index_vec);
6314 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6316 /* Use the compare to choose either values from the data vector or
6317 zero. */
6318 tree vec_cond = make_ssa_name (vectype);
6319 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6320 vec_compare,
6321 reduc_inputs[0],
6322 zero_vec);
6323 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6325 /* Finally we need to extract the data value from the vector (VEC_COND)
6326 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6327 reduction, but because this doesn't exist, we can use a MAX reduction
6328 instead. The data value might be signed or a float so we need to cast
6329 it first.
6330 In the case where the loop never made any matches, the data values are
6331 all identical, and so will reduce down correctly. */
6333 /* Make the matched data values unsigned. */
6334 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6335 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6336 vec_cond);
6337 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6338 VIEW_CONVERT_EXPR,
6339 vec_cond_cast_rhs);
6340 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6342 /* Reduce down to a scalar value. */
6343 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6344 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6345 1, vec_cond_cast);
6346 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6347 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6349 /* Convert the reduced value back to the result type and set as the
6350 result. */
6351 gimple_seq stmts = NULL;
6352 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6353 data_reduc);
6354 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6355 scalar_results.safe_push (new_temp);
6357 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6358 && reduc_fn == IFN_LAST)
6360 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6361 idx = 0;
6362 idx_val = induction_index[0];
6363 val = data_reduc[0];
6364 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6365 if (induction_index[i] > idx_val)
6366 val = data_reduc[i], idx_val = induction_index[i];
6367 return val; */
6369 tree data_eltype = TREE_TYPE (vectype);
6370 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6371 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6372 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6373 /* Enforced by vectorizable_reduction, which ensures we have target
6374 support before allowing a conditional reduction on variable-length
6375 vectors. */
6376 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6377 tree idx_val = NULL_TREE, val = NULL_TREE;
6378 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6380 tree old_idx_val = idx_val;
6381 tree old_val = val;
6382 idx_val = make_ssa_name (idx_eltype);
6383 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6384 build3 (BIT_FIELD_REF, idx_eltype,
6385 induction_index,
6386 bitsize_int (el_size),
6387 bitsize_int (off)));
6388 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6389 val = make_ssa_name (data_eltype);
6390 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6391 build3 (BIT_FIELD_REF,
6392 data_eltype,
6393 reduc_inputs[0],
6394 bitsize_int (el_size),
6395 bitsize_int (off)));
6396 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6397 if (off != 0)
6399 tree new_idx_val = idx_val;
6400 if (off != v_size - el_size)
6402 new_idx_val = make_ssa_name (idx_eltype);
6403 epilog_stmt = gimple_build_assign (new_idx_val,
6404 MAX_EXPR, idx_val,
6405 old_idx_val);
6406 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6408 tree cond = make_ssa_name (boolean_type_node);
6409 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6410 idx_val, old_idx_val);
6411 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6412 tree new_val = make_ssa_name (data_eltype);
6413 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6414 cond, val, old_val);
6415 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6416 idx_val = new_idx_val;
6417 val = new_val;
6420 /* Convert the reduced value back to the result type and set as the
6421 result. */
6422 gimple_seq stmts = NULL;
6423 val = gimple_convert (&stmts, scalar_type, val);
6424 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6425 scalar_results.safe_push (val);
6428 /* 2.3 Create the reduction code, using one of the three schemes described
6429 above. In SLP we simply need to extract all the elements from the
6430 vector (without reducing them), so we use scalar shifts. */
6431 else if (reduc_fn != IFN_LAST && !slp_reduc)
6433 tree tmp;
6434 tree vec_elem_type;
6436 /* Case 1: Create:
6437 v_out2 = reduc_expr <v_out1> */
6439 if (dump_enabled_p ())
6440 dump_printf_loc (MSG_NOTE, vect_location,
6441 "Reduce using direct vector reduction.\n");
6443 gimple_seq stmts = NULL;
6444 vec_elem_type = TREE_TYPE (vectype);
6445 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6446 vec_elem_type, reduc_inputs[0]);
6447 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6448 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6450 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6451 && induc_val)
6453 /* Earlier we set the initial value to be a vector if induc_val
6454 values. Check the result and if it is induc_val then replace
6455 with the original initial value, unless induc_val is
6456 the same as initial_def already. */
6457 tree zcompare = make_ssa_name (boolean_type_node);
6458 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6459 new_temp, induc_val);
6460 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6461 tree initial_def = reduc_info->reduc_initial_values[0];
6462 tmp = make_ssa_name (new_scalar_dest);
6463 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6464 initial_def, new_temp);
6465 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6466 new_temp = tmp;
6469 scalar_results.safe_push (new_temp);
6471 else if (direct_slp_reduc)
6473 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6474 with the elements for other SLP statements replaced with the
6475 neutral value. We can then do a normal reduction on each vector. */
6477 /* Enforced by vectorizable_reduction. */
6478 gcc_assert (reduc_inputs.length () == 1);
6479 gcc_assert (pow2p_hwi (group_size));
6481 gimple_seq seq = NULL;
6483 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6484 and the same element size as VECTYPE. */
6485 tree index = build_index_vector (vectype, 0, 1);
6486 tree index_type = TREE_TYPE (index);
6487 tree index_elt_type = TREE_TYPE (index_type);
6488 tree mask_type = truth_type_for (index_type);
6490 /* Create a vector that, for each element, identifies which of
6491 the REDUC_GROUP_SIZE results should use it. */
6492 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6493 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6494 build_vector_from_val (index_type, index_mask));
6496 /* Get a neutral vector value. This is simply a splat of the neutral
6497 scalar value if we have one, otherwise the initial scalar value
6498 is itself a neutral value. */
6499 tree vector_identity = NULL_TREE;
6500 tree neutral_op = NULL_TREE;
6501 if (slp_node)
6503 tree initial_value = NULL_TREE;
6504 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6505 initial_value = reduc_info->reduc_initial_values[0];
6506 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6507 initial_value, false);
6509 if (neutral_op)
6510 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6511 neutral_op);
6512 for (unsigned int i = 0; i < group_size; ++i)
6514 /* If there's no univeral neutral value, we can use the
6515 initial scalar value from the original PHI. This is used
6516 for MIN and MAX reduction, for example. */
6517 if (!neutral_op)
6519 tree scalar_value = reduc_info->reduc_initial_values[i];
6520 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6521 scalar_value);
6522 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6523 scalar_value);
6526 /* Calculate the equivalent of:
6528 sel[j] = (index[j] == i);
6530 which selects the elements of REDUC_INPUTS[0] that should
6531 be included in the result. */
6532 tree compare_val = build_int_cst (index_elt_type, i);
6533 compare_val = build_vector_from_val (index_type, compare_val);
6534 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6535 index, compare_val);
6537 /* Calculate the equivalent of:
6539 vec = seq ? reduc_inputs[0] : vector_identity;
6541 VEC is now suitable for a full vector reduction. */
6542 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6543 sel, reduc_inputs[0], vector_identity);
6545 /* Do the reduction and convert it to the appropriate type. */
6546 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6547 TREE_TYPE (vectype), vec);
6548 scalar = gimple_convert (&seq, scalar_type, scalar);
6549 scalar_results.safe_push (scalar);
6551 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6553 else
6555 bool reduce_with_shift;
6556 tree vec_temp;
6558 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6560 /* See if the target wants to do the final (shift) reduction
6561 in a vector mode of smaller size and first reduce upper/lower
6562 halves against each other. */
6563 enum machine_mode mode1 = mode;
6564 tree stype = TREE_TYPE (vectype);
6565 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6566 unsigned nunits1 = nunits;
6567 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6568 && reduc_inputs.length () == 1)
6570 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6571 /* For SLP reductions we have to make sure lanes match up, but
6572 since we're doing individual element final reduction reducing
6573 vector width here is even more important.
6574 ??? We can also separate lanes with permutes, for the common
6575 case of power-of-two group-size odd/even extracts would work. */
6576 if (slp_reduc && nunits != nunits1)
6578 nunits1 = least_common_multiple (nunits1, group_size);
6579 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6582 if (!slp_reduc
6583 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6584 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6586 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6587 stype, nunits1);
6588 reduce_with_shift = have_whole_vector_shift (mode1);
6589 if (!VECTOR_MODE_P (mode1)
6590 || !directly_supported_p (code, vectype1))
6591 reduce_with_shift = false;
6593 /* First reduce the vector to the desired vector size we should
6594 do shift reduction on by combining upper and lower halves. */
6595 gimple_seq stmts = NULL;
6596 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6597 code, &stmts);
6598 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6599 reduc_inputs[0] = new_temp;
6601 if (reduce_with_shift && !slp_reduc)
6603 int element_bitsize = tree_to_uhwi (bitsize);
6604 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6605 for variable-length vectors and also requires direct target support
6606 for loop reductions. */
6607 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6608 int nelements = vec_size_in_bits / element_bitsize;
6609 vec_perm_builder sel;
6610 vec_perm_indices indices;
6612 int elt_offset;
6614 tree zero_vec = build_zero_cst (vectype1);
6615 /* Case 2: Create:
6616 for (offset = nelements/2; offset >= 1; offset/=2)
6618 Create: va' = vec_shift <va, offset>
6619 Create: va = vop <va, va'>
6620 } */
6622 tree rhs;
6624 if (dump_enabled_p ())
6625 dump_printf_loc (MSG_NOTE, vect_location,
6626 "Reduce using vector shifts\n");
6628 gimple_seq stmts = NULL;
6629 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6630 for (elt_offset = nelements / 2;
6631 elt_offset >= 1;
6632 elt_offset /= 2)
6634 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6635 indices.new_vector (sel, 2, nelements);
6636 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6637 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6638 new_temp, zero_vec, mask);
6639 new_temp = gimple_build (&stmts, code,
6640 vectype1, new_name, new_temp);
6642 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6644 /* 2.4 Extract the final scalar result. Create:
6645 s_out3 = extract_field <v_out2, bitpos> */
6647 if (dump_enabled_p ())
6648 dump_printf_loc (MSG_NOTE, vect_location,
6649 "extract scalar result\n");
6651 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6652 bitsize, bitsize_zero_node);
6653 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6654 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6655 gimple_assign_set_lhs (epilog_stmt, new_temp);
6656 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6657 scalar_results.safe_push (new_temp);
6659 else
6661 /* Case 3: Create:
6662 s = extract_field <v_out2, 0>
6663 for (offset = element_size;
6664 offset < vector_size;
6665 offset += element_size;)
6667 Create: s' = extract_field <v_out2, offset>
6668 Create: s = op <s, s'> // For non SLP cases
6669 } */
6671 if (dump_enabled_p ())
6672 dump_printf_loc (MSG_NOTE, vect_location,
6673 "Reduce using scalar code.\n");
6675 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6676 int element_bitsize = tree_to_uhwi (bitsize);
6677 tree compute_type = TREE_TYPE (vectype);
6678 gimple_seq stmts = NULL;
6679 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6681 int bit_offset;
6682 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6683 vec_temp, bitsize, bitsize_zero_node);
6685 /* In SLP we don't need to apply reduction operation, so we just
6686 collect s' values in SCALAR_RESULTS. */
6687 if (slp_reduc)
6688 scalar_results.safe_push (new_temp);
6690 for (bit_offset = element_bitsize;
6691 bit_offset < vec_size_in_bits;
6692 bit_offset += element_bitsize)
6694 tree bitpos = bitsize_int (bit_offset);
6695 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6696 compute_type, vec_temp,
6697 bitsize, bitpos);
6698 if (slp_reduc)
6700 /* In SLP we don't need to apply reduction operation, so
6701 we just collect s' values in SCALAR_RESULTS. */
6702 new_temp = new_name;
6703 scalar_results.safe_push (new_name);
6705 else
6706 new_temp = gimple_build (&stmts, code, compute_type,
6707 new_name, new_temp);
6711 /* The only case where we need to reduce scalar results in SLP, is
6712 unrolling. If the size of SCALAR_RESULTS is greater than
6713 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6714 REDUC_GROUP_SIZE. */
6715 if (slp_reduc)
6717 tree res, first_res, new_res;
6719 /* Reduce multiple scalar results in case of SLP unrolling. */
6720 for (j = group_size; scalar_results.iterate (j, &res);
6721 j++)
6723 first_res = scalar_results[j % group_size];
6724 new_res = gimple_build (&stmts, code, compute_type,
6725 first_res, res);
6726 scalar_results[j % group_size] = new_res;
6728 scalar_results.truncate (group_size);
6729 for (k = 0; k < group_size; k++)
6730 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6731 scalar_results[k]);
6733 else
6735 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6736 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6737 scalar_results.safe_push (new_temp);
6740 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6743 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6744 && induc_val)
6746 /* Earlier we set the initial value to be a vector if induc_val
6747 values. Check the result and if it is induc_val then replace
6748 with the original initial value, unless induc_val is
6749 the same as initial_def already. */
6750 tree zcompare = make_ssa_name (boolean_type_node);
6751 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6752 induc_val);
6753 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6754 tree initial_def = reduc_info->reduc_initial_values[0];
6755 tree tmp = make_ssa_name (new_scalar_dest);
6756 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6757 initial_def, new_temp);
6758 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6759 scalar_results[0] = tmp;
6763 /* 2.5 Adjust the final result by the initial value of the reduction
6764 variable. (When such adjustment is not needed, then
6765 'adjustment_def' is zero). For example, if code is PLUS we create:
6766 new_temp = loop_exit_def + adjustment_def */
6768 if (adjustment_def)
6770 gcc_assert (!slp_reduc);
6771 gimple_seq stmts = NULL;
6772 if (double_reduc)
6774 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6775 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6776 new_temp = gimple_build (&stmts, code, vectype,
6777 reduc_inputs[0], adjustment_def);
6779 else
6781 new_temp = scalar_results[0];
6782 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6783 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6784 adjustment_def);
6785 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6786 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6787 new_temp, adjustment_def);
6788 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6791 epilog_stmt = gimple_seq_last_stmt (stmts);
6792 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6793 scalar_results[0] = new_temp;
6796 /* Record this operation if it could be reused by the epilogue loop. */
6797 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6798 && reduc_inputs.length () == 1)
6799 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6800 { orig_reduc_input, reduc_info });
6802 if (double_reduc)
6803 loop = outer_loop;
6805 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6806 phis with new adjusted scalar results, i.e., replace use <s_out0>
6807 with use <s_out4>.
6809 Transform:
6810 loop_exit:
6811 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6812 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6813 v_out2 = reduce <v_out1>
6814 s_out3 = extract_field <v_out2, 0>
6815 s_out4 = adjust_result <s_out3>
6816 use <s_out0>
6817 use <s_out0>
6819 into:
6821 loop_exit:
6822 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6823 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6824 v_out2 = reduce <v_out1>
6825 s_out3 = extract_field <v_out2, 0>
6826 s_out4 = adjust_result <s_out3>
6827 use <s_out4>
6828 use <s_out4> */
6830 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6831 for (k = 0; k < live_out_stmts.size (); k++)
6833 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6834 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6836 phis.create (3);
6837 /* Find the loop-closed-use at the loop exit of the original scalar
6838 result. (The reduction result is expected to have two immediate uses,
6839 one at the latch block, and one at the loop exit). For double
6840 reductions we are looking for exit phis of the outer loop. */
6841 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6843 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6845 if (!is_gimple_debug (USE_STMT (use_p)))
6846 phis.safe_push (USE_STMT (use_p));
6848 else
6850 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6852 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6854 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6856 if (!flow_bb_inside_loop_p (loop,
6857 gimple_bb (USE_STMT (phi_use_p)))
6858 && !is_gimple_debug (USE_STMT (phi_use_p)))
6859 phis.safe_push (USE_STMT (phi_use_p));
6865 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6867 /* Replace the uses: */
6868 orig_name = PHI_RESULT (exit_phi);
6870 /* Look for a single use at the target of the skip edge. */
6871 if (unify_with_main_loop_p)
6873 use_operand_p use_p;
6874 gimple *user;
6875 if (!single_imm_use (orig_name, &use_p, &user))
6876 gcc_unreachable ();
6877 orig_name = gimple_get_lhs (user);
6880 scalar_result = scalar_results[k];
6881 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6883 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6884 SET_USE (use_p, scalar_result);
6885 update_stmt (use_stmt);
6889 phis.release ();
6893 /* Return a vector of type VECTYPE that is equal to the vector select
6894 operation "MASK ? VEC : IDENTITY". Insert the select statements
6895 before GSI. */
6897 static tree
6898 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6899 tree vec, tree identity)
6901 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6902 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6903 mask, vec, identity);
6904 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6905 return cond;
6908 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6909 order, starting with LHS. Insert the extraction statements before GSI and
6910 associate the new scalar SSA names with variable SCALAR_DEST.
6911 Return the SSA name for the result. */
6913 static tree
6914 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6915 tree_code code, tree lhs, tree vector_rhs)
6917 tree vectype = TREE_TYPE (vector_rhs);
6918 tree scalar_type = TREE_TYPE (vectype);
6919 tree bitsize = TYPE_SIZE (scalar_type);
6920 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6921 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6923 for (unsigned HOST_WIDE_INT bit_offset = 0;
6924 bit_offset < vec_size_in_bits;
6925 bit_offset += element_bitsize)
6927 tree bitpos = bitsize_int (bit_offset);
6928 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6929 bitsize, bitpos);
6931 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6932 rhs = make_ssa_name (scalar_dest, stmt);
6933 gimple_assign_set_lhs (stmt, rhs);
6934 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6936 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6937 tree new_name = make_ssa_name (scalar_dest, stmt);
6938 gimple_assign_set_lhs (stmt, new_name);
6939 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6940 lhs = new_name;
6942 return lhs;
6945 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6946 type of the vector input. */
6948 static internal_fn
6949 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6951 internal_fn mask_reduc_fn;
6952 internal_fn mask_len_reduc_fn;
6954 switch (reduc_fn)
6956 case IFN_FOLD_LEFT_PLUS:
6957 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6958 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6959 break;
6961 default:
6962 return IFN_LAST;
6965 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6966 OPTIMIZE_FOR_SPEED))
6967 return mask_reduc_fn;
6968 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6969 OPTIMIZE_FOR_SPEED))
6970 return mask_len_reduc_fn;
6971 return IFN_LAST;
6974 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6975 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6976 statement. CODE is the operation performed by STMT_INFO and OPS are
6977 its scalar operands. REDUC_INDEX is the index of the operand in
6978 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6979 implements in-order reduction, or IFN_LAST if we should open-code it.
6980 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6981 that should be used to control the operation in a fully-masked loop. */
6983 static bool
6984 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6985 stmt_vec_info stmt_info,
6986 gimple_stmt_iterator *gsi,
6987 gimple **vec_stmt, slp_tree slp_node,
6988 gimple *reduc_def_stmt,
6989 code_helper code, internal_fn reduc_fn,
6990 tree *ops, int num_ops, tree vectype_in,
6991 int reduc_index, vec_loop_masks *masks,
6992 vec_loop_lens *lens)
6994 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6995 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6996 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6998 int ncopies;
6999 if (slp_node)
7000 ncopies = 1;
7001 else
7002 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7004 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7005 gcc_assert (ncopies == 1);
7007 bool is_cond_op = false;
7008 if (!code.is_tree_code ())
7010 code = conditional_internal_fn_code (internal_fn (code));
7011 gcc_assert (code != ERROR_MARK);
7012 is_cond_op = true;
7015 gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7017 if (slp_node)
7019 if (is_cond_op)
7021 if (dump_enabled_p ())
7022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7023 "fold-left reduction on SLP not supported.\n");
7024 return false;
7027 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7028 TYPE_VECTOR_SUBPARTS (vectype_in)));
7031 /* The operands either come from a binary operation or an IFN_COND operation.
7032 The former is a gimple assign with binary rhs and the latter is a
7033 gimple call with four arguments. */
7034 gcc_assert (num_ops == 2 || num_ops == 4);
7035 tree op0, opmask;
7036 if (!is_cond_op)
7037 op0 = ops[1 - reduc_index];
7038 else
7040 op0 = ops[2];
7041 opmask = ops[0];
7042 gcc_assert (!slp_node);
7045 int group_size = 1;
7046 stmt_vec_info scalar_dest_def_info;
7047 auto_vec<tree> vec_oprnds0, vec_opmask;
7048 if (slp_node)
7050 auto_vec<vec<tree> > vec_defs (2);
7051 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7052 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7053 vec_defs[0].release ();
7054 vec_defs[1].release ();
7055 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7056 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7058 else
7060 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7061 op0, &vec_oprnds0);
7062 scalar_dest_def_info = stmt_info;
7064 /* For an IFN_COND_OP we also need the vector mask operand. */
7065 if (is_cond_op)
7066 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7067 opmask, &vec_opmask);
7070 gimple *sdef = scalar_dest_def_info->stmt;
7071 tree scalar_dest = gimple_get_lhs (sdef);
7072 tree scalar_type = TREE_TYPE (scalar_dest);
7073 tree reduc_var = gimple_phi_result (reduc_def_stmt);
7075 int vec_num = vec_oprnds0.length ();
7076 gcc_assert (vec_num == 1 || slp_node);
7077 tree vec_elem_type = TREE_TYPE (vectype_out);
7078 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7080 tree vector_identity = NULL_TREE;
7081 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7083 vector_identity = build_zero_cst (vectype_out);
7084 if (!HONOR_SIGNED_ZEROS (vectype_out))
7086 else
7088 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7089 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7090 vector_identity);
7094 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7095 int i;
7096 tree def0;
7097 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7099 gimple *new_stmt;
7100 tree mask = NULL_TREE;
7101 tree len = NULL_TREE;
7102 tree bias = NULL_TREE;
7103 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7104 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7105 else if (is_cond_op)
7106 mask = vec_opmask[0];
7107 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7109 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7110 i, 1);
7111 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7112 bias = build_int_cst (intQI_type_node, biasval);
7113 if (!is_cond_op)
7114 mask = build_minus_one_cst (truth_type_for (vectype_in));
7117 /* Handle MINUS by adding the negative. */
7118 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7120 tree negated = make_ssa_name (vectype_out);
7121 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7122 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7123 def0 = negated;
7126 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7127 && mask && mask_reduc_fn == IFN_LAST)
7128 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7129 vector_identity);
7131 /* On the first iteration the input is simply the scalar phi
7132 result, and for subsequent iterations it is the output of
7133 the preceding operation. */
7134 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7136 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7137 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7138 def0, mask, len, bias);
7139 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7140 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7141 def0, mask);
7142 else
7143 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7144 def0);
7145 /* For chained SLP reductions the output of the previous reduction
7146 operation serves as the input of the next. For the final statement
7147 the output cannot be a temporary - we reuse the original
7148 scalar destination of the last statement. */
7149 if (i != vec_num - 1)
7151 gimple_set_lhs (new_stmt, scalar_dest_var);
7152 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7153 gimple_set_lhs (new_stmt, reduc_var);
7156 else
7158 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7159 tree_code (code), reduc_var, def0);
7160 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7161 /* Remove the statement, so that we can use the same code paths
7162 as for statements that we've just created. */
7163 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7164 gsi_remove (&tmp_gsi, true);
7167 if (i == vec_num - 1)
7169 gimple_set_lhs (new_stmt, scalar_dest);
7170 vect_finish_replace_stmt (loop_vinfo,
7171 scalar_dest_def_info,
7172 new_stmt);
7174 else
7175 vect_finish_stmt_generation (loop_vinfo,
7176 scalar_dest_def_info,
7177 new_stmt, gsi);
7179 if (slp_node)
7180 slp_node->push_vec_def (new_stmt);
7181 else
7183 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7184 *vec_stmt = new_stmt;
7188 return true;
7191 /* Function is_nonwrapping_integer_induction.
7193 Check if STMT_VINO (which is part of loop LOOP) both increments and
7194 does not cause overflow. */
7196 static bool
7197 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7199 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7200 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7201 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7202 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7203 widest_int ni, max_loop_value, lhs_max;
7204 wi::overflow_type overflow = wi::OVF_NONE;
7206 /* Make sure the loop is integer based. */
7207 if (TREE_CODE (base) != INTEGER_CST
7208 || TREE_CODE (step) != INTEGER_CST)
7209 return false;
7211 /* Check that the max size of the loop will not wrap. */
7213 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7214 return true;
7216 if (! max_stmt_executions (loop, &ni))
7217 return false;
7219 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7220 &overflow);
7221 if (overflow)
7222 return false;
7224 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7225 TYPE_SIGN (lhs_type), &overflow);
7226 if (overflow)
7227 return false;
7229 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7230 <= TYPE_PRECISION (lhs_type));
7233 /* Check if masking can be supported by inserting a conditional expression.
7234 CODE is the code for the operation. COND_FN is the conditional internal
7235 function, if it exists. VECTYPE_IN is the type of the vector input. */
7236 static bool
7237 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7238 tree vectype_in)
7240 if (cond_fn != IFN_LAST
7241 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7242 OPTIMIZE_FOR_SPEED))
7243 return false;
7245 if (code.is_tree_code ())
7246 switch (tree_code (code))
7248 case DOT_PROD_EXPR:
7249 case SAD_EXPR:
7250 return true;
7252 default:
7253 break;
7255 return false;
7258 /* Insert a conditional expression to enable masked vectorization. CODE is the
7259 code for the operation. VOP is the array of operands. MASK is the loop
7260 mask. GSI is a statement iterator used to place the new conditional
7261 expression. */
7262 static void
7263 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7264 gimple_stmt_iterator *gsi)
7266 switch (tree_code (code))
7268 case DOT_PROD_EXPR:
7270 tree vectype = TREE_TYPE (vop[1]);
7271 tree zero = build_zero_cst (vectype);
7272 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7273 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7274 mask, vop[1], zero);
7275 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7276 vop[1] = masked_op1;
7277 break;
7280 case SAD_EXPR:
7282 tree vectype = TREE_TYPE (vop[1]);
7283 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7284 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7285 mask, vop[1], vop[0]);
7286 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7287 vop[1] = masked_op1;
7288 break;
7291 default:
7292 gcc_unreachable ();
7296 /* Function vectorizable_reduction.
7298 Check if STMT_INFO performs a reduction operation that can be vectorized.
7299 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7300 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7301 Return true if STMT_INFO is vectorizable in this way.
7303 This function also handles reduction idioms (patterns) that have been
7304 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7305 may be of this form:
7306 X = pattern_expr (arg0, arg1, ..., X)
7307 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7308 sequence that had been detected and replaced by the pattern-stmt
7309 (STMT_INFO).
7311 This function also handles reduction of condition expressions, for example:
7312 for (int i = 0; i < N; i++)
7313 if (a[i] < value)
7314 last = a[i];
7315 This is handled by vectorising the loop and creating an additional vector
7316 containing the loop indexes for which "a[i] < value" was true. In the
7317 function epilogue this is reduced to a single max value and then used to
7318 index into the vector of results.
7320 In some cases of reduction patterns, the type of the reduction variable X is
7321 different than the type of the other arguments of STMT_INFO.
7322 In such cases, the vectype that is used when transforming STMT_INFO into
7323 a vector stmt is different than the vectype that is used to determine the
7324 vectorization factor, because it consists of a different number of elements
7325 than the actual number of elements that are being operated upon in parallel.
7327 For example, consider an accumulation of shorts into an int accumulator.
7328 On some targets it's possible to vectorize this pattern operating on 8
7329 shorts at a time (hence, the vectype for purposes of determining the
7330 vectorization factor should be V8HI); on the other hand, the vectype that
7331 is used to create the vector form is actually V4SI (the type of the result).
7333 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7334 indicates what is the actual level of parallelism (V8HI in the example), so
7335 that the right vectorization factor would be derived. This vectype
7336 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7337 be used to create the vectorized stmt. The right vectype for the vectorized
7338 stmt is obtained from the type of the result X:
7339 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7341 This means that, contrary to "regular" reductions (or "regular" stmts in
7342 general), the following equation:
7343 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7344 does *NOT* necessarily hold for reduction patterns. */
7346 bool
7347 vectorizable_reduction (loop_vec_info loop_vinfo,
7348 stmt_vec_info stmt_info, slp_tree slp_node,
7349 slp_instance slp_node_instance,
7350 stmt_vector_for_cost *cost_vec)
7352 tree vectype_in = NULL_TREE;
7353 tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7354 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7355 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7356 stmt_vec_info cond_stmt_vinfo = NULL;
7357 int i;
7358 int ncopies;
7359 bool single_defuse_cycle = false;
7360 bool nested_cycle = false;
7361 bool double_reduc = false;
7362 int vec_num;
7363 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7364 tree cond_reduc_val = NULL_TREE;
7366 /* Make sure it was already recognized as a reduction computation. */
7367 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7368 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7369 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7370 return false;
7372 /* The stmt we store reduction analysis meta on. */
7373 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7374 reduc_info->is_reduc_info = true;
7376 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7378 if (is_a <gphi *> (stmt_info->stmt))
7380 if (slp_node)
7382 /* We eventually need to set a vector type on invariant
7383 arguments. */
7384 unsigned j;
7385 slp_tree child;
7386 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7387 if (!vect_maybe_update_slp_op_vectype
7388 (child, SLP_TREE_VECTYPE (slp_node)))
7390 if (dump_enabled_p ())
7391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7392 "incompatible vector types for "
7393 "invariants\n");
7394 return false;
7397 /* Analysis for double-reduction is done on the outer
7398 loop PHI, nested cycles have no further restrictions. */
7399 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7401 else
7402 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7403 return true;
7406 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7407 stmt_vec_info phi_info = stmt_info;
7408 if (!is_a <gphi *> (stmt_info->stmt))
7410 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7411 return true;
7413 if (slp_node)
7415 slp_node_instance->reduc_phis = slp_node;
7416 /* ??? We're leaving slp_node to point to the PHIs, we only
7417 need it to get at the number of vector stmts which wasn't
7418 yet initialized for the instance root. */
7420 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7422 use_operand_p use_p;
7423 gimple *use_stmt;
7424 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7425 &use_p, &use_stmt);
7426 gcc_assert (res);
7427 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7430 /* PHIs should not participate in patterns. */
7431 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7432 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7434 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7435 and compute the reduction chain length. Discover the real
7436 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7437 tree reduc_def
7438 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7439 loop_latch_edge
7440 (gimple_bb (reduc_def_phi)->loop_father));
7441 unsigned reduc_chain_length = 0;
7442 bool only_slp_reduc_chain = true;
7443 stmt_info = NULL;
7444 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7445 while (reduc_def != PHI_RESULT (reduc_def_phi))
7447 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7448 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7449 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7451 if (dump_enabled_p ())
7452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7453 "reduction chain broken by patterns.\n");
7454 return false;
7456 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7457 only_slp_reduc_chain = false;
7458 /* For epilogue generation live members of the chain need
7459 to point back to the PHI via their original stmt for
7460 info_for_reduction to work. For SLP we need to look at
7461 all lanes here - even though we only will vectorize from
7462 the SLP node with live lane zero the other live lanes also
7463 need to be identified as part of a reduction to be able
7464 to skip code generation for them. */
7465 if (slp_for_stmt_info)
7467 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7468 if (STMT_VINFO_LIVE_P (s))
7469 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7471 else if (STMT_VINFO_LIVE_P (vdef))
7472 STMT_VINFO_REDUC_DEF (def) = phi_info;
7473 gimple_match_op op;
7474 if (!gimple_extract_op (vdef->stmt, &op))
7476 if (dump_enabled_p ())
7477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7478 "reduction chain includes unsupported"
7479 " statement type.\n");
7480 return false;
7482 if (CONVERT_EXPR_CODE_P (op.code))
7484 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7486 if (dump_enabled_p ())
7487 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7488 "conversion in the reduction chain.\n");
7489 return false;
7492 else if (!stmt_info)
7493 /* First non-conversion stmt. */
7494 stmt_info = vdef;
7495 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7496 reduc_chain_length++;
7497 if (!stmt_info && slp_node)
7498 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7500 /* PHIs should not participate in patterns. */
7501 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7503 if (nested_in_vect_loop_p (loop, stmt_info))
7505 loop = loop->inner;
7506 nested_cycle = true;
7509 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7510 element. */
7511 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7513 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7514 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7516 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7517 gcc_assert (slp_node
7518 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7520 /* 1. Is vectorizable reduction? */
7521 /* Not supportable if the reduction variable is used in the loop, unless
7522 it's a reduction chain. */
7523 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7524 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7525 return false;
7527 /* Reductions that are not used even in an enclosing outer-loop,
7528 are expected to be "live" (used out of the loop). */
7529 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7530 && !STMT_VINFO_LIVE_P (stmt_info))
7531 return false;
7533 /* 2. Has this been recognized as a reduction pattern?
7535 Check if STMT represents a pattern that has been recognized
7536 in earlier analysis stages. For stmts that represent a pattern,
7537 the STMT_VINFO_RELATED_STMT field records the last stmt in
7538 the original sequence that constitutes the pattern. */
7540 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7541 if (orig_stmt_info)
7543 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7544 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7547 /* 3. Check the operands of the operation. The first operands are defined
7548 inside the loop body. The last operand is the reduction variable,
7549 which is defined by the loop-header-phi. */
7551 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7552 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7553 gimple_match_op op;
7554 if (!gimple_extract_op (stmt_info->stmt, &op))
7555 gcc_unreachable ();
7556 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7557 || op.code == WIDEN_SUM_EXPR
7558 || op.code == SAD_EXPR);
7560 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7561 && !SCALAR_FLOAT_TYPE_P (op.type))
7562 return false;
7564 /* Do not try to vectorize bit-precision reductions. */
7565 if (!type_has_mode_precision_p (op.type))
7566 return false;
7568 /* For lane-reducing ops we're reducing the number of reduction PHIs
7569 which means the only use of that may be in the lane-reducing operation. */
7570 if (lane_reduc_code_p
7571 && reduc_chain_length != 1
7572 && !only_slp_reduc_chain)
7574 if (dump_enabled_p ())
7575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7576 "lane-reducing reduction with extra stmts.\n");
7577 return false;
7580 /* All uses but the last are expected to be defined in the loop.
7581 The last use is the reduction variable. In case of nested cycle this
7582 assumption is not true: we use reduc_index to record the index of the
7583 reduction variable. */
7584 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7585 /* We need to skip an extra operand for COND_EXPRs with embedded
7586 comparison. */
7587 unsigned opno_adjust = 0;
7588 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7589 opno_adjust = 1;
7590 for (i = 0; i < (int) op.num_ops; i++)
7592 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7593 if (i == 0 && op.code == COND_EXPR)
7594 continue;
7596 stmt_vec_info def_stmt_info;
7597 enum vect_def_type dt;
7598 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7599 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7600 &vectype_op[i], &def_stmt_info))
7602 if (dump_enabled_p ())
7603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7604 "use not simple.\n");
7605 return false;
7607 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7608 continue;
7610 /* For an IFN_COND_OP we might hit the reduction definition operand
7611 twice (once as definition, once as else). */
7612 if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7613 continue;
7615 /* There should be only one cycle def in the stmt, the one
7616 leading to reduc_def. */
7617 if (VECTORIZABLE_CYCLE_DEF (dt))
7618 return false;
7620 if (!vectype_op[i])
7621 vectype_op[i]
7622 = get_vectype_for_scalar_type (loop_vinfo,
7623 TREE_TYPE (op.ops[i]), slp_op[i]);
7625 /* To properly compute ncopies we are interested in the widest
7626 non-reduction input type in case we're looking at a widening
7627 accumulation that we later handle in vect_transform_reduction. */
7628 if (lane_reduc_code_p
7629 && vectype_op[i]
7630 && (!vectype_in
7631 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7632 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7633 vectype_in = vectype_op[i];
7635 if (op.code == COND_EXPR)
7637 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7638 if (dt == vect_constant_def)
7640 cond_reduc_dt = dt;
7641 cond_reduc_val = op.ops[i];
7643 if (dt == vect_induction_def
7644 && def_stmt_info
7645 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7647 cond_reduc_dt = dt;
7648 cond_stmt_vinfo = def_stmt_info;
7652 if (!vectype_in)
7653 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7654 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7656 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7657 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7658 /* If we have a condition reduction, see if we can simplify it further. */
7659 if (v_reduc_type == COND_REDUCTION)
7661 if (slp_node)
7662 return false;
7664 /* When the condition uses the reduction value in the condition, fail. */
7665 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7667 if (dump_enabled_p ())
7668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7669 "condition depends on previous iteration\n");
7670 return false;
7673 if (reduc_chain_length == 1
7674 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7675 OPTIMIZE_FOR_SPEED)
7676 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7677 vectype_in,
7678 OPTIMIZE_FOR_SPEED)))
7680 if (dump_enabled_p ())
7681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7682 "optimizing condition reduction with"
7683 " FOLD_EXTRACT_LAST.\n");
7684 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7686 else if (cond_reduc_dt == vect_induction_def)
7688 tree base
7689 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7690 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7692 gcc_assert (TREE_CODE (base) == INTEGER_CST
7693 && TREE_CODE (step) == INTEGER_CST);
7694 cond_reduc_val = NULL_TREE;
7695 enum tree_code cond_reduc_op_code = ERROR_MARK;
7696 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7697 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7699 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7700 above base; punt if base is the minimum value of the type for
7701 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7702 else if (tree_int_cst_sgn (step) == -1)
7704 cond_reduc_op_code = MIN_EXPR;
7705 if (tree_int_cst_sgn (base) == -1)
7706 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7707 else if (tree_int_cst_lt (base,
7708 TYPE_MAX_VALUE (TREE_TYPE (base))))
7709 cond_reduc_val
7710 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7712 else
7714 cond_reduc_op_code = MAX_EXPR;
7715 if (tree_int_cst_sgn (base) == 1)
7716 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7717 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7718 base))
7719 cond_reduc_val
7720 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7722 if (cond_reduc_val)
7724 if (dump_enabled_p ())
7725 dump_printf_loc (MSG_NOTE, vect_location,
7726 "condition expression based on "
7727 "integer induction.\n");
7728 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7729 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7730 = cond_reduc_val;
7731 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7734 else if (cond_reduc_dt == vect_constant_def)
7736 enum vect_def_type cond_initial_dt;
7737 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7738 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7739 if (cond_initial_dt == vect_constant_def
7740 && types_compatible_p (TREE_TYPE (cond_initial_val),
7741 TREE_TYPE (cond_reduc_val)))
7743 tree e = fold_binary (LE_EXPR, boolean_type_node,
7744 cond_initial_val, cond_reduc_val);
7745 if (e && (integer_onep (e) || integer_zerop (e)))
7747 if (dump_enabled_p ())
7748 dump_printf_loc (MSG_NOTE, vect_location,
7749 "condition expression based on "
7750 "compile time constant.\n");
7751 /* Record reduction code at analysis stage. */
7752 STMT_VINFO_REDUC_CODE (reduc_info)
7753 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7754 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7760 if (STMT_VINFO_LIVE_P (phi_info))
7761 return false;
7763 if (slp_node)
7764 ncopies = 1;
7765 else
7766 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7768 gcc_assert (ncopies >= 1);
7770 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7772 if (nested_cycle)
7774 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7775 == vect_double_reduction_def);
7776 double_reduc = true;
7779 /* 4.2. Check support for the epilog operation.
7781 If STMT represents a reduction pattern, then the type of the
7782 reduction variable may be different than the type of the rest
7783 of the arguments. For example, consider the case of accumulation
7784 of shorts into an int accumulator; The original code:
7785 S1: int_a = (int) short_a;
7786 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7788 was replaced with:
7789 STMT: int_acc = widen_sum <short_a, int_acc>
7791 This means that:
7792 1. The tree-code that is used to create the vector operation in the
7793 epilog code (that reduces the partial results) is not the
7794 tree-code of STMT, but is rather the tree-code of the original
7795 stmt from the pattern that STMT is replacing. I.e, in the example
7796 above we want to use 'widen_sum' in the loop, but 'plus' in the
7797 epilog.
7798 2. The type (mode) we use to check available target support
7799 for the vector operation to be created in the *epilog*, is
7800 determined by the type of the reduction variable (in the example
7801 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7802 However the type (mode) we use to check available target support
7803 for the vector operation to be created *inside the loop*, is
7804 determined by the type of the other arguments to STMT (in the
7805 example we'd check this: optab_handler (widen_sum_optab,
7806 vect_short_mode)).
7808 This is contrary to "regular" reductions, in which the types of all
7809 the arguments are the same as the type of the reduction variable.
7810 For "regular" reductions we can therefore use the same vector type
7811 (and also the same tree-code) when generating the epilog code and
7812 when generating the code inside the loop. */
7814 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7816 /* If conversion might have created a conditional operation like
7817 IFN_COND_ADD already. Use the internal code for the following checks. */
7818 if (orig_code.is_internal_fn ())
7820 tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7821 orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7824 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7826 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7827 if (reduction_type == TREE_CODE_REDUCTION)
7829 /* Check whether it's ok to change the order of the computation.
7830 Generally, when vectorizing a reduction we change the order of the
7831 computation. This may change the behavior of the program in some
7832 cases, so we need to check that this is ok. One exception is when
7833 vectorizing an outer-loop: the inner-loop is executed sequentially,
7834 and therefore vectorizing reductions in the inner-loop during
7835 outer-loop vectorization is safe. Likewise when we are vectorizing
7836 a series of reductions using SLP and the VF is one the reductions
7837 are performed in scalar order. */
7838 if (slp_node
7839 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7840 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7842 else if (needs_fold_left_reduction_p (op.type, orig_code))
7844 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7845 is not directy used in stmt. */
7846 if (!only_slp_reduc_chain
7847 && reduc_chain_length != 1)
7849 if (dump_enabled_p ())
7850 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7851 "in-order reduction chain without SLP.\n");
7852 return false;
7854 STMT_VINFO_REDUC_TYPE (reduc_info)
7855 = reduction_type = FOLD_LEFT_REDUCTION;
7857 else if (!commutative_binary_op_p (orig_code, op.type)
7858 || !associative_binary_op_p (orig_code, op.type))
7860 if (dump_enabled_p ())
7861 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7862 "reduction: not commutative/associative\n");
7863 return false;
7867 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7868 && ncopies > 1)
7870 if (dump_enabled_p ())
7871 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7872 "multiple types in double reduction or condition "
7873 "reduction or fold-left reduction.\n");
7874 return false;
7877 internal_fn reduc_fn = IFN_LAST;
7878 if (reduction_type == TREE_CODE_REDUCTION
7879 || reduction_type == FOLD_LEFT_REDUCTION
7880 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7881 || reduction_type == CONST_COND_REDUCTION)
7883 if (reduction_type == FOLD_LEFT_REDUCTION
7884 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7885 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7887 if (reduc_fn != IFN_LAST
7888 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7889 OPTIMIZE_FOR_SPEED))
7891 if (dump_enabled_p ())
7892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7893 "reduc op not supported by target.\n");
7895 reduc_fn = IFN_LAST;
7898 else
7900 if (!nested_cycle || double_reduc)
7902 if (dump_enabled_p ())
7903 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7904 "no reduc code for scalar code.\n");
7906 return false;
7910 else if (reduction_type == COND_REDUCTION)
7912 int scalar_precision
7913 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7914 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7915 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7916 vectype_out);
7918 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7919 OPTIMIZE_FOR_SPEED))
7920 reduc_fn = IFN_REDUC_MAX;
7922 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7924 if (reduction_type != EXTRACT_LAST_REDUCTION
7925 && (!nested_cycle || double_reduc)
7926 && reduc_fn == IFN_LAST
7927 && !nunits_out.is_constant ())
7929 if (dump_enabled_p ())
7930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7931 "missing target support for reduction on"
7932 " variable-length vectors.\n");
7933 return false;
7936 /* For SLP reductions, see if there is a neutral value we can use. */
7937 tree neutral_op = NULL_TREE;
7938 if (slp_node)
7940 tree initial_value = NULL_TREE;
7941 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7942 initial_value = vect_phi_initial_value (reduc_def_phi);
7943 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7944 orig_code, initial_value);
7947 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7949 /* We can't support in-order reductions of code such as this:
7951 for (int i = 0; i < n1; ++i)
7952 for (int j = 0; j < n2; ++j)
7953 l += a[j];
7955 since GCC effectively transforms the loop when vectorizing:
7957 for (int i = 0; i < n1 / VF; ++i)
7958 for (int j = 0; j < n2; ++j)
7959 for (int k = 0; k < VF; ++k)
7960 l += a[j];
7962 which is a reassociation of the original operation. */
7963 if (dump_enabled_p ())
7964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7965 "in-order double reduction not supported.\n");
7967 return false;
7970 if (reduction_type == FOLD_LEFT_REDUCTION
7971 && slp_node
7972 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7974 /* We cannot use in-order reductions in this case because there is
7975 an implicit reassociation of the operations involved. */
7976 if (dump_enabled_p ())
7977 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7978 "in-order unchained SLP reductions not supported.\n");
7979 return false;
7982 /* For double reductions, and for SLP reductions with a neutral value,
7983 we construct a variable-length initial vector by loading a vector
7984 full of the neutral value and then shift-and-inserting the start
7985 values into the low-numbered elements. */
7986 if ((double_reduc || neutral_op)
7987 && !nunits_out.is_constant ()
7988 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7989 vectype_out, OPTIMIZE_FOR_SPEED))
7991 if (dump_enabled_p ())
7992 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7993 "reduction on variable-length vectors requires"
7994 " target support for a vector-shift-and-insert"
7995 " operation.\n");
7996 return false;
7999 /* Check extra constraints for variable-length unchained SLP reductions. */
8000 if (slp_node
8001 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8002 && !nunits_out.is_constant ())
8004 /* We checked above that we could build the initial vector when
8005 there's a neutral element value. Check here for the case in
8006 which each SLP statement has its own initial value and in which
8007 that value needs to be repeated for every instance of the
8008 statement within the initial vector. */
8009 unsigned int group_size = SLP_TREE_LANES (slp_node);
8010 if (!neutral_op
8011 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8012 TREE_TYPE (vectype_out)))
8014 if (dump_enabled_p ())
8015 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8016 "unsupported form of SLP reduction for"
8017 " variable-length vectors: cannot build"
8018 " initial vector.\n");
8019 return false;
8021 /* The epilogue code relies on the number of elements being a multiple
8022 of the group size. The duplicate-and-interleave approach to setting
8023 up the initial vector does too. */
8024 if (!multiple_p (nunits_out, group_size))
8026 if (dump_enabled_p ())
8027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8028 "unsupported form of SLP reduction for"
8029 " variable-length vectors: the vector size"
8030 " is not a multiple of the number of results.\n");
8031 return false;
8035 if (reduction_type == COND_REDUCTION)
8037 widest_int ni;
8039 if (! max_loop_iterations (loop, &ni))
8041 if (dump_enabled_p ())
8042 dump_printf_loc (MSG_NOTE, vect_location,
8043 "loop count not known, cannot create cond "
8044 "reduction.\n");
8045 return false;
8047 /* Convert backedges to iterations. */
8048 ni += 1;
8050 /* The additional index will be the same type as the condition. Check
8051 that the loop can fit into this less one (because we'll use up the
8052 zero slot for when there are no matches). */
8053 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8054 if (wi::geu_p (ni, wi::to_widest (max_index)))
8056 if (dump_enabled_p ())
8057 dump_printf_loc (MSG_NOTE, vect_location,
8058 "loop size is greater than data size.\n");
8059 return false;
8063 /* In case the vectorization factor (VF) is bigger than the number
8064 of elements that we can fit in a vectype (nunits), we have to generate
8065 more than one vector stmt - i.e - we need to "unroll" the
8066 vector stmt by a factor VF/nunits. For more details see documentation
8067 in vectorizable_operation. */
8069 /* If the reduction is used in an outer loop we need to generate
8070 VF intermediate results, like so (e.g. for ncopies=2):
8071 r0 = phi (init, r0)
8072 r1 = phi (init, r1)
8073 r0 = x0 + r0;
8074 r1 = x1 + r1;
8075 (i.e. we generate VF results in 2 registers).
8076 In this case we have a separate def-use cycle for each copy, and therefore
8077 for each copy we get the vector def for the reduction variable from the
8078 respective phi node created for this copy.
8080 Otherwise (the reduction is unused in the loop nest), we can combine
8081 together intermediate results, like so (e.g. for ncopies=2):
8082 r = phi (init, r)
8083 r = x0 + r;
8084 r = x1 + r;
8085 (i.e. we generate VF/2 results in a single register).
8086 In this case for each copy we get the vector def for the reduction variable
8087 from the vectorized reduction operation generated in the previous iteration.
8089 This only works when we see both the reduction PHI and its only consumer
8090 in vectorizable_reduction and there are no intermediate stmts
8091 participating. When unrolling we want each unrolled iteration to have its
8092 own reduction accumulator since one of the main goals of unrolling a
8093 reduction is to reduce the aggregate loop-carried latency. */
8094 if (ncopies > 1
8095 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8096 && reduc_chain_length == 1
8097 && loop_vinfo->suggested_unroll_factor == 1)
8098 single_defuse_cycle = true;
8100 if (single_defuse_cycle || lane_reduc_code_p)
8102 gcc_assert (op.code != COND_EXPR);
8104 /* 4. Supportable by target? */
8105 bool ok = true;
8107 /* 4.1. check support for the operation in the loop
8109 This isn't necessary for the lane reduction codes, since they
8110 can only be produced by pattern matching, and it's up to the
8111 pattern matcher to test for support. The main reason for
8112 specifically skipping this step is to avoid rechecking whether
8113 mixed-sign dot-products can be implemented using signed
8114 dot-products. */
8115 machine_mode vec_mode = TYPE_MODE (vectype_in);
8116 if (!lane_reduc_code_p
8117 && !directly_supported_p (op.code, vectype_in, optab_vector))
8119 if (dump_enabled_p ())
8120 dump_printf (MSG_NOTE, "op not supported by target.\n");
8121 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8122 || !vect_can_vectorize_without_simd_p (op.code))
8123 ok = false;
8124 else
8125 if (dump_enabled_p ())
8126 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8129 if (vect_emulated_vector_p (vectype_in)
8130 && !vect_can_vectorize_without_simd_p (op.code))
8132 if (dump_enabled_p ())
8133 dump_printf (MSG_NOTE, "using word mode not possible.\n");
8134 return false;
8137 /* lane-reducing operations have to go through vect_transform_reduction.
8138 For the other cases try without the single cycle optimization. */
8139 if (!ok)
8141 if (lane_reduc_code_p)
8142 return false;
8143 else
8144 single_defuse_cycle = false;
8147 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8149 /* If the reduction stmt is one of the patterns that have lane
8150 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
8151 if ((ncopies > 1 && ! single_defuse_cycle)
8152 && lane_reduc_code_p)
8154 if (dump_enabled_p ())
8155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8156 "multi def-use cycle not possible for lane-reducing "
8157 "reduction operation\n");
8158 return false;
8161 if (slp_node
8162 && !(!single_defuse_cycle
8163 && !lane_reduc_code_p
8164 && reduction_type != FOLD_LEFT_REDUCTION))
8165 for (i = 0; i < (int) op.num_ops; i++)
8166 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8168 if (dump_enabled_p ())
8169 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8170 "incompatible vector types for invariants\n");
8171 return false;
8174 if (slp_node)
8175 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8176 else
8177 vec_num = 1;
8179 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8180 reduction_type, ncopies, cost_vec);
8181 /* Cost the reduction op inside the loop if transformed via
8182 vect_transform_reduction. Otherwise this is costed by the
8183 separate vectorizable_* routines. */
8184 if (single_defuse_cycle || lane_reduc_code_p)
8186 int factor = 1;
8187 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8188 /* Three dot-products and a subtraction. */
8189 factor = 4;
8190 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8191 stmt_info, 0, vect_body);
8194 if (dump_enabled_p ()
8195 && reduction_type == FOLD_LEFT_REDUCTION)
8196 dump_printf_loc (MSG_NOTE, vect_location,
8197 "using an in-order (fold-left) reduction.\n");
8198 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8199 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8200 reductions go through their own vectorizable_* routines. */
8201 if (!single_defuse_cycle
8202 && !lane_reduc_code_p
8203 && reduction_type != FOLD_LEFT_REDUCTION)
8205 stmt_vec_info tem
8206 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8207 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8209 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8210 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8212 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8213 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8215 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8217 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8218 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8219 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8221 if (reduction_type != FOLD_LEFT_REDUCTION
8222 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8223 && (cond_fn == IFN_LAST
8224 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8225 OPTIMIZE_FOR_SPEED)))
8227 if (dump_enabled_p ())
8228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8229 "can't operate on partial vectors because"
8230 " no conditional operation is available.\n");
8231 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8233 else if (reduction_type == FOLD_LEFT_REDUCTION
8234 && reduc_fn == IFN_LAST
8235 && !expand_vec_cond_expr_p (vectype_in,
8236 truth_type_for (vectype_in),
8237 SSA_NAME))
8239 if (dump_enabled_p ())
8240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8241 "can't operate on partial vectors because"
8242 " no conditional operation is available.\n");
8243 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8245 else if (reduction_type == FOLD_LEFT_REDUCTION
8246 && internal_fn_mask_index (reduc_fn) == -1
8247 && FLOAT_TYPE_P (vectype_in)
8248 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8250 if (dump_enabled_p ())
8251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8252 "can't operate on partial vectors because"
8253 " signed zeros cannot be preserved.\n");
8254 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8256 else
8258 internal_fn mask_reduc_fn
8259 = get_masked_reduction_fn (reduc_fn, vectype_in);
8261 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8262 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8263 vectype_in, 1);
8264 else
8265 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8266 vectype_in, NULL);
8269 return true;
8272 /* STMT_INFO is a dot-product reduction whose multiplication operands
8273 have different signs. Emit a sequence to emulate the operation
8274 using a series of signed DOT_PROD_EXPRs and return the last
8275 statement generated. VEC_DEST is the result of the vector operation
8276 and VOP lists its inputs. */
8278 static gassign *
8279 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8280 gimple_stmt_iterator *gsi, tree vec_dest,
8281 tree vop[3])
8283 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8284 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8285 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8286 gimple *new_stmt;
8288 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8289 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8290 std::swap (vop[0], vop[1]);
8292 /* Convert all inputs to signed types. */
8293 for (int i = 0; i < 3; ++i)
8294 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8296 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8297 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8298 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8299 vop[i] = tmp;
8302 /* In the comments below we assume 8-bit inputs for simplicity,
8303 but the approach works for any full integer type. */
8305 /* Create a vector of -128. */
8306 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8307 tree min_narrow = build_vector_from_val (narrow_vectype,
8308 min_narrow_elttype);
8310 /* Create a vector of 64. */
8311 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8312 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8313 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8315 /* Emit: SUB_RES = VOP[0] - 128. */
8316 tree sub_res = make_ssa_name (narrow_vectype);
8317 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8318 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8320 /* Emit:
8322 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8323 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8324 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8326 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8327 Doing the two 64 * y steps first allows more time to compute x. */
8328 tree stage1 = make_ssa_name (wide_vectype);
8329 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8330 vop[1], half_narrow, vop[2]);
8331 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8333 tree stage2 = make_ssa_name (wide_vectype);
8334 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8335 vop[1], half_narrow, stage1);
8336 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8338 tree stage3 = make_ssa_name (wide_vectype);
8339 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8340 sub_res, vop[1], stage2);
8341 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8343 /* Convert STAGE3 to the reduction type. */
8344 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8347 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8348 value. */
8350 bool
8351 vect_transform_reduction (loop_vec_info loop_vinfo,
8352 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8353 gimple **vec_stmt, slp_tree slp_node)
8355 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8356 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8357 int i;
8358 int ncopies;
8359 int vec_num;
8361 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8362 gcc_assert (reduc_info->is_reduc_info);
8364 if (nested_in_vect_loop_p (loop, stmt_info))
8366 loop = loop->inner;
8367 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8370 gimple_match_op op;
8371 if (!gimple_extract_op (stmt_info->stmt, &op))
8372 gcc_unreachable ();
8374 /* All uses but the last are expected to be defined in the loop.
8375 The last use is the reduction variable. In case of nested cycle this
8376 assumption is not true: we use reduc_index to record the index of the
8377 reduction variable. */
8378 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8379 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8380 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8381 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8383 if (slp_node)
8385 ncopies = 1;
8386 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8388 else
8390 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8391 vec_num = 1;
8394 code_helper code = canonicalize_code (op.code, op.type);
8395 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8397 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8398 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8399 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8401 /* Transform. */
8402 tree new_temp = NULL_TREE;
8403 auto_vec<tree> vec_oprnds0;
8404 auto_vec<tree> vec_oprnds1;
8405 auto_vec<tree> vec_oprnds2;
8406 tree def0;
8408 if (dump_enabled_p ())
8409 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8411 /* FORNOW: Multiple types are not supported for condition. */
8412 if (code == COND_EXPR)
8413 gcc_assert (ncopies == 1);
8415 /* A binary COND_OP reduction must have the same definition and else
8416 value. */
8417 bool cond_fn_p = code.is_internal_fn ()
8418 && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8419 if (cond_fn_p)
8421 gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8422 || code == IFN_COND_MUL || code == IFN_COND_AND
8423 || code == IFN_COND_IOR || code == IFN_COND_XOR);
8424 gcc_assert (op.num_ops == 4 && (op.ops[1] == op.ops[3]));
8427 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8429 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8430 if (reduction_type == FOLD_LEFT_REDUCTION)
8432 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8433 gcc_assert (code.is_tree_code () || cond_fn_p);
8434 return vectorize_fold_left_reduction
8435 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8436 code, reduc_fn, op.ops, op.num_ops, vectype_in,
8437 reduc_index, masks, lens);
8440 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8441 gcc_assert (single_defuse_cycle
8442 || code == DOT_PROD_EXPR
8443 || code == WIDEN_SUM_EXPR
8444 || code == SAD_EXPR);
8446 /* Create the destination vector */
8447 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8448 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8450 /* Get NCOPIES vector definitions for all operands except the reduction
8451 definition. */
8452 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8453 single_defuse_cycle && reduc_index == 0
8454 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8455 single_defuse_cycle && reduc_index == 1
8456 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8457 op.num_ops == 4
8458 || (op.num_ops == 3
8459 && !(single_defuse_cycle && reduc_index == 2))
8460 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8462 /* For single def-use cycles get one copy of the vectorized reduction
8463 definition. */
8464 if (single_defuse_cycle)
8466 gcc_assert (!slp_node);
8467 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8468 op.ops[reduc_index],
8469 reduc_index == 0 ? &vec_oprnds0
8470 : (reduc_index == 1 ? &vec_oprnds1
8471 : &vec_oprnds2));
8474 bool emulated_mixed_dot_prod
8475 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8476 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8478 gimple *new_stmt;
8479 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8480 if (masked_loop_p && !mask_by_cond_expr)
8482 /* No conditional ifns have been defined for dot-product yet. */
8483 gcc_assert (code != DOT_PROD_EXPR);
8485 /* Make sure that the reduction accumulator is vop[0]. */
8486 if (reduc_index == 1)
8488 gcc_assert (commutative_binary_op_p (code, op.type));
8489 std::swap (vop[0], vop[1]);
8491 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8492 vec_num * ncopies, vectype_in, i);
8493 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8494 vop[0], vop[1], vop[0]);
8495 new_temp = make_ssa_name (vec_dest, call);
8496 gimple_call_set_lhs (call, new_temp);
8497 gimple_call_set_nothrow (call, true);
8498 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8499 new_stmt = call;
8501 else
8503 if (op.num_ops >= 3)
8504 vop[2] = vec_oprnds2[i];
8506 if (masked_loop_p && mask_by_cond_expr)
8508 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8509 vec_num * ncopies, vectype_in, i);
8510 build_vect_cond_expr (code, vop, mask, gsi);
8513 if (emulated_mixed_dot_prod)
8514 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8515 vec_dest, vop);
8517 else if (code.is_internal_fn () && !cond_fn_p)
8518 new_stmt = gimple_build_call_internal (internal_fn (code),
8519 op.num_ops,
8520 vop[0], vop[1], vop[2]);
8521 else if (code.is_internal_fn () && cond_fn_p)
8522 new_stmt = gimple_build_call_internal (internal_fn (code),
8523 op.num_ops,
8524 vop[0], vop[1], vop[2],
8525 vop[1]);
8526 else
8527 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8528 vop[0], vop[1], vop[2]);
8529 new_temp = make_ssa_name (vec_dest, new_stmt);
8530 gimple_set_lhs (new_stmt, new_temp);
8531 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8534 if (slp_node)
8535 slp_node->push_vec_def (new_stmt);
8536 else if (single_defuse_cycle
8537 && i < ncopies - 1)
8539 if (reduc_index == 0)
8540 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8541 else if (reduc_index == 1)
8542 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8543 else if (reduc_index == 2)
8544 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8546 else
8547 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8550 if (!slp_node)
8551 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8553 return true;
8556 /* Transform phase of a cycle PHI. */
8558 bool
8559 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8560 stmt_vec_info stmt_info, gimple **vec_stmt,
8561 slp_tree slp_node, slp_instance slp_node_instance)
8563 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8564 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8565 int i;
8566 int ncopies;
8567 int j;
8568 bool nested_cycle = false;
8569 int vec_num;
8571 if (nested_in_vect_loop_p (loop, stmt_info))
8573 loop = loop->inner;
8574 nested_cycle = true;
8577 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8578 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8579 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8580 gcc_assert (reduc_info->is_reduc_info);
8582 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8583 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8584 /* Leave the scalar phi in place. */
8585 return true;
8587 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8588 /* For a nested cycle we do not fill the above. */
8589 if (!vectype_in)
8590 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8591 gcc_assert (vectype_in);
8593 if (slp_node)
8595 /* The size vect_schedule_slp_instance computes is off for us. */
8596 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8597 * SLP_TREE_LANES (slp_node), vectype_in);
8598 ncopies = 1;
8600 else
8602 vec_num = 1;
8603 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8606 /* Check whether we should use a single PHI node and accumulate
8607 vectors to one before the backedge. */
8608 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8609 ncopies = 1;
8611 /* Create the destination vector */
8612 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8613 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8614 vectype_out);
8616 /* Get the loop-entry arguments. */
8617 tree vec_initial_def = NULL_TREE;
8618 auto_vec<tree> vec_initial_defs;
8619 if (slp_node)
8621 vec_initial_defs.reserve (vec_num);
8622 if (nested_cycle)
8624 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8625 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8626 &vec_initial_defs);
8628 else
8630 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8631 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8632 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8634 unsigned int num_phis = stmts.length ();
8635 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8636 num_phis = 1;
8637 initial_values.reserve (num_phis);
8638 for (unsigned int i = 0; i < num_phis; ++i)
8640 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8641 initial_values.quick_push (vect_phi_initial_value (this_phi));
8643 if (vec_num == 1)
8644 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8645 if (!initial_values.is_empty ())
8647 tree initial_value
8648 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8649 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8650 tree neutral_op
8651 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8652 code, initial_value);
8653 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8654 &vec_initial_defs, vec_num,
8655 stmts.length (), neutral_op);
8659 else
8661 /* Get at the scalar def before the loop, that defines the initial
8662 value of the reduction variable. */
8663 tree initial_def = vect_phi_initial_value (phi);
8664 reduc_info->reduc_initial_values.safe_push (initial_def);
8665 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8666 and we can't use zero for induc_val, use initial_def. Similarly
8667 for REDUC_MIN and initial_def larger than the base. */
8668 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8670 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8671 if (TREE_CODE (initial_def) == INTEGER_CST
8672 && !integer_zerop (induc_val)
8673 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8674 && tree_int_cst_lt (initial_def, induc_val))
8675 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8676 && tree_int_cst_lt (induc_val, initial_def))))
8678 induc_val = initial_def;
8679 /* Communicate we used the initial_def to epilouge
8680 generation. */
8681 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8683 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8685 else if (nested_cycle)
8687 /* Do not use an adjustment def as that case is not supported
8688 correctly if ncopies is not one. */
8689 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8690 ncopies, initial_def,
8691 &vec_initial_defs);
8693 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8694 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8695 /* Fill the initial vector with the initial scalar value. */
8696 vec_initial_def
8697 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8698 initial_def, initial_def);
8699 else
8701 if (ncopies == 1)
8702 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8703 if (!reduc_info->reduc_initial_values.is_empty ())
8705 initial_def = reduc_info->reduc_initial_values[0];
8706 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8707 tree neutral_op
8708 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8709 code, initial_def);
8710 gcc_assert (neutral_op);
8711 /* Try to simplify the vector initialization by applying an
8712 adjustment after the reduction has been performed. */
8713 if (!reduc_info->reused_accumulator
8714 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8715 && !operand_equal_p (neutral_op, initial_def))
8717 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8718 = initial_def;
8719 initial_def = neutral_op;
8721 vec_initial_def
8722 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8723 initial_def, neutral_op);
8728 if (vec_initial_def)
8730 vec_initial_defs.create (ncopies);
8731 for (i = 0; i < ncopies; ++i)
8732 vec_initial_defs.quick_push (vec_initial_def);
8735 if (auto *accumulator = reduc_info->reused_accumulator)
8737 tree def = accumulator->reduc_input;
8738 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8740 unsigned int nreduc;
8741 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8742 (TREE_TYPE (def)),
8743 TYPE_VECTOR_SUBPARTS (vectype_out),
8744 &nreduc);
8745 gcc_assert (res);
8746 gimple_seq stmts = NULL;
8747 /* Reduce the single vector to a smaller one. */
8748 if (nreduc != 1)
8750 /* Perform the reduction in the appropriate type. */
8751 tree rvectype = vectype_out;
8752 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8753 TREE_TYPE (TREE_TYPE (def))))
8754 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8755 TYPE_VECTOR_SUBPARTS
8756 (vectype_out));
8757 def = vect_create_partial_epilog (def, rvectype,
8758 STMT_VINFO_REDUC_CODE
8759 (reduc_info),
8760 &stmts);
8762 /* The epilogue loop might use a different vector mode, like
8763 VNx2DI vs. V2DI. */
8764 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8766 tree reduc_type = build_vector_type_for_mode
8767 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8768 def = gimple_convert (&stmts, reduc_type, def);
8770 /* Adjust the input so we pick up the partially reduced value
8771 for the skip edge in vect_create_epilog_for_reduction. */
8772 accumulator->reduc_input = def;
8773 /* And the reduction could be carried out using a different sign. */
8774 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8775 def = gimple_convert (&stmts, vectype_out, def);
8776 if (loop_vinfo->main_loop_edge)
8778 /* While we'd like to insert on the edge this will split
8779 blocks and disturb bookkeeping, we also will eventually
8780 need this on the skip edge. Rely on sinking to
8781 fixup optimal placement and insert in the pred. */
8782 gimple_stmt_iterator gsi
8783 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8784 /* Insert before a cond that eventually skips the
8785 epilogue. */
8786 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8787 gsi_prev (&gsi);
8788 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8790 else
8791 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8792 stmts);
8794 if (loop_vinfo->main_loop_edge)
8795 vec_initial_defs[0]
8796 = vect_get_main_loop_result (loop_vinfo, def,
8797 vec_initial_defs[0]);
8798 else
8799 vec_initial_defs.safe_push (def);
8802 /* Generate the reduction PHIs upfront. */
8803 for (i = 0; i < vec_num; i++)
8805 tree vec_init_def = vec_initial_defs[i];
8806 for (j = 0; j < ncopies; j++)
8808 /* Create the reduction-phi that defines the reduction
8809 operand. */
8810 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8812 /* Set the loop-entry arg of the reduction-phi. */
8813 if (j != 0 && nested_cycle)
8814 vec_init_def = vec_initial_defs[j];
8815 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8816 UNKNOWN_LOCATION);
8818 /* The loop-latch arg is set in epilogue processing. */
8820 if (slp_node)
8821 slp_node->push_vec_def (new_phi);
8822 else
8824 if (j == 0)
8825 *vec_stmt = new_phi;
8826 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8831 return true;
8834 /* Vectorizes LC PHIs. */
8836 bool
8837 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8838 stmt_vec_info stmt_info, gimple **vec_stmt,
8839 slp_tree slp_node)
8841 if (!loop_vinfo
8842 || !is_a <gphi *> (stmt_info->stmt)
8843 || gimple_phi_num_args (stmt_info->stmt) != 1)
8844 return false;
8846 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8847 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8848 return false;
8850 if (!vec_stmt) /* transformation not required. */
8852 /* Deal with copies from externs or constants that disguise as
8853 loop-closed PHI nodes (PR97886). */
8854 if (slp_node
8855 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8856 SLP_TREE_VECTYPE (slp_node)))
8858 if (dump_enabled_p ())
8859 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8860 "incompatible vector types for invariants\n");
8861 return false;
8863 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8864 return true;
8867 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8868 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8869 basic_block bb = gimple_bb (stmt_info->stmt);
8870 edge e = single_pred_edge (bb);
8871 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8872 auto_vec<tree> vec_oprnds;
8873 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8874 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8875 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8876 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8878 /* Create the vectorized LC PHI node. */
8879 gphi *new_phi = create_phi_node (vec_dest, bb);
8880 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8881 if (slp_node)
8882 slp_node->push_vec_def (new_phi);
8883 else
8884 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8886 if (!slp_node)
8887 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8889 return true;
8892 /* Vectorizes PHIs. */
8894 bool
8895 vectorizable_phi (vec_info *,
8896 stmt_vec_info stmt_info, gimple **vec_stmt,
8897 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8899 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8900 return false;
8902 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8903 return false;
8905 tree vectype = SLP_TREE_VECTYPE (slp_node);
8907 if (!vec_stmt) /* transformation not required. */
8909 slp_tree child;
8910 unsigned i;
8911 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8912 if (!child)
8914 if (dump_enabled_p ())
8915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8916 "PHI node with unvectorized backedge def\n");
8917 return false;
8919 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8921 if (dump_enabled_p ())
8922 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8923 "incompatible vector types for invariants\n");
8924 return false;
8926 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8927 && !useless_type_conversion_p (vectype,
8928 SLP_TREE_VECTYPE (child)))
8930 /* With bools we can have mask and non-mask precision vectors
8931 or different non-mask precisions. while pattern recog is
8932 supposed to guarantee consistency here bugs in it can cause
8933 mismatches (PR103489 and PR103800 for example).
8934 Deal with them here instead of ICEing later. */
8935 if (dump_enabled_p ())
8936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8937 "incompatible vector type setup from "
8938 "bool pattern detection\n");
8939 return false;
8942 /* For single-argument PHIs assume coalescing which means zero cost
8943 for the scalar and the vector PHIs. This avoids artificially
8944 favoring the vector path (but may pessimize it in some cases). */
8945 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8946 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8947 vector_stmt, stmt_info, vectype, 0, vect_body);
8948 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8949 return true;
8952 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8953 basic_block bb = gimple_bb (stmt_info->stmt);
8954 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8955 auto_vec<gphi *> new_phis;
8956 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8958 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8960 /* Skip not yet vectorized defs. */
8961 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8962 && SLP_TREE_VEC_DEFS (child).is_empty ())
8963 continue;
8965 auto_vec<tree> vec_oprnds;
8966 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8967 if (!new_phis.exists ())
8969 new_phis.create (vec_oprnds.length ());
8970 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8972 /* Create the vectorized LC PHI node. */
8973 new_phis.quick_push (create_phi_node (vec_dest, bb));
8974 slp_node->push_vec_def (new_phis[j]);
8977 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8978 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8979 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8981 /* We should have at least one already vectorized child. */
8982 gcc_assert (new_phis.exists ());
8984 return true;
8987 /* Vectorizes first order recurrences. An overview of the transformation
8988 is described below. Suppose we have the following loop.
8990 int t = 0;
8991 for (int i = 0; i < n; ++i)
8993 b[i] = a[i] - t;
8994 t = a[i];
8997 There is a first-order recurrence on 'a'. For this loop, the scalar IR
8998 looks (simplified) like:
9000 scalar.preheader:
9001 init = 0;
9003 scalar.body:
9004 i = PHI <0(scalar.preheader), i+1(scalar.body)>
9005 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9006 _1 = a[i]
9007 b[i] = _1 - _2
9008 if (i < n) goto scalar.body
9010 In this example, _2 is a recurrence because it's value depends on the
9011 previous iteration. We vectorize this as (VF = 4)
9013 vector.preheader:
9014 vect_init = vect_cst(..., ..., ..., 0)
9016 vector.body
9017 i = PHI <0(vector.preheader), i+4(vector.body)>
9018 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9019 vect_2 = a[i, i+1, i+2, i+3];
9020 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9021 b[i, i+1, i+2, i+3] = vect_2 - vect_3
9022 if (..) goto vector.body
9024 In this function, vectorizable_recurr, we code generate both the
9025 vector PHI node and the permute since those together compute the
9026 vectorized value of the scalar PHI. We do not yet have the
9027 backedge value to fill in there nor into the vec_perm. Those
9028 are filled in maybe_set_vectorized_backedge_value and
9029 vect_schedule_scc.
9031 TODO: Since the scalar loop does not have a use of the recurrence
9032 outside of the loop the natural way to implement peeling via
9033 vectorizing the live value doesn't work. For now peeling of loops
9034 with a recurrence is not implemented. For SLP the supported cases
9035 are restricted to those requiring a single vector recurrence PHI. */
9037 bool
9038 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9039 gimple **vec_stmt, slp_tree slp_node,
9040 stmt_vector_for_cost *cost_vec)
9042 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9043 return false;
9045 gphi *phi = as_a<gphi *> (stmt_info->stmt);
9047 /* So far we only support first-order recurrence auto-vectorization. */
9048 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9049 return false;
9051 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9052 unsigned ncopies;
9053 if (slp_node)
9054 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9055 else
9056 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9057 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9058 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9059 /* We need to be able to make progress with a single vector. */
9060 if (maybe_gt (dist * 2, nunits))
9062 if (dump_enabled_p ())
9063 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9064 "first order recurrence exceeds half of "
9065 "a vector\n");
9066 return false;
9069 /* First-order recurrence autovectorization needs to handle permutation
9070 with indices = [nunits-1, nunits, nunits+1, ...]. */
9071 vec_perm_builder sel (nunits, 1, 3);
9072 for (int i = 0; i < 3; ++i)
9073 sel.quick_push (nunits - dist + i);
9074 vec_perm_indices indices (sel, 2, nunits);
9076 if (!vec_stmt) /* transformation not required. */
9078 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9079 indices))
9080 return false;
9082 if (slp_node)
9084 /* We eventually need to set a vector type on invariant
9085 arguments. */
9086 unsigned j;
9087 slp_tree child;
9088 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9089 if (!vect_maybe_update_slp_op_vectype
9090 (child, SLP_TREE_VECTYPE (slp_node)))
9092 if (dump_enabled_p ())
9093 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9094 "incompatible vector types for "
9095 "invariants\n");
9096 return false;
9099 /* The recurrence costs the initialization vector and one permute
9100 for each copy. */
9101 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9102 stmt_info, 0, vect_prologue);
9103 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9104 stmt_info, 0, vect_body);
9105 if (dump_enabled_p ())
9106 dump_printf_loc (MSG_NOTE, vect_location,
9107 "vectorizable_recurr: inside_cost = %d, "
9108 "prologue_cost = %d .\n", inside_cost,
9109 prologue_cost);
9111 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9112 return true;
9115 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9116 basic_block bb = gimple_bb (phi);
9117 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9118 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9120 gimple_seq stmts = NULL;
9121 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9122 gsi_insert_seq_on_edge_immediate (pe, stmts);
9124 tree vec_init = build_vector_from_val (vectype, preheader);
9125 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9127 /* Create the vectorized first-order PHI node. */
9128 tree vec_dest = vect_get_new_vect_var (vectype,
9129 vect_simple_var, "vec_recur_");
9130 gphi *new_phi = create_phi_node (vec_dest, bb);
9131 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9133 /* Insert shuffles the first-order recurrence autovectorization.
9134 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9135 tree perm = vect_gen_perm_mask_checked (vectype, indices);
9137 /* Insert the required permute after the latch definition. The
9138 second and later operands are tentative and will be updated when we have
9139 vectorized the latch definition. */
9140 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9141 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9142 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9143 gsi_next (&gsi2);
9145 for (unsigned i = 0; i < ncopies; ++i)
9147 vec_dest = make_ssa_name (vectype);
9148 gassign *vperm
9149 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9150 i == 0 ? gimple_phi_result (new_phi) : NULL,
9151 NULL, perm);
9152 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9154 if (slp_node)
9155 slp_node->push_vec_def (vperm);
9156 else
9157 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9160 if (!slp_node)
9161 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9162 return true;
9165 /* Return true if VECTYPE represents a vector that requires lowering
9166 by the vector lowering pass. */
9168 bool
9169 vect_emulated_vector_p (tree vectype)
9171 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9172 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9173 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9176 /* Return true if we can emulate CODE on an integer mode representation
9177 of a vector. */
9179 bool
9180 vect_can_vectorize_without_simd_p (tree_code code)
9182 switch (code)
9184 case PLUS_EXPR:
9185 case MINUS_EXPR:
9186 case NEGATE_EXPR:
9187 case BIT_AND_EXPR:
9188 case BIT_IOR_EXPR:
9189 case BIT_XOR_EXPR:
9190 case BIT_NOT_EXPR:
9191 return true;
9193 default:
9194 return false;
9198 /* Likewise, but taking a code_helper. */
9200 bool
9201 vect_can_vectorize_without_simd_p (code_helper code)
9203 return (code.is_tree_code ()
9204 && vect_can_vectorize_without_simd_p (tree_code (code)));
9207 /* Create vector init for vectorized iv. */
9208 static tree
9209 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9210 tree step_expr, poly_uint64 nunits,
9211 tree vectype,
9212 enum vect_induction_op_type induction_type)
9214 unsigned HOST_WIDE_INT const_nunits;
9215 tree vec_shift, vec_init, new_name;
9216 unsigned i;
9217 tree itype = TREE_TYPE (vectype);
9219 /* iv_loop is the loop to be vectorized. Create:
9220 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9221 new_name = gimple_convert (stmts, itype, init_expr);
9222 switch (induction_type)
9224 case vect_step_op_shr:
9225 case vect_step_op_shl:
9226 /* Build the Initial value from shift_expr. */
9227 vec_init = gimple_build_vector_from_val (stmts,
9228 vectype,
9229 new_name);
9230 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9231 build_zero_cst (itype), step_expr);
9232 vec_init = gimple_build (stmts,
9233 (induction_type == vect_step_op_shr
9234 ? RSHIFT_EXPR : LSHIFT_EXPR),
9235 vectype, vec_init, vec_shift);
9236 break;
9238 case vect_step_op_neg:
9240 vec_init = gimple_build_vector_from_val (stmts,
9241 vectype,
9242 new_name);
9243 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9244 vectype, vec_init);
9245 /* The encoding has 2 interleaved stepped patterns. */
9246 vec_perm_builder sel (nunits, 2, 3);
9247 sel.quick_grow (6);
9248 for (i = 0; i < 3; i++)
9250 sel[2 * i] = i;
9251 sel[2 * i + 1] = i + nunits;
9253 vec_perm_indices indices (sel, 2, nunits);
9254 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9255 fail when vec_init is const vector. In that situation vec_perm is not
9256 really needed. */
9257 tree perm_mask_even
9258 = vect_gen_perm_mask_any (vectype, indices);
9259 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9260 vectype,
9261 vec_init, vec_neg,
9262 perm_mask_even);
9264 break;
9266 case vect_step_op_mul:
9268 /* Use unsigned mult to avoid UD integer overflow. */
9269 gcc_assert (nunits.is_constant (&const_nunits));
9270 tree utype = unsigned_type_for (itype);
9271 tree uvectype = build_vector_type (utype,
9272 TYPE_VECTOR_SUBPARTS (vectype));
9273 new_name = gimple_convert (stmts, utype, new_name);
9274 vec_init = gimple_build_vector_from_val (stmts,
9275 uvectype,
9276 new_name);
9277 tree_vector_builder elts (uvectype, const_nunits, 1);
9278 tree elt_step = build_one_cst (utype);
9280 elts.quick_push (elt_step);
9281 for (i = 1; i < const_nunits; i++)
9283 /* Create: new_name_i = new_name + step_expr. */
9284 elt_step = gimple_build (stmts, MULT_EXPR,
9285 utype, elt_step, step_expr);
9286 elts.quick_push (elt_step);
9288 /* Create a vector from [new_name_0, new_name_1, ...,
9289 new_name_nunits-1]. */
9290 tree vec_mul = gimple_build_vector (stmts, &elts);
9291 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9292 vec_init, vec_mul);
9293 vec_init = gimple_convert (stmts, vectype, vec_init);
9295 break;
9297 default:
9298 gcc_unreachable ();
9301 return vec_init;
9304 /* Peel init_expr by skip_niter for induction_type. */
9305 tree
9306 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9307 tree skip_niters, tree step_expr,
9308 enum vect_induction_op_type induction_type)
9310 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9311 tree type = TREE_TYPE (init_expr);
9312 unsigned prec = TYPE_PRECISION (type);
9313 switch (induction_type)
9315 case vect_step_op_neg:
9316 if (TREE_INT_CST_LOW (skip_niters) % 2)
9317 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9318 /* else no change. */
9319 break;
9321 case vect_step_op_shr:
9322 case vect_step_op_shl:
9323 skip_niters = gimple_convert (stmts, type, skip_niters);
9324 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9325 /* When shift mount >= precision, need to avoid UD.
9326 In the original loop, there's no UD, and according to semantic,
9327 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9328 if (!tree_fits_uhwi_p (step_expr)
9329 || tree_to_uhwi (step_expr) >= prec)
9331 if (induction_type == vect_step_op_shl
9332 || TYPE_UNSIGNED (type))
9333 init_expr = build_zero_cst (type);
9334 else
9335 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9336 init_expr,
9337 wide_int_to_tree (type, prec - 1));
9339 else
9340 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9341 ? RSHIFT_EXPR : LSHIFT_EXPR),
9342 type, init_expr, step_expr);
9343 break;
9345 case vect_step_op_mul:
9347 tree utype = unsigned_type_for (type);
9348 init_expr = gimple_convert (stmts, utype, init_expr);
9349 wide_int skipn = wi::to_wide (skip_niters);
9350 wide_int begin = wi::to_wide (step_expr);
9351 auto_mpz base, exp, mod, res;
9352 wi::to_mpz (begin, base, TYPE_SIGN (type));
9353 wi::to_mpz (skipn, exp, UNSIGNED);
9354 mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9355 mpz_powm (res, base, exp, mod);
9356 begin = wi::from_mpz (type, res, TYPE_SIGN (type));
9357 tree mult_expr = wide_int_to_tree (utype, begin);
9358 init_expr = gimple_build (stmts, MULT_EXPR, utype,
9359 init_expr, mult_expr);
9360 init_expr = gimple_convert (stmts, type, init_expr);
9362 break;
9364 default:
9365 gcc_unreachable ();
9368 return init_expr;
9371 /* Create vector step for vectorized iv. */
9372 static tree
9373 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9374 poly_uint64 vf,
9375 enum vect_induction_op_type induction_type)
9377 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9378 tree new_name = NULL;
9379 /* Step should be pow (step, vf) for mult induction. */
9380 if (induction_type == vect_step_op_mul)
9382 gcc_assert (vf.is_constant ());
9383 wide_int begin = wi::to_wide (step_expr);
9385 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9386 begin = wi::mul (begin, wi::to_wide (step_expr));
9388 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9390 else if (induction_type == vect_step_op_neg)
9391 /* Do nothing. */
9393 else
9394 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9395 expr, step_expr);
9396 return new_name;
9399 static tree
9400 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9401 stmt_vec_info stmt_info,
9402 tree new_name, tree vectype,
9403 enum vect_induction_op_type induction_type)
9405 /* No step is needed for neg induction. */
9406 if (induction_type == vect_step_op_neg)
9407 return NULL;
9409 tree t = unshare_expr (new_name);
9410 gcc_assert (CONSTANT_CLASS_P (new_name)
9411 || TREE_CODE (new_name) == SSA_NAME);
9412 tree new_vec = build_vector_from_val (vectype, t);
9413 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9414 new_vec, vectype, NULL);
9415 return vec_step;
9418 /* Update vectorized iv with vect_step, induc_def is init. */
9419 static tree
9420 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9421 tree induc_def, tree vec_step,
9422 enum vect_induction_op_type induction_type)
9424 tree vec_def = induc_def;
9425 switch (induction_type)
9427 case vect_step_op_mul:
9429 /* Use unsigned mult to avoid UD integer overflow. */
9430 tree uvectype
9431 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9432 TYPE_VECTOR_SUBPARTS (vectype));
9433 vec_def = gimple_convert (stmts, uvectype, vec_def);
9434 vec_step = gimple_convert (stmts, uvectype, vec_step);
9435 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9436 vec_def, vec_step);
9437 vec_def = gimple_convert (stmts, vectype, vec_def);
9439 break;
9441 case vect_step_op_shr:
9442 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9443 vec_def, vec_step);
9444 break;
9446 case vect_step_op_shl:
9447 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9448 vec_def, vec_step);
9449 break;
9450 case vect_step_op_neg:
9451 vec_def = induc_def;
9452 /* Do nothing. */
9453 break;
9454 default:
9455 gcc_unreachable ();
9458 return vec_def;
9462 /* Function vectorizable_induction
9464 Check if STMT_INFO performs an nonlinear induction computation that can be
9465 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9466 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9467 basic block.
9468 Return true if STMT_INFO is vectorizable in this way. */
9470 static bool
9471 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9472 stmt_vec_info stmt_info,
9473 gimple **vec_stmt, slp_tree slp_node,
9474 stmt_vector_for_cost *cost_vec)
9476 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9477 unsigned ncopies;
9478 bool nested_in_vect_loop = false;
9479 class loop *iv_loop;
9480 tree vec_def;
9481 edge pe = loop_preheader_edge (loop);
9482 basic_block new_bb;
9483 tree vec_init, vec_step;
9484 tree new_name;
9485 gimple *new_stmt;
9486 gphi *induction_phi;
9487 tree induc_def, vec_dest;
9488 tree init_expr, step_expr;
9489 tree niters_skip;
9490 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9491 unsigned i;
9492 gimple_stmt_iterator si;
9494 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9496 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9497 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9498 enum vect_induction_op_type induction_type
9499 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9501 gcc_assert (induction_type > vect_step_op_add);
9503 if (slp_node)
9504 ncopies = 1;
9505 else
9506 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9507 gcc_assert (ncopies >= 1);
9509 /* FORNOW. Only handle nonlinear induction in the same loop. */
9510 if (nested_in_vect_loop_p (loop, stmt_info))
9512 if (dump_enabled_p ())
9513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9514 "nonlinear induction in nested loop.\n");
9515 return false;
9518 iv_loop = loop;
9519 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9521 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9522 update for each iv and a permutation to generate wanted vector iv. */
9523 if (slp_node)
9525 if (dump_enabled_p ())
9526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9527 "SLP induction not supported for nonlinear"
9528 " induction.\n");
9529 return false;
9532 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9534 if (dump_enabled_p ())
9535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9536 "floating point nonlinear induction vectorization"
9537 " not supported.\n");
9538 return false;
9541 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9542 init_expr = vect_phi_initial_value (phi);
9543 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9544 && TREE_CODE (step_expr) == INTEGER_CST);
9545 /* step_expr should be aligned with init_expr,
9546 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9547 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9549 if (TREE_CODE (init_expr) == INTEGER_CST)
9550 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9551 else
9552 gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9553 TREE_TYPE (init_expr)));
9555 switch (induction_type)
9557 case vect_step_op_neg:
9558 if (TREE_CODE (init_expr) != INTEGER_CST
9559 && TREE_CODE (init_expr) != REAL_CST)
9561 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9562 if (!directly_supported_p (NEGATE_EXPR, vectype))
9563 return false;
9565 /* The encoding has 2 interleaved stepped patterns. */
9566 vec_perm_builder sel (nunits, 2, 3);
9567 machine_mode mode = TYPE_MODE (vectype);
9568 sel.quick_grow (6);
9569 for (i = 0; i < 3; i++)
9571 sel[i * 2] = i;
9572 sel[i * 2 + 1] = i + nunits;
9574 vec_perm_indices indices (sel, 2, nunits);
9575 if (!can_vec_perm_const_p (mode, mode, indices))
9576 return false;
9578 break;
9580 case vect_step_op_mul:
9582 /* Check for backend support of MULT_EXPR. */
9583 if (!directly_supported_p (MULT_EXPR, vectype))
9584 return false;
9586 /* ?? How to construct vector step for variable number vector.
9587 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9588 if (!vf.is_constant ())
9589 return false;
9591 break;
9593 case vect_step_op_shr:
9594 /* Check for backend support of RSHIFT_EXPR. */
9595 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9596 return false;
9598 /* Don't shift more than type precision to avoid UD. */
9599 if (!tree_fits_uhwi_p (step_expr)
9600 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9601 TYPE_PRECISION (TREE_TYPE (init_expr))))
9602 return false;
9603 break;
9605 case vect_step_op_shl:
9606 /* Check for backend support of RSHIFT_EXPR. */
9607 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9608 return false;
9610 /* Don't shift more than type precision to avoid UD. */
9611 if (!tree_fits_uhwi_p (step_expr)
9612 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9613 TYPE_PRECISION (TREE_TYPE (init_expr))))
9614 return false;
9616 break;
9618 default:
9619 gcc_unreachable ();
9622 if (!vec_stmt) /* transformation not required. */
9624 unsigned inside_cost = 0, prologue_cost = 0;
9625 /* loop cost for vec_loop. Neg induction doesn't have any
9626 inside_cost. */
9627 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9628 stmt_info, 0, vect_body);
9630 /* loop cost for vec_loop. Neg induction doesn't have any
9631 inside_cost. */
9632 if (induction_type == vect_step_op_neg)
9633 inside_cost = 0;
9635 /* prologue cost for vec_init and vec_step. */
9636 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9637 stmt_info, 0, vect_prologue);
9639 if (dump_enabled_p ())
9640 dump_printf_loc (MSG_NOTE, vect_location,
9641 "vect_model_induction_cost: inside_cost = %d, "
9642 "prologue_cost = %d. \n", inside_cost,
9643 prologue_cost);
9645 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9646 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9647 return true;
9650 /* Transform. */
9652 /* Compute a vector variable, initialized with the first VF values of
9653 the induction variable. E.g., for an iv with IV_PHI='X' and
9654 evolution S, for a vector of 4 units, we want to compute:
9655 [X, X + S, X + 2*S, X + 3*S]. */
9657 if (dump_enabled_p ())
9658 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9660 pe = loop_preheader_edge (iv_loop);
9661 /* Find the first insertion point in the BB. */
9662 basic_block bb = gimple_bb (phi);
9663 si = gsi_after_labels (bb);
9665 gimple_seq stmts = NULL;
9667 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9668 /* If we are using the loop mask to "peel" for alignment then we need
9669 to adjust the start value here. */
9670 if (niters_skip != NULL_TREE)
9671 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9672 step_expr, induction_type);
9674 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9675 step_expr, nunits, vectype,
9676 induction_type);
9677 if (stmts)
9679 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9680 gcc_assert (!new_bb);
9683 stmts = NULL;
9684 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9685 vf, induction_type);
9686 if (stmts)
9688 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9689 gcc_assert (!new_bb);
9692 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9693 new_name, vectype,
9694 induction_type);
9695 /* Create the following def-use cycle:
9696 loop prolog:
9697 vec_init = ...
9698 vec_step = ...
9699 loop:
9700 vec_iv = PHI <vec_init, vec_loop>
9702 STMT
9704 vec_loop = vec_iv + vec_step; */
9706 /* Create the induction-phi that defines the induction-operand. */
9707 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9708 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9709 induc_def = PHI_RESULT (induction_phi);
9711 /* Create the iv update inside the loop. */
9712 stmts = NULL;
9713 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9714 induc_def, vec_step,
9715 induction_type);
9717 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9718 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9720 /* Set the arguments of the phi node: */
9721 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9722 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9723 UNKNOWN_LOCATION);
9725 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9726 *vec_stmt = induction_phi;
9728 /* In case that vectorization factor (VF) is bigger than the number
9729 of elements that we can fit in a vectype (nunits), we have to generate
9730 more than one vector stmt - i.e - we need to "unroll" the
9731 vector stmt by a factor VF/nunits. For more details see documentation
9732 in vectorizable_operation. */
9734 if (ncopies > 1)
9736 stmts = NULL;
9737 /* FORNOW. This restriction should be relaxed. */
9738 gcc_assert (!nested_in_vect_loop);
9740 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9741 nunits, induction_type);
9743 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9744 new_name, vectype,
9745 induction_type);
9746 vec_def = induc_def;
9747 for (i = 1; i < ncopies; i++)
9749 /* vec_i = vec_prev + vec_step. */
9750 stmts = NULL;
9751 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9752 vec_def, vec_step,
9753 induction_type);
9754 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9755 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9756 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9760 if (dump_enabled_p ())
9761 dump_printf_loc (MSG_NOTE, vect_location,
9762 "transform induction: created def-use cycle: %G%G",
9763 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9765 return true;
9768 /* Function vectorizable_induction
9770 Check if STMT_INFO performs an induction computation that can be vectorized.
9771 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9772 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9773 Return true if STMT_INFO is vectorizable in this way. */
9775 bool
9776 vectorizable_induction (loop_vec_info loop_vinfo,
9777 stmt_vec_info stmt_info,
9778 gimple **vec_stmt, slp_tree slp_node,
9779 stmt_vector_for_cost *cost_vec)
9781 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9782 unsigned ncopies;
9783 bool nested_in_vect_loop = false;
9784 class loop *iv_loop;
9785 tree vec_def;
9786 edge pe = loop_preheader_edge (loop);
9787 basic_block new_bb;
9788 tree new_vec, vec_init, vec_step, t;
9789 tree new_name;
9790 gimple *new_stmt;
9791 gphi *induction_phi;
9792 tree induc_def, vec_dest;
9793 tree init_expr, step_expr;
9794 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9795 unsigned i;
9796 tree expr;
9797 gimple_stmt_iterator si;
9798 enum vect_induction_op_type induction_type
9799 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9801 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9802 if (!phi)
9803 return false;
9805 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9806 return false;
9808 /* Make sure it was recognized as induction computation. */
9809 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9810 return false;
9812 /* Handle nonlinear induction in a separate place. */
9813 if (induction_type != vect_step_op_add)
9814 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9815 vec_stmt, slp_node, cost_vec);
9817 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9818 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9820 if (slp_node)
9821 ncopies = 1;
9822 else
9823 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9824 gcc_assert (ncopies >= 1);
9826 /* FORNOW. These restrictions should be relaxed. */
9827 if (nested_in_vect_loop_p (loop, stmt_info))
9829 imm_use_iterator imm_iter;
9830 use_operand_p use_p;
9831 gimple *exit_phi;
9832 edge latch_e;
9833 tree loop_arg;
9835 if (ncopies > 1)
9837 if (dump_enabled_p ())
9838 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9839 "multiple types in nested loop.\n");
9840 return false;
9843 exit_phi = NULL;
9844 latch_e = loop_latch_edge (loop->inner);
9845 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9846 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9848 gimple *use_stmt = USE_STMT (use_p);
9849 if (is_gimple_debug (use_stmt))
9850 continue;
9852 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9854 exit_phi = use_stmt;
9855 break;
9858 if (exit_phi)
9860 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9861 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9862 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9864 if (dump_enabled_p ())
9865 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9866 "inner-loop induction only used outside "
9867 "of the outer vectorized loop.\n");
9868 return false;
9872 nested_in_vect_loop = true;
9873 iv_loop = loop->inner;
9875 else
9876 iv_loop = loop;
9877 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9879 if (slp_node && !nunits.is_constant ())
9881 /* The current SLP code creates the step value element-by-element. */
9882 if (dump_enabled_p ())
9883 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9884 "SLP induction not supported for variable-length"
9885 " vectors.\n");
9886 return false;
9889 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9891 if (dump_enabled_p ())
9892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9893 "floating point induction vectorization disabled\n");
9894 return false;
9897 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9898 gcc_assert (step_expr != NULL_TREE);
9899 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9901 /* Check for backend support of PLUS/MINUS_EXPR. */
9902 if (!directly_supported_p (PLUS_EXPR, step_vectype)
9903 || !directly_supported_p (MINUS_EXPR, step_vectype))
9904 return false;
9906 if (!vec_stmt) /* transformation not required. */
9908 unsigned inside_cost = 0, prologue_cost = 0;
9909 if (slp_node)
9911 /* We eventually need to set a vector type on invariant
9912 arguments. */
9913 unsigned j;
9914 slp_tree child;
9915 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9916 if (!vect_maybe_update_slp_op_vectype
9917 (child, SLP_TREE_VECTYPE (slp_node)))
9919 if (dump_enabled_p ())
9920 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9921 "incompatible vector types for "
9922 "invariants\n");
9923 return false;
9925 /* loop cost for vec_loop. */
9926 inside_cost
9927 = record_stmt_cost (cost_vec,
9928 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9929 vector_stmt, stmt_info, 0, vect_body);
9930 /* prologue cost for vec_init (if not nested) and step. */
9931 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9932 scalar_to_vec,
9933 stmt_info, 0, vect_prologue);
9935 else /* if (!slp_node) */
9937 /* loop cost for vec_loop. */
9938 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9939 stmt_info, 0, vect_body);
9940 /* prologue cost for vec_init and vec_step. */
9941 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9942 stmt_info, 0, vect_prologue);
9944 if (dump_enabled_p ())
9945 dump_printf_loc (MSG_NOTE, vect_location,
9946 "vect_model_induction_cost: inside_cost = %d, "
9947 "prologue_cost = %d .\n", inside_cost,
9948 prologue_cost);
9950 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9951 DUMP_VECT_SCOPE ("vectorizable_induction");
9952 return true;
9955 /* Transform. */
9957 /* Compute a vector variable, initialized with the first VF values of
9958 the induction variable. E.g., for an iv with IV_PHI='X' and
9959 evolution S, for a vector of 4 units, we want to compute:
9960 [X, X + S, X + 2*S, X + 3*S]. */
9962 if (dump_enabled_p ())
9963 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9965 pe = loop_preheader_edge (iv_loop);
9966 /* Find the first insertion point in the BB. */
9967 basic_block bb = gimple_bb (phi);
9968 si = gsi_after_labels (bb);
9970 /* For SLP induction we have to generate several IVs as for example
9971 with group size 3 we need
9972 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9973 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9974 if (slp_node)
9976 /* Enforced above. */
9977 unsigned int const_nunits = nunits.to_constant ();
9979 /* The initial values are vectorized, but any lanes > group_size
9980 need adjustment. */
9981 slp_tree init_node
9982 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9984 /* Gather steps. Since we do not vectorize inductions as
9985 cycles we have to reconstruct the step from SCEV data. */
9986 unsigned group_size = SLP_TREE_LANES (slp_node);
9987 tree *steps = XALLOCAVEC (tree, group_size);
9988 tree *inits = XALLOCAVEC (tree, group_size);
9989 stmt_vec_info phi_info;
9990 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9992 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9993 if (!init_node)
9994 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9995 pe->dest_idx);
9998 /* Now generate the IVs. */
9999 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10000 gcc_assert ((const_nunits * nvects) % group_size == 0);
10001 unsigned nivs;
10002 if (nested_in_vect_loop)
10003 nivs = nvects;
10004 else
10006 /* Compute the number of distinct IVs we need. First reduce
10007 group_size if it is a multiple of const_nunits so we get
10008 one IV for a group_size of 4 but const_nunits 2. */
10009 unsigned group_sizep = group_size;
10010 if (group_sizep % const_nunits == 0)
10011 group_sizep = group_sizep / const_nunits;
10012 nivs = least_common_multiple (group_sizep,
10013 const_nunits) / const_nunits;
10015 tree stept = TREE_TYPE (step_vectype);
10016 tree lupdate_mul = NULL_TREE;
10017 if (!nested_in_vect_loop)
10019 /* The number of iterations covered in one vector iteration. */
10020 unsigned lup_mul = (nvects * const_nunits) / group_size;
10021 lupdate_mul
10022 = build_vector_from_val (step_vectype,
10023 SCALAR_FLOAT_TYPE_P (stept)
10024 ? build_real_from_wide (stept, lup_mul,
10025 UNSIGNED)
10026 : build_int_cstu (stept, lup_mul));
10028 tree peel_mul = NULL_TREE;
10029 gimple_seq init_stmts = NULL;
10030 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10032 if (SCALAR_FLOAT_TYPE_P (stept))
10033 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10034 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10035 else
10036 peel_mul = gimple_convert (&init_stmts, stept,
10037 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10038 peel_mul = gimple_build_vector_from_val (&init_stmts,
10039 step_vectype, peel_mul);
10041 unsigned ivn;
10042 auto_vec<tree> vec_steps;
10043 for (ivn = 0; ivn < nivs; ++ivn)
10045 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10046 tree_vector_builder init_elts (vectype, const_nunits, 1);
10047 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10048 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10050 /* The scalar steps of the IVs. */
10051 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10052 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10053 step_elts.quick_push (elt);
10054 if (!init_node)
10056 /* The scalar inits of the IVs if not vectorized. */
10057 elt = inits[(ivn*const_nunits + eltn) % group_size];
10058 if (!useless_type_conversion_p (TREE_TYPE (vectype),
10059 TREE_TYPE (elt)))
10060 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10061 TREE_TYPE (vectype), elt);
10062 init_elts.quick_push (elt);
10064 /* The number of steps to add to the initial values. */
10065 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10066 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10067 ? build_real_from_wide (stept,
10068 mul_elt, UNSIGNED)
10069 : build_int_cstu (stept, mul_elt));
10071 vec_step = gimple_build_vector (&init_stmts, &step_elts);
10072 vec_steps.safe_push (vec_step);
10073 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10074 if (peel_mul)
10075 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10076 step_mul, peel_mul);
10077 if (!init_node)
10078 vec_init = gimple_build_vector (&init_stmts, &init_elts);
10080 /* Create the induction-phi that defines the induction-operand. */
10081 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10082 "vec_iv_");
10083 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10084 induc_def = PHI_RESULT (induction_phi);
10086 /* Create the iv update inside the loop */
10087 tree up = vec_step;
10088 if (lupdate_mul)
10089 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10090 vec_step, lupdate_mul);
10091 gimple_seq stmts = NULL;
10092 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10093 vec_def = gimple_build (&stmts,
10094 PLUS_EXPR, step_vectype, vec_def, up);
10095 vec_def = gimple_convert (&stmts, vectype, vec_def);
10096 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10097 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10098 UNKNOWN_LOCATION);
10100 if (init_node)
10101 vec_init = vect_get_slp_vect_def (init_node, ivn);
10102 if (!nested_in_vect_loop
10103 && !integer_zerop (step_mul))
10105 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10106 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10107 vec_step, step_mul);
10108 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10109 vec_def, up);
10110 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10113 /* Set the arguments of the phi node: */
10114 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10116 slp_node->push_vec_def (induction_phi);
10118 if (!nested_in_vect_loop)
10120 /* Fill up to the number of vectors we need for the whole group. */
10121 nivs = least_common_multiple (group_size,
10122 const_nunits) / const_nunits;
10123 vec_steps.reserve (nivs-ivn);
10124 for (; ivn < nivs; ++ivn)
10126 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10127 vec_steps.quick_push (vec_steps[0]);
10131 /* Re-use IVs when we can. We are generating further vector
10132 stmts by adding VF' * stride to the IVs generated above. */
10133 if (ivn < nvects)
10135 unsigned vfp
10136 = least_common_multiple (group_size, const_nunits) / group_size;
10137 tree lupdate_mul
10138 = build_vector_from_val (step_vectype,
10139 SCALAR_FLOAT_TYPE_P (stept)
10140 ? build_real_from_wide (stept,
10141 vfp, UNSIGNED)
10142 : build_int_cstu (stept, vfp));
10143 for (; ivn < nvects; ++ivn)
10145 gimple *iv
10146 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10147 tree def = gimple_get_lhs (iv);
10148 if (ivn < 2*nivs)
10149 vec_steps[ivn - nivs]
10150 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10151 vec_steps[ivn - nivs], lupdate_mul);
10152 gimple_seq stmts = NULL;
10153 def = gimple_convert (&stmts, step_vectype, def);
10154 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10155 def, vec_steps[ivn % nivs]);
10156 def = gimple_convert (&stmts, vectype, def);
10157 if (gimple_code (iv) == GIMPLE_PHI)
10158 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10159 else
10161 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10162 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10164 slp_node->push_vec_def (def);
10168 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10169 gcc_assert (!new_bb);
10171 return true;
10174 init_expr = vect_phi_initial_value (phi);
10176 gimple_seq stmts = NULL;
10177 if (!nested_in_vect_loop)
10179 /* Convert the initial value to the IV update type. */
10180 tree new_type = TREE_TYPE (step_expr);
10181 init_expr = gimple_convert (&stmts, new_type, init_expr);
10183 /* If we are using the loop mask to "peel" for alignment then we need
10184 to adjust the start value here. */
10185 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10186 if (skip_niters != NULL_TREE)
10188 if (FLOAT_TYPE_P (vectype))
10189 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10190 skip_niters);
10191 else
10192 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10193 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10194 skip_niters, step_expr);
10195 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10196 init_expr, skip_step);
10200 if (stmts)
10202 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10203 gcc_assert (!new_bb);
10206 /* Create the vector that holds the initial_value of the induction. */
10207 if (nested_in_vect_loop)
10209 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10210 been created during vectorization of previous stmts. We obtain it
10211 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10212 auto_vec<tree> vec_inits;
10213 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10214 init_expr, &vec_inits);
10215 vec_init = vec_inits[0];
10216 /* If the initial value is not of proper type, convert it. */
10217 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10219 new_stmt
10220 = gimple_build_assign (vect_get_new_ssa_name (vectype,
10221 vect_simple_var,
10222 "vec_iv_"),
10223 VIEW_CONVERT_EXPR,
10224 build1 (VIEW_CONVERT_EXPR, vectype,
10225 vec_init));
10226 vec_init = gimple_assign_lhs (new_stmt);
10227 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10228 new_stmt);
10229 gcc_assert (!new_bb);
10232 else
10234 /* iv_loop is the loop to be vectorized. Create:
10235 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10236 stmts = NULL;
10237 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10239 unsigned HOST_WIDE_INT const_nunits;
10240 if (nunits.is_constant (&const_nunits))
10242 tree_vector_builder elts (step_vectype, const_nunits, 1);
10243 elts.quick_push (new_name);
10244 for (i = 1; i < const_nunits; i++)
10246 /* Create: new_name_i = new_name + step_expr */
10247 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10248 new_name, step_expr);
10249 elts.quick_push (new_name);
10251 /* Create a vector from [new_name_0, new_name_1, ...,
10252 new_name_nunits-1] */
10253 vec_init = gimple_build_vector (&stmts, &elts);
10255 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10256 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10257 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10258 new_name, step_expr);
10259 else
10261 /* Build:
10262 [base, base, base, ...]
10263 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10264 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10265 gcc_assert (flag_associative_math);
10266 tree index = build_index_vector (step_vectype, 0, 1);
10267 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10268 new_name);
10269 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10270 step_expr);
10271 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10272 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10273 vec_init, step_vec);
10274 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10275 vec_init, base_vec);
10277 vec_init = gimple_convert (&stmts, vectype, vec_init);
10279 if (stmts)
10281 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10282 gcc_assert (!new_bb);
10287 /* Create the vector that holds the step of the induction. */
10288 if (nested_in_vect_loop)
10289 /* iv_loop is nested in the loop to be vectorized. Generate:
10290 vec_step = [S, S, S, S] */
10291 new_name = step_expr;
10292 else
10294 /* iv_loop is the loop to be vectorized. Generate:
10295 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10296 gimple_seq seq = NULL;
10297 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10299 expr = build_int_cst (integer_type_node, vf);
10300 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10302 else
10303 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10304 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10305 expr, step_expr);
10306 if (seq)
10308 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10309 gcc_assert (!new_bb);
10313 t = unshare_expr (new_name);
10314 gcc_assert (CONSTANT_CLASS_P (new_name)
10315 || TREE_CODE (new_name) == SSA_NAME);
10316 new_vec = build_vector_from_val (step_vectype, t);
10317 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10318 new_vec, step_vectype, NULL);
10321 /* Create the following def-use cycle:
10322 loop prolog:
10323 vec_init = ...
10324 vec_step = ...
10325 loop:
10326 vec_iv = PHI <vec_init, vec_loop>
10328 STMT
10330 vec_loop = vec_iv + vec_step; */
10332 /* Create the induction-phi that defines the induction-operand. */
10333 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10334 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10335 induc_def = PHI_RESULT (induction_phi);
10337 /* Create the iv update inside the loop */
10338 stmts = NULL;
10339 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10340 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10341 vec_def = gimple_convert (&stmts, vectype, vec_def);
10342 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10343 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10345 /* Set the arguments of the phi node: */
10346 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10347 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10348 UNKNOWN_LOCATION);
10350 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10351 *vec_stmt = induction_phi;
10353 /* In case that vectorization factor (VF) is bigger than the number
10354 of elements that we can fit in a vectype (nunits), we have to generate
10355 more than one vector stmt - i.e - we need to "unroll" the
10356 vector stmt by a factor VF/nunits. For more details see documentation
10357 in vectorizable_operation. */
10359 if (ncopies > 1)
10361 gimple_seq seq = NULL;
10362 /* FORNOW. This restriction should be relaxed. */
10363 gcc_assert (!nested_in_vect_loop);
10365 /* Create the vector that holds the step of the induction. */
10366 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10368 expr = build_int_cst (integer_type_node, nunits);
10369 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10371 else
10372 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10373 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10374 expr, step_expr);
10375 if (seq)
10377 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10378 gcc_assert (!new_bb);
10381 t = unshare_expr (new_name);
10382 gcc_assert (CONSTANT_CLASS_P (new_name)
10383 || TREE_CODE (new_name) == SSA_NAME);
10384 new_vec = build_vector_from_val (step_vectype, t);
10385 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10386 new_vec, step_vectype, NULL);
10388 vec_def = induc_def;
10389 for (i = 1; i < ncopies + 1; i++)
10391 /* vec_i = vec_prev + vec_step */
10392 gimple_seq stmts = NULL;
10393 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10394 vec_def = gimple_build (&stmts,
10395 PLUS_EXPR, step_vectype, vec_def, vec_step);
10396 vec_def = gimple_convert (&stmts, vectype, vec_def);
10398 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10399 if (i < ncopies)
10401 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10402 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10404 else
10406 /* vec_1 = vec_iv + (VF/n * S)
10407 vec_2 = vec_1 + (VF/n * S)
10409 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10411 vec_n is used as vec_loop to save the large step register and
10412 related operations. */
10413 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10414 UNKNOWN_LOCATION);
10419 if (dump_enabled_p ())
10420 dump_printf_loc (MSG_NOTE, vect_location,
10421 "transform induction: created def-use cycle: %G%G",
10422 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10424 return true;
10427 /* Function vectorizable_live_operation.
10429 STMT_INFO computes a value that is used outside the loop. Check if
10430 it can be supported. */
10432 bool
10433 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10434 slp_tree slp_node, slp_instance slp_node_instance,
10435 int slp_index, bool vec_stmt_p,
10436 stmt_vector_for_cost *cost_vec)
10438 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10439 imm_use_iterator imm_iter;
10440 tree lhs, lhs_type, bitsize;
10441 tree vectype = (slp_node
10442 ? SLP_TREE_VECTYPE (slp_node)
10443 : STMT_VINFO_VECTYPE (stmt_info));
10444 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10445 int ncopies;
10446 gimple *use_stmt;
10447 auto_vec<tree> vec_oprnds;
10448 int vec_entry = 0;
10449 poly_uint64 vec_index = 0;
10451 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10453 /* If a stmt of a reduction is live, vectorize it via
10454 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10455 validity so just trigger the transform here. */
10456 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10458 if (!vec_stmt_p)
10459 return true;
10460 if (slp_node)
10462 /* For reduction chains the meta-info is attached to
10463 the group leader. */
10464 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10465 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10466 /* For SLP reductions we vectorize the epilogue for
10467 all involved stmts together. */
10468 else if (slp_index != 0)
10469 return true;
10471 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10472 gcc_assert (reduc_info->is_reduc_info);
10473 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10474 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10475 return true;
10476 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10477 slp_node_instance);
10478 return true;
10481 /* If STMT is not relevant and it is a simple assignment and its inputs are
10482 invariant then it can remain in place, unvectorized. The original last
10483 scalar value that it computes will be used. */
10484 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10486 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10487 if (dump_enabled_p ())
10488 dump_printf_loc (MSG_NOTE, vect_location,
10489 "statement is simple and uses invariant. Leaving in "
10490 "place.\n");
10491 return true;
10494 if (slp_node)
10495 ncopies = 1;
10496 else
10497 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10499 if (slp_node)
10501 gcc_assert (slp_index >= 0);
10503 /* Get the last occurrence of the scalar index from the concatenation of
10504 all the slp vectors. Calculate which slp vector it is and the index
10505 within. */
10506 int num_scalar = SLP_TREE_LANES (slp_node);
10507 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10508 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10510 /* Calculate which vector contains the result, and which lane of
10511 that vector we need. */
10512 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10514 if (dump_enabled_p ())
10515 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10516 "Cannot determine which vector holds the"
10517 " final result.\n");
10518 return false;
10522 if (!vec_stmt_p)
10524 /* No transformation required. */
10525 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10527 if (slp_node)
10529 if (dump_enabled_p ())
10530 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10531 "can't operate on partial vectors "
10532 "because an SLP statement is live after "
10533 "the loop.\n");
10534 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10536 else if (ncopies > 1)
10538 if (dump_enabled_p ())
10539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10540 "can't operate on partial vectors "
10541 "because ncopies is greater than 1.\n");
10542 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10544 else
10546 gcc_assert (ncopies == 1 && !slp_node);
10547 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10548 OPTIMIZE_FOR_SPEED))
10549 vect_record_loop_mask (loop_vinfo,
10550 &LOOP_VINFO_MASKS (loop_vinfo),
10551 1, vectype, NULL);
10552 else if (can_vec_extract_var_idx_p (
10553 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10554 vect_record_loop_len (loop_vinfo,
10555 &LOOP_VINFO_LENS (loop_vinfo),
10556 1, vectype, 1);
10557 else
10559 if (dump_enabled_p ())
10560 dump_printf_loc (
10561 MSG_MISSED_OPTIMIZATION, vect_location,
10562 "can't operate on partial vectors "
10563 "because the target doesn't support extract "
10564 "last reduction.\n");
10565 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10569 /* ??? Enable for loop costing as well. */
10570 if (!loop_vinfo)
10571 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10572 0, vect_epilogue);
10573 return true;
10576 /* Use the lhs of the original scalar statement. */
10577 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10578 if (dump_enabled_p ())
10579 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10580 "stmt %G", stmt);
10582 lhs = gimple_get_lhs (stmt);
10583 lhs_type = TREE_TYPE (lhs);
10585 bitsize = vector_element_bits_tree (vectype);
10587 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10588 tree vec_lhs, bitstart;
10589 gimple *vec_stmt;
10590 if (slp_node)
10592 gcc_assert (!loop_vinfo
10593 || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10594 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10596 /* Get the correct slp vectorized stmt. */
10597 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10598 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10600 /* Get entry to use. */
10601 bitstart = bitsize_int (vec_index);
10602 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10604 else
10606 /* For multiple copies, get the last copy. */
10607 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10608 vec_lhs = gimple_get_lhs (vec_stmt);
10610 /* Get the last lane in the vector. */
10611 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10614 if (loop_vinfo)
10616 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10617 requirement, insert one phi node for it. It looks like:
10618 loop;
10620 # lhs' = PHI <lhs>
10622 loop;
10624 # vec_lhs' = PHI <vec_lhs>
10625 new_tree = lane_extract <vec_lhs', ...>;
10626 lhs' = new_tree; */
10628 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10629 basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
10630 gcc_assert (single_pred_p (exit_bb));
10632 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10633 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10634 SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
10636 gimple_seq stmts = NULL;
10637 tree new_tree;
10638 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10640 /* Emit:
10642 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10644 where VEC_LHS is the vectorized live-out result and MASK is
10645 the loop mask for the final iteration. */
10646 gcc_assert (ncopies == 1 && !slp_node);
10647 gimple_seq tem = NULL;
10648 gimple_stmt_iterator gsi = gsi_last (tem);
10649 tree len
10650 = vect_get_loop_len (loop_vinfo, &gsi,
10651 &LOOP_VINFO_LENS (loop_vinfo),
10652 1, vectype, 0, 0);
10654 /* BIAS - 1. */
10655 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10656 tree bias_minus_one
10657 = int_const_binop (MINUS_EXPR,
10658 build_int_cst (TREE_TYPE (len), biasval),
10659 build_one_cst (TREE_TYPE (len)));
10661 /* LAST_INDEX = LEN + (BIAS - 1). */
10662 tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10663 len, bias_minus_one);
10665 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10666 tree scalar_res
10667 = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10668 vec_lhs_phi, last_index);
10670 /* Convert the extracted vector element to the scalar type. */
10671 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10673 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10675 /* Emit:
10677 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10679 where VEC_LHS is the vectorized live-out result and MASK is
10680 the loop mask for the final iteration. */
10681 gcc_assert (ncopies == 1 && !slp_node);
10682 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10683 gimple_seq tem = NULL;
10684 gimple_stmt_iterator gsi = gsi_last (tem);
10685 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10686 &LOOP_VINFO_MASKS (loop_vinfo),
10687 1, vectype, 0);
10688 gimple_seq_add_seq (&stmts, tem);
10689 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10690 mask, vec_lhs_phi);
10692 /* Convert the extracted vector element to the scalar type. */
10693 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10695 else
10697 tree bftype = TREE_TYPE (vectype);
10698 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10699 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10700 new_tree = build3 (BIT_FIELD_REF, bftype,
10701 vec_lhs_phi, bitsize, bitstart);
10702 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10703 &stmts, true, NULL_TREE);
10706 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10707 if (stmts)
10708 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10710 /* Remove existing phis that copy from lhs and create copies
10711 from new_tree. */
10712 gimple_stmt_iterator gsi;
10713 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
10715 gimple *phi = gsi_stmt (gsi);
10716 if ((gimple_phi_arg_def (phi, 0) == lhs))
10718 remove_phi_node (&gsi, false);
10719 tree lhs_phi = gimple_phi_result (phi);
10720 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10721 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10723 else
10724 gsi_next (&gsi);
10727 /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10728 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10729 gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10731 else
10733 /* For basic-block vectorization simply insert the lane-extraction. */
10734 tree bftype = TREE_TYPE (vectype);
10735 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10736 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10737 tree new_tree = build3 (BIT_FIELD_REF, bftype,
10738 vec_lhs, bitsize, bitstart);
10739 gimple_seq stmts = NULL;
10740 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10741 &stmts, true, NULL_TREE);
10742 if (TREE_CODE (new_tree) == SSA_NAME
10743 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10744 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10745 if (is_a <gphi *> (vec_stmt))
10747 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10748 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10750 else
10752 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10753 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10756 /* Replace use of lhs with newly computed result. If the use stmt is a
10757 single arg PHI, just replace all uses of PHI result. It's necessary
10758 because lcssa PHI defining lhs may be before newly inserted stmt. */
10759 use_operand_p use_p;
10760 stmt_vec_info use_stmt_info;
10761 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10762 if (!is_gimple_debug (use_stmt)
10763 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10764 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10766 /* ??? This can happen when the live lane ends up being
10767 rooted in a vector construction code-generated by an
10768 external SLP node (and code-generation for that already
10769 happened). See gcc.dg/vect/bb-slp-47.c.
10770 Doing this is what would happen if that vector CTOR
10771 were not code-generated yet so it is not too bad.
10772 ??? In fact we'd likely want to avoid this situation
10773 in the first place. */
10774 if (TREE_CODE (new_tree) == SSA_NAME
10775 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10776 && gimple_code (use_stmt) != GIMPLE_PHI
10777 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10778 use_stmt))
10780 if (dump_enabled_p ())
10781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10782 "Using original scalar computation for "
10783 "live lane because use preceeds vector "
10784 "def\n");
10785 continue;
10787 /* ??? It can also happen that we end up pulling a def into
10788 a loop where replacing out-of-loop uses would require
10789 a new LC SSA PHI node. Retain the original scalar in
10790 those cases as well. PR98064. */
10791 if (TREE_CODE (new_tree) == SSA_NAME
10792 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10793 && (gimple_bb (use_stmt)->loop_father
10794 != gimple_bb (vec_stmt)->loop_father)
10795 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10796 gimple_bb (use_stmt)->loop_father))
10798 if (dump_enabled_p ())
10799 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10800 "Using original scalar computation for "
10801 "live lane because there is an out-of-loop "
10802 "definition for it\n");
10803 continue;
10805 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10806 SET_USE (use_p, new_tree);
10807 update_stmt (use_stmt);
10811 return true;
10814 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
10816 static void
10817 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10819 ssa_op_iter op_iter;
10820 imm_use_iterator imm_iter;
10821 def_operand_p def_p;
10822 gimple *ustmt;
10824 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10826 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10828 basic_block bb;
10830 if (!is_gimple_debug (ustmt))
10831 continue;
10833 bb = gimple_bb (ustmt);
10835 if (!flow_bb_inside_loop_p (loop, bb))
10837 if (gimple_debug_bind_p (ustmt))
10839 if (dump_enabled_p ())
10840 dump_printf_loc (MSG_NOTE, vect_location,
10841 "killing debug use\n");
10843 gimple_debug_bind_reset_value (ustmt);
10844 update_stmt (ustmt);
10846 else
10847 gcc_unreachable ();
10853 /* Given loop represented by LOOP_VINFO, return true if computation of
10854 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10855 otherwise. */
10857 static bool
10858 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10860 /* Constant case. */
10861 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10863 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10864 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10866 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10867 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10868 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10869 return true;
10872 widest_int max;
10873 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10874 /* Check the upper bound of loop niters. */
10875 if (get_max_loop_iterations (loop, &max))
10877 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10878 signop sgn = TYPE_SIGN (type);
10879 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10880 if (max < type_max)
10881 return true;
10883 return false;
10886 /* Return a mask type with half the number of elements as OLD_TYPE,
10887 given that it should have mode NEW_MODE. */
10889 tree
10890 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10892 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10893 return build_truth_vector_type_for_mode (nunits, new_mode);
10896 /* Return a mask type with twice as many elements as OLD_TYPE,
10897 given that it should have mode NEW_MODE. */
10899 tree
10900 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10902 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10903 return build_truth_vector_type_for_mode (nunits, new_mode);
10906 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10907 contain a sequence of NVECTORS masks that each control a vector of type
10908 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10909 these vector masks with the vector version of SCALAR_MASK. */
10911 void
10912 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10913 unsigned int nvectors, tree vectype, tree scalar_mask)
10915 gcc_assert (nvectors != 0);
10917 if (scalar_mask)
10919 scalar_cond_masked_key cond (scalar_mask, nvectors);
10920 loop_vinfo->scalar_cond_masked_set.add (cond);
10923 masks->mask_set.add (std::make_pair (vectype, nvectors));
10926 /* Given a complete set of masks MASKS, extract mask number INDEX
10927 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10928 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10930 See the comment above vec_loop_masks for more details about the mask
10931 arrangement. */
10933 tree
10934 vect_get_loop_mask (loop_vec_info loop_vinfo,
10935 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10936 unsigned int nvectors, tree vectype, unsigned int index)
10938 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10939 == vect_partial_vectors_while_ult)
10941 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10942 tree mask_type = rgm->type;
10944 /* Populate the rgroup's mask array, if this is the first time we've
10945 used it. */
10946 if (rgm->controls.is_empty ())
10948 rgm->controls.safe_grow_cleared (nvectors, true);
10949 for (unsigned int i = 0; i < nvectors; ++i)
10951 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10952 /* Provide a dummy definition until the real one is available. */
10953 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10954 rgm->controls[i] = mask;
10958 tree mask = rgm->controls[index];
10959 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10960 TYPE_VECTOR_SUBPARTS (vectype)))
10962 /* A loop mask for data type X can be reused for data type Y
10963 if X has N times more elements than Y and if Y's elements
10964 are N times bigger than X's. In this case each sequence
10965 of N elements in the loop mask will be all-zero or all-one.
10966 We can then view-convert the mask so that each sequence of
10967 N elements is replaced by a single element. */
10968 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10969 TYPE_VECTOR_SUBPARTS (vectype)));
10970 gimple_seq seq = NULL;
10971 mask_type = truth_type_for (vectype);
10972 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10973 if (seq)
10974 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10976 return mask;
10978 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10979 == vect_partial_vectors_avx512)
10981 /* The number of scalars per iteration and the number of vectors are
10982 both compile-time constants. */
10983 unsigned int nscalars_per_iter
10984 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10985 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10987 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10989 /* The stored nV is dependent on the mask type produced. */
10990 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10991 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10992 == rgm->factor);
10993 nvectors = rgm->factor;
10995 /* Populate the rgroup's mask array, if this is the first time we've
10996 used it. */
10997 if (rgm->controls.is_empty ())
10999 rgm->controls.safe_grow_cleared (nvectors, true);
11000 for (unsigned int i = 0; i < nvectors; ++i)
11002 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11003 /* Provide a dummy definition until the real one is available. */
11004 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11005 rgm->controls[i] = mask;
11008 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11009 TYPE_VECTOR_SUBPARTS (vectype)))
11010 return rgm->controls[index];
11012 /* Split the vector if needed. Since we are dealing with integer mode
11013 masks with AVX512 we can operate on the integer representation
11014 performing the whole vector shifting. */
11015 unsigned HOST_WIDE_INT factor;
11016 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11017 TYPE_VECTOR_SUBPARTS (vectype), &factor);
11018 gcc_assert (ok);
11019 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11020 tree mask_type = truth_type_for (vectype);
11021 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11022 unsigned vi = index / factor;
11023 unsigned vpart = index % factor;
11024 tree vec = rgm->controls[vi];
11025 gimple_seq seq = NULL;
11026 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11027 lang_hooks.types.type_for_mode
11028 (TYPE_MODE (rgm->type), 1), vec);
11029 /* For integer mode masks simply shift the right bits into position. */
11030 if (vpart != 0)
11031 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11032 build_int_cst (integer_type_node,
11033 (TYPE_VECTOR_SUBPARTS (vectype)
11034 * vpart)));
11035 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11036 (TYPE_MODE (mask_type), 1), vec);
11037 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11038 if (seq)
11039 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11040 return vec;
11042 else
11043 gcc_unreachable ();
11046 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11047 lengths for controlling an operation on VECTYPE. The operation splits
11048 each element of VECTYPE into FACTOR separate subelements, measuring the
11049 length as a number of these subelements. */
11051 void
11052 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11053 unsigned int nvectors, tree vectype, unsigned int factor)
11055 gcc_assert (nvectors != 0);
11056 if (lens->length () < nvectors)
11057 lens->safe_grow_cleared (nvectors, true);
11058 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11060 /* The number of scalars per iteration, scalar occupied bytes and
11061 the number of vectors are both compile-time constants. */
11062 unsigned int nscalars_per_iter
11063 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11064 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11066 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11068 /* For now, we only support cases in which all loads and stores fall back
11069 to VnQI or none do. */
11070 gcc_assert (!rgl->max_nscalars_per_iter
11071 || (rgl->factor == 1 && factor == 1)
11072 || (rgl->max_nscalars_per_iter * rgl->factor
11073 == nscalars_per_iter * factor));
11074 rgl->max_nscalars_per_iter = nscalars_per_iter;
11075 rgl->type = vectype;
11076 rgl->factor = factor;
11080 /* Given a complete set of lengths LENS, extract length number INDEX
11081 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11082 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11083 multipled by the number of elements that should be processed.
11084 Insert any set-up statements before GSI. */
11086 tree
11087 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11088 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11089 unsigned int index, unsigned int factor)
11091 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11092 bool use_bias_adjusted_len =
11093 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11095 /* Populate the rgroup's len array, if this is the first time we've
11096 used it. */
11097 if (rgl->controls.is_empty ())
11099 rgl->controls.safe_grow_cleared (nvectors, true);
11100 for (unsigned int i = 0; i < nvectors; ++i)
11102 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11103 gcc_assert (len_type != NULL_TREE);
11105 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11107 /* Provide a dummy definition until the real one is available. */
11108 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11109 rgl->controls[i] = len;
11111 if (use_bias_adjusted_len)
11113 gcc_assert (i == 0);
11114 tree adjusted_len =
11115 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11116 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11117 rgl->bias_adjusted_ctrl = adjusted_len;
11122 if (use_bias_adjusted_len)
11123 return rgl->bias_adjusted_ctrl;
11125 tree loop_len = rgl->controls[index];
11126 if (rgl->factor == 1 && factor == 1)
11128 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11129 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11130 if (maybe_ne (nunits1, nunits2))
11132 /* A loop len for data type X can be reused for data type Y
11133 if X has N times more elements than Y and if Y's elements
11134 are N times bigger than X's. */
11135 gcc_assert (multiple_p (nunits1, nunits2));
11136 factor = exact_div (nunits1, nunits2).to_constant ();
11137 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11138 gimple_seq seq = NULL;
11139 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11140 build_int_cst (iv_type, factor));
11141 if (seq)
11142 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11145 return loop_len;
11148 /* Scale profiling counters by estimation for LOOP which is vectorized
11149 by factor VF.
11150 If FLAT is true, the loop we started with had unrealistically flat
11151 profile. */
11153 static void
11154 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11156 /* For flat profiles do not scale down proportionally by VF and only
11157 cap by known iteration count bounds. */
11158 if (flat)
11160 if (dump_file && (dump_flags & TDF_DETAILS))
11161 fprintf (dump_file,
11162 "Vectorized loop profile seems flat; not scaling iteration "
11163 "count down by the vectorization factor %i\n", vf);
11164 scale_loop_profile (loop, profile_probability::always (),
11165 get_likely_max_loop_iterations_int (loop));
11166 return;
11168 /* Loop body executes VF fewer times and exit increases VF times. */
11169 profile_count entry_count = loop_preheader_edge (loop)->count ();
11171 /* If we have unreliable loop profile avoid dropping entry
11172 count bellow header count. This can happen since loops
11173 has unrealistically low trip counts. */
11174 while (vf > 1
11175 && loop->header->count > entry_count
11176 && loop->header->count < entry_count * vf)
11178 if (dump_file && (dump_flags & TDF_DETAILS))
11179 fprintf (dump_file,
11180 "Vectorization factor %i seems too large for profile "
11181 "prevoiusly believed to be consistent; reducing.\n", vf);
11182 vf /= 2;
11185 if (entry_count.nonzero_p ())
11186 set_edge_probability_and_rescale_others
11187 (exit_e,
11188 entry_count.probability_in (loop->header->count / vf));
11189 /* Avoid producing very large exit probability when we do not have
11190 sensible profile. */
11191 else if (exit_e->probability < profile_probability::always () / (vf * 2))
11192 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11193 loop->latch->count = single_pred_edge (loop->latch)->count ();
11195 scale_loop_profile (loop, profile_probability::always () / vf,
11196 get_likely_max_loop_iterations_int (loop));
11199 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11200 latch edge values originally defined by it. */
11202 static void
11203 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11204 stmt_vec_info def_stmt_info)
11206 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11207 if (!def || TREE_CODE (def) != SSA_NAME)
11208 return;
11209 stmt_vec_info phi_info;
11210 imm_use_iterator iter;
11211 use_operand_p use_p;
11212 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11214 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11215 if (!phi)
11216 continue;
11217 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11218 && (phi_info = loop_vinfo->lookup_stmt (phi))
11219 && STMT_VINFO_RELEVANT_P (phi_info)))
11220 continue;
11221 loop_p loop = gimple_bb (phi)->loop_father;
11222 edge e = loop_latch_edge (loop);
11223 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11224 continue;
11226 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11227 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11228 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11230 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11231 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11232 gcc_assert (phi_defs.length () == latch_defs.length ());
11233 for (unsigned i = 0; i < phi_defs.length (); ++i)
11234 add_phi_arg (as_a <gphi *> (phi_defs[i]),
11235 gimple_get_lhs (latch_defs[i]), e,
11236 gimple_phi_arg_location (phi, e->dest_idx));
11238 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11240 /* For first order recurrences we have to update both uses of
11241 the latch definition, the one in the PHI node and the one
11242 in the generated VEC_PERM_EXPR. */
11243 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11244 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11245 gcc_assert (phi_defs.length () == latch_defs.length ());
11246 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11247 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11248 for (unsigned i = 0; i < phi_defs.length (); ++i)
11250 gassign *perm = as_a <gassign *> (phi_defs[i]);
11251 if (i > 0)
11252 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11253 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11254 update_stmt (perm);
11256 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11257 gimple_phi_arg_location (phi, e->dest_idx));
11262 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11263 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11264 stmt_vec_info. */
11266 static bool
11267 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11268 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11270 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11271 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11273 if (dump_enabled_p ())
11274 dump_printf_loc (MSG_NOTE, vect_location,
11275 "------>vectorizing statement: %G", stmt_info->stmt);
11277 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11278 vect_loop_kill_debug_uses (loop, stmt_info);
11280 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11281 && !STMT_VINFO_LIVE_P (stmt_info))
11282 return false;
11284 if (STMT_VINFO_VECTYPE (stmt_info))
11286 poly_uint64 nunits
11287 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11288 if (!STMT_SLP_TYPE (stmt_info)
11289 && maybe_ne (nunits, vf)
11290 && dump_enabled_p ())
11291 /* For SLP VF is set according to unrolling factor, and not
11292 to vector size, hence for SLP this print is not valid. */
11293 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11296 /* Pure SLP statements have already been vectorized. We still need
11297 to apply loop vectorization to hybrid SLP statements. */
11298 if (PURE_SLP_STMT (stmt_info))
11299 return false;
11301 if (dump_enabled_p ())
11302 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11304 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11305 *seen_store = stmt_info;
11307 return true;
11310 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11311 in the hash_map with its corresponding values. */
11313 static tree
11314 find_in_mapping (tree t, void *context)
11316 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11318 tree *value = mapping->get (t);
11319 return value ? *value : t;
11322 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11323 original loop that has now been vectorized.
11325 The inits of the data_references need to be advanced with the number of
11326 iterations of the main loop. This has been computed in vect_do_peeling and
11327 is stored in parameter ADVANCE. We first restore the data_references
11328 initial offset with the values recored in ORIG_DRS_INIT.
11330 Since the loop_vec_info of this EPILOGUE was constructed for the original
11331 loop, its stmt_vec_infos all point to the original statements. These need
11332 to be updated to point to their corresponding copies as well as the SSA_NAMES
11333 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11335 The data_reference's connections also need to be updated. Their
11336 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11337 stmt_vec_infos, their statements need to point to their corresponding copy,
11338 if they are gather loads or scatter stores then their reference needs to be
11339 updated to point to its corresponding copy and finally we set
11340 'base_misaligned' to false as we have already peeled for alignment in the
11341 prologue of the main loop. */
11343 static void
11344 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11346 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11347 auto_vec<gimple *> stmt_worklist;
11348 hash_map<tree,tree> mapping;
11349 gimple *orig_stmt, *new_stmt;
11350 gimple_stmt_iterator epilogue_gsi;
11351 gphi_iterator epilogue_phi_gsi;
11352 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11353 basic_block *epilogue_bbs = get_loop_body (epilogue);
11354 unsigned i;
11356 free (LOOP_VINFO_BBS (epilogue_vinfo));
11357 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11359 /* Advance data_reference's with the number of iterations of the previous
11360 loop and its prologue. */
11361 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11364 /* The EPILOGUE loop is a copy of the original loop so they share the same
11365 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11366 point to the copied statements. We also create a mapping of all LHS' in
11367 the original loop and all the LHS' in the EPILOGUE and create worklists to
11368 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11369 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11371 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11372 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11374 new_stmt = epilogue_phi_gsi.phi ();
11376 gcc_assert (gimple_uid (new_stmt) > 0);
11377 stmt_vinfo
11378 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11380 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11381 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11383 mapping.put (gimple_phi_result (orig_stmt),
11384 gimple_phi_result (new_stmt));
11385 /* PHI nodes can not have patterns or related statements. */
11386 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11387 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11390 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11391 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11393 new_stmt = gsi_stmt (epilogue_gsi);
11394 if (is_gimple_debug (new_stmt))
11395 continue;
11397 gcc_assert (gimple_uid (new_stmt) > 0);
11398 stmt_vinfo
11399 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11401 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11402 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11404 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11405 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11407 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11409 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11410 for (gimple_stmt_iterator gsi = gsi_start (seq);
11411 !gsi_end_p (gsi); gsi_next (&gsi))
11412 stmt_worklist.safe_push (gsi_stmt (gsi));
11415 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11416 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11418 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11419 stmt_worklist.safe_push (stmt);
11420 /* Set BB such that the assert in
11421 'get_initial_def_for_reduction' is able to determine that
11422 the BB of the related stmt is inside this loop. */
11423 gimple_set_bb (stmt,
11424 gimple_bb (new_stmt));
11425 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11426 gcc_assert (related_vinfo == NULL
11427 || related_vinfo == stmt_vinfo);
11432 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11433 using the original main loop and thus need to be updated to refer to the
11434 cloned variables used in the epilogue. */
11435 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11437 gimple *stmt = stmt_worklist[i];
11438 tree *new_op;
11440 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11442 tree op = gimple_op (stmt, j);
11443 if ((new_op = mapping.get(op)))
11444 gimple_set_op (stmt, j, *new_op);
11445 else
11447 /* PR92429: The last argument of simplify_replace_tree disables
11448 folding when replacing arguments. This is required as
11449 otherwise you might end up with different statements than the
11450 ones analyzed in vect_loop_analyze, leading to different
11451 vectorization. */
11452 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11453 &find_in_mapping, &mapping, false);
11454 gimple_set_op (stmt, j, op);
11459 struct data_reference *dr;
11460 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11461 FOR_EACH_VEC_ELT (datarefs, i, dr)
11463 orig_stmt = DR_STMT (dr);
11464 gcc_assert (gimple_uid (orig_stmt) > 0);
11465 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11466 /* Data references for gather loads and scatter stores do not use the
11467 updated offset we set using ADVANCE. Instead we have to make sure the
11468 reference in the data references point to the corresponding copy of
11469 the original in the epilogue. Make sure to update both
11470 gather/scatters recognized by dataref analysis and also other
11471 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11472 auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11473 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11474 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11476 DR_REF (dr)
11477 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11478 &find_in_mapping, &mapping);
11479 DR_BASE_ADDRESS (dr)
11480 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11481 &find_in_mapping, &mapping);
11483 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11484 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11485 /* The vector size of the epilogue is smaller than that of the main loop
11486 so the alignment is either the same or lower. This means the dr will
11487 thus by definition be aligned. */
11488 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11491 epilogue_vinfo->shared->datarefs_copy.release ();
11492 epilogue_vinfo->shared->save_datarefs ();
11495 /* Function vect_transform_loop.
11497 The analysis phase has determined that the loop is vectorizable.
11498 Vectorize the loop - created vectorized stmts to replace the scalar
11499 stmts in the loop, and update the loop exit condition.
11500 Returns scalar epilogue loop if any. */
11502 class loop *
11503 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11505 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11506 class loop *epilogue = NULL;
11507 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11508 int nbbs = loop->num_nodes;
11509 int i;
11510 tree niters_vector = NULL_TREE;
11511 tree step_vector = NULL_TREE;
11512 tree niters_vector_mult_vf = NULL_TREE;
11513 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11514 unsigned int lowest_vf = constant_lower_bound (vf);
11515 gimple *stmt;
11516 bool check_profitability = false;
11517 unsigned int th;
11518 bool flat = maybe_flat_loop_profile (loop);
11520 DUMP_VECT_SCOPE ("vec_transform_loop");
11522 loop_vinfo->shared->check_datarefs ();
11524 /* Use the more conservative vectorization threshold. If the number
11525 of iterations is constant assume the cost check has been performed
11526 by our caller. If the threshold makes all loops profitable that
11527 run at least the (estimated) vectorization factor number of times
11528 checking is pointless, too. */
11529 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11530 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11532 if (dump_enabled_p ())
11533 dump_printf_loc (MSG_NOTE, vect_location,
11534 "Profitability threshold is %d loop iterations.\n",
11535 th);
11536 check_profitability = true;
11539 /* Make sure there exists a single-predecessor exit bb. Do this before
11540 versioning. */
11541 edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11542 if (! single_pred_p (e->dest))
11544 split_loop_exit_edge (e, true);
11545 if (dump_enabled_p ())
11546 dump_printf (MSG_NOTE, "split exit edge\n");
11549 /* Version the loop first, if required, so the profitability check
11550 comes first. */
11552 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11554 class loop *sloop
11555 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11556 sloop->force_vectorize = false;
11557 check_profitability = false;
11560 /* Make sure there exists a single-predecessor exit bb also on the
11561 scalar loop copy. Do this after versioning but before peeling
11562 so CFG structure is fine for both scalar and if-converted loop
11563 to make slpeel_duplicate_current_defs_from_edges face matched
11564 loop closed PHI nodes on the exit. */
11565 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11567 e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11568 if (! single_pred_p (e->dest))
11570 split_loop_exit_edge (e, true);
11571 if (dump_enabled_p ())
11572 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11576 tree niters = vect_build_loop_niters (loop_vinfo);
11577 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11578 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11579 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11580 tree advance;
11581 drs_init_vec orig_drs_init;
11583 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11584 &step_vector, &niters_vector_mult_vf, th,
11585 check_profitability, niters_no_overflow,
11586 &advance);
11587 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11588 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11590 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11591 block after loop exit. We need to scale all that. */
11592 basic_block preheader
11593 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11594 preheader->count
11595 = preheader->count.apply_probability
11596 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11597 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11598 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11599 single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11600 = preheader->count;
11603 if (niters_vector == NULL_TREE)
11605 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11606 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11607 && known_eq (lowest_vf, vf))
11609 niters_vector
11610 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11611 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11612 step_vector = build_one_cst (TREE_TYPE (niters));
11614 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11615 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11616 &step_vector, niters_no_overflow);
11617 else
11618 /* vect_do_peeling subtracted the number of peeled prologue
11619 iterations from LOOP_VINFO_NITERS. */
11620 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11621 &niters_vector, &step_vector,
11622 niters_no_overflow);
11625 /* 1) Make sure the loop header has exactly two entries
11626 2) Make sure we have a preheader basic block. */
11628 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11630 split_edge (loop_preheader_edge (loop));
11632 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11633 /* This will deal with any possible peeling. */
11634 vect_prepare_for_masked_peels (loop_vinfo);
11636 /* Schedule the SLP instances first, then handle loop vectorization
11637 below. */
11638 if (!loop_vinfo->slp_instances.is_empty ())
11640 DUMP_VECT_SCOPE ("scheduling SLP instances");
11641 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11644 /* FORNOW: the vectorizer supports only loops which body consist
11645 of one basic block (header + empty latch). When the vectorizer will
11646 support more involved loop forms, the order by which the BBs are
11647 traversed need to be reconsidered. */
11649 for (i = 0; i < nbbs; i++)
11651 basic_block bb = bbs[i];
11652 stmt_vec_info stmt_info;
11654 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11655 gsi_next (&si))
11657 gphi *phi = si.phi ();
11658 if (dump_enabled_p ())
11659 dump_printf_loc (MSG_NOTE, vect_location,
11660 "------>vectorizing phi: %G", (gimple *) phi);
11661 stmt_info = loop_vinfo->lookup_stmt (phi);
11662 if (!stmt_info)
11663 continue;
11665 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11666 vect_loop_kill_debug_uses (loop, stmt_info);
11668 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11669 && !STMT_VINFO_LIVE_P (stmt_info))
11670 continue;
11672 if (STMT_VINFO_VECTYPE (stmt_info)
11673 && (maybe_ne
11674 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11675 && dump_enabled_p ())
11676 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11678 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11679 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11680 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11681 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11682 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11683 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11684 && ! PURE_SLP_STMT (stmt_info))
11686 if (dump_enabled_p ())
11687 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11688 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11692 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11693 gsi_next (&si))
11695 gphi *phi = si.phi ();
11696 stmt_info = loop_vinfo->lookup_stmt (phi);
11697 if (!stmt_info)
11698 continue;
11700 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11701 && !STMT_VINFO_LIVE_P (stmt_info))
11702 continue;
11704 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11705 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11706 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11707 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11708 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11709 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11710 && ! PURE_SLP_STMT (stmt_info))
11711 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11714 for (gimple_stmt_iterator si = gsi_start_bb (bb);
11715 !gsi_end_p (si);)
11717 stmt = gsi_stmt (si);
11718 /* During vectorization remove existing clobber stmts. */
11719 if (gimple_clobber_p (stmt))
11721 unlink_stmt_vdef (stmt);
11722 gsi_remove (&si, true);
11723 release_defs (stmt);
11725 else
11727 /* Ignore vector stmts created in the outer loop. */
11728 stmt_info = loop_vinfo->lookup_stmt (stmt);
11730 /* vector stmts created in the outer-loop during vectorization of
11731 stmts in an inner-loop may not have a stmt_info, and do not
11732 need to be vectorized. */
11733 stmt_vec_info seen_store = NULL;
11734 if (stmt_info)
11736 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11738 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11739 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11740 !gsi_end_p (subsi); gsi_next (&subsi))
11742 stmt_vec_info pat_stmt_info
11743 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11744 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11745 &si, &seen_store);
11747 stmt_vec_info pat_stmt_info
11748 = STMT_VINFO_RELATED_STMT (stmt_info);
11749 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11750 &si, &seen_store))
11751 maybe_set_vectorized_backedge_value (loop_vinfo,
11752 pat_stmt_info);
11754 else
11756 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11757 &seen_store))
11758 maybe_set_vectorized_backedge_value (loop_vinfo,
11759 stmt_info);
11762 gsi_next (&si);
11763 if (seen_store)
11765 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11766 /* Interleaving. If IS_STORE is TRUE, the
11767 vectorization of the interleaving chain was
11768 completed - free all the stores in the chain. */
11769 vect_remove_stores (loop_vinfo,
11770 DR_GROUP_FIRST_ELEMENT (seen_store));
11771 else
11772 /* Free the attached stmt_vec_info and remove the stmt. */
11773 loop_vinfo->remove_stmt (stmt_info);
11778 /* Stub out scalar statements that must not survive vectorization.
11779 Doing this here helps with grouped statements, or statements that
11780 are involved in patterns. */
11781 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11782 !gsi_end_p (gsi); gsi_next (&gsi))
11784 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11785 if (!call || !gimple_call_internal_p (call))
11786 continue;
11787 internal_fn ifn = gimple_call_internal_fn (call);
11788 if (ifn == IFN_MASK_LOAD)
11790 tree lhs = gimple_get_lhs (call);
11791 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11793 tree zero = build_zero_cst (TREE_TYPE (lhs));
11794 gimple *new_stmt = gimple_build_assign (lhs, zero);
11795 gsi_replace (&gsi, new_stmt, true);
11798 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11800 tree lhs = gimple_get_lhs (call);
11801 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11803 tree else_arg
11804 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11805 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11806 gsi_replace (&gsi, new_stmt, true);
11810 } /* BBs in loop */
11812 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11813 a zero NITERS becomes a nonzero NITERS_VECTOR. */
11814 if (integer_onep (step_vector))
11815 niters_no_overflow = true;
11816 vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11817 niters_vector, step_vector, niters_vector_mult_vf,
11818 !niters_no_overflow);
11820 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11822 /* True if the final iteration might not handle a full vector's
11823 worth of scalar iterations. */
11824 bool final_iter_may_be_partial
11825 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11826 /* The minimum number of iterations performed by the epilogue. This
11827 is 1 when peeling for gaps because we always need a final scalar
11828 iteration. */
11829 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11830 /* +1 to convert latch counts to loop iteration counts,
11831 -min_epilogue_iters to remove iterations that cannot be performed
11832 by the vector code. */
11833 int bias_for_lowest = 1 - min_epilogue_iters;
11834 int bias_for_assumed = bias_for_lowest;
11835 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11836 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11838 /* When the amount of peeling is known at compile time, the first
11839 iteration will have exactly alignment_npeels active elements.
11840 In the worst case it will have at least one. */
11841 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11842 bias_for_lowest += lowest_vf - min_first_active;
11843 bias_for_assumed += assumed_vf - min_first_active;
11845 /* In these calculations the "- 1" converts loop iteration counts
11846 back to latch counts. */
11847 if (loop->any_upper_bound)
11849 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11850 loop->nb_iterations_upper_bound
11851 = (final_iter_may_be_partial
11852 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11853 lowest_vf) - 1
11854 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11855 lowest_vf) - 1);
11856 if (main_vinfo
11857 /* Both peeling for alignment and peeling for gaps can end up
11858 with the scalar epilogue running for more than VF-1 iterations. */
11859 && !main_vinfo->peeling_for_alignment
11860 && !main_vinfo->peeling_for_gaps)
11862 unsigned int bound;
11863 poly_uint64 main_iters
11864 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11865 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11866 main_iters
11867 = upper_bound (main_iters,
11868 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11869 if (can_div_away_from_zero_p (main_iters,
11870 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11871 &bound))
11872 loop->nb_iterations_upper_bound
11873 = wi::umin ((bound_wide_int) (bound - 1),
11874 loop->nb_iterations_upper_bound);
11877 if (loop->any_likely_upper_bound)
11878 loop->nb_iterations_likely_upper_bound
11879 = (final_iter_may_be_partial
11880 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11881 + bias_for_lowest, lowest_vf) - 1
11882 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11883 + bias_for_lowest, lowest_vf) - 1);
11884 if (loop->any_estimate)
11885 loop->nb_iterations_estimate
11886 = (final_iter_may_be_partial
11887 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11888 assumed_vf) - 1
11889 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11890 assumed_vf) - 1);
11891 scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11892 assumed_vf, flat);
11894 if (dump_enabled_p ())
11896 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11898 dump_printf_loc (MSG_NOTE, vect_location,
11899 "LOOP VECTORIZED\n");
11900 if (loop->inner)
11901 dump_printf_loc (MSG_NOTE, vect_location,
11902 "OUTER LOOP VECTORIZED\n");
11903 dump_printf (MSG_NOTE, "\n");
11905 else
11906 dump_printf_loc (MSG_NOTE, vect_location,
11907 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11908 GET_MODE_NAME (loop_vinfo->vector_mode));
11911 /* Loops vectorized with a variable factor won't benefit from
11912 unrolling/peeling. */
11913 if (!vf.is_constant ())
11915 loop->unroll = 1;
11916 if (dump_enabled_p ())
11917 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11918 " variable-length vectorization factor\n");
11920 /* Free SLP instances here because otherwise stmt reference counting
11921 won't work. */
11922 slp_instance instance;
11923 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11924 vect_free_slp_instance (instance);
11925 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11926 /* Clear-up safelen field since its value is invalid after vectorization
11927 since vectorized loop can have loop-carried dependencies. */
11928 loop->safelen = 0;
11930 if (epilogue)
11932 update_epilogue_loop_vinfo (epilogue, advance);
11934 epilogue->simduid = loop->simduid;
11935 epilogue->force_vectorize = loop->force_vectorize;
11936 epilogue->dont_vectorize = false;
11939 return epilogue;
11942 /* The code below is trying to perform simple optimization - revert
11943 if-conversion for masked stores, i.e. if the mask of a store is zero
11944 do not perform it and all stored value producers also if possible.
11945 For example,
11946 for (i=0; i<n; i++)
11947 if (c[i])
11949 p1[i] += 1;
11950 p2[i] = p3[i] +2;
11952 this transformation will produce the following semi-hammock:
11954 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11956 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11957 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11958 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11959 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11960 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11961 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11965 void
11966 optimize_mask_stores (class loop *loop)
11968 basic_block *bbs = get_loop_body (loop);
11969 unsigned nbbs = loop->num_nodes;
11970 unsigned i;
11971 basic_block bb;
11972 class loop *bb_loop;
11973 gimple_stmt_iterator gsi;
11974 gimple *stmt;
11975 auto_vec<gimple *> worklist;
11976 auto_purge_vect_location sentinel;
11978 vect_location = find_loop_location (loop);
11979 /* Pick up all masked stores in loop if any. */
11980 for (i = 0; i < nbbs; i++)
11982 bb = bbs[i];
11983 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11984 gsi_next (&gsi))
11986 stmt = gsi_stmt (gsi);
11987 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11988 worklist.safe_push (stmt);
11992 free (bbs);
11993 if (worklist.is_empty ())
11994 return;
11996 /* Loop has masked stores. */
11997 while (!worklist.is_empty ())
11999 gimple *last, *last_store;
12000 edge e, efalse;
12001 tree mask;
12002 basic_block store_bb, join_bb;
12003 gimple_stmt_iterator gsi_to;
12004 tree vdef, new_vdef;
12005 gphi *phi;
12006 tree vectype;
12007 tree zero;
12009 last = worklist.pop ();
12010 mask = gimple_call_arg (last, 2);
12011 bb = gimple_bb (last);
12012 /* Create then_bb and if-then structure in CFG, then_bb belongs to
12013 the same loop as if_bb. It could be different to LOOP when two
12014 level loop-nest is vectorized and mask_store belongs to the inner
12015 one. */
12016 e = split_block (bb, last);
12017 bb_loop = bb->loop_father;
12018 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12019 join_bb = e->dest;
12020 store_bb = create_empty_bb (bb);
12021 add_bb_to_loop (store_bb, bb_loop);
12022 e->flags = EDGE_TRUE_VALUE;
12023 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12024 /* Put STORE_BB to likely part. */
12025 efalse->probability = profile_probability::likely ();
12026 e->probability = efalse->probability.invert ();
12027 store_bb->count = efalse->count ();
12028 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12029 if (dom_info_available_p (CDI_DOMINATORS))
12030 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12031 if (dump_enabled_p ())
12032 dump_printf_loc (MSG_NOTE, vect_location,
12033 "Create new block %d to sink mask stores.",
12034 store_bb->index);
12035 /* Create vector comparison with boolean result. */
12036 vectype = TREE_TYPE (mask);
12037 zero = build_zero_cst (vectype);
12038 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12039 gsi = gsi_last_bb (bb);
12040 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12041 /* Create new PHI node for vdef of the last masked store:
12042 .MEM_2 = VDEF <.MEM_1>
12043 will be converted to
12044 .MEM.3 = VDEF <.MEM_1>
12045 and new PHI node will be created in join bb
12046 .MEM_2 = PHI <.MEM_1, .MEM_3>
12048 vdef = gimple_vdef (last);
12049 new_vdef = make_ssa_name (gimple_vop (cfun), last);
12050 gimple_set_vdef (last, new_vdef);
12051 phi = create_phi_node (vdef, join_bb);
12052 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12054 /* Put all masked stores with the same mask to STORE_BB if possible. */
12055 while (true)
12057 gimple_stmt_iterator gsi_from;
12058 gimple *stmt1 = NULL;
12060 /* Move masked store to STORE_BB. */
12061 last_store = last;
12062 gsi = gsi_for_stmt (last);
12063 gsi_from = gsi;
12064 /* Shift GSI to the previous stmt for further traversal. */
12065 gsi_prev (&gsi);
12066 gsi_to = gsi_start_bb (store_bb);
12067 gsi_move_before (&gsi_from, &gsi_to);
12068 /* Setup GSI_TO to the non-empty block start. */
12069 gsi_to = gsi_start_bb (store_bb);
12070 if (dump_enabled_p ())
12071 dump_printf_loc (MSG_NOTE, vect_location,
12072 "Move stmt to created bb\n%G", last);
12073 /* Move all stored value producers if possible. */
12074 while (!gsi_end_p (gsi))
12076 tree lhs;
12077 imm_use_iterator imm_iter;
12078 use_operand_p use_p;
12079 bool res;
12081 /* Skip debug statements. */
12082 if (is_gimple_debug (gsi_stmt (gsi)))
12084 gsi_prev (&gsi);
12085 continue;
12087 stmt1 = gsi_stmt (gsi);
12088 /* Do not consider statements writing to memory or having
12089 volatile operand. */
12090 if (gimple_vdef (stmt1)
12091 || gimple_has_volatile_ops (stmt1))
12092 break;
12093 gsi_from = gsi;
12094 gsi_prev (&gsi);
12095 lhs = gimple_get_lhs (stmt1);
12096 if (!lhs)
12097 break;
12099 /* LHS of vectorized stmt must be SSA_NAME. */
12100 if (TREE_CODE (lhs) != SSA_NAME)
12101 break;
12103 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12105 /* Remove dead scalar statement. */
12106 if (has_zero_uses (lhs))
12108 gsi_remove (&gsi_from, true);
12109 continue;
12113 /* Check that LHS does not have uses outside of STORE_BB. */
12114 res = true;
12115 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12117 gimple *use_stmt;
12118 use_stmt = USE_STMT (use_p);
12119 if (is_gimple_debug (use_stmt))
12120 continue;
12121 if (gimple_bb (use_stmt) != store_bb)
12123 res = false;
12124 break;
12127 if (!res)
12128 break;
12130 if (gimple_vuse (stmt1)
12131 && gimple_vuse (stmt1) != gimple_vuse (last_store))
12132 break;
12134 /* Can move STMT1 to STORE_BB. */
12135 if (dump_enabled_p ())
12136 dump_printf_loc (MSG_NOTE, vect_location,
12137 "Move stmt to created bb\n%G", stmt1);
12138 gsi_move_before (&gsi_from, &gsi_to);
12139 /* Shift GSI_TO for further insertion. */
12140 gsi_prev (&gsi_to);
12142 /* Put other masked stores with the same mask to STORE_BB. */
12143 if (worklist.is_empty ()
12144 || gimple_call_arg (worklist.last (), 2) != mask
12145 || worklist.last () != stmt1)
12146 break;
12147 last = worklist.pop ();
12149 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12153 /* Decide whether it is possible to use a zero-based induction variable
12154 when vectorizing LOOP_VINFO with partial vectors. If it is, return
12155 the value that the induction variable must be able to hold in order
12156 to ensure that the rgroups eventually have no active vector elements.
12157 Return -1 otherwise. */
12159 widest_int
12160 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12162 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12163 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12164 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12166 /* Calculate the value that the induction variable must be able
12167 to hit in order to ensure that we end the loop with an all-false mask.
12168 This involves adding the maximum number of inactive trailing scalar
12169 iterations. */
12170 widest_int iv_limit = -1;
12171 if (max_loop_iterations (loop, &iv_limit))
12173 if (niters_skip)
12175 /* Add the maximum number of skipped iterations to the
12176 maximum iteration count. */
12177 if (TREE_CODE (niters_skip) == INTEGER_CST)
12178 iv_limit += wi::to_widest (niters_skip);
12179 else
12180 iv_limit += max_vf - 1;
12182 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12183 /* Make a conservatively-correct assumption. */
12184 iv_limit += max_vf - 1;
12186 /* IV_LIMIT is the maximum number of latch iterations, which is also
12187 the maximum in-range IV value. Round this value down to the previous
12188 vector alignment boundary and then add an extra full iteration. */
12189 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12190 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12192 return iv_limit;
12195 /* For the given rgroup_controls RGC, check whether an induction variable
12196 would ever hit a value that produces a set of all-false masks or zero
12197 lengths before wrapping around. Return true if it's possible to wrap
12198 around before hitting the desirable value, otherwise return false. */
12200 bool
12201 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12203 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12205 if (iv_limit == -1)
12206 return true;
12208 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12209 unsigned int compare_precision = TYPE_PRECISION (compare_type);
12210 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12212 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12213 return true;
12215 return false;