C99 testsuite readiness: Compile more tests with -std=gnu89
[official-gcc.git] / gcc / tree-vect-loop.cc
blob4a8b0a18800e16321b1c3d69294ce17fd827aaf7
1 /* Loop Vectorization
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "memmodel.h"
36 #include "optabs.h"
37 #include "diagnostic-core.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "cfganal.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "gimple-fold.h"
52 #include "cgraph.h"
53 #include "tree-cfg.h"
54 #include "tree-if-conv.h"
55 #include "internal-fn.h"
56 #include "tree-vector-builder.h"
57 #include "vec-perm-indices.h"
58 #include "tree-eh.h"
59 #include "case-cfn-macros.h"
60 #include "langhooks.h"
62 /* Loop Vectorization Pass.
64 This pass tries to vectorize loops.
66 For example, the vectorizer transforms the following simple loop:
68 short a[N]; short b[N]; short c[N]; int i;
70 for (i=0; i<N; i++){
71 a[i] = b[i] + c[i];
74 as if it was manually vectorized by rewriting the source code into:
76 typedef int __attribute__((mode(V8HI))) v8hi;
77 short a[N]; short b[N]; short c[N]; int i;
78 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 v8hi va, vb, vc;
81 for (i=0; i<N/8; i++){
82 vb = pb[i];
83 vc = pc[i];
84 va = vb + vc;
85 pa[i] = va;
88 The main entry to this pass is vectorize_loops(), in which
89 the vectorizer applies a set of analyses on a given set of loops,
90 followed by the actual vectorization transformation for the loops that
91 had successfully passed the analysis phase.
92 Throughout this pass we make a distinction between two types of
93 data: scalars (which are represented by SSA_NAMES), and memory references
94 ("data-refs"). These two types of data require different handling both
95 during analysis and transformation. The types of data-refs that the
96 vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 accesses are required to have a simple (consecutive) access pattern.
100 Analysis phase:
101 ===============
102 The driver for the analysis phase is vect_analyze_loop().
103 It applies a set of analyses, some of which rely on the scalar evolution
104 analyzer (scev) developed by Sebastian Pop.
106 During the analysis phase the vectorizer records some information
107 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 loop, as well as general information about the loop as a whole, which is
109 recorded in a "loop_vec_info" struct attached to each loop.
111 Transformation phase:
112 =====================
113 The loop transformation phase scans all the stmts in the loop, and
114 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 the loop that needs to be vectorized. It inserts the vector code sequence
116 just before the scalar stmt S, and records a pointer to the vector code
117 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 attached to S). This pointer will be used for the vectorization of following
119 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 otherwise, we rely on dead code elimination for removing it.
122 For example, say stmt S1 was vectorized into stmt VS1:
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 S2: a = b;
128 To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 resulting sequence would be:
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 VS2: va = vb;
136 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
138 Operands that are not SSA_NAMEs, are data-refs that appear in
139 load/store operations (like 'x[i]' in S1), and are handled differently.
141 Target modeling:
142 =================
143 Currently the only target specific information that is used is the
144 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 Targets that can support different sizes of vectors, for now will need
146 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 flexibility will be added in the future.
149 Since we only vectorize operations which vector form can be
150 expressed using existing tree codes, to verify that an operation is
151 supported, the vectorizer checks the relevant optab at the relevant
152 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 the value found is CODE_FOR_nothing, then there's no target support, and
154 we can't vectorize the stmt.
156 For additional information on this project see:
157 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 unsigned *);
162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 bool *, bool *, bool);
165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 may already be set for general statements (not just data refs). */
169 static opt_result
170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 bool vectype_maybe_set_p,
172 poly_uint64 *vf)
174 gimple *stmt = stmt_info->stmt;
176 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 && !STMT_VINFO_LIVE_P (stmt_info))
178 || gimple_clobber_p (stmt))
180 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 return opt_result::success ();
185 tree stmt_vectype, nunits_vectype;
186 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 &stmt_vectype,
188 &nunits_vectype);
189 if (!res)
190 return res;
192 if (stmt_vectype)
194 if (STMT_VINFO_VECTYPE (stmt_info))
195 /* The only case when a vectype had been already set is for stmts
196 that contain a data ref, or for "pattern-stmts" (stmts generated
197 by the vectorizer to represent/replace a certain idiom). */
198 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 || vectype_maybe_set_p)
200 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 else
202 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
205 if (nunits_vectype)
206 vect_update_max_nunits (vf, nunits_vectype);
208 return opt_result::success ();
211 /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 types of STMT_INFO and all attached pattern statements and update
213 the vectorization factor VF accordingly. Return true on success
214 or false if something prevented vectorization. */
216 static opt_result
217 vect_determine_vf_for_stmt (vec_info *vinfo,
218 stmt_vec_info stmt_info, poly_uint64 *vf)
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 if (!res)
225 return res;
227 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 && STMT_VINFO_RELATED_STMT (stmt_info))
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
233 /* If a pattern statement has def stmts, analyze them too. */
234 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 !gsi_end_p (si); gsi_next (&si))
237 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "==> examining pattern def stmt: %G",
241 def_stmt_info->stmt);
242 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 if (!res)
244 return res;
247 if (dump_enabled_p ())
248 dump_printf_loc (MSG_NOTE, vect_location,
249 "==> examining pattern statement: %G",
250 stmt_info->stmt);
251 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 if (!res)
253 return res;
256 return opt_result::success ();
259 /* Function vect_determine_vectorization_factor
261 Determine the vectorization factor (VF). VF is the number of data elements
262 that are operated upon in parallel in a single iteration of the vectorized
263 loop. For example, when vectorizing a loop that operates on 4byte elements,
264 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 elements can fit in a single vector register.
267 We currently support vectorization of loops in which all types operated upon
268 are of the same size. Therefore this function currently sets VF according to
269 the size of the types operated upon, and fails if there are multiple sizes
270 in the loop.
272 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 original loop:
274 for (i=0; i<N; i++){
275 a[i] = b[i] + c[i];
278 vectorized loop:
279 for (i=0; i<N; i+=VF){
280 a[i:VF] = b[i:VF] + c[i:VF];
284 static opt_result
285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
287 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 unsigned nbbs = loop->num_nodes;
290 poly_uint64 vectorization_factor = 1;
291 tree scalar_type = NULL_TREE;
292 gphi *phi;
293 tree vectype;
294 stmt_vec_info stmt_info;
295 unsigned i;
297 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
299 for (i = 0; i < nbbs; i++)
301 basic_block bb = bbs[i];
303 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 gsi_next (&si))
306 phi = si.phi ();
307 stmt_info = loop_vinfo->lookup_stmt (phi);
308 if (dump_enabled_p ())
309 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 (gimple *) phi);
312 gcc_assert (stmt_info);
314 if (STMT_VINFO_RELEVANT_P (stmt_info)
315 || STMT_VINFO_LIVE_P (stmt_info))
317 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 scalar_type = TREE_TYPE (PHI_RESULT (phi));
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE, vect_location,
322 "get vectype for scalar type: %T\n",
323 scalar_type);
325 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 if (!vectype)
327 return opt_result::failure_at (phi,
328 "not vectorized: unsupported "
329 "data-type %T\n",
330 scalar_type);
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
337 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 dump_printf (MSG_NOTE, "\n");
344 vect_update_max_nunits (&vectorization_factor, vectype);
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 gsi_next (&si))
351 if (is_gimple_debug (gsi_stmt (si)))
352 continue;
353 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 opt_result res
355 = vect_determine_vf_for_stmt (loop_vinfo,
356 stmt_info, &vectorization_factor);
357 if (!res)
358 return res;
362 /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 if (dump_enabled_p ())
365 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 dump_dec (MSG_NOTE, vectorization_factor);
367 dump_printf (MSG_NOTE, "\n");
370 if (known_le (vectorization_factor, 1U))
371 return opt_result::failure_at (vect_location,
372 "not vectorized: unsupported data-type\n");
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 return opt_result::success ();
378 /* Function vect_is_simple_iv_evolution.
380 FORNOW: A simple evolution of an induction variables in the loop is
381 considered a polynomial evolution. */
383 static bool
384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 tree * step)
387 tree init_expr;
388 tree step_expr;
389 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 basic_block bb;
392 /* When there is no evolution in this loop, the evolution function
393 is not "simple". */
394 if (evolution_part == NULL_TREE)
395 return false;
397 /* When the evolution is a polynomial of degree >= 2
398 the evolution function is not "simple". */
399 if (tree_is_chrec (evolution_part))
400 return false;
402 step_expr = evolution_part;
403 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 step_expr, init_expr);
409 *init = init_expr;
410 *step = step_expr;
412 if (TREE_CODE (step_expr) != INTEGER_CST
413 && (TREE_CODE (step_expr) != SSA_NAME
414 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 || !flag_associative_math)))
419 && (TREE_CODE (step_expr) != REAL_CST
420 || !flag_associative_math))
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "step unknown.\n");
425 return false;
428 return true;
431 /* Function vect_is_nonlinear_iv_evolution
433 Only support nonlinear induction for integer type
434 1. neg
435 2. mul by constant
436 3. lshift/rshift by constant.
438 For neg induction, return a fake step as integer -1. */
439 static bool
440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 gphi* loop_phi_node, tree *init, tree *step)
443 tree init_expr, ev_expr, result, op1, op2;
444 gimple* def;
446 if (gimple_phi_num_args (loop_phi_node) != 2)
447 return false;
449 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
452 /* Support nonlinear induction only for integer type. */
453 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 return false;
456 *init = init_expr;
457 result = PHI_RESULT (loop_phi_node);
459 if (TREE_CODE (ev_expr) != SSA_NAME
460 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 || !is_gimple_assign (def))
462 return false;
464 enum tree_code t_code = gimple_assign_rhs_code (def);
465 switch (t_code)
467 case NEGATE_EXPR:
468 if (gimple_assign_rhs1 (def) != result)
469 return false;
470 *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 break;
474 case RSHIFT_EXPR:
475 case LSHIFT_EXPR:
476 case MULT_EXPR:
477 op1 = gimple_assign_rhs1 (def);
478 op2 = gimple_assign_rhs2 (def);
479 if (TREE_CODE (op2) != INTEGER_CST
480 || op1 != result)
481 return false;
482 *step = op2;
483 if (t_code == LSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 else if (t_code == RSHIFT_EXPR)
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 else
489 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 break;
492 default:
493 return false;
496 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
499 return true;
502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 what we are assuming is a double reduction. For example, given
504 a structure like this:
506 outer1:
507 x_1 = PHI <x_4(outer2), ...>;
510 inner:
511 x_2 = PHI <x_1(outer1), ...>;
513 x_3 = ...;
516 outer2:
517 x_4 = PHI <x_3(inner)>;
520 outer loop analysis would treat x_1 as a double reduction phi and
521 this function would then return true for x_2. */
523 static bool
524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
526 use_operand_p use_p;
527 ssa_op_iter op_iter;
528 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 return true;
532 return false;
535 /* Returns true if Phi is a first-order recurrence. A first-order
536 recurrence is a non-reduction recurrence relation in which the value of
537 the recurrence in the current loop iteration equals a value defined in
538 the previous iteration. */
540 static bool
541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 gphi *phi)
544 /* A nested cycle isn't vectorizable as first order recurrence. */
545 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 return false;
548 /* Ensure the loop latch definition is from within the loop. */
549 edge latch = loop_latch_edge (loop);
550 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 if (TREE_CODE (ldef) != SSA_NAME
552 || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 return false;
557 tree def = gimple_phi_result (phi);
559 /* Ensure every use_stmt of the phi node is dominated by the latch
560 definition. */
561 imm_use_iterator imm_iter;
562 use_operand_p use_p;
563 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 if (!is_gimple_debug (USE_STMT (use_p))
565 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 USE_STMT (use_p))))
568 return false;
570 /* First-order recurrence autovectorization needs shuffle vector. */
571 tree scalar_type = TREE_TYPE (def);
572 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 if (!vectype)
574 return false;
576 return true;
579 /* Function vect_analyze_scalar_cycles_1.
581 Examine the cross iteration def-use cycles of scalar variables
582 in LOOP. LOOP_VINFO represents the loop that is now being
583 considered for vectorization (can be LOOP, or an outer-loop
584 enclosing LOOP). SLP indicates there will be some subsequent
585 slp analyses or not. */
587 static void
588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 bool slp)
591 basic_block bb = loop->header;
592 tree init, step;
593 auto_vec<stmt_vec_info, 64> worklist;
594 gphi_iterator gsi;
595 bool double_reduc, reduc_chain;
597 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
599 /* First - identify all inductions. Reduction detection assumes that all the
600 inductions have been identified, therefore, this order must not be
601 changed. */
602 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
604 gphi *phi = gsi.phi ();
605 tree access_fn = NULL;
606 tree def = PHI_RESULT (phi);
607 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 (gimple *) phi);
613 /* Skip virtual phi's. The data dependences that are associated with
614 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 if (virtual_operand_p (def))
616 continue;
618 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
620 /* Analyze the evolution function. */
621 access_fn = analyze_scalar_evolution (loop, def);
622 if (access_fn)
624 STRIP_NOPS (access_fn);
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "Access function of PHI: %T\n", access_fn);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 = initial_condition_in_loop_num (access_fn, loop->num);
630 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 = evolution_part_in_loop_num (access_fn, loop->num);
634 if ((!access_fn
635 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 &init, &step)
638 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 && TREE_CODE (step) != INTEGER_CST))
640 /* Only handle nonlinear iv for same loop. */
641 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 phi, &init, &step)))
645 worklist.safe_push (stmt_vinfo);
646 continue;
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 != NULL_TREE);
651 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
659 /* Second - identify all reductions and nested cycles. */
660 while (worklist.length () > 0)
662 stmt_vec_info stmt_vinfo = worklist.pop ();
663 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
664 tree def = PHI_RESULT (phi);
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 (gimple *) phi);
670 gcc_assert (!virtual_operand_p (def)
671 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
673 stmt_vec_info reduc_stmt_info
674 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 &reduc_chain, slp);
676 if (reduc_stmt_info)
678 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 if (double_reduc)
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location,
684 "Detected double reduction.\n");
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
689 else
691 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "Detected vectorizable nested cycle.\n");
697 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
699 else
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_NOTE, vect_location,
703 "Detected reduction.\n");
705 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
706 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
707 /* Store the reduction cycles for possible vectorization in
708 loop-aware SLP if it was not detected as reduction
709 chain. */
710 if (! reduc_chain)
711 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
712 (reduc_stmt_info);
716 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
718 else
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown def-use cycle pattern.\n");
726 /* Function vect_analyze_scalar_cycles.
728 Examine the cross iteration def-use cycles of scalar variables, by
729 analyzing the loop-header PHIs of scalar variables. Classify each
730 cycle as one of the following: invariant, induction, reduction, unknown.
731 We do that for the loop represented by LOOP_VINFO, and also to its
732 inner-loop, if exists.
733 Examples for scalar cycles:
735 Example1: reduction:
737 loop1:
738 for (i=0; i<N; i++)
739 sum += a[i];
741 Example2: induction:
743 loop2:
744 for (i=0; i<N; i++)
745 a[i] = i; */
747 static void
748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
752 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
754 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 Reductions in such inner-loop therefore have different properties than
756 the reductions in the nest that gets vectorized:
757 1. When vectorized, they are executed in the same order as in the original
758 scalar loop, so we can't change the order of computation when
759 vectorizing them.
760 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 current checks are too strict. */
763 if (loop->inner)
764 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
767 /* Transfer group and reduction information from STMT_INFO to its
768 pattern stmt. */
770 static void
771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
773 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
774 stmt_vec_info stmtp;
775 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
777 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
780 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
781 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
782 == STMT_VINFO_DEF_TYPE (stmt_info));
783 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
784 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
785 if (stmt_info)
786 REDUC_GROUP_NEXT_ELEMENT (stmtp)
787 = STMT_VINFO_RELATED_STMT (stmt_info);
789 while (stmt_info);
792 /* Fixup scalar cycles that now have their stmts detected as patterns. */
794 static void
795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
797 stmt_vec_info first;
798 unsigned i;
800 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
802 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
803 while (next)
805 if ((STMT_VINFO_IN_PATTERN_P (next)
806 != STMT_VINFO_IN_PATTERN_P (first))
807 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
808 break;
809 next = REDUC_GROUP_NEXT_ELEMENT (next);
811 /* If all reduction chain members are well-formed patterns adjust
812 the group to group the pattern stmts instead. */
813 if (! next
814 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
816 if (STMT_VINFO_IN_PATTERN_P (first))
818 vect_fixup_reduc_chain (first);
819 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
820 = STMT_VINFO_RELATED_STMT (first);
823 /* If not all stmt in the chain are patterns or if we failed
824 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 it as regular reduction instead. */
826 else
828 stmt_vec_info vinfo = first;
829 stmt_vec_info last = NULL;
830 while (vinfo)
832 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
833 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
834 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
835 last = vinfo;
836 vinfo = next;
838 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
839 = vect_internal_def;
840 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
841 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
842 --i;
847 /* Function vect_get_loop_niters.
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 niter information holds in ASSUMPTIONS.
854 Return the loop exit conditions. */
857 static vec<gcond *>
858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
859 tree *number_of_iterations, tree *number_of_iterationsm1)
861 auto_vec<edge> exits = get_loop_exit_edges (loop);
862 vec<gcond *> conds;
863 conds.create (exits.length ());
864 class tree_niter_desc niter_desc;
865 tree niter_assumptions, niter, may_be_zero;
867 *assumptions = boolean_true_node;
868 *number_of_iterationsm1 = chrec_dont_know;
869 *number_of_iterations = chrec_dont_know;
871 DUMP_VECT_SCOPE ("get_loop_niters");
873 if (exits.is_empty ())
874 return conds;
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
878 exits.length ());
880 edge exit;
881 unsigned int i;
882 FOR_EACH_VEC_ELT (exits, i, exit)
884 gcond *cond = get_loop_exit_condition (exit);
885 if (cond)
886 conds.safe_push (cond);
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
891 if (exit != main_exit)
892 continue;
894 may_be_zero = NULL_TREE;
895 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
896 || chrec_contains_undetermined (niter_desc.niter))
897 continue;
899 niter_assumptions = niter_desc.assumptions;
900 may_be_zero = niter_desc.may_be_zero;
901 niter = niter_desc.niter;
903 if (may_be_zero && integer_zerop (may_be_zero))
904 may_be_zero = NULL_TREE;
906 if (may_be_zero)
908 if (COMPARISON_CLASS_P (may_be_zero))
910 /* Try to combine may_be_zero with assumptions, this can simplify
911 computation of niter expression. */
912 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
913 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
914 niter_assumptions,
915 fold_build1 (TRUTH_NOT_EXPR,
916 boolean_type_node,
917 may_be_zero));
918 else
919 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
920 build_int_cst (TREE_TYPE (niter), 0),
921 rewrite_to_non_trapping_overflow (niter));
923 may_be_zero = NULL_TREE;
925 else if (integer_nonzerop (may_be_zero))
927 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
928 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
929 continue;
931 else
932 continue;
935 /* Loop assumptions are based off the normal exit. */
936 *assumptions = niter_assumptions;
937 *number_of_iterationsm1 = niter;
939 /* We want the number of loop header executions which is the number
940 of latch executions plus one.
941 ??? For UINT_MAX latch executions this number overflows to zero
942 for loops like do { n++; } while (n != 0); */
943 if (niter && !chrec_contains_undetermined (niter))
944 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
945 unshare_expr (niter),
946 build_int_cst (TREE_TYPE (niter), 1));
947 *number_of_iterations = niter;
950 if (dump_enabled_p ())
951 dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
953 return conds;
956 /* Determine the main loop exit for the vectorizer. */
958 edge
959 vec_init_loop_exit_info (class loop *loop)
961 /* Before we begin we must first determine which exit is the main one and
962 which are auxilary exits. */
963 auto_vec<edge> exits = get_loop_exit_edges (loop);
964 if (exits.length () == 1)
965 return exits[0];
967 /* If we have multiple exits we only support counting IV at the moment. Analyze
968 all exits and return one */
969 class tree_niter_desc niter_desc;
970 edge candidate = NULL;
971 for (edge exit : exits)
973 if (!get_loop_exit_condition (exit))
974 continue;
976 if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
977 && !chrec_contains_undetermined (niter_desc.niter))
979 if (!niter_desc.may_be_zero || !candidate)
980 candidate = exit;
984 return candidate;
987 /* Function bb_in_loop_p
989 Used as predicate for dfs order traversal of the loop bbs. */
991 static bool
992 bb_in_loop_p (const_basic_block bb, const void *data)
994 const class loop *const loop = (const class loop *)data;
995 if (flow_bb_inside_loop_p (loop, bb))
996 return true;
997 return false;
1001 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1002 stmt_vec_info structs for all the stmts in LOOP_IN. */
1004 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1005 : vec_info (vec_info::loop, shared),
1006 loop (loop_in),
1007 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1008 num_itersm1 (NULL_TREE),
1009 num_iters (NULL_TREE),
1010 num_iters_unchanged (NULL_TREE),
1011 num_iters_assumptions (NULL_TREE),
1012 vector_costs (nullptr),
1013 scalar_costs (nullptr),
1014 th (0),
1015 versioning_threshold (0),
1016 vectorization_factor (0),
1017 main_loop_edge (nullptr),
1018 skip_main_loop_edge (nullptr),
1019 skip_this_loop_edge (nullptr),
1020 reusable_accumulators (),
1021 suggested_unroll_factor (1),
1022 max_vectorization_factor (0),
1023 mask_skip_niters (NULL_TREE),
1024 rgroup_compare_type (NULL_TREE),
1025 simd_if_cond (NULL_TREE),
1026 partial_vector_style (vect_partial_vectors_none),
1027 unaligned_dr (NULL),
1028 peeling_for_alignment (0),
1029 ptr_mask (0),
1030 ivexpr_map (NULL),
1031 scan_map (NULL),
1032 slp_unrolling_factor (1),
1033 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1034 vectorizable (false),
1035 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1036 using_partial_vectors_p (false),
1037 using_decrementing_iv_p (false),
1038 using_select_vl_p (false),
1039 epil_using_partial_vectors_p (false),
1040 partial_load_store_bias (0),
1041 peeling_for_gaps (false),
1042 peeling_for_niter (false),
1043 no_data_dependencies (false),
1044 has_mask_store (false),
1045 scalar_loop_scaling (profile_probability::uninitialized ()),
1046 scalar_loop (NULL),
1047 orig_loop_info (NULL),
1048 vec_loop_iv_exit (NULL),
1049 vec_epilogue_loop_iv_exit (NULL),
1050 scalar_loop_iv_exit (NULL)
1052 /* CHECKME: We want to visit all BBs before their successors (except for
1053 latch blocks, for which this assertion wouldn't hold). In the simple
1054 case of the loop forms we allow, a dfs order of the BBs would the same
1055 as reversed postorder traversal, so we are safe. */
1057 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1058 bbs, loop->num_nodes, loop);
1059 gcc_assert (nbbs == loop->num_nodes);
1061 for (unsigned int i = 0; i < nbbs; i++)
1063 basic_block bb = bbs[i];
1064 gimple_stmt_iterator si;
1066 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1068 gimple *phi = gsi_stmt (si);
1069 gimple_set_uid (phi, 0);
1070 add_stmt (phi);
1073 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1075 gimple *stmt = gsi_stmt (si);
1076 gimple_set_uid (stmt, 0);
1077 if (is_gimple_debug (stmt))
1078 continue;
1079 add_stmt (stmt);
1080 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1081 third argument is the #pragma omp simd if (x) condition, when 0,
1082 loop shouldn't be vectorized, when non-zero constant, it should
1083 be vectorized normally, otherwise versioned with vectorized loop
1084 done if the condition is non-zero at runtime. */
1085 if (loop_in->simduid
1086 && is_gimple_call (stmt)
1087 && gimple_call_internal_p (stmt)
1088 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1089 && gimple_call_num_args (stmt) >= 3
1090 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1091 && (loop_in->simduid
1092 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1094 tree arg = gimple_call_arg (stmt, 2);
1095 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1096 simd_if_cond = arg;
1097 else
1098 gcc_assert (integer_nonzerop (arg));
1103 epilogue_vinfos.create (6);
1106 /* Free all levels of rgroup CONTROLS. */
1108 void
1109 release_vec_loop_controls (vec<rgroup_controls> *controls)
1111 rgroup_controls *rgc;
1112 unsigned int i;
1113 FOR_EACH_VEC_ELT (*controls, i, rgc)
1114 rgc->controls.release ();
1115 controls->release ();
1118 /* Free all memory used by the _loop_vec_info, as well as all the
1119 stmt_vec_info structs of all the stmts in the loop. */
1121 _loop_vec_info::~_loop_vec_info ()
1123 free (bbs);
1125 release_vec_loop_controls (&masks.rgc_vec);
1126 release_vec_loop_controls (&lens);
1127 delete ivexpr_map;
1128 delete scan_map;
1129 epilogue_vinfos.release ();
1130 delete scalar_costs;
1131 delete vector_costs;
1133 /* When we release an epiloge vinfo that we do not intend to use
1134 avoid clearing AUX of the main loop which should continue to
1135 point to the main loop vinfo since otherwise we'll leak that. */
1136 if (loop->aux == this)
1137 loop->aux = NULL;
1140 /* Return an invariant or register for EXPR and emit necessary
1141 computations in the LOOP_VINFO loop preheader. */
1143 tree
1144 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1146 if (is_gimple_reg (expr)
1147 || is_gimple_min_invariant (expr))
1148 return expr;
1150 if (! loop_vinfo->ivexpr_map)
1151 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1152 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1153 if (! cached)
1155 gimple_seq stmts = NULL;
1156 cached = force_gimple_operand (unshare_expr (expr),
1157 &stmts, true, NULL_TREE);
1158 if (stmts)
1160 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1161 gsi_insert_seq_on_edge_immediate (e, stmts);
1164 return cached;
1167 /* Return true if we can use CMP_TYPE as the comparison type to produce
1168 all masks required to mask LOOP_VINFO. */
1170 static bool
1171 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1173 rgroup_controls *rgm;
1174 unsigned int i;
1175 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1176 if (rgm->type != NULL_TREE
1177 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1178 cmp_type, rgm->type,
1179 OPTIMIZE_FOR_SPEED))
1180 return false;
1181 return true;
1184 /* Calculate the maximum number of scalars per iteration for every
1185 rgroup in LOOP_VINFO. */
1187 static unsigned int
1188 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1190 unsigned int res = 1;
1191 unsigned int i;
1192 rgroup_controls *rgm;
1193 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1194 res = MAX (res, rgm->max_nscalars_per_iter);
1195 return res;
1198 /* Calculate the minimum precision necessary to represent:
1200 MAX_NITERS * FACTOR
1202 as an unsigned integer, where MAX_NITERS is the maximum number of
1203 loop header iterations for the original scalar form of LOOP_VINFO. */
1205 static unsigned
1206 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1208 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1210 /* Get the maximum number of iterations that is representable
1211 in the counter type. */
1212 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1213 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1215 /* Get a more refined estimate for the number of iterations. */
1216 widest_int max_back_edges;
1217 if (max_loop_iterations (loop, &max_back_edges))
1218 max_ni = wi::smin (max_ni, max_back_edges + 1);
1220 /* Work out how many bits we need to represent the limit. */
1221 return wi::min_precision (max_ni * factor, UNSIGNED);
1224 /* True if the loop needs peeling or partial vectors when vectorized. */
1226 static bool
1227 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1229 unsigned HOST_WIDE_INT const_vf;
1230 HOST_WIDE_INT max_niter
1231 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1233 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1234 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1235 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1236 (loop_vinfo));
1238 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1239 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1241 /* Work out the (constant) number of iterations that need to be
1242 peeled for reasons other than niters. */
1243 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1244 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1245 peel_niter += 1;
1246 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1247 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1248 return true;
1250 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1251 /* ??? When peeling for gaps but not alignment, we could
1252 try to check whether the (variable) niters is known to be
1253 VF * N + 1. That's something of a niche case though. */
1254 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1255 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1256 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1257 < (unsigned) exact_log2 (const_vf))
1258 /* In case of versioning, check if the maximum number of
1259 iterations is greater than th. If they are identical,
1260 the epilogue is unnecessary. */
1261 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1262 || ((unsigned HOST_WIDE_INT) max_niter
1263 > (th / const_vf) * const_vf))))
1264 return true;
1266 return false;
1269 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1270 whether we can actually generate the masks required. Return true if so,
1271 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1273 static bool
1274 vect_verify_full_masking (loop_vec_info loop_vinfo)
1276 unsigned int min_ni_width;
1278 /* Use a normal loop if there are no statements that need masking.
1279 This only happens in rare degenerate cases: it means that the loop
1280 has no loads, no stores, and no live-out values. */
1281 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1282 return false;
1284 /* Produce the rgroup controls. */
1285 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1287 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1288 tree vectype = mask.first;
1289 unsigned nvectors = mask.second;
1291 if (masks->rgc_vec.length () < nvectors)
1292 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1293 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1294 /* The number of scalars per iteration and the number of vectors are
1295 both compile-time constants. */
1296 unsigned int nscalars_per_iter
1297 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1298 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1300 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1302 rgm->max_nscalars_per_iter = nscalars_per_iter;
1303 rgm->type = truth_type_for (vectype);
1304 rgm->factor = 1;
1308 unsigned int max_nscalars_per_iter
1309 = vect_get_max_nscalars_per_iter (loop_vinfo);
1311 /* Work out how many bits we need to represent the limit. */
1312 min_ni_width
1313 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1315 /* Find a scalar mode for which WHILE_ULT is supported. */
1316 opt_scalar_int_mode cmp_mode_iter;
1317 tree cmp_type = NULL_TREE;
1318 tree iv_type = NULL_TREE;
1319 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1320 unsigned int iv_precision = UINT_MAX;
1322 if (iv_limit != -1)
1323 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1324 UNSIGNED);
1326 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1328 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1329 if (cmp_bits >= min_ni_width
1330 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1332 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1333 if (this_type
1334 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1336 /* Although we could stop as soon as we find a valid mode,
1337 there are at least two reasons why that's not always the
1338 best choice:
1340 - An IV that's Pmode or wider is more likely to be reusable
1341 in address calculations than an IV that's narrower than
1342 Pmode.
1344 - Doing the comparison in IV_PRECISION or wider allows
1345 a natural 0-based IV, whereas using a narrower comparison
1346 type requires mitigations against wrap-around.
1348 Conversely, if the IV limit is variable, doing the comparison
1349 in a wider type than the original type can introduce
1350 unnecessary extensions, so picking the widest valid mode
1351 is not always a good choice either.
1353 Here we prefer the first IV type that's Pmode or wider,
1354 and the first comparison type that's IV_PRECISION or wider.
1355 (The comparison type must be no wider than the IV type,
1356 to avoid extensions in the vector loop.)
1358 ??? We might want to try continuing beyond Pmode for ILP32
1359 targets if CMP_BITS < IV_PRECISION. */
1360 iv_type = this_type;
1361 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1362 cmp_type = this_type;
1363 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1364 break;
1369 if (!cmp_type)
1371 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1372 return false;
1375 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1376 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1377 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1378 return true;
1381 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1382 whether we can actually generate AVX512 style masks. Return true if so,
1383 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1385 static bool
1386 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1388 /* Produce differently organized rgc_vec and differently check
1389 we can produce masks. */
1391 /* Use a normal loop if there are no statements that need masking.
1392 This only happens in rare degenerate cases: it means that the loop
1393 has no loads, no stores, and no live-out values. */
1394 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1395 return false;
1397 /* For the decrementing IV we need to represent all values in
1398 [0, niter + niter_skip] where niter_skip is the elements we
1399 skip in the first iteration for prologue peeling. */
1400 tree iv_type = NULL_TREE;
1401 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1402 unsigned int iv_precision = UINT_MAX;
1403 if (iv_limit != -1)
1404 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1406 /* First compute the type for the IV we use to track the remaining
1407 scalar iterations. */
1408 opt_scalar_int_mode cmp_mode_iter;
1409 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1411 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1412 if (cmp_bits >= iv_precision
1413 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1415 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1416 if (iv_type)
1417 break;
1420 if (!iv_type)
1421 return false;
1423 /* Produce the rgroup controls. */
1424 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1426 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1427 tree vectype = mask.first;
1428 unsigned nvectors = mask.second;
1430 /* The number of scalars per iteration and the number of vectors are
1431 both compile-time constants. */
1432 unsigned int nscalars_per_iter
1433 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1434 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1436 /* We index the rgroup_controls vector with nscalars_per_iter
1437 which we keep constant and instead have a varying nvectors,
1438 remembering the vector mask with the fewest nV. */
1439 if (masks->rgc_vec.length () < nscalars_per_iter)
1440 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1441 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1443 if (!rgm->type || rgm->factor > nvectors)
1445 rgm->type = truth_type_for (vectype);
1446 rgm->compare_type = NULL_TREE;
1447 rgm->max_nscalars_per_iter = nscalars_per_iter;
1448 rgm->factor = nvectors;
1449 rgm->bias_adjusted_ctrl = NULL_TREE;
1453 /* There is no fixed compare type we are going to use but we have to
1454 be able to get at one for each mask group. */
1455 unsigned int min_ni_width
1456 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1458 bool ok = true;
1459 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1461 tree mask_type = rgc.type;
1462 if (!mask_type)
1463 continue;
1465 if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1467 ok = false;
1468 break;
1471 /* If iv_type is usable as compare type use that - we can elide the
1472 saturation in that case. */
1473 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1475 tree cmp_vectype
1476 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1477 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1478 rgc.compare_type = cmp_vectype;
1480 if (!rgc.compare_type)
1481 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1483 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1484 if (cmp_bits >= min_ni_width
1485 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1487 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1488 if (!cmp_type)
1489 continue;
1491 /* Check whether we can produce the mask with cmp_type. */
1492 tree cmp_vectype
1493 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1494 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1496 rgc.compare_type = cmp_vectype;
1497 break;
1501 if (!rgc.compare_type)
1503 ok = false;
1504 break;
1507 if (!ok)
1509 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1510 return false;
1513 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1514 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1515 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1516 return true;
1519 /* Check whether we can use vector access with length based on precison
1520 comparison. So far, to keep it simple, we only allow the case that the
1521 precision of the target supported length is larger than the precision
1522 required by loop niters. */
1524 static bool
1525 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1527 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1528 return false;
1530 machine_mode len_load_mode, len_store_mode;
1531 if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1532 .exists (&len_load_mode))
1533 return false;
1534 if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1535 .exists (&len_store_mode))
1536 return false;
1538 signed char partial_load_bias = internal_len_load_store_bias
1539 (IFN_LEN_LOAD, len_load_mode);
1541 signed char partial_store_bias = internal_len_load_store_bias
1542 (IFN_LEN_STORE, len_store_mode);
1544 gcc_assert (partial_load_bias == partial_store_bias);
1546 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1547 return false;
1549 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1550 len_loads with a length of zero. In order to avoid that we prohibit
1551 more than one loop length here. */
1552 if (partial_load_bias == -1
1553 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1554 return false;
1556 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1558 unsigned int max_nitems_per_iter = 1;
1559 unsigned int i;
1560 rgroup_controls *rgl;
1561 /* Find the maximum number of items per iteration for every rgroup. */
1562 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1564 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1565 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1568 /* Work out how many bits we need to represent the length limit. */
1569 unsigned int min_ni_prec
1570 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1572 /* Now use the maximum of below precisions for one suitable IV type:
1573 - the IV's natural precision
1574 - the precision needed to hold: the maximum number of scalar
1575 iterations multiplied by the scale factor (min_ni_prec above)
1576 - the Pmode precision
1578 If min_ni_prec is less than the precision of the current niters,
1579 we perfer to still use the niters type. Prefer to use Pmode and
1580 wider IV to avoid narrow conversions. */
1582 unsigned int ni_prec
1583 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1584 min_ni_prec = MAX (min_ni_prec, ni_prec);
1585 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1587 tree iv_type = NULL_TREE;
1588 opt_scalar_int_mode tmode_iter;
1589 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1591 scalar_mode tmode = tmode_iter.require ();
1592 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1594 /* ??? Do we really want to construct one IV whose precision exceeds
1595 BITS_PER_WORD? */
1596 if (tbits > BITS_PER_WORD)
1597 break;
1599 /* Find the first available standard integral type. */
1600 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1602 iv_type = build_nonstandard_integer_type (tbits, true);
1603 break;
1607 if (!iv_type)
1609 if (dump_enabled_p ())
1610 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1611 "can't vectorize with length-based partial vectors"
1612 " because there is no suitable iv type.\n");
1613 return false;
1616 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1617 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1618 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1620 return true;
1623 /* Calculate the cost of one scalar iteration of the loop. */
1624 static void
1625 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1627 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1628 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1629 int nbbs = loop->num_nodes, factor;
1630 int innerloop_iters, i;
1632 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1634 /* Gather costs for statements in the scalar loop. */
1636 /* FORNOW. */
1637 innerloop_iters = 1;
1638 if (loop->inner)
1639 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1641 for (i = 0; i < nbbs; i++)
1643 gimple_stmt_iterator si;
1644 basic_block bb = bbs[i];
1646 if (bb->loop_father == loop->inner)
1647 factor = innerloop_iters;
1648 else
1649 factor = 1;
1651 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1653 gimple *stmt = gsi_stmt (si);
1654 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1656 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1657 continue;
1659 /* Skip stmts that are not vectorized inside the loop. */
1660 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1661 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1662 && (!STMT_VINFO_LIVE_P (vstmt_info)
1663 || !VECTORIZABLE_CYCLE_DEF
1664 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1665 continue;
1667 vect_cost_for_stmt kind;
1668 if (STMT_VINFO_DATA_REF (stmt_info))
1670 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1671 kind = scalar_load;
1672 else
1673 kind = scalar_store;
1675 else if (vect_nop_conversion_p (stmt_info))
1676 continue;
1677 else
1678 kind = scalar_stmt;
1680 /* We are using vect_prologue here to avoid scaling twice
1681 by the inner loop factor. */
1682 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1683 factor, kind, stmt_info, 0, vect_prologue);
1687 /* Now accumulate cost. */
1688 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1689 add_stmt_costs (loop_vinfo->scalar_costs,
1690 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1691 loop_vinfo->scalar_costs->finish_cost (nullptr);
1695 /* Function vect_analyze_loop_form.
1697 Verify that certain CFG restrictions hold, including:
1698 - the loop has a pre-header
1699 - the loop has a single entry and exit
1700 - the loop exit condition is simple enough
1701 - the number of iterations can be analyzed, i.e, a countable loop. The
1702 niter could be analyzed under some assumptions. */
1704 opt_result
1705 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1707 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1709 edge exit_e = vec_init_loop_exit_info (loop);
1710 if (!exit_e)
1711 return opt_result::failure_at (vect_location,
1712 "not vectorized:"
1713 " could not determine main exit from"
1714 " loop with multiple exits.\n");
1715 info->loop_exit = exit_e;
1716 if (dump_enabled_p ())
1717 dump_printf_loc (MSG_NOTE, vect_location,
1718 "using as main loop exit: %d -> %d [AUX: %p]\n",
1719 exit_e->src->index, exit_e->dest->index, exit_e->aux);
1721 /* Different restrictions apply when we are considering an inner-most loop,
1722 vs. an outer (nested) loop.
1723 (FORNOW. May want to relax some of these restrictions in the future). */
1725 info->inner_loop_cond = NULL;
1726 if (!loop->inner)
1728 /* Inner-most loop. We currently require that the number of BBs is
1729 exactly 2 (the header and latch). Vectorizable inner-most loops
1730 look like this:
1732 (pre-header)
1734 header <--------+
1735 | | |
1736 | +--> latch --+
1738 (exit-bb) */
1740 if (loop->num_nodes != 2)
1741 return opt_result::failure_at (vect_location,
1742 "not vectorized:"
1743 " control flow in loop.\n");
1745 if (empty_block_p (loop->header))
1746 return opt_result::failure_at (vect_location,
1747 "not vectorized: empty loop.\n");
1749 else
1751 class loop *innerloop = loop->inner;
1752 edge entryedge;
1754 /* Nested loop. We currently require that the loop is doubly-nested,
1755 contains a single inner loop, and the number of BBs is exactly 5.
1756 Vectorizable outer-loops look like this:
1758 (pre-header)
1760 header <---+
1762 inner-loop |
1764 tail ------+
1766 (exit-bb)
1768 The inner-loop has the properties expected of inner-most loops
1769 as described above. */
1771 if ((loop->inner)->inner || (loop->inner)->next)
1772 return opt_result::failure_at (vect_location,
1773 "not vectorized:"
1774 " multiple nested loops.\n");
1776 if (loop->num_nodes != 5)
1777 return opt_result::failure_at (vect_location,
1778 "not vectorized:"
1779 " control flow in loop.\n");
1781 entryedge = loop_preheader_edge (innerloop);
1782 if (entryedge->src != loop->header
1783 || !single_exit (innerloop)
1784 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1785 return opt_result::failure_at (vect_location,
1786 "not vectorized:"
1787 " unsupported outerloop form.\n");
1789 /* Analyze the inner-loop. */
1790 vect_loop_form_info inner;
1791 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1792 if (!res)
1794 if (dump_enabled_p ())
1795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1796 "not vectorized: Bad inner loop.\n");
1797 return res;
1800 /* Don't support analyzing niter under assumptions for inner
1801 loop. */
1802 if (!integer_onep (inner.assumptions))
1803 return opt_result::failure_at (vect_location,
1804 "not vectorized: Bad inner loop.\n");
1806 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1807 return opt_result::failure_at (vect_location,
1808 "not vectorized: inner-loop count not"
1809 " invariant.\n");
1811 if (dump_enabled_p ())
1812 dump_printf_loc (MSG_NOTE, vect_location,
1813 "Considering outer-loop vectorization.\n");
1814 info->inner_loop_cond = inner.conds[0];
1817 if (!single_exit (loop))
1818 return opt_result::failure_at (vect_location,
1819 "not vectorized: multiple exits.\n");
1820 if (EDGE_COUNT (loop->header->preds) != 2)
1821 return opt_result::failure_at (vect_location,
1822 "not vectorized:"
1823 " too many incoming edges.\n");
1825 /* We assume that the loop exit condition is at the end of the loop. i.e,
1826 that the loop is represented as a do-while (with a proper if-guard
1827 before the loop if needed), where the loop header contains all the
1828 executable statements, and the latch is empty. */
1829 if (!empty_block_p (loop->latch)
1830 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1831 return opt_result::failure_at (vect_location,
1832 "not vectorized: latch block not empty.\n");
1834 /* Make sure the exit is not abnormal. */
1835 if (exit_e->flags & EDGE_ABNORMAL)
1836 return opt_result::failure_at (vect_location,
1837 "not vectorized:"
1838 " abnormal loop exit edge.\n");
1840 info->conds
1841 = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1842 &info->number_of_iterations,
1843 &info->number_of_iterationsm1);
1845 if (info->conds.is_empty ())
1846 return opt_result::failure_at
1847 (vect_location,
1848 "not vectorized: complicated exit condition.\n");
1850 /* Determine what the primary and alternate exit conds are. */
1851 for (unsigned i = 0; i < info->conds.length (); i++)
1853 gcond *cond = info->conds[i];
1854 if (exit_e->src == gimple_bb (cond))
1855 std::swap (info->conds[0], info->conds[i]);
1858 if (integer_zerop (info->assumptions)
1859 || !info->number_of_iterations
1860 || chrec_contains_undetermined (info->number_of_iterations))
1861 return opt_result::failure_at
1862 (info->conds[0],
1863 "not vectorized: number of iterations cannot be computed.\n");
1865 if (integer_zerop (info->number_of_iterations))
1866 return opt_result::failure_at
1867 (info->conds[0],
1868 "not vectorized: number of iterations = 0.\n");
1870 if (!(tree_fits_shwi_p (info->number_of_iterations)
1871 && tree_to_shwi (info->number_of_iterations) > 0))
1873 if (dump_enabled_p ())
1875 dump_printf_loc (MSG_NOTE, vect_location,
1876 "Symbolic number of iterations is ");
1877 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1878 dump_printf (MSG_NOTE, "\n");
1882 return opt_result::success ();
1885 /* Create a loop_vec_info for LOOP with SHARED and the
1886 vect_analyze_loop_form result. */
1888 loop_vec_info
1889 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1890 const vect_loop_form_info *info,
1891 loop_vec_info main_loop_info)
1893 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1894 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1895 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1896 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1897 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1898 /* Also record the assumptions for versioning. */
1899 if (!integer_onep (info->assumptions) && !main_loop_info)
1900 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1902 for (gcond *cond : info->conds)
1904 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1905 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1908 for (unsigned i = 1; i < info->conds.length (); i ++)
1909 LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1910 LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1912 LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1914 if (info->inner_loop_cond)
1916 stmt_vec_info inner_loop_cond_info
1917 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1918 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1919 /* If we have an estimate on the number of iterations of the inner
1920 loop use that to limit the scale for costing, otherwise use
1921 --param vect-inner-loop-cost-factor literally. */
1922 widest_int nit;
1923 if (estimated_stmt_executions (loop->inner, &nit))
1924 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1925 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1928 return loop_vinfo;
1933 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1934 statements update the vectorization factor. */
1936 static void
1937 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1939 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1940 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1941 int nbbs = loop->num_nodes;
1942 poly_uint64 vectorization_factor;
1943 int i;
1945 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1947 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1948 gcc_assert (known_ne (vectorization_factor, 0U));
1950 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1951 vectorization factor of the loop is the unrolling factor required by
1952 the SLP instances. If that unrolling factor is 1, we say, that we
1953 perform pure SLP on loop - cross iteration parallelism is not
1954 exploited. */
1955 bool only_slp_in_loop = true;
1956 for (i = 0; i < nbbs; i++)
1958 basic_block bb = bbs[i];
1959 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1960 gsi_next (&si))
1962 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1963 if (!stmt_info)
1964 continue;
1965 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1966 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1967 && !PURE_SLP_STMT (stmt_info))
1968 /* STMT needs both SLP and loop-based vectorization. */
1969 only_slp_in_loop = false;
1971 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1972 gsi_next (&si))
1974 if (is_gimple_debug (gsi_stmt (si)))
1975 continue;
1976 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1977 stmt_info = vect_stmt_to_vectorize (stmt_info);
1978 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1979 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1980 && !PURE_SLP_STMT (stmt_info))
1981 /* STMT needs both SLP and loop-based vectorization. */
1982 only_slp_in_loop = false;
1986 if (only_slp_in_loop)
1988 if (dump_enabled_p ())
1989 dump_printf_loc (MSG_NOTE, vect_location,
1990 "Loop contains only SLP stmts\n");
1991 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1993 else
1995 if (dump_enabled_p ())
1996 dump_printf_loc (MSG_NOTE, vect_location,
1997 "Loop contains SLP and non-SLP stmts\n");
1998 /* Both the vectorization factor and unroll factor have the form
1999 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2000 so they must have a common multiple. */
2001 vectorization_factor
2002 = force_common_multiple (vectorization_factor,
2003 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2006 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2007 if (dump_enabled_p ())
2009 dump_printf_loc (MSG_NOTE, vect_location,
2010 "Updating vectorization factor to ");
2011 dump_dec (MSG_NOTE, vectorization_factor);
2012 dump_printf (MSG_NOTE, ".\n");
2016 /* Return true if STMT_INFO describes a double reduction phi and if
2017 the other phi in the reduction is also relevant for vectorization.
2018 This rejects cases such as:
2020 outer1:
2021 x_1 = PHI <x_3(outer2), ...>;
2024 inner:
2025 x_2 = ...;
2028 outer2:
2029 x_3 = PHI <x_2(inner)>;
2031 if nothing in x_2 or elsewhere makes x_1 relevant. */
2033 static bool
2034 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2036 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2037 return false;
2039 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2042 /* Function vect_analyze_loop_operations.
2044 Scan the loop stmts and make sure they are all vectorizable. */
2046 static opt_result
2047 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2049 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2050 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2051 int nbbs = loop->num_nodes;
2052 int i;
2053 stmt_vec_info stmt_info;
2054 bool need_to_vectorize = false;
2055 bool ok;
2057 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2059 auto_vec<stmt_info_for_cost> cost_vec;
2061 for (i = 0; i < nbbs; i++)
2063 basic_block bb = bbs[i];
2065 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2066 gsi_next (&si))
2068 gphi *phi = si.phi ();
2069 ok = true;
2071 stmt_info = loop_vinfo->lookup_stmt (phi);
2072 if (dump_enabled_p ())
2073 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2074 (gimple *) phi);
2075 if (virtual_operand_p (gimple_phi_result (phi)))
2076 continue;
2078 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2079 (i.e., a phi in the tail of the outer-loop). */
2080 if (! is_loop_header_bb_p (bb))
2082 /* FORNOW: we currently don't support the case that these phis
2083 are not used in the outerloop (unless it is double reduction,
2084 i.e., this phi is vect_reduction_def), cause this case
2085 requires to actually do something here. */
2086 if (STMT_VINFO_LIVE_P (stmt_info)
2087 && !vect_active_double_reduction_p (stmt_info))
2088 return opt_result::failure_at (phi,
2089 "Unsupported loop-closed phi"
2090 " in outer-loop.\n");
2092 /* If PHI is used in the outer loop, we check that its operand
2093 is defined in the inner loop. */
2094 if (STMT_VINFO_RELEVANT_P (stmt_info))
2096 tree phi_op;
2098 if (gimple_phi_num_args (phi) != 1)
2099 return opt_result::failure_at (phi, "unsupported phi");
2101 phi_op = PHI_ARG_DEF (phi, 0);
2102 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2103 if (!op_def_info)
2104 return opt_result::failure_at (phi, "unsupported phi\n");
2106 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2107 && (STMT_VINFO_RELEVANT (op_def_info)
2108 != vect_used_in_outer_by_reduction))
2109 return opt_result::failure_at (phi, "unsupported phi\n");
2111 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2112 || (STMT_VINFO_DEF_TYPE (stmt_info)
2113 == vect_double_reduction_def))
2114 && !vectorizable_lc_phi (loop_vinfo,
2115 stmt_info, NULL, NULL))
2116 return opt_result::failure_at (phi, "unsupported phi\n");
2119 continue;
2122 gcc_assert (stmt_info);
2124 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2125 || STMT_VINFO_LIVE_P (stmt_info))
2126 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2127 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2128 /* A scalar-dependence cycle that we don't support. */
2129 return opt_result::failure_at (phi,
2130 "not vectorized:"
2131 " scalar dependence cycle.\n");
2133 if (STMT_VINFO_RELEVANT_P (stmt_info))
2135 need_to_vectorize = true;
2136 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2137 && ! PURE_SLP_STMT (stmt_info))
2138 ok = vectorizable_induction (loop_vinfo,
2139 stmt_info, NULL, NULL,
2140 &cost_vec);
2141 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2142 || (STMT_VINFO_DEF_TYPE (stmt_info)
2143 == vect_double_reduction_def)
2144 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2145 && ! PURE_SLP_STMT (stmt_info))
2146 ok = vectorizable_reduction (loop_vinfo,
2147 stmt_info, NULL, NULL, &cost_vec);
2148 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2149 == vect_first_order_recurrence)
2150 && ! PURE_SLP_STMT (stmt_info))
2151 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2152 &cost_vec);
2155 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2156 if (ok
2157 && STMT_VINFO_LIVE_P (stmt_info)
2158 && !PURE_SLP_STMT (stmt_info))
2159 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2160 -1, false, &cost_vec);
2162 if (!ok)
2163 return opt_result::failure_at (phi,
2164 "not vectorized: relevant phi not "
2165 "supported: %G",
2166 static_cast <gimple *> (phi));
2169 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2170 gsi_next (&si))
2172 gimple *stmt = gsi_stmt (si);
2173 if (!gimple_clobber_p (stmt)
2174 && !is_gimple_debug (stmt))
2176 opt_result res
2177 = vect_analyze_stmt (loop_vinfo,
2178 loop_vinfo->lookup_stmt (stmt),
2179 &need_to_vectorize,
2180 NULL, NULL, &cost_vec);
2181 if (!res)
2182 return res;
2185 } /* bbs */
2187 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2189 /* All operations in the loop are either irrelevant (deal with loop
2190 control, or dead), or only used outside the loop and can be moved
2191 out of the loop (e.g. invariants, inductions). The loop can be
2192 optimized away by scalar optimizations. We're better off not
2193 touching this loop. */
2194 if (!need_to_vectorize)
2196 if (dump_enabled_p ())
2197 dump_printf_loc (MSG_NOTE, vect_location,
2198 "All the computation can be taken out of the loop.\n");
2199 return opt_result::failure_at
2200 (vect_location,
2201 "not vectorized: redundant loop. no profit to vectorize.\n");
2204 return opt_result::success ();
2207 /* Return true if we know that the iteration count is smaller than the
2208 vectorization factor. Return false if it isn't, or if we can't be sure
2209 either way. */
2211 static bool
2212 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2214 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2216 HOST_WIDE_INT max_niter;
2217 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2218 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2219 else
2220 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2222 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2223 return true;
2225 return false;
2228 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2229 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2230 definitely no, or -1 if it's worth retrying. */
2232 static int
2233 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2234 unsigned *suggested_unroll_factor)
2236 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2237 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2239 /* Only loops that can handle partially-populated vectors can have iteration
2240 counts less than the vectorization factor. */
2241 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2242 && vect_known_niters_smaller_than_vf (loop_vinfo))
2244 if (dump_enabled_p ())
2245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246 "not vectorized: iteration count smaller than "
2247 "vectorization factor.\n");
2248 return 0;
2251 /* If we know the number of iterations we can do better, for the
2252 epilogue we can also decide whether the main loop leaves us
2253 with enough iterations, prefering a smaller vector epilog then
2254 also possibly used for the case we skip the vector loop. */
2255 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2257 widest_int scalar_niters
2258 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2259 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2261 loop_vec_info orig_loop_vinfo
2262 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2263 unsigned lowest_vf
2264 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2265 int prolog_peeling = 0;
2266 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2267 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2268 if (prolog_peeling >= 0
2269 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2270 lowest_vf))
2272 unsigned gap
2273 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2274 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2275 % lowest_vf + gap);
2278 /* Reject vectorizing for a single scalar iteration, even if
2279 we could in principle implement that using partial vectors. */
2280 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2281 if (scalar_niters <= peeling_gap + 1)
2283 if (dump_enabled_p ())
2284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285 "not vectorized: loop only has a single "
2286 "scalar iteration.\n");
2287 return 0;
2290 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2292 /* Check that the loop processes at least one full vector. */
2293 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2294 if (known_lt (scalar_niters, vf))
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2298 "loop does not have enough iterations "
2299 "to support vectorization.\n");
2300 return 0;
2303 /* If we need to peel an extra epilogue iteration to handle data
2304 accesses with gaps, check that there are enough scalar iterations
2305 available.
2307 The check above is redundant with this one when peeling for gaps,
2308 but the distinction is useful for diagnostics. */
2309 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2310 && known_le (scalar_niters, vf))
2312 if (dump_enabled_p ())
2313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2314 "loop does not have enough iterations "
2315 "to support peeling for gaps.\n");
2316 return 0;
2321 /* If using the "very cheap" model. reject cases in which we'd keep
2322 a copy of the scalar code (even if we might be able to vectorize it). */
2323 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2324 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2325 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2326 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2328 if (dump_enabled_p ())
2329 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2330 "some scalar iterations would need to be peeled\n");
2331 return 0;
2334 int min_profitable_iters, min_profitable_estimate;
2335 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2336 &min_profitable_estimate,
2337 suggested_unroll_factor);
2339 if (min_profitable_iters < 0)
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343 "not vectorized: vectorization not profitable.\n");
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346 "not vectorized: vector version will never be "
2347 "profitable.\n");
2348 return -1;
2351 int min_scalar_loop_bound = (param_min_vect_loop_bound
2352 * assumed_vf);
2354 /* Use the cost model only if it is more conservative than user specified
2355 threshold. */
2356 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2357 min_profitable_iters);
2359 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2361 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2362 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2364 if (dump_enabled_p ())
2365 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2366 "not vectorized: vectorization not profitable.\n");
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE, vect_location,
2369 "not vectorized: iteration count smaller than user "
2370 "specified loop bound parameter or minimum profitable "
2371 "iterations (whichever is more conservative).\n");
2372 return 0;
2375 /* The static profitablity threshold min_profitable_estimate includes
2376 the cost of having to check at runtime whether the scalar loop
2377 should be used instead. If it turns out that we don't need or want
2378 such a check, the threshold we should use for the static estimate
2379 is simply the point at which the vector loop becomes more profitable
2380 than the scalar loop. */
2381 if (min_profitable_estimate > min_profitable_iters
2382 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2383 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2384 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2385 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2387 if (dump_enabled_p ())
2388 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2389 " choice between the scalar and vector loops\n");
2390 min_profitable_estimate = min_profitable_iters;
2393 /* If the vector loop needs multiple iterations to be beneficial then
2394 things are probably too close to call, and the conservative thing
2395 would be to stick with the scalar code. */
2396 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2397 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2399 if (dump_enabled_p ())
2400 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2401 "one iteration of the vector loop would be"
2402 " more expensive than the equivalent number of"
2403 " iterations of the scalar loop\n");
2404 return 0;
2407 HOST_WIDE_INT estimated_niter;
2409 /* If we are vectorizing an epilogue then we know the maximum number of
2410 scalar iterations it will cover is at least one lower than the
2411 vectorization factor of the main loop. */
2412 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2413 estimated_niter
2414 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2415 else
2417 estimated_niter = estimated_stmt_executions_int (loop);
2418 if (estimated_niter == -1)
2419 estimated_niter = likely_max_stmt_executions_int (loop);
2421 if (estimated_niter != -1
2422 && ((unsigned HOST_WIDE_INT) estimated_niter
2423 < MAX (th, (unsigned) min_profitable_estimate)))
2425 if (dump_enabled_p ())
2426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2427 "not vectorized: estimated iteration count too "
2428 "small.\n");
2429 if (dump_enabled_p ())
2430 dump_printf_loc (MSG_NOTE, vect_location,
2431 "not vectorized: estimated iteration count smaller "
2432 "than specified loop bound parameter or minimum "
2433 "profitable iterations (whichever is more "
2434 "conservative).\n");
2435 return -1;
2438 return 1;
2441 static opt_result
2442 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2443 vec<data_reference_p> *datarefs,
2444 unsigned int *n_stmts)
2446 *n_stmts = 0;
2447 for (unsigned i = 0; i < loop->num_nodes; i++)
2448 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2449 !gsi_end_p (gsi); gsi_next (&gsi))
2451 gimple *stmt = gsi_stmt (gsi);
2452 if (is_gimple_debug (stmt))
2453 continue;
2454 ++(*n_stmts);
2455 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2456 NULL, 0);
2457 if (!res)
2459 if (is_gimple_call (stmt) && loop->safelen)
2461 tree fndecl = gimple_call_fndecl (stmt), op;
2462 if (fndecl == NULL_TREE
2463 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2465 fndecl = gimple_call_arg (stmt, 0);
2466 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2467 fndecl = TREE_OPERAND (fndecl, 0);
2468 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2470 if (fndecl != NULL_TREE)
2472 cgraph_node *node = cgraph_node::get (fndecl);
2473 if (node != NULL && node->simd_clones != NULL)
2475 unsigned int j, n = gimple_call_num_args (stmt);
2476 for (j = 0; j < n; j++)
2478 op = gimple_call_arg (stmt, j);
2479 if (DECL_P (op)
2480 || (REFERENCE_CLASS_P (op)
2481 && get_base_address (op)))
2482 break;
2484 op = gimple_call_lhs (stmt);
2485 /* Ignore #pragma omp declare simd functions
2486 if they don't have data references in the
2487 call stmt itself. */
2488 if (j == n
2489 && !(op
2490 && (DECL_P (op)
2491 || (REFERENCE_CLASS_P (op)
2492 && get_base_address (op)))))
2493 continue;
2497 return res;
2499 /* If dependence analysis will give up due to the limit on the
2500 number of datarefs stop here and fail fatally. */
2501 if (datarefs->length ()
2502 > (unsigned)param_loop_max_datarefs_for_datadeps)
2503 return opt_result::failure_at (stmt, "exceeded param "
2504 "loop-max-datarefs-for-datadeps\n");
2506 return opt_result::success ();
2509 /* Look for SLP-only access groups and turn each individual access into its own
2510 group. */
2511 static void
2512 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2514 unsigned int i;
2515 struct data_reference *dr;
2517 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2519 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2520 FOR_EACH_VEC_ELT (datarefs, i, dr)
2522 gcc_assert (DR_REF (dr));
2523 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2525 /* Check if the load is a part of an interleaving chain. */
2526 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2528 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2529 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2530 unsigned int group_size = DR_GROUP_SIZE (first_element);
2532 /* Check if SLP-only groups. */
2533 if (!STMT_SLP_TYPE (stmt_info)
2534 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2536 /* Dissolve the group. */
2537 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2539 stmt_vec_info vinfo = first_element;
2540 while (vinfo)
2542 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2543 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2544 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2545 DR_GROUP_SIZE (vinfo) = 1;
2546 if (STMT_VINFO_STRIDED_P (first_element)
2547 /* We cannot handle stores with gaps. */
2548 || DR_IS_WRITE (dr_info->dr))
2550 STMT_VINFO_STRIDED_P (vinfo) = true;
2551 DR_GROUP_GAP (vinfo) = 0;
2553 else
2554 DR_GROUP_GAP (vinfo) = group_size - 1;
2555 /* Duplicate and adjust alignment info, it needs to
2556 be present on each group leader, see dr_misalignment. */
2557 if (vinfo != first_element)
2559 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2560 dr_info2->target_alignment = dr_info->target_alignment;
2561 int misalignment = dr_info->misalignment;
2562 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2564 HOST_WIDE_INT diff
2565 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2566 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2567 unsigned HOST_WIDE_INT align_c
2568 = dr_info->target_alignment.to_constant ();
2569 misalignment = (misalignment + diff) % align_c;
2571 dr_info2->misalignment = misalignment;
2573 vinfo = next;
2580 /* Determine if operating on full vectors for LOOP_VINFO might leave
2581 some scalar iterations still to do. If so, decide how we should
2582 handle those scalar iterations. The possibilities are:
2584 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2585 In this case:
2587 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2588 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2589 LOOP_VINFO_PEELING_FOR_NITER == false
2591 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2592 to handle the remaining scalar iterations. In this case:
2594 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2595 LOOP_VINFO_PEELING_FOR_NITER == true
2597 There are two choices:
2599 (2a) Consider vectorizing the epilogue loop at the same VF as the
2600 main loop, but using partial vectors instead of full vectors.
2601 In this case:
2603 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2605 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2606 In this case:
2608 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2611 opt_result
2612 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2614 /* Determine whether there would be any scalar iterations left over. */
2615 bool need_peeling_or_partial_vectors_p
2616 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2618 /* Decide whether to vectorize the loop with partial vectors. */
2619 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2620 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2621 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2622 && need_peeling_or_partial_vectors_p)
2624 /* For partial-vector-usage=1, try to push the handling of partial
2625 vectors to the epilogue, with the main loop continuing to operate
2626 on full vectors.
2628 If we are unrolling we also do not want to use partial vectors. This
2629 is to avoid the overhead of generating multiple masks and also to
2630 avoid having to execute entire iterations of FALSE masked instructions
2631 when dealing with one or less full iterations.
2633 ??? We could then end up failing to use partial vectors if we
2634 decide to peel iterations into a prologue, and if the main loop
2635 then ends up processing fewer than VF iterations. */
2636 if ((param_vect_partial_vector_usage == 1
2637 || loop_vinfo->suggested_unroll_factor > 1)
2638 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2639 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2640 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2641 else
2642 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2645 if (dump_enabled_p ())
2646 dump_printf_loc (MSG_NOTE, vect_location,
2647 "operating on %s vectors%s.\n",
2648 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2649 ? "partial" : "full",
2650 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2651 ? " for epilogue loop" : "");
2653 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2654 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2655 && need_peeling_or_partial_vectors_p);
2657 return opt_result::success ();
2660 /* Function vect_analyze_loop_2.
2662 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2663 analyses will record information in some members of LOOP_VINFO. FATAL
2664 indicates if some analysis meets fatal error. If one non-NULL pointer
2665 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2666 worked out suggested unroll factor, while one NULL pointer shows it's
2667 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2668 is to hold the slp decision when the suggested unroll factor is worked
2669 out. */
2670 static opt_result
2671 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2672 unsigned *suggested_unroll_factor,
2673 bool& slp_done_for_suggested_uf)
2675 opt_result ok = opt_result::success ();
2676 int res;
2677 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2678 poly_uint64 min_vf = 2;
2679 loop_vec_info orig_loop_vinfo = NULL;
2681 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2682 loop_vec_info of the first vectorized loop. */
2683 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2684 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2685 else
2686 orig_loop_vinfo = loop_vinfo;
2687 gcc_assert (orig_loop_vinfo);
2689 /* The first group of checks is independent of the vector size. */
2690 fatal = true;
2692 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2693 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2694 return opt_result::failure_at (vect_location,
2695 "not vectorized: simd if(0)\n");
2697 /* Find all data references in the loop (which correspond to vdefs/vuses)
2698 and analyze their evolution in the loop. */
2700 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2702 /* Gather the data references and count stmts in the loop. */
2703 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2705 opt_result res
2706 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2707 &LOOP_VINFO_DATAREFS (loop_vinfo),
2708 &LOOP_VINFO_N_STMTS (loop_vinfo));
2709 if (!res)
2711 if (dump_enabled_p ())
2712 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2713 "not vectorized: loop contains function "
2714 "calls or data references that cannot "
2715 "be analyzed\n");
2716 return res;
2718 loop_vinfo->shared->save_datarefs ();
2720 else
2721 loop_vinfo->shared->check_datarefs ();
2723 /* Analyze the data references and also adjust the minimal
2724 vectorization factor according to the loads and stores. */
2726 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2727 if (!ok)
2729 if (dump_enabled_p ())
2730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2731 "bad data references.\n");
2732 return ok;
2735 /* Check if we are applying unroll factor now. */
2736 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2737 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2739 /* If the slp decision is false when suggested unroll factor is worked
2740 out, and we are applying suggested unroll factor, we can simply skip
2741 all slp related analyses this time. */
2742 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2744 /* Classify all cross-iteration scalar data-flow cycles.
2745 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2746 vect_analyze_scalar_cycles (loop_vinfo, slp);
2748 vect_pattern_recog (loop_vinfo);
2750 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2752 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2753 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2755 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2756 if (!ok)
2758 if (dump_enabled_p ())
2759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760 "bad data access.\n");
2761 return ok;
2764 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2766 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2767 if (!ok)
2769 if (dump_enabled_p ())
2770 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2771 "unexpected pattern.\n");
2772 return ok;
2775 /* While the rest of the analysis below depends on it in some way. */
2776 fatal = false;
2778 /* Analyze data dependences between the data-refs in the loop
2779 and adjust the maximum vectorization factor according to
2780 the dependences.
2781 FORNOW: fail at the first data dependence that we encounter. */
2783 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2784 if (!ok)
2786 if (dump_enabled_p ())
2787 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2788 "bad data dependence.\n");
2789 return ok;
2791 if (max_vf != MAX_VECTORIZATION_FACTOR
2792 && maybe_lt (max_vf, min_vf))
2793 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2794 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2796 ok = vect_determine_vectorization_factor (loop_vinfo);
2797 if (!ok)
2799 if (dump_enabled_p ())
2800 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2801 "can't determine vectorization factor.\n");
2802 return ok;
2804 if (max_vf != MAX_VECTORIZATION_FACTOR
2805 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2806 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2808 /* Compute the scalar iteration cost. */
2809 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2811 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2813 if (slp)
2815 /* Check the SLP opportunities in the loop, analyze and build
2816 SLP trees. */
2817 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2818 if (!ok)
2819 return ok;
2821 /* If there are any SLP instances mark them as pure_slp. */
2822 slp = vect_make_slp_decision (loop_vinfo);
2823 if (slp)
2825 /* Find stmts that need to be both vectorized and SLPed. */
2826 vect_detect_hybrid_slp (loop_vinfo);
2828 /* Update the vectorization factor based on the SLP decision. */
2829 vect_update_vf_for_slp (loop_vinfo);
2831 /* Optimize the SLP graph with the vectorization factor fixed. */
2832 vect_optimize_slp (loop_vinfo);
2834 /* Gather the loads reachable from the SLP graph entries. */
2835 vect_gather_slp_loads (loop_vinfo);
2839 bool saved_can_use_partial_vectors_p
2840 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2842 /* We don't expect to have to roll back to anything other than an empty
2843 set of rgroups. */
2844 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2846 /* This is the point where we can re-start analysis with SLP forced off. */
2847 start_over:
2849 /* Apply the suggested unrolling factor, this was determined by the backend
2850 during finish_cost the first time we ran the analyzis for this
2851 vector mode. */
2852 if (applying_suggested_uf)
2853 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2855 /* Now the vectorization factor is final. */
2856 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2857 gcc_assert (known_ne (vectorization_factor, 0U));
2859 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2861 dump_printf_loc (MSG_NOTE, vect_location,
2862 "vectorization_factor = ");
2863 dump_dec (MSG_NOTE, vectorization_factor);
2864 dump_printf (MSG_NOTE, ", niters = %wd\n",
2865 LOOP_VINFO_INT_NITERS (loop_vinfo));
2868 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2870 /* Analyze the alignment of the data-refs in the loop.
2871 Fail if a data reference is found that cannot be vectorized. */
2873 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2874 if (!ok)
2876 if (dump_enabled_p ())
2877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2878 "bad data alignment.\n");
2879 return ok;
2882 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2883 It is important to call pruning after vect_analyze_data_ref_accesses,
2884 since we use grouping information gathered by interleaving analysis. */
2885 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2886 if (!ok)
2887 return ok;
2889 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2890 vectorization, since we do not want to add extra peeling or
2891 add versioning for alignment. */
2892 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2893 /* This pass will decide on using loop versioning and/or loop peeling in
2894 order to enhance the alignment of data references in the loop. */
2895 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2896 if (!ok)
2897 return ok;
2899 if (slp)
2901 /* Analyze operations in the SLP instances. Note this may
2902 remove unsupported SLP instances which makes the above
2903 SLP kind detection invalid. */
2904 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2905 vect_slp_analyze_operations (loop_vinfo);
2906 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2908 ok = opt_result::failure_at (vect_location,
2909 "unsupported SLP instances\n");
2910 goto again;
2913 /* Check whether any load in ALL SLP instances is possibly permuted. */
2914 slp_tree load_node, slp_root;
2915 unsigned i, x;
2916 slp_instance instance;
2917 bool can_use_lanes = true;
2918 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2920 slp_root = SLP_INSTANCE_TREE (instance);
2921 int group_size = SLP_TREE_LANES (slp_root);
2922 tree vectype = SLP_TREE_VECTYPE (slp_root);
2923 bool loads_permuted = false;
2924 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2926 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2927 continue;
2928 unsigned j;
2929 stmt_vec_info load_info;
2930 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2931 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2933 loads_permuted = true;
2934 break;
2938 /* If the loads and stores can be handled with load/store-lane
2939 instructions record it and move on to the next instance. */
2940 if (loads_permuted
2941 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2942 && vect_store_lanes_supported (vectype, group_size, false)
2943 != IFN_LAST)
2945 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2947 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2948 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2949 /* Use SLP for strided accesses (or if we can't
2950 load-lanes). */
2951 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2952 || vect_load_lanes_supported
2953 (STMT_VINFO_VECTYPE (stmt_vinfo),
2954 DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
2955 break;
2958 can_use_lanes
2959 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2961 if (can_use_lanes && dump_enabled_p ())
2962 dump_printf_loc (MSG_NOTE, vect_location,
2963 "SLP instance %p can use load/store-lanes\n",
2964 (void *) instance);
2966 else
2968 can_use_lanes = false;
2969 break;
2973 /* If all SLP instances can use load/store-lanes abort SLP and try again
2974 with SLP disabled. */
2975 if (can_use_lanes)
2977 ok = opt_result::failure_at (vect_location,
2978 "Built SLP cancelled: can use "
2979 "load/store-lanes\n");
2980 if (dump_enabled_p ())
2981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2982 "Built SLP cancelled: all SLP instances support "
2983 "load/store-lanes\n");
2984 goto again;
2988 /* Dissolve SLP-only groups. */
2989 vect_dissolve_slp_only_groups (loop_vinfo);
2991 /* Scan all the remaining operations in the loop that are not subject
2992 to SLP and make sure they are vectorizable. */
2993 ok = vect_analyze_loop_operations (loop_vinfo);
2994 if (!ok)
2996 if (dump_enabled_p ())
2997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2998 "bad operation or unsupported loop bound.\n");
2999 return ok;
3002 /* For now, we don't expect to mix both masking and length approaches for one
3003 loop, disable it if both are recorded. */
3004 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3005 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3006 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3008 if (dump_enabled_p ())
3009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3010 "can't vectorize a loop with partial vectors"
3011 " because we don't expect to mix different"
3012 " approaches with partial vectors for the"
3013 " same loop.\n");
3014 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3017 /* If we still have the option of using partial vectors,
3018 check whether we can generate the necessary loop controls. */
3019 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3021 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3023 if (!vect_verify_full_masking (loop_vinfo)
3024 && !vect_verify_full_masking_avx512 (loop_vinfo))
3025 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3027 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3028 if (!vect_verify_loop_lens (loop_vinfo))
3029 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3032 /* If we're vectorizing a loop that uses length "controls" and
3033 can iterate more than once, we apply decrementing IV approach
3034 in loop control. */
3035 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3036 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3037 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3038 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3039 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3040 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3041 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3043 /* If a loop uses length controls and has a decrementing loop control IV,
3044 we will normally pass that IV through a MIN_EXPR to calcaluate the
3045 basis for the length controls. E.g. in a loop that processes one
3046 element per scalar iteration, the number of elements would be
3047 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3049 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3050 step, since only the final iteration of the vector loop can have
3051 inactive lanes.
3053 However, some targets have a dedicated instruction for calculating the
3054 preferred length, given the total number of elements that still need to
3055 be processed. This is encapsulated in the SELECT_VL internal function.
3057 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3058 to determine the basis for the length controls. However, unlike the
3059 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3060 lanes inactive in any iteration of the vector loop, not just the last
3061 iteration. This SELECT_VL approach therefore requires us to use pointer
3062 IVs with variable steps.
3064 Once we've decided how many elements should be processed by one
3065 iteration of the vector loop, we need to populate the rgroup controls.
3066 If a loop has multiple rgroups, we need to make sure that those rgroups
3067 "line up" (that is, they must be consistent about which elements are
3068 active and which aren't). This is done by vect_adjust_loop_lens_control.
3070 In principle, it would be possible to use vect_adjust_loop_lens_control
3071 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3072 However:
3074 (1) In practice, it only makes sense to use SELECT_VL when a vector
3075 operation will be controlled directly by the result. It is not
3076 worth using SELECT_VL if it would only be the input to other
3077 calculations.
3079 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3080 pointer IV will need N updates by a variable amount (N-1 updates
3081 within the iteration and 1 update to move to the next iteration).
3083 Because of this, we prefer to use the MIN_EXPR approach whenever there
3084 is more than one length control.
3086 In addition, SELECT_VL always operates to a granularity of 1 unit.
3087 If we wanted to use it to control an SLP operation on N consecutive
3088 elements, we would need to make the SELECT_VL inputs measure scalar
3089 iterations (rather than elements) and then multiply the SELECT_VL
3090 result by N. But using SELECT_VL this way is inefficient because
3091 of (1) above.
3093 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3094 satisfied:
3096 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3097 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3099 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3100 we will fail to gain benefits of following unroll optimizations. We prefer
3101 using the MIN_EXPR approach in this situation. */
3102 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3104 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3105 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3106 OPTIMIZE_FOR_SPEED)
3107 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3108 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3109 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3110 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3111 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3114 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3115 assuming that the loop will be used as a main loop. We will redo
3116 this analysis later if we instead decide to use the loop as an
3117 epilogue loop. */
3118 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3119 if (!ok)
3120 return ok;
3122 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3123 to be able to handle fewer than VF scalars, or needs to have a lower VF
3124 than the main loop. */
3125 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3126 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3128 poly_uint64 unscaled_vf
3129 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3130 orig_loop_vinfo->suggested_unroll_factor);
3131 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3132 return opt_result::failure_at (vect_location,
3133 "Vectorization factor too high for"
3134 " epilogue loop.\n");
3137 /* Check the costings of the loop make vectorizing worthwhile. */
3138 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3139 if (res < 0)
3141 ok = opt_result::failure_at (vect_location,
3142 "Loop costings may not be worthwhile.\n");
3143 goto again;
3145 if (!res)
3146 return opt_result::failure_at (vect_location,
3147 "Loop costings not worthwhile.\n");
3149 /* If an epilogue loop is required make sure we can create one. */
3150 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3151 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3153 if (dump_enabled_p ())
3154 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3155 if (!vect_can_advance_ivs_p (loop_vinfo)
3156 || !slpeel_can_duplicate_loop_p (loop,
3157 LOOP_VINFO_IV_EXIT (loop_vinfo),
3158 LOOP_VINFO_IV_EXIT (loop_vinfo)))
3160 ok = opt_result::failure_at (vect_location,
3161 "not vectorized: can't create required "
3162 "epilog loop\n");
3163 goto again;
3167 /* During peeling, we need to check if number of loop iterations is
3168 enough for both peeled prolog loop and vector loop. This check
3169 can be merged along with threshold check of loop versioning, so
3170 increase threshold for this case if necessary.
3172 If we are analyzing an epilogue we still want to check what its
3173 versioning threshold would be. If we decide to vectorize the epilogues we
3174 will want to use the lowest versioning threshold of all epilogues and main
3175 loop. This will enable us to enter a vectorized epilogue even when
3176 versioning the loop. We can't simply check whether the epilogue requires
3177 versioning though since we may have skipped some versioning checks when
3178 analyzing the epilogue. For instance, checks for alias versioning will be
3179 skipped when dealing with epilogues as we assume we already checked them
3180 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3181 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3183 poly_uint64 niters_th = 0;
3184 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3186 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3188 /* Niters for peeled prolog loop. */
3189 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3191 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3192 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3193 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3195 else
3196 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3199 /* Niters for at least one iteration of vectorized loop. */
3200 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3201 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3202 /* One additional iteration because of peeling for gap. */
3203 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3204 niters_th += 1;
3206 /* Use the same condition as vect_transform_loop to decide when to use
3207 the cost to determine a versioning threshold. */
3208 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3209 && ordered_p (th, niters_th))
3210 niters_th = ordered_max (poly_uint64 (th), niters_th);
3212 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3215 gcc_assert (known_eq (vectorization_factor,
3216 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3218 slp_done_for_suggested_uf = slp;
3220 /* Ok to vectorize! */
3221 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3222 return opt_result::success ();
3224 again:
3225 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3226 gcc_assert (!ok);
3228 /* Try again with SLP forced off but if we didn't do any SLP there is
3229 no point in re-trying. */
3230 if (!slp)
3231 return ok;
3233 /* If the slp decision is true when suggested unroll factor is worked
3234 out, and we are applying suggested unroll factor, we don't need to
3235 re-try any more. */
3236 if (applying_suggested_uf && slp_done_for_suggested_uf)
3237 return ok;
3239 /* If there are reduction chains re-trying will fail anyway. */
3240 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3241 return ok;
3243 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3244 via interleaving or lane instructions. */
3245 slp_instance instance;
3246 slp_tree node;
3247 unsigned i, j;
3248 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3250 stmt_vec_info vinfo;
3251 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3252 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3253 continue;
3254 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3255 unsigned int size = DR_GROUP_SIZE (vinfo);
3256 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3257 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3258 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3259 && ! vect_grouped_store_supported (vectype, size))
3260 return opt_result::failure_at (vinfo->stmt,
3261 "unsupported grouped store\n");
3262 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3264 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
3265 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3266 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3267 size = DR_GROUP_SIZE (vinfo);
3268 vectype = STMT_VINFO_VECTYPE (vinfo);
3269 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3270 && ! vect_grouped_load_supported (vectype, single_element_p,
3271 size))
3272 return opt_result::failure_at (vinfo->stmt,
3273 "unsupported grouped load\n");
3277 if (dump_enabled_p ())
3278 dump_printf_loc (MSG_NOTE, vect_location,
3279 "re-trying with SLP disabled\n");
3281 /* Roll back state appropriately. No SLP this time. */
3282 slp = false;
3283 /* Restore vectorization factor as it were without SLP. */
3284 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3285 /* Free the SLP instances. */
3286 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3287 vect_free_slp_instance (instance);
3288 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3289 /* Reset SLP type to loop_vect on all stmts. */
3290 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3292 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3293 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3294 !gsi_end_p (si); gsi_next (&si))
3296 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3297 STMT_SLP_TYPE (stmt_info) = loop_vect;
3298 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3299 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3301 /* vectorizable_reduction adjusts reduction stmt def-types,
3302 restore them to that of the PHI. */
3303 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3304 = STMT_VINFO_DEF_TYPE (stmt_info);
3305 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3306 (STMT_VINFO_REDUC_DEF (stmt_info)))
3307 = STMT_VINFO_DEF_TYPE (stmt_info);
3310 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3311 !gsi_end_p (si); gsi_next (&si))
3313 if (is_gimple_debug (gsi_stmt (si)))
3314 continue;
3315 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3316 STMT_SLP_TYPE (stmt_info) = loop_vect;
3317 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3319 stmt_vec_info pattern_stmt_info
3320 = STMT_VINFO_RELATED_STMT (stmt_info);
3321 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3322 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3324 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3325 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3326 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3327 !gsi_end_p (pi); gsi_next (&pi))
3328 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3329 = loop_vect;
3333 /* Free optimized alias test DDRS. */
3334 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3335 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3336 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3337 /* Reset target cost data. */
3338 delete loop_vinfo->vector_costs;
3339 loop_vinfo->vector_costs = nullptr;
3340 /* Reset accumulated rgroup information. */
3341 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3342 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3343 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3344 /* Reset assorted flags. */
3345 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3346 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3347 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3348 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3349 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3350 = saved_can_use_partial_vectors_p;
3352 goto start_over;
3355 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3356 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3357 OLD_LOOP_VINFO is better unless something specifically indicates
3358 otherwise.
3360 Note that this deliberately isn't a partial order. */
3362 static bool
3363 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3364 loop_vec_info old_loop_vinfo)
3366 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3367 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3369 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3370 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3372 /* Always prefer a VF of loop->simdlen over any other VF. */
3373 if (loop->simdlen)
3375 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3376 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3377 if (new_simdlen_p != old_simdlen_p)
3378 return new_simdlen_p;
3381 const auto *old_costs = old_loop_vinfo->vector_costs;
3382 const auto *new_costs = new_loop_vinfo->vector_costs;
3383 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3384 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3386 return new_costs->better_main_loop_than_p (old_costs);
3389 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3390 true if we should. */
3392 static bool
3393 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3394 loop_vec_info old_loop_vinfo)
3396 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3397 return false;
3399 if (dump_enabled_p ())
3400 dump_printf_loc (MSG_NOTE, vect_location,
3401 "***** Preferring vector mode %s to vector mode %s\n",
3402 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3403 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3404 return true;
3407 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3408 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3409 MODE_I to the next mode useful to analyze.
3410 Return the loop_vinfo on success and wrapped null on failure. */
3412 static opt_loop_vec_info
3413 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3414 const vect_loop_form_info *loop_form_info,
3415 loop_vec_info main_loop_vinfo,
3416 const vector_modes &vector_modes, unsigned &mode_i,
3417 machine_mode &autodetected_vector_mode,
3418 bool &fatal)
3420 loop_vec_info loop_vinfo
3421 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3423 machine_mode vector_mode = vector_modes[mode_i];
3424 loop_vinfo->vector_mode = vector_mode;
3425 unsigned int suggested_unroll_factor = 1;
3426 bool slp_done_for_suggested_uf = false;
3428 /* Run the main analysis. */
3429 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3430 &suggested_unroll_factor,
3431 slp_done_for_suggested_uf);
3432 if (dump_enabled_p ())
3433 dump_printf_loc (MSG_NOTE, vect_location,
3434 "***** Analysis %s with vector mode %s\n",
3435 res ? "succeeded" : " failed",
3436 GET_MODE_NAME (loop_vinfo->vector_mode));
3438 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3440 if (dump_enabled_p ())
3441 dump_printf_loc (MSG_NOTE, vect_location,
3442 "***** Re-trying analysis for unrolling"
3443 " with unroll factor %d and slp %s.\n",
3444 suggested_unroll_factor,
3445 slp_done_for_suggested_uf ? "on" : "off");
3446 loop_vec_info unroll_vinfo
3447 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3448 unroll_vinfo->vector_mode = vector_mode;
3449 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3450 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3451 slp_done_for_suggested_uf);
3452 if (new_res)
3454 delete loop_vinfo;
3455 loop_vinfo = unroll_vinfo;
3457 else
3458 delete unroll_vinfo;
3461 /* Remember the autodetected vector mode. */
3462 if (vector_mode == VOIDmode)
3463 autodetected_vector_mode = loop_vinfo->vector_mode;
3465 /* Advance mode_i, first skipping modes that would result in the
3466 same analysis result. */
3467 while (mode_i + 1 < vector_modes.length ()
3468 && vect_chooses_same_modes_p (loop_vinfo,
3469 vector_modes[mode_i + 1]))
3471 if (dump_enabled_p ())
3472 dump_printf_loc (MSG_NOTE, vect_location,
3473 "***** The result for vector mode %s would"
3474 " be the same\n",
3475 GET_MODE_NAME (vector_modes[mode_i + 1]));
3476 mode_i += 1;
3478 if (mode_i + 1 < vector_modes.length ()
3479 && VECTOR_MODE_P (autodetected_vector_mode)
3480 && (related_vector_mode (vector_modes[mode_i + 1],
3481 GET_MODE_INNER (autodetected_vector_mode))
3482 == autodetected_vector_mode)
3483 && (related_vector_mode (autodetected_vector_mode,
3484 GET_MODE_INNER (vector_modes[mode_i + 1]))
3485 == vector_modes[mode_i + 1]))
3487 if (dump_enabled_p ())
3488 dump_printf_loc (MSG_NOTE, vect_location,
3489 "***** Skipping vector mode %s, which would"
3490 " repeat the analysis for %s\n",
3491 GET_MODE_NAME (vector_modes[mode_i + 1]),
3492 GET_MODE_NAME (autodetected_vector_mode));
3493 mode_i += 1;
3495 mode_i++;
3497 if (!res)
3499 delete loop_vinfo;
3500 if (fatal)
3501 gcc_checking_assert (main_loop_vinfo == NULL);
3502 return opt_loop_vec_info::propagate_failure (res);
3505 return opt_loop_vec_info::success (loop_vinfo);
3508 /* Function vect_analyze_loop.
3510 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3511 for it. The different analyses will record information in the
3512 loop_vec_info struct. */
3513 opt_loop_vec_info
3514 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3516 DUMP_VECT_SCOPE ("analyze_loop_nest");
3518 if (loop_outer (loop)
3519 && loop_vec_info_for_loop (loop_outer (loop))
3520 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3521 return opt_loop_vec_info::failure_at (vect_location,
3522 "outer-loop already vectorized.\n");
3524 if (!find_loop_nest (loop, &shared->loop_nest))
3525 return opt_loop_vec_info::failure_at
3526 (vect_location,
3527 "not vectorized: loop nest containing two or more consecutive inner"
3528 " loops cannot be vectorized\n");
3530 /* Analyze the loop form. */
3531 vect_loop_form_info loop_form_info;
3532 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3533 if (!res)
3535 if (dump_enabled_p ())
3536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3537 "bad loop form.\n");
3538 return opt_loop_vec_info::propagate_failure (res);
3540 if (!integer_onep (loop_form_info.assumptions))
3542 /* We consider to vectorize this loop by versioning it under
3543 some assumptions. In order to do this, we need to clear
3544 existing information computed by scev and niter analyzer. */
3545 scev_reset_htab ();
3546 free_numbers_of_iterations_estimates (loop);
3547 /* Also set flag for this loop so that following scev and niter
3548 analysis are done under the assumptions. */
3549 loop_constraint_set (loop, LOOP_C_FINITE);
3552 auto_vector_modes vector_modes;
3553 /* Autodetect first vector size we try. */
3554 vector_modes.safe_push (VOIDmode);
3555 unsigned int autovec_flags
3556 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3557 loop->simdlen != 0);
3558 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3559 && !unlimited_cost_model (loop));
3560 machine_mode autodetected_vector_mode = VOIDmode;
3561 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3562 unsigned int mode_i = 0;
3563 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3565 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3566 a mode has not been analyzed. */
3567 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3568 for (unsigned i = 0; i < vector_modes.length (); ++i)
3569 cached_vf_per_mode.safe_push (0);
3571 /* First determine the main loop vectorization mode, either the first
3572 one that works, starting with auto-detecting the vector mode and then
3573 following the targets order of preference, or the one with the
3574 lowest cost if pick_lowest_cost_p. */
3575 while (1)
3577 bool fatal;
3578 unsigned int last_mode_i = mode_i;
3579 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3580 failed. */
3581 cached_vf_per_mode[last_mode_i] = -1;
3582 opt_loop_vec_info loop_vinfo
3583 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3584 NULL, vector_modes, mode_i,
3585 autodetected_vector_mode, fatal);
3586 if (fatal)
3587 break;
3589 if (loop_vinfo)
3591 /* Analyzis has been successful so update the VF value. The
3592 VF should always be a multiple of unroll_factor and we want to
3593 capture the original VF here. */
3594 cached_vf_per_mode[last_mode_i]
3595 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3596 loop_vinfo->suggested_unroll_factor);
3597 /* Once we hit the desired simdlen for the first time,
3598 discard any previous attempts. */
3599 if (simdlen
3600 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3602 delete first_loop_vinfo;
3603 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3604 simdlen = 0;
3606 else if (pick_lowest_cost_p
3607 && first_loop_vinfo
3608 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3610 /* Pick loop_vinfo over first_loop_vinfo. */
3611 delete first_loop_vinfo;
3612 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3614 if (first_loop_vinfo == NULL)
3615 first_loop_vinfo = loop_vinfo;
3616 else
3618 delete loop_vinfo;
3619 loop_vinfo = opt_loop_vec_info::success (NULL);
3622 /* Commit to first_loop_vinfo if we have no reason to try
3623 alternatives. */
3624 if (!simdlen && !pick_lowest_cost_p)
3625 break;
3627 if (mode_i == vector_modes.length ()
3628 || autodetected_vector_mode == VOIDmode)
3629 break;
3631 /* Try the next biggest vector size. */
3632 if (dump_enabled_p ())
3633 dump_printf_loc (MSG_NOTE, vect_location,
3634 "***** Re-trying analysis with vector mode %s\n",
3635 GET_MODE_NAME (vector_modes[mode_i]));
3637 if (!first_loop_vinfo)
3638 return opt_loop_vec_info::propagate_failure (res);
3640 if (dump_enabled_p ())
3641 dump_printf_loc (MSG_NOTE, vect_location,
3642 "***** Choosing vector mode %s\n",
3643 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3645 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3646 enabled, SIMDUID is not set, it is the innermost loop and we have
3647 either already found the loop's SIMDLEN or there was no SIMDLEN to
3648 begin with.
3649 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3650 bool vect_epilogues = (!simdlen
3651 && loop->inner == NULL
3652 && param_vect_epilogues_nomask
3653 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3654 && !loop->simduid);
3655 if (!vect_epilogues)
3656 return first_loop_vinfo;
3658 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3659 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3661 /* For epilogues start the analysis from the first mode. The motivation
3662 behind starting from the beginning comes from cases where the VECTOR_MODES
3663 array may contain length-agnostic and length-specific modes. Their
3664 ordering is not guaranteed, so we could end up picking a mode for the main
3665 loop that is after the epilogue's optimal mode. */
3666 vector_modes[0] = autodetected_vector_mode;
3667 mode_i = 0;
3669 bool supports_partial_vectors =
3670 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3671 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3673 while (1)
3675 /* If the target does not support partial vectors we can shorten the
3676 number of modes to analyze for the epilogue as we know we can't pick a
3677 mode that would lead to a VF at least as big as the
3678 FIRST_VINFO_VF. */
3679 if (!supports_partial_vectors
3680 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3682 mode_i++;
3683 if (mode_i == vector_modes.length ())
3684 break;
3685 continue;
3688 if (dump_enabled_p ())
3689 dump_printf_loc (MSG_NOTE, vect_location,
3690 "***** Re-trying epilogue analysis with vector "
3691 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3693 bool fatal;
3694 opt_loop_vec_info loop_vinfo
3695 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3696 first_loop_vinfo,
3697 vector_modes, mode_i,
3698 autodetected_vector_mode, fatal);
3699 if (fatal)
3700 break;
3702 if (loop_vinfo)
3704 if (pick_lowest_cost_p)
3706 /* Keep trying to roll back vectorization attempts while the
3707 loop_vec_infos they produced were worse than this one. */
3708 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3709 while (!vinfos.is_empty ()
3710 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3712 gcc_assert (vect_epilogues);
3713 delete vinfos.pop ();
3716 /* For now only allow one epilogue loop. */
3717 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3719 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3720 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3721 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3722 || maybe_ne (lowest_th, 0U));
3723 /* Keep track of the known smallest versioning
3724 threshold. */
3725 if (ordered_p (lowest_th, th))
3726 lowest_th = ordered_min (lowest_th, th);
3728 else
3730 delete loop_vinfo;
3731 loop_vinfo = opt_loop_vec_info::success (NULL);
3734 /* For now only allow one epilogue loop, but allow
3735 pick_lowest_cost_p to replace it, so commit to the
3736 first epilogue if we have no reason to try alternatives. */
3737 if (!pick_lowest_cost_p)
3738 break;
3741 if (mode_i == vector_modes.length ())
3742 break;
3746 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3748 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3749 if (dump_enabled_p ())
3750 dump_printf_loc (MSG_NOTE, vect_location,
3751 "***** Choosing epilogue vector mode %s\n",
3752 GET_MODE_NAME
3753 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3756 return first_loop_vinfo;
3759 /* Return true if there is an in-order reduction function for CODE, storing
3760 it in *REDUC_FN if so. */
3762 static bool
3763 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3765 if (code == PLUS_EXPR)
3767 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3768 return true;
3770 return false;
3773 /* Function reduction_fn_for_scalar_code
3775 Input:
3776 CODE - tree_code of a reduction operations.
3778 Output:
3779 REDUC_FN - the corresponding internal function to be used to reduce the
3780 vector of partial results into a single scalar result, or IFN_LAST
3781 if the operation is a supported reduction operation, but does not have
3782 such an internal function.
3784 Return FALSE if CODE currently cannot be vectorized as reduction. */
3786 bool
3787 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3789 if (code.is_tree_code ())
3790 switch (tree_code (code))
3792 case MAX_EXPR:
3793 *reduc_fn = IFN_REDUC_MAX;
3794 return true;
3796 case MIN_EXPR:
3797 *reduc_fn = IFN_REDUC_MIN;
3798 return true;
3800 case PLUS_EXPR:
3801 *reduc_fn = IFN_REDUC_PLUS;
3802 return true;
3804 case BIT_AND_EXPR:
3805 *reduc_fn = IFN_REDUC_AND;
3806 return true;
3808 case BIT_IOR_EXPR:
3809 *reduc_fn = IFN_REDUC_IOR;
3810 return true;
3812 case BIT_XOR_EXPR:
3813 *reduc_fn = IFN_REDUC_XOR;
3814 return true;
3816 case MULT_EXPR:
3817 case MINUS_EXPR:
3818 *reduc_fn = IFN_LAST;
3819 return true;
3821 default:
3822 return false;
3824 else
3825 switch (combined_fn (code))
3827 CASE_CFN_FMAX:
3828 *reduc_fn = IFN_REDUC_FMAX;
3829 return true;
3831 CASE_CFN_FMIN:
3832 *reduc_fn = IFN_REDUC_FMIN;
3833 return true;
3835 default:
3836 return false;
3840 /* If there is a neutral value X such that a reduction would not be affected
3841 by the introduction of additional X elements, return that X, otherwise
3842 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3843 of the scalar elements. If the reduction has just a single initial value
3844 then INITIAL_VALUE is that value, otherwise it is null. */
3846 tree
3847 neutral_op_for_reduction (tree scalar_type, code_helper code,
3848 tree initial_value)
3850 if (code.is_tree_code ())
3851 switch (tree_code (code))
3853 case WIDEN_SUM_EXPR:
3854 case DOT_PROD_EXPR:
3855 case SAD_EXPR:
3856 case PLUS_EXPR:
3857 case MINUS_EXPR:
3858 case BIT_IOR_EXPR:
3859 case BIT_XOR_EXPR:
3860 return build_zero_cst (scalar_type);
3862 case MULT_EXPR:
3863 return build_one_cst (scalar_type);
3865 case BIT_AND_EXPR:
3866 return build_all_ones_cst (scalar_type);
3868 case MAX_EXPR:
3869 case MIN_EXPR:
3870 return initial_value;
3872 default:
3873 return NULL_TREE;
3875 else
3876 switch (combined_fn (code))
3878 CASE_CFN_FMIN:
3879 CASE_CFN_FMAX:
3880 return initial_value;
3882 default:
3883 return NULL_TREE;
3887 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3888 STMT is printed with a message MSG. */
3890 static void
3891 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3893 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3896 /* Return true if we need an in-order reduction for operation CODE
3897 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3898 overflow must wrap. */
3900 bool
3901 needs_fold_left_reduction_p (tree type, code_helper code)
3903 /* CHECKME: check for !flag_finite_math_only too? */
3904 if (SCALAR_FLOAT_TYPE_P (type))
3906 if (code.is_tree_code ())
3907 switch (tree_code (code))
3909 case MIN_EXPR:
3910 case MAX_EXPR:
3911 return false;
3913 default:
3914 return !flag_associative_math;
3916 else
3917 switch (combined_fn (code))
3919 CASE_CFN_FMIN:
3920 CASE_CFN_FMAX:
3921 return false;
3923 default:
3924 return !flag_associative_math;
3928 if (INTEGRAL_TYPE_P (type))
3929 return (!code.is_tree_code ()
3930 || !operation_no_trapping_overflow (type, tree_code (code)));
3932 if (SAT_FIXED_POINT_TYPE_P (type))
3933 return true;
3935 return false;
3938 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3939 has a handled computation expression. Store the main reduction
3940 operation in *CODE. */
3942 static bool
3943 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3944 tree loop_arg, code_helper *code,
3945 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3947 auto_bitmap visited;
3948 tree lookfor = PHI_RESULT (phi);
3949 ssa_op_iter curri;
3950 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3951 while (USE_FROM_PTR (curr) != loop_arg)
3952 curr = op_iter_next_use (&curri);
3953 curri.i = curri.numops;
3956 path.safe_push (std::make_pair (curri, curr));
3957 tree use = USE_FROM_PTR (curr);
3958 if (use == lookfor)
3959 break;
3960 gimple *def = SSA_NAME_DEF_STMT (use);
3961 if (gimple_nop_p (def)
3962 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3964 pop:
3967 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3968 curri = x.first;
3969 curr = x.second;
3971 curr = op_iter_next_use (&curri);
3972 /* Skip already visited or non-SSA operands (from iterating
3973 over PHI args). */
3974 while (curr != NULL_USE_OPERAND_P
3975 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3976 || ! bitmap_set_bit (visited,
3977 SSA_NAME_VERSION
3978 (USE_FROM_PTR (curr)))));
3980 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3981 if (curr == NULL_USE_OPERAND_P)
3982 break;
3984 else
3986 if (gimple_code (def) == GIMPLE_PHI)
3987 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3988 else
3989 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3990 while (curr != NULL_USE_OPERAND_P
3991 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3992 || ! bitmap_set_bit (visited,
3993 SSA_NAME_VERSION
3994 (USE_FROM_PTR (curr)))))
3995 curr = op_iter_next_use (&curri);
3996 if (curr == NULL_USE_OPERAND_P)
3997 goto pop;
4000 while (1);
4001 if (dump_file && (dump_flags & TDF_DETAILS))
4003 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4004 unsigned i;
4005 std::pair<ssa_op_iter, use_operand_p> *x;
4006 FOR_EACH_VEC_ELT (path, i, x)
4007 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4008 dump_printf (MSG_NOTE, "\n");
4011 /* Check whether the reduction path detected is valid. */
4012 bool fail = path.length () == 0;
4013 bool neg = false;
4014 int sign = -1;
4015 *code = ERROR_MARK;
4016 for (unsigned i = 1; i < path.length (); ++i)
4018 gimple *use_stmt = USE_STMT (path[i].second);
4019 gimple_match_op op;
4020 if (!gimple_extract_op (use_stmt, &op))
4022 fail = true;
4023 break;
4025 unsigned int opi = op.num_ops;
4026 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4028 /* The following make sure we can compute the operand index
4029 easily plus it mostly disallows chaining via COND_EXPR condition
4030 operands. */
4031 for (opi = 0; opi < op.num_ops; ++opi)
4032 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4033 break;
4035 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4037 for (opi = 0; opi < op.num_ops; ++opi)
4038 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4039 break;
4041 if (opi == op.num_ops)
4043 fail = true;
4044 break;
4046 op.code = canonicalize_code (op.code, op.type);
4047 if (op.code == MINUS_EXPR)
4049 op.code = PLUS_EXPR;
4050 /* Track whether we negate the reduction value each iteration. */
4051 if (op.ops[1] == op.ops[opi])
4052 neg = ! neg;
4054 if (CONVERT_EXPR_CODE_P (op.code)
4055 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4057 else if (*code == ERROR_MARK)
4059 *code = op.code;
4060 sign = TYPE_SIGN (op.type);
4062 else if (op.code != *code)
4064 fail = true;
4065 break;
4067 else if ((op.code == MIN_EXPR
4068 || op.code == MAX_EXPR)
4069 && sign != TYPE_SIGN (op.type))
4071 fail = true;
4072 break;
4074 /* Check there's only a single stmt the op is used on. For the
4075 not value-changing tail and the last stmt allow out-of-loop uses.
4076 ??? We could relax this and handle arbitrary live stmts by
4077 forcing a scalar epilogue for example. */
4078 imm_use_iterator imm_iter;
4079 use_operand_p use_p;
4080 gimple *op_use_stmt;
4081 unsigned cnt = 0;
4082 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4083 if (!is_gimple_debug (op_use_stmt)
4084 && (*code != ERROR_MARK
4085 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
4086 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4087 cnt++;
4088 if (cnt != 1)
4090 fail = true;
4091 break;
4094 return ! fail && ! neg && *code != ERROR_MARK;
4097 bool
4098 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4099 tree loop_arg, enum tree_code code)
4101 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4102 code_helper code_;
4103 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4104 && code_ == code);
4109 /* Function vect_is_simple_reduction
4111 (1) Detect a cross-iteration def-use cycle that represents a simple
4112 reduction computation. We look for the following pattern:
4114 loop_header:
4115 a1 = phi < a0, a2 >
4116 a3 = ...
4117 a2 = operation (a3, a1)
4121 a3 = ...
4122 loop_header:
4123 a1 = phi < a0, a2 >
4124 a2 = operation (a3, a1)
4126 such that:
4127 1. operation is commutative and associative and it is safe to
4128 change the order of the computation
4129 2. no uses for a2 in the loop (a2 is used out of the loop)
4130 3. no uses of a1 in the loop besides the reduction operation
4131 4. no uses of a1 outside the loop.
4133 Conditions 1,4 are tested here.
4134 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4136 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4137 nested cycles.
4139 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4140 reductions:
4142 a1 = phi < a0, a2 >
4143 inner loop (def of a3)
4144 a2 = phi < a3 >
4146 (4) Detect condition expressions, ie:
4147 for (int i = 0; i < N; i++)
4148 if (a[i] < val)
4149 ret_val = a[i];
4153 static stmt_vec_info
4154 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4155 bool *double_reduc, bool *reduc_chain_p, bool slp)
4157 gphi *phi = as_a <gphi *> (phi_info->stmt);
4158 gimple *phi_use_stmt = NULL;
4159 imm_use_iterator imm_iter;
4160 use_operand_p use_p;
4162 *double_reduc = false;
4163 *reduc_chain_p = false;
4164 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4166 tree phi_name = PHI_RESULT (phi);
4167 /* ??? If there are no uses of the PHI result the inner loop reduction
4168 won't be detected as possibly double-reduction by vectorizable_reduction
4169 because that tries to walk the PHI arg from the preheader edge which
4170 can be constant. See PR60382. */
4171 if (has_zero_uses (phi_name))
4172 return NULL;
4173 class loop *loop = (gimple_bb (phi))->loop_father;
4174 unsigned nphi_def_loop_uses = 0;
4175 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4177 gimple *use_stmt = USE_STMT (use_p);
4178 if (is_gimple_debug (use_stmt))
4179 continue;
4181 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4183 if (dump_enabled_p ())
4184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4185 "intermediate value used outside loop.\n");
4187 return NULL;
4190 nphi_def_loop_uses++;
4191 phi_use_stmt = use_stmt;
4194 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4195 if (TREE_CODE (latch_def) != SSA_NAME)
4197 if (dump_enabled_p ())
4198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4199 "reduction: not ssa_name: %T\n", latch_def);
4200 return NULL;
4203 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4204 if (!def_stmt_info
4205 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4206 return NULL;
4208 bool nested_in_vect_loop
4209 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4210 unsigned nlatch_def_loop_uses = 0;
4211 auto_vec<gphi *, 3> lcphis;
4212 bool inner_loop_of_double_reduc = false;
4213 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4215 gimple *use_stmt = USE_STMT (use_p);
4216 if (is_gimple_debug (use_stmt))
4217 continue;
4218 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4219 nlatch_def_loop_uses++;
4220 else
4222 /* We can have more than one loop-closed PHI. */
4223 lcphis.safe_push (as_a <gphi *> (use_stmt));
4224 if (nested_in_vect_loop
4225 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4226 == vect_double_reduction_def))
4227 inner_loop_of_double_reduc = true;
4231 /* If we are vectorizing an inner reduction we are executing that
4232 in the original order only in case we are not dealing with a
4233 double reduction. */
4234 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4236 if (dump_enabled_p ())
4237 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4238 "detected nested cycle: ");
4239 return def_stmt_info;
4242 /* When the inner loop of a double reduction ends up with more than
4243 one loop-closed PHI we have failed to classify alternate such
4244 PHIs as double reduction, leading to wrong code. See PR103237. */
4245 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4247 if (dump_enabled_p ())
4248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4249 "unhandle double reduction\n");
4250 return NULL;
4253 /* If this isn't a nested cycle or if the nested cycle reduction value
4254 is used ouside of the inner loop we cannot handle uses of the reduction
4255 value. */
4256 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4258 if (dump_enabled_p ())
4259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4260 "reduction used in loop.\n");
4261 return NULL;
4264 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4265 defined in the inner loop. */
4266 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4268 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4269 if (gimple_phi_num_args (def_stmt) != 1
4270 || TREE_CODE (op1) != SSA_NAME)
4272 if (dump_enabled_p ())
4273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4274 "unsupported phi node definition.\n");
4276 return NULL;
4279 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4280 and the latch definition op1. */
4281 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4282 if (gimple_bb (def1)
4283 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4284 && loop->inner
4285 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4286 && (is_gimple_assign (def1) || is_gimple_call (def1))
4287 && is_a <gphi *> (phi_use_stmt)
4288 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4289 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4290 loop_latch_edge (loop->inner))))
4292 if (dump_enabled_p ())
4293 report_vect_op (MSG_NOTE, def_stmt,
4294 "detected double reduction: ");
4296 *double_reduc = true;
4297 return def_stmt_info;
4300 return NULL;
4303 /* Look for the expression computing latch_def from then loop PHI result. */
4304 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4305 code_helper code;
4306 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4307 path))
4309 STMT_VINFO_REDUC_CODE (phi_info) = code;
4310 if (code == COND_EXPR && !nested_in_vect_loop)
4311 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4313 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4314 reduction chain for which the additional restriction is that
4315 all operations in the chain are the same. */
4316 auto_vec<stmt_vec_info, 8> reduc_chain;
4317 unsigned i;
4318 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4319 for (i = path.length () - 1; i >= 1; --i)
4321 gimple *stmt = USE_STMT (path[i].second);
4322 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4323 gimple_match_op op;
4324 if (!gimple_extract_op (stmt, &op))
4325 gcc_unreachable ();
4326 if (gassign *assign = dyn_cast<gassign *> (stmt))
4327 STMT_VINFO_REDUC_IDX (stmt_info)
4328 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4329 else
4331 gcall *call = as_a<gcall *> (stmt);
4332 STMT_VINFO_REDUC_IDX (stmt_info)
4333 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4335 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4336 && (i == 1 || i == path.length () - 1));
4337 if ((op.code != code && !leading_conversion)
4338 /* We can only handle the final value in epilogue
4339 generation for reduction chains. */
4340 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4341 is_slp_reduc = false;
4342 /* For reduction chains we support a trailing/leading
4343 conversions. We do not store those in the actual chain. */
4344 if (leading_conversion)
4345 continue;
4346 reduc_chain.safe_push (stmt_info);
4348 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4350 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4352 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4353 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4355 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4356 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4358 /* Save the chain for further analysis in SLP detection. */
4359 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4360 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4362 *reduc_chain_p = true;
4363 if (dump_enabled_p ())
4364 dump_printf_loc (MSG_NOTE, vect_location,
4365 "reduction: detected reduction chain\n");
4367 else if (dump_enabled_p ())
4368 dump_printf_loc (MSG_NOTE, vect_location,
4369 "reduction: detected reduction\n");
4371 return def_stmt_info;
4374 if (dump_enabled_p ())
4375 dump_printf_loc (MSG_NOTE, vect_location,
4376 "reduction: unknown pattern\n");
4378 return NULL;
4381 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4382 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4383 or -1 if not known. */
4385 static int
4386 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4388 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4389 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4391 if (dump_enabled_p ())
4392 dump_printf_loc (MSG_NOTE, vect_location,
4393 "cost model: epilogue peel iters set to vf/2 "
4394 "because loop iterations are unknown .\n");
4395 return assumed_vf / 2;
4397 else
4399 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4400 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4401 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4402 /* If we need to peel for gaps, but no peeling is required, we have to
4403 peel VF iterations. */
4404 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4405 peel_iters_epilogue = assumed_vf;
4406 return peel_iters_epilogue;
4410 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4412 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4413 int *peel_iters_epilogue,
4414 stmt_vector_for_cost *scalar_cost_vec,
4415 stmt_vector_for_cost *prologue_cost_vec,
4416 stmt_vector_for_cost *epilogue_cost_vec)
4418 int retval = 0;
4420 *peel_iters_epilogue
4421 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4423 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4425 /* If peeled iterations are known but number of scalar loop
4426 iterations are unknown, count a taken branch per peeled loop. */
4427 if (peel_iters_prologue > 0)
4428 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4429 vect_prologue);
4430 if (*peel_iters_epilogue > 0)
4431 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4432 vect_epilogue);
4435 stmt_info_for_cost *si;
4436 int j;
4437 if (peel_iters_prologue)
4438 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4439 retval += record_stmt_cost (prologue_cost_vec,
4440 si->count * peel_iters_prologue,
4441 si->kind, si->stmt_info, si->misalign,
4442 vect_prologue);
4443 if (*peel_iters_epilogue)
4444 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4445 retval += record_stmt_cost (epilogue_cost_vec,
4446 si->count * *peel_iters_epilogue,
4447 si->kind, si->stmt_info, si->misalign,
4448 vect_epilogue);
4450 return retval;
4453 /* Function vect_estimate_min_profitable_iters
4455 Return the number of iterations required for the vector version of the
4456 loop to be profitable relative to the cost of the scalar version of the
4457 loop.
4459 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4460 of iterations for vectorization. -1 value means loop vectorization
4461 is not profitable. This returned value may be used for dynamic
4462 profitability check.
4464 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4465 for static check against estimated number of iterations. */
4467 static void
4468 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4469 int *ret_min_profitable_niters,
4470 int *ret_min_profitable_estimate,
4471 unsigned *suggested_unroll_factor)
4473 int min_profitable_iters;
4474 int min_profitable_estimate;
4475 int peel_iters_prologue;
4476 int peel_iters_epilogue;
4477 unsigned vec_inside_cost = 0;
4478 int vec_outside_cost = 0;
4479 unsigned vec_prologue_cost = 0;
4480 unsigned vec_epilogue_cost = 0;
4481 int scalar_single_iter_cost = 0;
4482 int scalar_outside_cost = 0;
4483 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4484 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4485 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4487 /* Cost model disabled. */
4488 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4490 if (dump_enabled_p ())
4491 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4492 *ret_min_profitable_niters = 0;
4493 *ret_min_profitable_estimate = 0;
4494 return;
4497 /* Requires loop versioning tests to handle misalignment. */
4498 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4500 /* FIXME: Make cost depend on complexity of individual check. */
4501 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4502 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4503 if (dump_enabled_p ())
4504 dump_printf (MSG_NOTE,
4505 "cost model: Adding cost of checks for loop "
4506 "versioning to treat misalignment.\n");
4509 /* Requires loop versioning with alias checks. */
4510 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4512 /* FIXME: Make cost depend on complexity of individual check. */
4513 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4514 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4515 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4516 if (len)
4517 /* Count LEN - 1 ANDs and LEN comparisons. */
4518 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4519 scalar_stmt, vect_prologue);
4520 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4521 if (len)
4523 /* Count LEN - 1 ANDs and LEN comparisons. */
4524 unsigned int nstmts = len * 2 - 1;
4525 /* +1 for each bias that needs adding. */
4526 for (unsigned int i = 0; i < len; ++i)
4527 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4528 nstmts += 1;
4529 (void) add_stmt_cost (target_cost_data, nstmts,
4530 scalar_stmt, vect_prologue);
4532 if (dump_enabled_p ())
4533 dump_printf (MSG_NOTE,
4534 "cost model: Adding cost of checks for loop "
4535 "versioning aliasing.\n");
4538 /* Requires loop versioning with niter checks. */
4539 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4541 /* FIXME: Make cost depend on complexity of individual check. */
4542 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4543 NULL, NULL, NULL_TREE, 0, vect_prologue);
4544 if (dump_enabled_p ())
4545 dump_printf (MSG_NOTE,
4546 "cost model: Adding cost of checks for loop "
4547 "versioning niters.\n");
4550 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4551 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4552 vect_prologue);
4554 /* Count statements in scalar loop. Using this as scalar cost for a single
4555 iteration for now.
4557 TODO: Add outer loop support.
4559 TODO: Consider assigning different costs to different scalar
4560 statements. */
4562 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4564 /* Add additional cost for the peeled instructions in prologue and epilogue
4565 loop. (For fully-masked loops there will be no peeling.)
4567 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4568 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4570 TODO: Build an expression that represents peel_iters for prologue and
4571 epilogue to be used in a run-time test. */
4573 bool prologue_need_br_taken_cost = false;
4574 bool prologue_need_br_not_taken_cost = false;
4576 /* Calculate peel_iters_prologue. */
4577 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4578 peel_iters_prologue = 0;
4579 else if (npeel < 0)
4581 peel_iters_prologue = assumed_vf / 2;
4582 if (dump_enabled_p ())
4583 dump_printf (MSG_NOTE, "cost model: "
4584 "prologue peel iters set to vf/2.\n");
4586 /* If peeled iterations are unknown, count a taken branch and a not taken
4587 branch per peeled loop. Even if scalar loop iterations are known,
4588 vector iterations are not known since peeled prologue iterations are
4589 not known. Hence guards remain the same. */
4590 prologue_need_br_taken_cost = true;
4591 prologue_need_br_not_taken_cost = true;
4593 else
4595 peel_iters_prologue = npeel;
4596 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4597 /* If peeled iterations are known but number of scalar loop
4598 iterations are unknown, count a taken branch per peeled loop. */
4599 prologue_need_br_taken_cost = true;
4602 bool epilogue_need_br_taken_cost = false;
4603 bool epilogue_need_br_not_taken_cost = false;
4605 /* Calculate peel_iters_epilogue. */
4606 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4607 /* We need to peel exactly one iteration for gaps. */
4608 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4609 else if (npeel < 0)
4611 /* If peeling for alignment is unknown, loop bound of main loop
4612 becomes unknown. */
4613 peel_iters_epilogue = assumed_vf / 2;
4614 if (dump_enabled_p ())
4615 dump_printf (MSG_NOTE, "cost model: "
4616 "epilogue peel iters set to vf/2 because "
4617 "peeling for alignment is unknown.\n");
4619 /* See the same reason above in peel_iters_prologue calculation. */
4620 epilogue_need_br_taken_cost = true;
4621 epilogue_need_br_not_taken_cost = true;
4623 else
4625 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4626 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4627 /* If peeled iterations are known but number of scalar loop
4628 iterations are unknown, count a taken branch per peeled loop. */
4629 epilogue_need_br_taken_cost = true;
4632 stmt_info_for_cost *si;
4633 int j;
4634 /* Add costs associated with peel_iters_prologue. */
4635 if (peel_iters_prologue)
4636 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4638 (void) add_stmt_cost (target_cost_data,
4639 si->count * peel_iters_prologue, si->kind,
4640 si->stmt_info, si->node, si->vectype,
4641 si->misalign, vect_prologue);
4644 /* Add costs associated with peel_iters_epilogue. */
4645 if (peel_iters_epilogue)
4646 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4648 (void) add_stmt_cost (target_cost_data,
4649 si->count * peel_iters_epilogue, si->kind,
4650 si->stmt_info, si->node, si->vectype,
4651 si->misalign, vect_epilogue);
4654 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4656 if (prologue_need_br_taken_cost)
4657 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4658 vect_prologue);
4660 if (prologue_need_br_not_taken_cost)
4661 (void) add_stmt_cost (target_cost_data, 1,
4662 cond_branch_not_taken, vect_prologue);
4664 if (epilogue_need_br_taken_cost)
4665 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4666 vect_epilogue);
4668 if (epilogue_need_br_not_taken_cost)
4669 (void) add_stmt_cost (target_cost_data, 1,
4670 cond_branch_not_taken, vect_epilogue);
4672 /* Take care of special costs for rgroup controls of partial vectors. */
4673 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4674 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4675 == vect_partial_vectors_avx512))
4677 /* Calculate how many masks we need to generate. */
4678 unsigned int num_masks = 0;
4679 bool need_saturation = false;
4680 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4681 if (rgm.type)
4683 unsigned nvectors = rgm.factor;
4684 num_masks += nvectors;
4685 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4686 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4687 need_saturation = true;
4690 /* ??? The target isn't able to identify the costs below as
4691 producing masks so it cannot penaltize cases where we'd run
4692 out of mask registers for example. */
4694 /* ??? We are also failing to account for smaller vector masks
4695 we generate by splitting larger masks in vect_get_loop_mask. */
4697 /* In the worst case, we need to generate each mask in the prologue
4698 and in the loop body. We need one splat per group and one
4699 compare per mask.
4701 Sometimes the prologue mask will fold to a constant,
4702 so the actual prologue cost might be smaller. However, it's
4703 simpler and safer to use the worst-case cost; if this ends up
4704 being the tie-breaker between vectorizing or not, then it's
4705 probably better not to vectorize. */
4706 (void) add_stmt_cost (target_cost_data,
4707 num_masks
4708 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4709 vector_stmt, NULL, NULL, NULL_TREE, 0,
4710 vect_prologue);
4711 (void) add_stmt_cost (target_cost_data,
4712 num_masks
4713 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4714 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4716 /* When we need saturation we need it both in the prologue and
4717 the epilogue. */
4718 if (need_saturation)
4720 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4721 NULL, NULL, NULL_TREE, 0, vect_prologue);
4722 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4723 NULL, NULL, NULL_TREE, 0, vect_body);
4726 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4727 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4728 == vect_partial_vectors_while_ult))
4730 /* Calculate how many masks we need to generate. */
4731 unsigned int num_masks = 0;
4732 rgroup_controls *rgm;
4733 unsigned int num_vectors_m1;
4734 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4735 num_vectors_m1, rgm)
4736 if (rgm->type)
4737 num_masks += num_vectors_m1 + 1;
4738 gcc_assert (num_masks > 0);
4740 /* In the worst case, we need to generate each mask in the prologue
4741 and in the loop body. One of the loop body mask instructions
4742 replaces the comparison in the scalar loop, and since we don't
4743 count the scalar comparison against the scalar body, we shouldn't
4744 count that vector instruction against the vector body either.
4746 Sometimes we can use unpacks instead of generating prologue
4747 masks and sometimes the prologue mask will fold to a constant,
4748 so the actual prologue cost might be smaller. However, it's
4749 simpler and safer to use the worst-case cost; if this ends up
4750 being the tie-breaker between vectorizing or not, then it's
4751 probably better not to vectorize. */
4752 (void) add_stmt_cost (target_cost_data, num_masks,
4753 vector_stmt, NULL, NULL, NULL_TREE, 0,
4754 vect_prologue);
4755 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4756 vector_stmt, NULL, NULL, NULL_TREE, 0,
4757 vect_body);
4759 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4761 /* Referring to the functions vect_set_loop_condition_partial_vectors
4762 and vect_set_loop_controls_directly, we need to generate each
4763 length in the prologue and in the loop body if required. Although
4764 there are some possible optimizations, we consider the worst case
4765 here. */
4767 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4768 signed char partial_load_store_bias
4769 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4770 bool need_iterate_p
4771 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4772 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4774 /* Calculate how many statements to be added. */
4775 unsigned int prologue_stmts = 0;
4776 unsigned int body_stmts = 0;
4778 rgroup_controls *rgc;
4779 unsigned int num_vectors_m1;
4780 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4781 if (rgc->type)
4783 /* May need one SHIFT for nitems_total computation. */
4784 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4785 if (nitems != 1 && !niters_known_p)
4786 prologue_stmts += 1;
4788 /* May need one MAX and one MINUS for wrap around. */
4789 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4790 prologue_stmts += 2;
4792 /* Need one MAX and one MINUS for each batch limit excepting for
4793 the 1st one. */
4794 prologue_stmts += num_vectors_m1 * 2;
4796 unsigned int num_vectors = num_vectors_m1 + 1;
4798 /* Need to set up lengths in prologue, only one MIN required
4799 for each since start index is zero. */
4800 prologue_stmts += num_vectors;
4802 /* If we have a non-zero partial load bias, we need one PLUS
4803 to adjust the load length. */
4804 if (partial_load_store_bias != 0)
4805 body_stmts += 1;
4807 /* Each may need two MINs and one MINUS to update lengths in body
4808 for next iteration. */
4809 if (need_iterate_p)
4810 body_stmts += 3 * num_vectors;
4813 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4814 scalar_stmt, vect_prologue);
4815 (void) add_stmt_cost (target_cost_data, body_stmts,
4816 scalar_stmt, vect_body);
4819 /* FORNOW: The scalar outside cost is incremented in one of the
4820 following ways:
4822 1. The vectorizer checks for alignment and aliasing and generates
4823 a condition that allows dynamic vectorization. A cost model
4824 check is ANDED with the versioning condition. Hence scalar code
4825 path now has the added cost of the versioning check.
4827 if (cost > th & versioning_check)
4828 jmp to vector code
4830 Hence run-time scalar is incremented by not-taken branch cost.
4832 2. The vectorizer then checks if a prologue is required. If the
4833 cost model check was not done before during versioning, it has to
4834 be done before the prologue check.
4836 if (cost <= th)
4837 prologue = scalar_iters
4838 if (prologue == 0)
4839 jmp to vector code
4840 else
4841 execute prologue
4842 if (prologue == num_iters)
4843 go to exit
4845 Hence the run-time scalar cost is incremented by a taken branch,
4846 plus a not-taken branch, plus a taken branch cost.
4848 3. The vectorizer then checks if an epilogue is required. If the
4849 cost model check was not done before during prologue check, it
4850 has to be done with the epilogue check.
4852 if (prologue == 0)
4853 jmp to vector code
4854 else
4855 execute prologue
4856 if (prologue == num_iters)
4857 go to exit
4858 vector code:
4859 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4860 jmp to epilogue
4862 Hence the run-time scalar cost should be incremented by 2 taken
4863 branches.
4865 TODO: The back end may reorder the BBS's differently and reverse
4866 conditions/branch directions. Change the estimates below to
4867 something more reasonable. */
4869 /* If the number of iterations is known and we do not do versioning, we can
4870 decide whether to vectorize at compile time. Hence the scalar version
4871 do not carry cost model guard costs. */
4872 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4873 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4875 /* Cost model check occurs at versioning. */
4876 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4877 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4878 else
4880 /* Cost model check occurs at prologue generation. */
4881 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4882 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4883 + vect_get_stmt_cost (cond_branch_not_taken);
4884 /* Cost model check occurs at epilogue generation. */
4885 else
4886 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4890 /* Complete the target-specific cost calculations. */
4891 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4892 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4893 suggested_unroll_factor);
4895 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4896 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4897 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4898 *suggested_unroll_factor,
4899 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4901 if (dump_enabled_p ())
4902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4903 "can't unroll as unrolled vectorization factor larger"
4904 " than maximum vectorization factor: "
4905 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4906 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4907 *suggested_unroll_factor = 1;
4910 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4912 if (dump_enabled_p ())
4914 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4915 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4916 vec_inside_cost);
4917 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4918 vec_prologue_cost);
4919 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4920 vec_epilogue_cost);
4921 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4922 scalar_single_iter_cost);
4923 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4924 scalar_outside_cost);
4925 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4926 vec_outside_cost);
4927 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4928 peel_iters_prologue);
4929 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4930 peel_iters_epilogue);
4933 /* Calculate number of iterations required to make the vector version
4934 profitable, relative to the loop bodies only. The following condition
4935 must hold true:
4936 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4937 where
4938 SIC = scalar iteration cost, VIC = vector iteration cost,
4939 VOC = vector outside cost, VF = vectorization factor,
4940 NPEEL = prologue iterations + epilogue iterations,
4941 SOC = scalar outside cost for run time cost model check. */
4943 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4944 - vec_inside_cost);
4945 if (saving_per_viter <= 0)
4947 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4948 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4949 "vectorization did not happen for a simd loop");
4951 if (dump_enabled_p ())
4952 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4953 "cost model: the vector iteration cost = %d "
4954 "divided by the scalar iteration cost = %d "
4955 "is greater or equal to the vectorization factor = %d"
4956 ".\n",
4957 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4958 *ret_min_profitable_niters = -1;
4959 *ret_min_profitable_estimate = -1;
4960 return;
4963 /* ??? The "if" arm is written to handle all cases; see below for what
4964 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4965 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4967 /* Rewriting the condition above in terms of the number of
4968 vector iterations (vniters) rather than the number of
4969 scalar iterations (niters) gives:
4971 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4973 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4975 For integer N, X and Y when X > 0:
4977 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4978 int outside_overhead = (vec_outside_cost
4979 - scalar_single_iter_cost * peel_iters_prologue
4980 - scalar_single_iter_cost * peel_iters_epilogue
4981 - scalar_outside_cost);
4982 /* We're only interested in cases that require at least one
4983 vector iteration. */
4984 int min_vec_niters = 1;
4985 if (outside_overhead > 0)
4986 min_vec_niters = outside_overhead / saving_per_viter + 1;
4988 if (dump_enabled_p ())
4989 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4990 min_vec_niters);
4992 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4994 /* Now that we know the minimum number of vector iterations,
4995 find the minimum niters for which the scalar cost is larger:
4997 SIC * niters > VIC * vniters + VOC - SOC
4999 We know that the minimum niters is no more than
5000 vniters * VF + NPEEL, but it might be (and often is) less
5001 than that if a partial vector iteration is cheaper than the
5002 equivalent scalar code. */
5003 int threshold = (vec_inside_cost * min_vec_niters
5004 + vec_outside_cost
5005 - scalar_outside_cost);
5006 if (threshold <= 0)
5007 min_profitable_iters = 1;
5008 else
5009 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5011 else
5012 /* Convert the number of vector iterations into a number of
5013 scalar iterations. */
5014 min_profitable_iters = (min_vec_niters * assumed_vf
5015 + peel_iters_prologue
5016 + peel_iters_epilogue);
5018 else
5020 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5021 * assumed_vf
5022 - vec_inside_cost * peel_iters_prologue
5023 - vec_inside_cost * peel_iters_epilogue);
5024 if (min_profitable_iters <= 0)
5025 min_profitable_iters = 0;
5026 else
5028 min_profitable_iters /= saving_per_viter;
5030 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5031 <= (((int) vec_inside_cost * min_profitable_iters)
5032 + (((int) vec_outside_cost - scalar_outside_cost)
5033 * assumed_vf)))
5034 min_profitable_iters++;
5038 if (dump_enabled_p ())
5039 dump_printf (MSG_NOTE,
5040 " Calculated minimum iters for profitability: %d\n",
5041 min_profitable_iters);
5043 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5044 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5045 /* We want the vectorized loop to execute at least once. */
5046 min_profitable_iters = assumed_vf + peel_iters_prologue;
5047 else if (min_profitable_iters < peel_iters_prologue)
5048 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5049 vectorized loop executes at least once. */
5050 min_profitable_iters = peel_iters_prologue;
5052 if (dump_enabled_p ())
5053 dump_printf_loc (MSG_NOTE, vect_location,
5054 " Runtime profitability threshold = %d\n",
5055 min_profitable_iters);
5057 *ret_min_profitable_niters = min_profitable_iters;
5059 /* Calculate number of iterations required to make the vector version
5060 profitable, relative to the loop bodies only.
5062 Non-vectorized variant is SIC * niters and it must win over vector
5063 variant on the expected loop trip count. The following condition must hold true:
5064 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5066 if (vec_outside_cost <= 0)
5067 min_profitable_estimate = 0;
5068 /* ??? This "else if" arm is written to handle all cases; see below for
5069 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5070 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5072 /* This is a repeat of the code above, but with + SOC rather
5073 than - SOC. */
5074 int outside_overhead = (vec_outside_cost
5075 - scalar_single_iter_cost * peel_iters_prologue
5076 - scalar_single_iter_cost * peel_iters_epilogue
5077 + scalar_outside_cost);
5078 int min_vec_niters = 1;
5079 if (outside_overhead > 0)
5080 min_vec_niters = outside_overhead / saving_per_viter + 1;
5082 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5084 int threshold = (vec_inside_cost * min_vec_niters
5085 + vec_outside_cost
5086 + scalar_outside_cost);
5087 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5089 else
5090 min_profitable_estimate = (min_vec_niters * assumed_vf
5091 + peel_iters_prologue
5092 + peel_iters_epilogue);
5094 else
5096 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5097 * assumed_vf
5098 - vec_inside_cost * peel_iters_prologue
5099 - vec_inside_cost * peel_iters_epilogue)
5100 / ((scalar_single_iter_cost * assumed_vf)
5101 - vec_inside_cost);
5103 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5104 if (dump_enabled_p ())
5105 dump_printf_loc (MSG_NOTE, vect_location,
5106 " Static estimate profitability threshold = %d\n",
5107 min_profitable_estimate);
5109 *ret_min_profitable_estimate = min_profitable_estimate;
5112 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5113 vector elements (not bits) for a vector with NELT elements. */
5114 static void
5115 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5116 vec_perm_builder *sel)
5118 /* The encoding is a single stepped pattern. Any wrap-around is handled
5119 by vec_perm_indices. */
5120 sel->new_vector (nelt, 1, 3);
5121 for (unsigned int i = 0; i < 3; i++)
5122 sel->quick_push (i + offset);
5125 /* Checks whether the target supports whole-vector shifts for vectors of mode
5126 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5127 it supports vec_perm_const with masks for all necessary shift amounts. */
5128 static bool
5129 have_whole_vector_shift (machine_mode mode)
5131 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5132 return true;
5134 /* Variable-length vectors should be handled via the optab. */
5135 unsigned int nelt;
5136 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5137 return false;
5139 vec_perm_builder sel;
5140 vec_perm_indices indices;
5141 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5143 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5144 indices.new_vector (sel, 2, nelt);
5145 if (!can_vec_perm_const_p (mode, mode, indices, false))
5146 return false;
5148 return true;
5151 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5152 multiplication operands have differing signs and (b) we intend
5153 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5154 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5156 static bool
5157 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5158 stmt_vec_info stmt_info)
5160 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5161 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5162 return false;
5164 tree rhs1 = gimple_assign_rhs1 (assign);
5165 tree rhs2 = gimple_assign_rhs2 (assign);
5166 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5167 return false;
5169 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5170 gcc_assert (reduc_info->is_reduc_info);
5171 return !directly_supported_p (DOT_PROD_EXPR,
5172 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5173 optab_vector_mixed_sign);
5176 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5177 functions. Design better to avoid maintenance issues. */
5179 /* Function vect_model_reduction_cost.
5181 Models cost for a reduction operation, including the vector ops
5182 generated within the strip-mine loop in some cases, the initial
5183 definition before the loop, and the epilogue code that must be generated. */
5185 static void
5186 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5187 stmt_vec_info stmt_info, internal_fn reduc_fn,
5188 vect_reduction_type reduction_type,
5189 int ncopies, stmt_vector_for_cost *cost_vec)
5191 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5192 tree vectype;
5193 machine_mode mode;
5194 class loop *loop = NULL;
5196 if (loop_vinfo)
5197 loop = LOOP_VINFO_LOOP (loop_vinfo);
5199 /* Condition reductions generate two reductions in the loop. */
5200 if (reduction_type == COND_REDUCTION)
5201 ncopies *= 2;
5203 vectype = STMT_VINFO_VECTYPE (stmt_info);
5204 mode = TYPE_MODE (vectype);
5205 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5207 gimple_match_op op;
5208 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5209 gcc_unreachable ();
5211 bool emulated_mixed_dot_prod
5212 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5213 if (reduction_type == EXTRACT_LAST_REDUCTION)
5214 /* No extra instructions are needed in the prologue. The loop body
5215 operations are costed in vectorizable_condition. */
5216 inside_cost = 0;
5217 else if (reduction_type == FOLD_LEFT_REDUCTION)
5219 /* No extra instructions needed in the prologue. */
5220 prologue_cost = 0;
5222 if (reduc_fn != IFN_LAST)
5223 /* Count one reduction-like operation per vector. */
5224 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5225 stmt_info, 0, vect_body);
5226 else
5228 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5229 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5230 inside_cost = record_stmt_cost (cost_vec, nelements,
5231 vec_to_scalar, stmt_info, 0,
5232 vect_body);
5233 inside_cost += record_stmt_cost (cost_vec, nelements,
5234 scalar_stmt, stmt_info, 0,
5235 vect_body);
5238 else
5240 /* Add in the cost of the initial definitions. */
5241 int prologue_stmts;
5242 if (reduction_type == COND_REDUCTION)
5243 /* For cond reductions we have four vectors: initial index, step,
5244 initial result of the data reduction, initial value of the index
5245 reduction. */
5246 prologue_stmts = 4;
5247 else if (emulated_mixed_dot_prod)
5248 /* We need the initial reduction value and two invariants:
5249 one that contains the minimum signed value and one that
5250 contains half of its negative. */
5251 prologue_stmts = 3;
5252 else
5253 prologue_stmts = 1;
5254 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5255 scalar_to_vec, stmt_info, 0,
5256 vect_prologue);
5259 /* Determine cost of epilogue code.
5261 We have a reduction operator that will reduce the vector in one statement.
5262 Also requires scalar extract. */
5264 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5266 if (reduc_fn != IFN_LAST)
5268 if (reduction_type == COND_REDUCTION)
5270 /* An EQ stmt and an COND_EXPR stmt. */
5271 epilogue_cost += record_stmt_cost (cost_vec, 2,
5272 vector_stmt, stmt_info, 0,
5273 vect_epilogue);
5274 /* Reduction of the max index and a reduction of the found
5275 values. */
5276 epilogue_cost += record_stmt_cost (cost_vec, 2,
5277 vec_to_scalar, stmt_info, 0,
5278 vect_epilogue);
5279 /* A broadcast of the max value. */
5280 epilogue_cost += record_stmt_cost (cost_vec, 1,
5281 scalar_to_vec, stmt_info, 0,
5282 vect_epilogue);
5284 else
5286 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5287 stmt_info, 0, vect_epilogue);
5288 epilogue_cost += record_stmt_cost (cost_vec, 1,
5289 vec_to_scalar, stmt_info, 0,
5290 vect_epilogue);
5293 else if (reduction_type == COND_REDUCTION)
5295 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5296 /* Extraction of scalar elements. */
5297 epilogue_cost += record_stmt_cost (cost_vec,
5298 2 * estimated_nunits,
5299 vec_to_scalar, stmt_info, 0,
5300 vect_epilogue);
5301 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5302 epilogue_cost += record_stmt_cost (cost_vec,
5303 2 * estimated_nunits - 3,
5304 scalar_stmt, stmt_info, 0,
5305 vect_epilogue);
5307 else if (reduction_type == EXTRACT_LAST_REDUCTION
5308 || reduction_type == FOLD_LEFT_REDUCTION)
5309 /* No extra instructions need in the epilogue. */
5311 else
5313 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5314 tree bitsize = TYPE_SIZE (op.type);
5315 int element_bitsize = tree_to_uhwi (bitsize);
5316 int nelements = vec_size_in_bits / element_bitsize;
5318 if (op.code == COND_EXPR)
5319 op.code = MAX_EXPR;
5321 /* We have a whole vector shift available. */
5322 if (VECTOR_MODE_P (mode)
5323 && directly_supported_p (op.code, vectype)
5324 && have_whole_vector_shift (mode))
5326 /* Final reduction via vector shifts and the reduction operator.
5327 Also requires scalar extract. */
5328 epilogue_cost += record_stmt_cost (cost_vec,
5329 exact_log2 (nelements) * 2,
5330 vector_stmt, stmt_info, 0,
5331 vect_epilogue);
5332 epilogue_cost += record_stmt_cost (cost_vec, 1,
5333 vec_to_scalar, stmt_info, 0,
5334 vect_epilogue);
5336 else
5337 /* Use extracts and reduction op for final reduction. For N
5338 elements, we have N extracts and N-1 reduction ops. */
5339 epilogue_cost += record_stmt_cost (cost_vec,
5340 nelements + nelements - 1,
5341 vector_stmt, stmt_info, 0,
5342 vect_epilogue);
5346 if (dump_enabled_p ())
5347 dump_printf (MSG_NOTE,
5348 "vect_model_reduction_cost: inside_cost = %d, "
5349 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5350 prologue_cost, epilogue_cost);
5353 /* SEQ is a sequence of instructions that initialize the reduction
5354 described by REDUC_INFO. Emit them in the appropriate place. */
5356 static void
5357 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5358 stmt_vec_info reduc_info, gimple *seq)
5360 if (reduc_info->reused_accumulator)
5362 /* When reusing an accumulator from the main loop, we only need
5363 initialization instructions if the main loop can be skipped.
5364 In that case, emit the initialization instructions at the end
5365 of the guard block that does the skip. */
5366 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5367 gcc_assert (skip_edge);
5368 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5369 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5371 else
5373 /* The normal case: emit the initialization instructions on the
5374 preheader edge. */
5375 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5376 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5380 /* Function get_initial_def_for_reduction
5382 Input:
5383 REDUC_INFO - the info_for_reduction
5384 INIT_VAL - the initial value of the reduction variable
5385 NEUTRAL_OP - a value that has no effect on the reduction, as per
5386 neutral_op_for_reduction
5388 Output:
5389 Return a vector variable, initialized according to the operation that
5390 STMT_VINFO performs. This vector will be used as the initial value
5391 of the vector of partial results.
5393 The value we need is a vector in which element 0 has value INIT_VAL
5394 and every other element has value NEUTRAL_OP. */
5396 static tree
5397 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5398 stmt_vec_info reduc_info,
5399 tree init_val, tree neutral_op)
5401 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5402 tree scalar_type = TREE_TYPE (init_val);
5403 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5404 tree init_def;
5405 gimple_seq stmts = NULL;
5407 gcc_assert (vectype);
5409 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5410 || SCALAR_FLOAT_TYPE_P (scalar_type));
5412 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5413 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5415 if (operand_equal_p (init_val, neutral_op))
5417 /* If both elements are equal then the vector described above is
5418 just a splat. */
5419 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5420 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5422 else
5424 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5425 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5426 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5428 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5429 element 0. */
5430 init_def = gimple_build_vector_from_val (&stmts, vectype,
5431 neutral_op);
5432 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5433 vectype, init_def, init_val);
5435 else
5437 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5438 tree_vector_builder elts (vectype, 1, 2);
5439 elts.quick_push (init_val);
5440 elts.quick_push (neutral_op);
5441 init_def = gimple_build_vector (&stmts, &elts);
5445 if (stmts)
5446 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5447 return init_def;
5450 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5451 which performs a reduction involving GROUP_SIZE scalar statements.
5452 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5453 is nonnull, introducing extra elements of that value will not change the
5454 result. */
5456 static void
5457 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5458 stmt_vec_info reduc_info,
5459 vec<tree> *vec_oprnds,
5460 unsigned int number_of_vectors,
5461 unsigned int group_size, tree neutral_op)
5463 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5464 unsigned HOST_WIDE_INT nunits;
5465 unsigned j, number_of_places_left_in_vector;
5466 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5467 unsigned int i;
5469 gcc_assert (group_size == initial_values.length () || neutral_op);
5471 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5472 created vectors. It is greater than 1 if unrolling is performed.
5474 For example, we have two scalar operands, s1 and s2 (e.g., group of
5475 strided accesses of size two), while NUNITS is four (i.e., four scalars
5476 of this type can be packed in a vector). The output vector will contain
5477 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5478 will be 2).
5480 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5481 vectors containing the operands.
5483 For example, NUNITS is four as before, and the group size is 8
5484 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5485 {s5, s6, s7, s8}. */
5487 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5488 nunits = group_size;
5490 number_of_places_left_in_vector = nunits;
5491 bool constant_p = true;
5492 tree_vector_builder elts (vector_type, nunits, 1);
5493 elts.quick_grow (nunits);
5494 gimple_seq ctor_seq = NULL;
5495 for (j = 0; j < nunits * number_of_vectors; ++j)
5497 tree op;
5498 i = j % group_size;
5500 /* Get the def before the loop. In reduction chain we have only
5501 one initial value. Else we have as many as PHIs in the group. */
5502 if (i >= initial_values.length () || (j > i && neutral_op))
5503 op = neutral_op;
5504 else
5505 op = initial_values[i];
5507 /* Create 'vect_ = {op0,op1,...,opn}'. */
5508 number_of_places_left_in_vector--;
5509 elts[nunits - number_of_places_left_in_vector - 1] = op;
5510 if (!CONSTANT_CLASS_P (op))
5511 constant_p = false;
5513 if (number_of_places_left_in_vector == 0)
5515 tree init;
5516 if (constant_p && !neutral_op
5517 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5518 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5519 /* Build the vector directly from ELTS. */
5520 init = gimple_build_vector (&ctor_seq, &elts);
5521 else if (neutral_op)
5523 /* Build a vector of the neutral value and shift the
5524 other elements into place. */
5525 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5526 neutral_op);
5527 int k = nunits;
5528 while (k > 0 && elts[k - 1] == neutral_op)
5529 k -= 1;
5530 while (k > 0)
5532 k -= 1;
5533 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5534 vector_type, init, elts[k]);
5537 else
5539 /* First time round, duplicate ELTS to fill the
5540 required number of vectors. */
5541 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5542 elts, number_of_vectors, *vec_oprnds);
5543 break;
5545 vec_oprnds->quick_push (init);
5547 number_of_places_left_in_vector = nunits;
5548 elts.new_vector (vector_type, nunits, 1);
5549 elts.quick_grow (nunits);
5550 constant_p = true;
5553 if (ctor_seq != NULL)
5554 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5557 /* For a statement STMT_INFO taking part in a reduction operation return
5558 the stmt_vec_info the meta information is stored on. */
5560 stmt_vec_info
5561 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5563 stmt_info = vect_orig_stmt (stmt_info);
5564 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5565 if (!is_a <gphi *> (stmt_info->stmt)
5566 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5567 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5568 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5569 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5571 if (gimple_phi_num_args (phi) == 1)
5572 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5574 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5576 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5577 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5578 stmt_info = info;
5580 return stmt_info;
5583 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5584 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5585 return false. */
5587 static bool
5588 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5589 stmt_vec_info reduc_info)
5591 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5592 if (!main_loop_vinfo)
5593 return false;
5595 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5596 return false;
5598 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5599 auto_vec<tree, 16> main_loop_results (num_phis);
5600 auto_vec<tree, 16> initial_values (num_phis);
5601 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5603 /* The epilogue loop can be entered either from the main loop or
5604 from an earlier guard block. */
5605 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5606 for (tree incoming_value : reduc_info->reduc_initial_values)
5608 /* Look for:
5610 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5611 INITIAL_VALUE(guard block)>. */
5612 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5614 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5615 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5617 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5618 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5620 main_loop_results.quick_push (from_main_loop);
5621 initial_values.quick_push (from_skip);
5624 else
5625 /* The main loop dominates the epilogue loop. */
5626 main_loop_results.splice (reduc_info->reduc_initial_values);
5628 /* See if the main loop has the kind of accumulator we need. */
5629 vect_reusable_accumulator *accumulator
5630 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5631 if (!accumulator
5632 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5633 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5634 accumulator->reduc_info->reduc_scalar_results.begin ()))
5635 return false;
5637 /* Handle the case where we can reduce wider vectors to narrower ones. */
5638 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5639 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5640 unsigned HOST_WIDE_INT m;
5641 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5642 TYPE_VECTOR_SUBPARTS (vectype), &m))
5643 return false;
5644 /* Check the intermediate vector types and operations are available. */
5645 tree prev_vectype = old_vectype;
5646 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5647 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5649 intermediate_nunits = exact_div (intermediate_nunits, 2);
5650 tree intermediate_vectype = get_related_vectype_for_scalar_type
5651 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5652 if (!intermediate_vectype
5653 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5654 intermediate_vectype)
5655 || !can_vec_extract (TYPE_MODE (prev_vectype),
5656 TYPE_MODE (intermediate_vectype)))
5657 return false;
5658 prev_vectype = intermediate_vectype;
5661 /* Non-SLP reductions might apply an adjustment after the reduction
5662 operation, in order to simplify the initialization of the accumulator.
5663 If the epilogue loop carries on from where the main loop left off,
5664 it should apply the same adjustment to the final reduction result.
5666 If the epilogue loop can also be entered directly (rather than via
5667 the main loop), we need to be able to handle that case in the same way,
5668 with the same adjustment. (In principle we could add a PHI node
5669 to select the correct adjustment, but in practice that shouldn't be
5670 necessary.) */
5671 tree main_adjustment
5672 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5673 if (loop_vinfo->main_loop_edge && main_adjustment)
5675 gcc_assert (num_phis == 1);
5676 tree initial_value = initial_values[0];
5677 /* Check that we can use INITIAL_VALUE as the adjustment and
5678 initialize the accumulator with a neutral value instead. */
5679 if (!operand_equal_p (initial_value, main_adjustment))
5680 return false;
5681 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5682 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5683 code, initial_value);
5685 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5686 reduc_info->reduc_initial_values.truncate (0);
5687 reduc_info->reduc_initial_values.splice (initial_values);
5688 reduc_info->reused_accumulator = accumulator;
5689 return true;
5692 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5693 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5695 static tree
5696 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5697 gimple_seq *seq)
5699 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5700 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5701 tree stype = TREE_TYPE (vectype);
5702 tree new_temp = vec_def;
5703 while (nunits > nunits1)
5705 nunits /= 2;
5706 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5707 stype, nunits);
5708 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5710 /* The target has to make sure we support lowpart/highpart
5711 extraction, either via direct vector extract or through
5712 an integer mode punning. */
5713 tree dst1, dst2;
5714 gimple *epilog_stmt;
5715 if (convert_optab_handler (vec_extract_optab,
5716 TYPE_MODE (TREE_TYPE (new_temp)),
5717 TYPE_MODE (vectype1))
5718 != CODE_FOR_nothing)
5720 /* Extract sub-vectors directly once vec_extract becomes
5721 a conversion optab. */
5722 dst1 = make_ssa_name (vectype1);
5723 epilog_stmt
5724 = gimple_build_assign (dst1, BIT_FIELD_REF,
5725 build3 (BIT_FIELD_REF, vectype1,
5726 new_temp, TYPE_SIZE (vectype1),
5727 bitsize_int (0)));
5728 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5729 dst2 = make_ssa_name (vectype1);
5730 epilog_stmt
5731 = gimple_build_assign (dst2, BIT_FIELD_REF,
5732 build3 (BIT_FIELD_REF, vectype1,
5733 new_temp, TYPE_SIZE (vectype1),
5734 bitsize_int (bitsize)));
5735 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5737 else
5739 /* Extract via punning to appropriately sized integer mode
5740 vector. */
5741 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5742 tree etype = build_vector_type (eltype, 2);
5743 gcc_assert (convert_optab_handler (vec_extract_optab,
5744 TYPE_MODE (etype),
5745 TYPE_MODE (eltype))
5746 != CODE_FOR_nothing);
5747 tree tem = make_ssa_name (etype);
5748 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5749 build1 (VIEW_CONVERT_EXPR,
5750 etype, new_temp));
5751 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5752 new_temp = tem;
5753 tem = make_ssa_name (eltype);
5754 epilog_stmt
5755 = gimple_build_assign (tem, BIT_FIELD_REF,
5756 build3 (BIT_FIELD_REF, eltype,
5757 new_temp, TYPE_SIZE (eltype),
5758 bitsize_int (0)));
5759 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5760 dst1 = make_ssa_name (vectype1);
5761 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5762 build1 (VIEW_CONVERT_EXPR,
5763 vectype1, tem));
5764 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5765 tem = make_ssa_name (eltype);
5766 epilog_stmt
5767 = gimple_build_assign (tem, BIT_FIELD_REF,
5768 build3 (BIT_FIELD_REF, eltype,
5769 new_temp, TYPE_SIZE (eltype),
5770 bitsize_int (bitsize)));
5771 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5772 dst2 = make_ssa_name (vectype1);
5773 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5774 build1 (VIEW_CONVERT_EXPR,
5775 vectype1, tem));
5776 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5779 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5782 return new_temp;
5785 /* Function vect_create_epilog_for_reduction
5787 Create code at the loop-epilog to finalize the result of a reduction
5788 computation.
5790 STMT_INFO is the scalar reduction stmt that is being vectorized.
5791 SLP_NODE is an SLP node containing a group of reduction statements. The
5792 first one in this group is STMT_INFO.
5793 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5794 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5795 (counting from 0)
5797 This function:
5798 1. Completes the reduction def-use cycles.
5799 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5800 by calling the function specified by REDUC_FN if available, or by
5801 other means (whole-vector shifts or a scalar loop).
5802 The function also creates a new phi node at the loop exit to preserve
5803 loop-closed form, as illustrated below.
5805 The flow at the entry to this function:
5807 loop:
5808 vec_def = phi <vec_init, null> # REDUCTION_PHI
5809 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5810 s_loop = scalar_stmt # (scalar) STMT_INFO
5811 loop_exit:
5812 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5813 use <s_out0>
5814 use <s_out0>
5816 The above is transformed by this function into:
5818 loop:
5819 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5820 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5821 s_loop = scalar_stmt # (scalar) STMT_INFO
5822 loop_exit:
5823 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5824 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5825 v_out2 = reduce <v_out1>
5826 s_out3 = extract_field <v_out2, 0>
5827 s_out4 = adjust_result <s_out3>
5828 use <s_out4>
5829 use <s_out4>
5832 static void
5833 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5834 stmt_vec_info stmt_info,
5835 slp_tree slp_node,
5836 slp_instance slp_node_instance)
5838 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5839 gcc_assert (reduc_info->is_reduc_info);
5840 /* For double reductions we need to get at the inner loop reduction
5841 stmt which has the meta info attached. Our stmt_info is that of the
5842 loop-closed PHI of the inner loop which we remember as
5843 def for the reduction PHI generation. */
5844 bool double_reduc = false;
5845 stmt_vec_info rdef_info = stmt_info;
5846 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5848 gcc_assert (!slp_node);
5849 double_reduc = true;
5850 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5851 (stmt_info->stmt, 0));
5852 stmt_info = vect_stmt_to_vectorize (stmt_info);
5854 gphi *reduc_def_stmt
5855 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5856 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5857 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5858 tree vectype;
5859 machine_mode mode;
5860 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5861 basic_block exit_bb;
5862 tree scalar_dest;
5863 tree scalar_type;
5864 gimple *new_phi = NULL, *phi = NULL;
5865 gimple_stmt_iterator exit_gsi;
5866 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5867 gimple *epilog_stmt = NULL;
5868 gimple *exit_phi;
5869 tree bitsize;
5870 tree def;
5871 tree orig_name, scalar_result;
5872 imm_use_iterator imm_iter, phi_imm_iter;
5873 use_operand_p use_p, phi_use_p;
5874 gimple *use_stmt;
5875 auto_vec<tree> reduc_inputs;
5876 int j, i;
5877 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5878 unsigned int group_size = 1, k;
5879 auto_vec<gimple *> phis;
5880 /* SLP reduction without reduction chain, e.g.,
5881 # a1 = phi <a2, a0>
5882 # b1 = phi <b2, b0>
5883 a2 = operation (a1)
5884 b2 = operation (b1) */
5885 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5886 bool direct_slp_reduc;
5887 tree induction_index = NULL_TREE;
5889 if (slp_node)
5890 group_size = SLP_TREE_LANES (slp_node);
5892 if (nested_in_vect_loop_p (loop, stmt_info))
5894 outer_loop = loop;
5895 loop = loop->inner;
5896 gcc_assert (!slp_node && double_reduc);
5899 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5900 gcc_assert (vectype);
5901 mode = TYPE_MODE (vectype);
5903 tree induc_val = NULL_TREE;
5904 tree adjustment_def = NULL;
5905 if (slp_node)
5907 else
5909 /* Optimize: for induction condition reduction, if we can't use zero
5910 for induc_val, use initial_def. */
5911 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5912 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5913 else if (double_reduc)
5915 else
5916 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5919 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5920 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5921 if (slp_reduc)
5922 /* All statements produce live-out values. */
5923 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5924 else if (slp_node)
5926 /* The last statement in the reduction chain produces the live-out
5927 value. Note SLP optimization can shuffle scalar stmts to
5928 optimize permutations so we have to search for the last stmt. */
5929 for (k = 0; k < group_size; ++k)
5930 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5932 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5933 break;
5937 unsigned vec_num;
5938 int ncopies;
5939 if (slp_node)
5941 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5942 ncopies = 1;
5944 else
5946 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5947 vec_num = 1;
5948 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5951 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5952 which is updated with the current index of the loop for every match of
5953 the original loop's cond_expr (VEC_STMT). This results in a vector
5954 containing the last time the condition passed for that vector lane.
5955 The first match will be a 1 to allow 0 to be used for non-matching
5956 indexes. If there are no matches at all then the vector will be all
5957 zeroes.
5959 PR92772: This algorithm is broken for architectures that support
5960 masked vectors, but do not provide fold_extract_last. */
5961 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5963 auto_vec<std::pair<tree, bool>, 2> ccompares;
5964 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5965 cond_info = vect_stmt_to_vectorize (cond_info);
5966 while (cond_info != reduc_info)
5968 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5970 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5971 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5972 ccompares.safe_push
5973 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5974 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5976 cond_info
5977 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5978 1 + STMT_VINFO_REDUC_IDX
5979 (cond_info)));
5980 cond_info = vect_stmt_to_vectorize (cond_info);
5982 gcc_assert (ccompares.length () != 0);
5984 tree indx_before_incr, indx_after_incr;
5985 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5986 int scalar_precision
5987 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5988 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5989 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5990 (TYPE_MODE (vectype), cr_index_scalar_type,
5991 TYPE_VECTOR_SUBPARTS (vectype));
5993 /* First we create a simple vector induction variable which starts
5994 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5995 vector size (STEP). */
5997 /* Create a {1,2,3,...} vector. */
5998 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6000 /* Create a vector of the step value. */
6001 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6002 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6004 /* Create an induction variable. */
6005 gimple_stmt_iterator incr_gsi;
6006 bool insert_after;
6007 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6008 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6009 insert_after, &indx_before_incr, &indx_after_incr);
6011 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6012 filled with zeros (VEC_ZERO). */
6014 /* Create a vector of 0s. */
6015 tree zero = build_zero_cst (cr_index_scalar_type);
6016 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6018 /* Create a vector phi node. */
6019 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6020 new_phi = create_phi_node (new_phi_tree, loop->header);
6021 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6022 loop_preheader_edge (loop), UNKNOWN_LOCATION);
6024 /* Now take the condition from the loops original cond_exprs
6025 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6026 every match uses values from the induction variable
6027 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6028 (NEW_PHI_TREE).
6029 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6030 the new cond_expr (INDEX_COND_EXPR). */
6031 gimple_seq stmts = NULL;
6032 for (int i = ccompares.length () - 1; i != -1; --i)
6034 tree ccompare = ccompares[i].first;
6035 if (ccompares[i].second)
6036 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6037 cr_index_vector_type,
6038 ccompare,
6039 indx_before_incr, new_phi_tree);
6040 else
6041 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6042 cr_index_vector_type,
6043 ccompare,
6044 new_phi_tree, indx_before_incr);
6046 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6048 /* Update the phi with the vec cond. */
6049 induction_index = new_phi_tree;
6050 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6051 loop_latch_edge (loop), UNKNOWN_LOCATION);
6054 /* 2. Create epilog code.
6055 The reduction epilog code operates across the elements of the vector
6056 of partial results computed by the vectorized loop.
6057 The reduction epilog code consists of:
6059 step 1: compute the scalar result in a vector (v_out2)
6060 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6061 step 3: adjust the scalar result (s_out3) if needed.
6063 Step 1 can be accomplished using one the following three schemes:
6064 (scheme 1) using reduc_fn, if available.
6065 (scheme 2) using whole-vector shifts, if available.
6066 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6067 combined.
6069 The overall epilog code looks like this:
6071 s_out0 = phi <s_loop> # original EXIT_PHI
6072 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6073 v_out2 = reduce <v_out1> # step 1
6074 s_out3 = extract_field <v_out2, 0> # step 2
6075 s_out4 = adjust_result <s_out3> # step 3
6077 (step 3 is optional, and steps 1 and 2 may be combined).
6078 Lastly, the uses of s_out0 are replaced by s_out4. */
6081 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6082 v_out1 = phi <VECT_DEF>
6083 Store them in NEW_PHIS. */
6084 if (double_reduc)
6085 loop = outer_loop;
6086 exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
6087 exit_gsi = gsi_after_labels (exit_bb);
6088 reduc_inputs.create (slp_node ? vec_num : ncopies);
6089 for (unsigned i = 0; i < vec_num; i++)
6091 gimple_seq stmts = NULL;
6092 if (slp_node)
6093 def = vect_get_slp_vect_def (slp_node, i);
6094 else
6095 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6096 for (j = 0; j < ncopies; j++)
6098 tree new_def = copy_ssa_name (def);
6099 phi = create_phi_node (new_def, exit_bb);
6100 if (j)
6101 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6102 SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
6103 new_def = gimple_convert (&stmts, vectype, new_def);
6104 reduc_inputs.quick_push (new_def);
6106 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6109 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6110 (i.e. when reduc_fn is not available) and in the final adjustment
6111 code (if needed). Also get the original scalar reduction variable as
6112 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6113 represents a reduction pattern), the tree-code and scalar-def are
6114 taken from the original stmt that the pattern-stmt (STMT) replaces.
6115 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6116 are taken from STMT. */
6118 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6119 if (orig_stmt_info != stmt_info)
6121 /* Reduction pattern */
6122 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6123 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6126 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6127 scalar_type = TREE_TYPE (scalar_dest);
6128 scalar_results.truncate (0);
6129 scalar_results.reserve_exact (group_size);
6130 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6131 bitsize = TYPE_SIZE (scalar_type);
6133 /* True if we should implement SLP_REDUC using native reduction operations
6134 instead of scalar operations. */
6135 direct_slp_reduc = (reduc_fn != IFN_LAST
6136 && slp_reduc
6137 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6139 /* In case of reduction chain, e.g.,
6140 # a1 = phi <a3, a0>
6141 a2 = operation (a1)
6142 a3 = operation (a2),
6144 we may end up with more than one vector result. Here we reduce them
6145 to one vector.
6147 The same is true for a SLP reduction, e.g.,
6148 # a1 = phi <a2, a0>
6149 # b1 = phi <b2, b0>
6150 a2 = operation (a1)
6151 b2 = operation (a2),
6153 where we can end up with more than one vector as well. We can
6154 easily accumulate vectors when the number of vector elements is
6155 a multiple of the SLP group size.
6157 The same is true if we couldn't use a single defuse cycle. */
6158 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6159 || direct_slp_reduc
6160 || (slp_reduc
6161 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6162 || ncopies > 1)
6164 gimple_seq stmts = NULL;
6165 tree single_input = reduc_inputs[0];
6166 for (k = 1; k < reduc_inputs.length (); k++)
6167 single_input = gimple_build (&stmts, code, vectype,
6168 single_input, reduc_inputs[k]);
6169 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6171 reduc_inputs.truncate (0);
6172 reduc_inputs.safe_push (single_input);
6175 tree orig_reduc_input = reduc_inputs[0];
6177 /* If this loop is an epilogue loop that can be skipped after the
6178 main loop, we can only share a reduction operation between the
6179 main loop and the epilogue if we put it at the target of the
6180 skip edge.
6182 We can still reuse accumulators if this check fails. Doing so has
6183 the minor(?) benefit of making the epilogue loop's scalar result
6184 independent of the main loop's scalar result. */
6185 bool unify_with_main_loop_p = false;
6186 if (reduc_info->reused_accumulator
6187 && loop_vinfo->skip_this_loop_edge
6188 && single_succ_p (exit_bb)
6189 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6191 unify_with_main_loop_p = true;
6193 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6194 reduc_inputs[0] = make_ssa_name (vectype);
6195 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6196 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6197 UNKNOWN_LOCATION);
6198 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6199 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6200 exit_gsi = gsi_after_labels (reduc_block);
6203 /* Shouldn't be used beyond this point. */
6204 exit_bb = nullptr;
6206 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6207 && reduc_fn != IFN_LAST)
6209 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6210 various data values where the condition matched and another vector
6211 (INDUCTION_INDEX) containing all the indexes of those matches. We
6212 need to extract the last matching index (which will be the index with
6213 highest value) and use this to index into the data vector.
6214 For the case where there were no matches, the data vector will contain
6215 all default values and the index vector will be all zeros. */
6217 /* Get various versions of the type of the vector of indexes. */
6218 tree index_vec_type = TREE_TYPE (induction_index);
6219 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6220 tree index_scalar_type = TREE_TYPE (index_vec_type);
6221 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6223 /* Get an unsigned integer version of the type of the data vector. */
6224 int scalar_precision
6225 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6226 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6227 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6228 vectype);
6230 /* First we need to create a vector (ZERO_VEC) of zeros and another
6231 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6232 can create using a MAX reduction and then expanding.
6233 In the case where the loop never made any matches, the max index will
6234 be zero. */
6236 /* Vector of {0, 0, 0,...}. */
6237 tree zero_vec = build_zero_cst (vectype);
6239 /* Find maximum value from the vector of found indexes. */
6240 tree max_index = make_ssa_name (index_scalar_type);
6241 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6242 1, induction_index);
6243 gimple_call_set_lhs (max_index_stmt, max_index);
6244 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6246 /* Vector of {max_index, max_index, max_index,...}. */
6247 tree max_index_vec = make_ssa_name (index_vec_type);
6248 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6249 max_index);
6250 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6251 max_index_vec_rhs);
6252 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6254 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6255 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6256 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6257 otherwise. Only one value should match, resulting in a vector
6258 (VEC_COND) with one data value and the rest zeros.
6259 In the case where the loop never made any matches, every index will
6260 match, resulting in a vector with all data values (which will all be
6261 the default value). */
6263 /* Compare the max index vector to the vector of found indexes to find
6264 the position of the max value. */
6265 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6266 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6267 induction_index,
6268 max_index_vec);
6269 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6271 /* Use the compare to choose either values from the data vector or
6272 zero. */
6273 tree vec_cond = make_ssa_name (vectype);
6274 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6275 vec_compare,
6276 reduc_inputs[0],
6277 zero_vec);
6278 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6280 /* Finally we need to extract the data value from the vector (VEC_COND)
6281 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6282 reduction, but because this doesn't exist, we can use a MAX reduction
6283 instead. The data value might be signed or a float so we need to cast
6284 it first.
6285 In the case where the loop never made any matches, the data values are
6286 all identical, and so will reduce down correctly. */
6288 /* Make the matched data values unsigned. */
6289 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6290 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6291 vec_cond);
6292 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6293 VIEW_CONVERT_EXPR,
6294 vec_cond_cast_rhs);
6295 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6297 /* Reduce down to a scalar value. */
6298 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6299 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6300 1, vec_cond_cast);
6301 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6302 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6304 /* Convert the reduced value back to the result type and set as the
6305 result. */
6306 gimple_seq stmts = NULL;
6307 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6308 data_reduc);
6309 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6310 scalar_results.safe_push (new_temp);
6312 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6313 && reduc_fn == IFN_LAST)
6315 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6316 idx = 0;
6317 idx_val = induction_index[0];
6318 val = data_reduc[0];
6319 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6320 if (induction_index[i] > idx_val)
6321 val = data_reduc[i], idx_val = induction_index[i];
6322 return val; */
6324 tree data_eltype = TREE_TYPE (vectype);
6325 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6326 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6327 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6328 /* Enforced by vectorizable_reduction, which ensures we have target
6329 support before allowing a conditional reduction on variable-length
6330 vectors. */
6331 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6332 tree idx_val = NULL_TREE, val = NULL_TREE;
6333 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6335 tree old_idx_val = idx_val;
6336 tree old_val = val;
6337 idx_val = make_ssa_name (idx_eltype);
6338 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6339 build3 (BIT_FIELD_REF, idx_eltype,
6340 induction_index,
6341 bitsize_int (el_size),
6342 bitsize_int (off)));
6343 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6344 val = make_ssa_name (data_eltype);
6345 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6346 build3 (BIT_FIELD_REF,
6347 data_eltype,
6348 reduc_inputs[0],
6349 bitsize_int (el_size),
6350 bitsize_int (off)));
6351 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6352 if (off != 0)
6354 tree new_idx_val = idx_val;
6355 if (off != v_size - el_size)
6357 new_idx_val = make_ssa_name (idx_eltype);
6358 epilog_stmt = gimple_build_assign (new_idx_val,
6359 MAX_EXPR, idx_val,
6360 old_idx_val);
6361 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6363 tree cond = make_ssa_name (boolean_type_node);
6364 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6365 idx_val, old_idx_val);
6366 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6367 tree new_val = make_ssa_name (data_eltype);
6368 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6369 cond, val, old_val);
6370 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6371 idx_val = new_idx_val;
6372 val = new_val;
6375 /* Convert the reduced value back to the result type and set as the
6376 result. */
6377 gimple_seq stmts = NULL;
6378 val = gimple_convert (&stmts, scalar_type, val);
6379 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6380 scalar_results.safe_push (val);
6383 /* 2.3 Create the reduction code, using one of the three schemes described
6384 above. In SLP we simply need to extract all the elements from the
6385 vector (without reducing them), so we use scalar shifts. */
6386 else if (reduc_fn != IFN_LAST && !slp_reduc)
6388 tree tmp;
6389 tree vec_elem_type;
6391 /* Case 1: Create:
6392 v_out2 = reduc_expr <v_out1> */
6394 if (dump_enabled_p ())
6395 dump_printf_loc (MSG_NOTE, vect_location,
6396 "Reduce using direct vector reduction.\n");
6398 gimple_seq stmts = NULL;
6399 vec_elem_type = TREE_TYPE (vectype);
6400 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6401 vec_elem_type, reduc_inputs[0]);
6402 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6403 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6405 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6406 && induc_val)
6408 /* Earlier we set the initial value to be a vector if induc_val
6409 values. Check the result and if it is induc_val then replace
6410 with the original initial value, unless induc_val is
6411 the same as initial_def already. */
6412 tree zcompare = make_ssa_name (boolean_type_node);
6413 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6414 new_temp, induc_val);
6415 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6416 tree initial_def = reduc_info->reduc_initial_values[0];
6417 tmp = make_ssa_name (new_scalar_dest);
6418 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6419 initial_def, new_temp);
6420 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6421 new_temp = tmp;
6424 scalar_results.safe_push (new_temp);
6426 else if (direct_slp_reduc)
6428 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6429 with the elements for other SLP statements replaced with the
6430 neutral value. We can then do a normal reduction on each vector. */
6432 /* Enforced by vectorizable_reduction. */
6433 gcc_assert (reduc_inputs.length () == 1);
6434 gcc_assert (pow2p_hwi (group_size));
6436 gimple_seq seq = NULL;
6438 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6439 and the same element size as VECTYPE. */
6440 tree index = build_index_vector (vectype, 0, 1);
6441 tree index_type = TREE_TYPE (index);
6442 tree index_elt_type = TREE_TYPE (index_type);
6443 tree mask_type = truth_type_for (index_type);
6445 /* Create a vector that, for each element, identifies which of
6446 the REDUC_GROUP_SIZE results should use it. */
6447 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6448 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6449 build_vector_from_val (index_type, index_mask));
6451 /* Get a neutral vector value. This is simply a splat of the neutral
6452 scalar value if we have one, otherwise the initial scalar value
6453 is itself a neutral value. */
6454 tree vector_identity = NULL_TREE;
6455 tree neutral_op = NULL_TREE;
6456 if (slp_node)
6458 tree initial_value = NULL_TREE;
6459 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6460 initial_value = reduc_info->reduc_initial_values[0];
6461 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6462 initial_value);
6464 if (neutral_op)
6465 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6466 neutral_op);
6467 for (unsigned int i = 0; i < group_size; ++i)
6469 /* If there's no univeral neutral value, we can use the
6470 initial scalar value from the original PHI. This is used
6471 for MIN and MAX reduction, for example. */
6472 if (!neutral_op)
6474 tree scalar_value = reduc_info->reduc_initial_values[i];
6475 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6476 scalar_value);
6477 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6478 scalar_value);
6481 /* Calculate the equivalent of:
6483 sel[j] = (index[j] == i);
6485 which selects the elements of REDUC_INPUTS[0] that should
6486 be included in the result. */
6487 tree compare_val = build_int_cst (index_elt_type, i);
6488 compare_val = build_vector_from_val (index_type, compare_val);
6489 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6490 index, compare_val);
6492 /* Calculate the equivalent of:
6494 vec = seq ? reduc_inputs[0] : vector_identity;
6496 VEC is now suitable for a full vector reduction. */
6497 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6498 sel, reduc_inputs[0], vector_identity);
6500 /* Do the reduction and convert it to the appropriate type. */
6501 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6502 TREE_TYPE (vectype), vec);
6503 scalar = gimple_convert (&seq, scalar_type, scalar);
6504 scalar_results.safe_push (scalar);
6506 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6508 else
6510 bool reduce_with_shift;
6511 tree vec_temp;
6513 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6515 /* See if the target wants to do the final (shift) reduction
6516 in a vector mode of smaller size and first reduce upper/lower
6517 halves against each other. */
6518 enum machine_mode mode1 = mode;
6519 tree stype = TREE_TYPE (vectype);
6520 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6521 unsigned nunits1 = nunits;
6522 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6523 && reduc_inputs.length () == 1)
6525 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6526 /* For SLP reductions we have to make sure lanes match up, but
6527 since we're doing individual element final reduction reducing
6528 vector width here is even more important.
6529 ??? We can also separate lanes with permutes, for the common
6530 case of power-of-two group-size odd/even extracts would work. */
6531 if (slp_reduc && nunits != nunits1)
6533 nunits1 = least_common_multiple (nunits1, group_size);
6534 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6537 if (!slp_reduc
6538 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6539 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6541 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6542 stype, nunits1);
6543 reduce_with_shift = have_whole_vector_shift (mode1);
6544 if (!VECTOR_MODE_P (mode1)
6545 || !directly_supported_p (code, vectype1))
6546 reduce_with_shift = false;
6548 /* First reduce the vector to the desired vector size we should
6549 do shift reduction on by combining upper and lower halves. */
6550 gimple_seq stmts = NULL;
6551 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6552 code, &stmts);
6553 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6554 reduc_inputs[0] = new_temp;
6556 if (reduce_with_shift && !slp_reduc)
6558 int element_bitsize = tree_to_uhwi (bitsize);
6559 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6560 for variable-length vectors and also requires direct target support
6561 for loop reductions. */
6562 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6563 int nelements = vec_size_in_bits / element_bitsize;
6564 vec_perm_builder sel;
6565 vec_perm_indices indices;
6567 int elt_offset;
6569 tree zero_vec = build_zero_cst (vectype1);
6570 /* Case 2: Create:
6571 for (offset = nelements/2; offset >= 1; offset/=2)
6573 Create: va' = vec_shift <va, offset>
6574 Create: va = vop <va, va'>
6575 } */
6577 tree rhs;
6579 if (dump_enabled_p ())
6580 dump_printf_loc (MSG_NOTE, vect_location,
6581 "Reduce using vector shifts\n");
6583 gimple_seq stmts = NULL;
6584 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6585 for (elt_offset = nelements / 2;
6586 elt_offset >= 1;
6587 elt_offset /= 2)
6589 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6590 indices.new_vector (sel, 2, nelements);
6591 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6592 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6593 new_temp, zero_vec, mask);
6594 new_temp = gimple_build (&stmts, code,
6595 vectype1, new_name, new_temp);
6597 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6599 /* 2.4 Extract the final scalar result. Create:
6600 s_out3 = extract_field <v_out2, bitpos> */
6602 if (dump_enabled_p ())
6603 dump_printf_loc (MSG_NOTE, vect_location,
6604 "extract scalar result\n");
6606 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6607 bitsize, bitsize_zero_node);
6608 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6609 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6610 gimple_assign_set_lhs (epilog_stmt, new_temp);
6611 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6612 scalar_results.safe_push (new_temp);
6614 else
6616 /* Case 3: Create:
6617 s = extract_field <v_out2, 0>
6618 for (offset = element_size;
6619 offset < vector_size;
6620 offset += element_size;)
6622 Create: s' = extract_field <v_out2, offset>
6623 Create: s = op <s, s'> // For non SLP cases
6624 } */
6626 if (dump_enabled_p ())
6627 dump_printf_loc (MSG_NOTE, vect_location,
6628 "Reduce using scalar code.\n");
6630 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6631 int element_bitsize = tree_to_uhwi (bitsize);
6632 tree compute_type = TREE_TYPE (vectype);
6633 gimple_seq stmts = NULL;
6634 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6636 int bit_offset;
6637 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6638 vec_temp, bitsize, bitsize_zero_node);
6640 /* In SLP we don't need to apply reduction operation, so we just
6641 collect s' values in SCALAR_RESULTS. */
6642 if (slp_reduc)
6643 scalar_results.safe_push (new_temp);
6645 for (bit_offset = element_bitsize;
6646 bit_offset < vec_size_in_bits;
6647 bit_offset += element_bitsize)
6649 tree bitpos = bitsize_int (bit_offset);
6650 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6651 compute_type, vec_temp,
6652 bitsize, bitpos);
6653 if (slp_reduc)
6655 /* In SLP we don't need to apply reduction operation, so
6656 we just collect s' values in SCALAR_RESULTS. */
6657 new_temp = new_name;
6658 scalar_results.safe_push (new_name);
6660 else
6661 new_temp = gimple_build (&stmts, code, compute_type,
6662 new_name, new_temp);
6666 /* The only case where we need to reduce scalar results in SLP, is
6667 unrolling. If the size of SCALAR_RESULTS is greater than
6668 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6669 REDUC_GROUP_SIZE. */
6670 if (slp_reduc)
6672 tree res, first_res, new_res;
6674 /* Reduce multiple scalar results in case of SLP unrolling. */
6675 for (j = group_size; scalar_results.iterate (j, &res);
6676 j++)
6678 first_res = scalar_results[j % group_size];
6679 new_res = gimple_build (&stmts, code, compute_type,
6680 first_res, res);
6681 scalar_results[j % group_size] = new_res;
6683 scalar_results.truncate (group_size);
6684 for (k = 0; k < group_size; k++)
6685 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6686 scalar_results[k]);
6688 else
6690 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6691 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6692 scalar_results.safe_push (new_temp);
6695 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6698 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6699 && induc_val)
6701 /* Earlier we set the initial value to be a vector if induc_val
6702 values. Check the result and if it is induc_val then replace
6703 with the original initial value, unless induc_val is
6704 the same as initial_def already. */
6705 tree zcompare = make_ssa_name (boolean_type_node);
6706 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6707 induc_val);
6708 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6709 tree initial_def = reduc_info->reduc_initial_values[0];
6710 tree tmp = make_ssa_name (new_scalar_dest);
6711 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6712 initial_def, new_temp);
6713 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6714 scalar_results[0] = tmp;
6718 /* 2.5 Adjust the final result by the initial value of the reduction
6719 variable. (When such adjustment is not needed, then
6720 'adjustment_def' is zero). For example, if code is PLUS we create:
6721 new_temp = loop_exit_def + adjustment_def */
6723 if (adjustment_def)
6725 gcc_assert (!slp_reduc);
6726 gimple_seq stmts = NULL;
6727 if (double_reduc)
6729 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6730 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6731 new_temp = gimple_build (&stmts, code, vectype,
6732 reduc_inputs[0], adjustment_def);
6734 else
6736 new_temp = scalar_results[0];
6737 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6738 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6739 adjustment_def);
6740 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6741 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6742 new_temp, adjustment_def);
6743 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6746 epilog_stmt = gimple_seq_last_stmt (stmts);
6747 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6748 scalar_results[0] = new_temp;
6751 /* Record this operation if it could be reused by the epilogue loop. */
6752 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6753 && reduc_inputs.length () == 1)
6754 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6755 { orig_reduc_input, reduc_info });
6757 if (double_reduc)
6758 loop = outer_loop;
6760 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6761 phis with new adjusted scalar results, i.e., replace use <s_out0>
6762 with use <s_out4>.
6764 Transform:
6765 loop_exit:
6766 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6767 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6768 v_out2 = reduce <v_out1>
6769 s_out3 = extract_field <v_out2, 0>
6770 s_out4 = adjust_result <s_out3>
6771 use <s_out0>
6772 use <s_out0>
6774 into:
6776 loop_exit:
6777 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6778 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6779 v_out2 = reduce <v_out1>
6780 s_out3 = extract_field <v_out2, 0>
6781 s_out4 = adjust_result <s_out3>
6782 use <s_out4>
6783 use <s_out4> */
6785 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6786 for (k = 0; k < live_out_stmts.size (); k++)
6788 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6789 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6791 phis.create (3);
6792 /* Find the loop-closed-use at the loop exit of the original scalar
6793 result. (The reduction result is expected to have two immediate uses,
6794 one at the latch block, and one at the loop exit). For double
6795 reductions we are looking for exit phis of the outer loop. */
6796 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6798 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6800 if (!is_gimple_debug (USE_STMT (use_p)))
6801 phis.safe_push (USE_STMT (use_p));
6803 else
6805 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6807 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6809 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6811 if (!flow_bb_inside_loop_p (loop,
6812 gimple_bb (USE_STMT (phi_use_p)))
6813 && !is_gimple_debug (USE_STMT (phi_use_p)))
6814 phis.safe_push (USE_STMT (phi_use_p));
6820 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6822 /* Replace the uses: */
6823 orig_name = PHI_RESULT (exit_phi);
6825 /* Look for a single use at the target of the skip edge. */
6826 if (unify_with_main_loop_p)
6828 use_operand_p use_p;
6829 gimple *user;
6830 if (!single_imm_use (orig_name, &use_p, &user))
6831 gcc_unreachable ();
6832 orig_name = gimple_get_lhs (user);
6835 scalar_result = scalar_results[k];
6836 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6838 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6839 SET_USE (use_p, scalar_result);
6840 update_stmt (use_stmt);
6844 phis.release ();
6848 /* Return a vector of type VECTYPE that is equal to the vector select
6849 operation "MASK ? VEC : IDENTITY". Insert the select statements
6850 before GSI. */
6852 static tree
6853 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6854 tree vec, tree identity)
6856 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6857 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6858 mask, vec, identity);
6859 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6860 return cond;
6863 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6864 order, starting with LHS. Insert the extraction statements before GSI and
6865 associate the new scalar SSA names with variable SCALAR_DEST.
6866 Return the SSA name for the result. */
6868 static tree
6869 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6870 tree_code code, tree lhs, tree vector_rhs)
6872 tree vectype = TREE_TYPE (vector_rhs);
6873 tree scalar_type = TREE_TYPE (vectype);
6874 tree bitsize = TYPE_SIZE (scalar_type);
6875 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6876 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6878 for (unsigned HOST_WIDE_INT bit_offset = 0;
6879 bit_offset < vec_size_in_bits;
6880 bit_offset += element_bitsize)
6882 tree bitpos = bitsize_int (bit_offset);
6883 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6884 bitsize, bitpos);
6886 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6887 rhs = make_ssa_name (scalar_dest, stmt);
6888 gimple_assign_set_lhs (stmt, rhs);
6889 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6891 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6892 tree new_name = make_ssa_name (scalar_dest, stmt);
6893 gimple_assign_set_lhs (stmt, new_name);
6894 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6895 lhs = new_name;
6897 return lhs;
6900 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6901 type of the vector input. */
6903 static internal_fn
6904 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6906 internal_fn mask_reduc_fn;
6907 internal_fn mask_len_reduc_fn;
6909 switch (reduc_fn)
6911 case IFN_FOLD_LEFT_PLUS:
6912 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6913 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6914 break;
6916 default:
6917 return IFN_LAST;
6920 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6921 OPTIMIZE_FOR_SPEED))
6922 return mask_reduc_fn;
6923 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6924 OPTIMIZE_FOR_SPEED))
6925 return mask_len_reduc_fn;
6926 return IFN_LAST;
6929 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6930 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6931 statement. CODE is the operation performed by STMT_INFO and OPS are
6932 its scalar operands. REDUC_INDEX is the index of the operand in
6933 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6934 implements in-order reduction, or IFN_LAST if we should open-code it.
6935 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6936 that should be used to control the operation in a fully-masked loop. */
6938 static bool
6939 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6940 stmt_vec_info stmt_info,
6941 gimple_stmt_iterator *gsi,
6942 gimple **vec_stmt, slp_tree slp_node,
6943 gimple *reduc_def_stmt,
6944 tree_code code, internal_fn reduc_fn,
6945 tree ops[3], tree vectype_in,
6946 int reduc_index, vec_loop_masks *masks,
6947 vec_loop_lens *lens)
6949 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6950 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6951 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6953 int ncopies;
6954 if (slp_node)
6955 ncopies = 1;
6956 else
6957 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6959 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6960 gcc_assert (ncopies == 1);
6961 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6963 if (slp_node)
6964 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6965 TYPE_VECTOR_SUBPARTS (vectype_in)));
6967 tree op0 = ops[1 - reduc_index];
6969 int group_size = 1;
6970 stmt_vec_info scalar_dest_def_info;
6971 auto_vec<tree> vec_oprnds0;
6972 if (slp_node)
6974 auto_vec<vec<tree> > vec_defs (2);
6975 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6976 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6977 vec_defs[0].release ();
6978 vec_defs[1].release ();
6979 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6980 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6982 else
6984 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6985 op0, &vec_oprnds0);
6986 scalar_dest_def_info = stmt_info;
6989 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6990 tree scalar_type = TREE_TYPE (scalar_dest);
6991 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6993 int vec_num = vec_oprnds0.length ();
6994 gcc_assert (vec_num == 1 || slp_node);
6995 tree vec_elem_type = TREE_TYPE (vectype_out);
6996 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6998 tree vector_identity = NULL_TREE;
6999 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7001 vector_identity = build_zero_cst (vectype_out);
7002 if (!HONOR_SIGNED_ZEROS (vectype_out))
7004 else
7006 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7007 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7008 vector_identity);
7012 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7013 int i;
7014 tree def0;
7015 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7017 gimple *new_stmt;
7018 tree mask = NULL_TREE;
7019 tree len = NULL_TREE;
7020 tree bias = NULL_TREE;
7021 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7022 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7023 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7025 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7026 i, 1);
7027 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7028 bias = build_int_cst (intQI_type_node, biasval);
7029 mask = build_minus_one_cst (truth_type_for (vectype_in));
7032 /* Handle MINUS by adding the negative. */
7033 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7035 tree negated = make_ssa_name (vectype_out);
7036 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7037 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7038 def0 = negated;
7041 if (mask && mask_reduc_fn == IFN_LAST)
7042 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7043 vector_identity);
7045 /* On the first iteration the input is simply the scalar phi
7046 result, and for subsequent iterations it is the output of
7047 the preceding operation. */
7048 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7050 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7051 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7052 def0, mask, len, bias);
7053 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7054 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7055 def0, mask);
7056 else
7057 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7058 def0);
7059 /* For chained SLP reductions the output of the previous reduction
7060 operation serves as the input of the next. For the final statement
7061 the output cannot be a temporary - we reuse the original
7062 scalar destination of the last statement. */
7063 if (i != vec_num - 1)
7065 gimple_set_lhs (new_stmt, scalar_dest_var);
7066 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7067 gimple_set_lhs (new_stmt, reduc_var);
7070 else
7072 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
7073 reduc_var, def0);
7074 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7075 /* Remove the statement, so that we can use the same code paths
7076 as for statements that we've just created. */
7077 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7078 gsi_remove (&tmp_gsi, true);
7081 if (i == vec_num - 1)
7083 gimple_set_lhs (new_stmt, scalar_dest);
7084 vect_finish_replace_stmt (loop_vinfo,
7085 scalar_dest_def_info,
7086 new_stmt);
7088 else
7089 vect_finish_stmt_generation (loop_vinfo,
7090 scalar_dest_def_info,
7091 new_stmt, gsi);
7093 if (slp_node)
7094 slp_node->push_vec_def (new_stmt);
7095 else
7097 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7098 *vec_stmt = new_stmt;
7102 return true;
7105 /* Function is_nonwrapping_integer_induction.
7107 Check if STMT_VINO (which is part of loop LOOP) both increments and
7108 does not cause overflow. */
7110 static bool
7111 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7113 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7114 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7115 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7116 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7117 widest_int ni, max_loop_value, lhs_max;
7118 wi::overflow_type overflow = wi::OVF_NONE;
7120 /* Make sure the loop is integer based. */
7121 if (TREE_CODE (base) != INTEGER_CST
7122 || TREE_CODE (step) != INTEGER_CST)
7123 return false;
7125 /* Check that the max size of the loop will not wrap. */
7127 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7128 return true;
7130 if (! max_stmt_executions (loop, &ni))
7131 return false;
7133 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7134 &overflow);
7135 if (overflow)
7136 return false;
7138 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7139 TYPE_SIGN (lhs_type), &overflow);
7140 if (overflow)
7141 return false;
7143 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7144 <= TYPE_PRECISION (lhs_type));
7147 /* Check if masking can be supported by inserting a conditional expression.
7148 CODE is the code for the operation. COND_FN is the conditional internal
7149 function, if it exists. VECTYPE_IN is the type of the vector input. */
7150 static bool
7151 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7152 tree vectype_in)
7154 if (cond_fn != IFN_LAST
7155 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7156 OPTIMIZE_FOR_SPEED))
7157 return false;
7159 if (code.is_tree_code ())
7160 switch (tree_code (code))
7162 case DOT_PROD_EXPR:
7163 case SAD_EXPR:
7164 return true;
7166 default:
7167 break;
7169 return false;
7172 /* Insert a conditional expression to enable masked vectorization. CODE is the
7173 code for the operation. VOP is the array of operands. MASK is the loop
7174 mask. GSI is a statement iterator used to place the new conditional
7175 expression. */
7176 static void
7177 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7178 gimple_stmt_iterator *gsi)
7180 switch (tree_code (code))
7182 case DOT_PROD_EXPR:
7184 tree vectype = TREE_TYPE (vop[1]);
7185 tree zero = build_zero_cst (vectype);
7186 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7187 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7188 mask, vop[1], zero);
7189 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7190 vop[1] = masked_op1;
7191 break;
7194 case SAD_EXPR:
7196 tree vectype = TREE_TYPE (vop[1]);
7197 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7198 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7199 mask, vop[1], vop[0]);
7200 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7201 vop[1] = masked_op1;
7202 break;
7205 default:
7206 gcc_unreachable ();
7210 /* Function vectorizable_reduction.
7212 Check if STMT_INFO performs a reduction operation that can be vectorized.
7213 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7214 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7215 Return true if STMT_INFO is vectorizable in this way.
7217 This function also handles reduction idioms (patterns) that have been
7218 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7219 may be of this form:
7220 X = pattern_expr (arg0, arg1, ..., X)
7221 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7222 sequence that had been detected and replaced by the pattern-stmt
7223 (STMT_INFO).
7225 This function also handles reduction of condition expressions, for example:
7226 for (int i = 0; i < N; i++)
7227 if (a[i] < value)
7228 last = a[i];
7229 This is handled by vectorising the loop and creating an additional vector
7230 containing the loop indexes for which "a[i] < value" was true. In the
7231 function epilogue this is reduced to a single max value and then used to
7232 index into the vector of results.
7234 In some cases of reduction patterns, the type of the reduction variable X is
7235 different than the type of the other arguments of STMT_INFO.
7236 In such cases, the vectype that is used when transforming STMT_INFO into
7237 a vector stmt is different than the vectype that is used to determine the
7238 vectorization factor, because it consists of a different number of elements
7239 than the actual number of elements that are being operated upon in parallel.
7241 For example, consider an accumulation of shorts into an int accumulator.
7242 On some targets it's possible to vectorize this pattern operating on 8
7243 shorts at a time (hence, the vectype for purposes of determining the
7244 vectorization factor should be V8HI); on the other hand, the vectype that
7245 is used to create the vector form is actually V4SI (the type of the result).
7247 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7248 indicates what is the actual level of parallelism (V8HI in the example), so
7249 that the right vectorization factor would be derived. This vectype
7250 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7251 be used to create the vectorized stmt. The right vectype for the vectorized
7252 stmt is obtained from the type of the result X:
7253 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7255 This means that, contrary to "regular" reductions (or "regular" stmts in
7256 general), the following equation:
7257 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7258 does *NOT* necessarily hold for reduction patterns. */
7260 bool
7261 vectorizable_reduction (loop_vec_info loop_vinfo,
7262 stmt_vec_info stmt_info, slp_tree slp_node,
7263 slp_instance slp_node_instance,
7264 stmt_vector_for_cost *cost_vec)
7266 tree vectype_in = NULL_TREE;
7267 tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7268 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7269 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7270 stmt_vec_info cond_stmt_vinfo = NULL;
7271 int i;
7272 int ncopies;
7273 bool single_defuse_cycle = false;
7274 bool nested_cycle = false;
7275 bool double_reduc = false;
7276 int vec_num;
7277 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7278 tree cond_reduc_val = NULL_TREE;
7280 /* Make sure it was already recognized as a reduction computation. */
7281 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7282 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7283 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7284 return false;
7286 /* The stmt we store reduction analysis meta on. */
7287 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7288 reduc_info->is_reduc_info = true;
7290 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7292 if (is_a <gphi *> (stmt_info->stmt))
7294 if (slp_node)
7296 /* We eventually need to set a vector type on invariant
7297 arguments. */
7298 unsigned j;
7299 slp_tree child;
7300 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7301 if (!vect_maybe_update_slp_op_vectype
7302 (child, SLP_TREE_VECTYPE (slp_node)))
7304 if (dump_enabled_p ())
7305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7306 "incompatible vector types for "
7307 "invariants\n");
7308 return false;
7311 /* Analysis for double-reduction is done on the outer
7312 loop PHI, nested cycles have no further restrictions. */
7313 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7315 else
7316 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7317 return true;
7320 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7321 stmt_vec_info phi_info = stmt_info;
7322 if (!is_a <gphi *> (stmt_info->stmt))
7324 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7325 return true;
7327 if (slp_node)
7329 slp_node_instance->reduc_phis = slp_node;
7330 /* ??? We're leaving slp_node to point to the PHIs, we only
7331 need it to get at the number of vector stmts which wasn't
7332 yet initialized for the instance root. */
7334 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7336 use_operand_p use_p;
7337 gimple *use_stmt;
7338 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7339 &use_p, &use_stmt);
7340 gcc_assert (res);
7341 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7344 /* PHIs should not participate in patterns. */
7345 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7346 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7348 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7349 and compute the reduction chain length. Discover the real
7350 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7351 tree reduc_def
7352 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7353 loop_latch_edge
7354 (gimple_bb (reduc_def_phi)->loop_father));
7355 unsigned reduc_chain_length = 0;
7356 bool only_slp_reduc_chain = true;
7357 stmt_info = NULL;
7358 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7359 while (reduc_def != PHI_RESULT (reduc_def_phi))
7361 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7362 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7363 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7365 if (dump_enabled_p ())
7366 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7367 "reduction chain broken by patterns.\n");
7368 return false;
7370 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7371 only_slp_reduc_chain = false;
7372 /* For epilogue generation live members of the chain need
7373 to point back to the PHI via their original stmt for
7374 info_for_reduction to work. For SLP we need to look at
7375 all lanes here - even though we only will vectorize from
7376 the SLP node with live lane zero the other live lanes also
7377 need to be identified as part of a reduction to be able
7378 to skip code generation for them. */
7379 if (slp_for_stmt_info)
7381 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7382 if (STMT_VINFO_LIVE_P (s))
7383 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7385 else if (STMT_VINFO_LIVE_P (vdef))
7386 STMT_VINFO_REDUC_DEF (def) = phi_info;
7387 gimple_match_op op;
7388 if (!gimple_extract_op (vdef->stmt, &op))
7390 if (dump_enabled_p ())
7391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7392 "reduction chain includes unsupported"
7393 " statement type.\n");
7394 return false;
7396 if (CONVERT_EXPR_CODE_P (op.code))
7398 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7400 if (dump_enabled_p ())
7401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7402 "conversion in the reduction chain.\n");
7403 return false;
7406 else if (!stmt_info)
7407 /* First non-conversion stmt. */
7408 stmt_info = vdef;
7409 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7410 reduc_chain_length++;
7411 if (!stmt_info && slp_node)
7412 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7414 /* PHIs should not participate in patterns. */
7415 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7417 if (nested_in_vect_loop_p (loop, stmt_info))
7419 loop = loop->inner;
7420 nested_cycle = true;
7423 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7424 element. */
7425 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7427 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7428 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7430 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7431 gcc_assert (slp_node
7432 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7434 /* 1. Is vectorizable reduction? */
7435 /* Not supportable if the reduction variable is used in the loop, unless
7436 it's a reduction chain. */
7437 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7438 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7439 return false;
7441 /* Reductions that are not used even in an enclosing outer-loop,
7442 are expected to be "live" (used out of the loop). */
7443 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7444 && !STMT_VINFO_LIVE_P (stmt_info))
7445 return false;
7447 /* 2. Has this been recognized as a reduction pattern?
7449 Check if STMT represents a pattern that has been recognized
7450 in earlier analysis stages. For stmts that represent a pattern,
7451 the STMT_VINFO_RELATED_STMT field records the last stmt in
7452 the original sequence that constitutes the pattern. */
7454 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7455 if (orig_stmt_info)
7457 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7458 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7461 /* 3. Check the operands of the operation. The first operands are defined
7462 inside the loop body. The last operand is the reduction variable,
7463 which is defined by the loop-header-phi. */
7465 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7466 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7467 gimple_match_op op;
7468 if (!gimple_extract_op (stmt_info->stmt, &op))
7469 gcc_unreachable ();
7470 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7471 || op.code == WIDEN_SUM_EXPR
7472 || op.code == SAD_EXPR);
7474 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7475 && !SCALAR_FLOAT_TYPE_P (op.type))
7476 return false;
7478 /* Do not try to vectorize bit-precision reductions. */
7479 if (!type_has_mode_precision_p (op.type))
7480 return false;
7482 /* For lane-reducing ops we're reducing the number of reduction PHIs
7483 which means the only use of that may be in the lane-reducing operation. */
7484 if (lane_reduc_code_p
7485 && reduc_chain_length != 1
7486 && !only_slp_reduc_chain)
7488 if (dump_enabled_p ())
7489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7490 "lane-reducing reduction with extra stmts.\n");
7491 return false;
7494 /* All uses but the last are expected to be defined in the loop.
7495 The last use is the reduction variable. In case of nested cycle this
7496 assumption is not true: we use reduc_index to record the index of the
7497 reduction variable. */
7498 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7499 /* We need to skip an extra operand for COND_EXPRs with embedded
7500 comparison. */
7501 unsigned opno_adjust = 0;
7502 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7503 opno_adjust = 1;
7504 for (i = 0; i < (int) op.num_ops; i++)
7506 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7507 if (i == 0 && op.code == COND_EXPR)
7508 continue;
7510 stmt_vec_info def_stmt_info;
7511 enum vect_def_type dt;
7512 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7513 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7514 &vectype_op[i], &def_stmt_info))
7516 if (dump_enabled_p ())
7517 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7518 "use not simple.\n");
7519 return false;
7521 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7522 continue;
7524 /* There should be only one cycle def in the stmt, the one
7525 leading to reduc_def. */
7526 if (VECTORIZABLE_CYCLE_DEF (dt))
7527 return false;
7529 if (!vectype_op[i])
7530 vectype_op[i]
7531 = get_vectype_for_scalar_type (loop_vinfo,
7532 TREE_TYPE (op.ops[i]), slp_op[i]);
7534 /* To properly compute ncopies we are interested in the widest
7535 non-reduction input type in case we're looking at a widening
7536 accumulation that we later handle in vect_transform_reduction. */
7537 if (lane_reduc_code_p
7538 && vectype_op[i]
7539 && (!vectype_in
7540 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7541 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7542 vectype_in = vectype_op[i];
7544 if (op.code == COND_EXPR)
7546 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7547 if (dt == vect_constant_def)
7549 cond_reduc_dt = dt;
7550 cond_reduc_val = op.ops[i];
7552 if (dt == vect_induction_def
7553 && def_stmt_info
7554 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7556 cond_reduc_dt = dt;
7557 cond_stmt_vinfo = def_stmt_info;
7561 if (!vectype_in)
7562 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7563 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7565 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7566 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7567 /* If we have a condition reduction, see if we can simplify it further. */
7568 if (v_reduc_type == COND_REDUCTION)
7570 if (slp_node)
7571 return false;
7573 /* When the condition uses the reduction value in the condition, fail. */
7574 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7576 if (dump_enabled_p ())
7577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7578 "condition depends on previous iteration\n");
7579 return false;
7582 if (reduc_chain_length == 1
7583 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7584 OPTIMIZE_FOR_SPEED)
7585 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7586 vectype_in,
7587 OPTIMIZE_FOR_SPEED)))
7589 if (dump_enabled_p ())
7590 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7591 "optimizing condition reduction with"
7592 " FOLD_EXTRACT_LAST.\n");
7593 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7595 else if (cond_reduc_dt == vect_induction_def)
7597 tree base
7598 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7599 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7601 gcc_assert (TREE_CODE (base) == INTEGER_CST
7602 && TREE_CODE (step) == INTEGER_CST);
7603 cond_reduc_val = NULL_TREE;
7604 enum tree_code cond_reduc_op_code = ERROR_MARK;
7605 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7606 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7608 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7609 above base; punt if base is the minimum value of the type for
7610 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7611 else if (tree_int_cst_sgn (step) == -1)
7613 cond_reduc_op_code = MIN_EXPR;
7614 if (tree_int_cst_sgn (base) == -1)
7615 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7616 else if (tree_int_cst_lt (base,
7617 TYPE_MAX_VALUE (TREE_TYPE (base))))
7618 cond_reduc_val
7619 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7621 else
7623 cond_reduc_op_code = MAX_EXPR;
7624 if (tree_int_cst_sgn (base) == 1)
7625 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7626 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7627 base))
7628 cond_reduc_val
7629 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7631 if (cond_reduc_val)
7633 if (dump_enabled_p ())
7634 dump_printf_loc (MSG_NOTE, vect_location,
7635 "condition expression based on "
7636 "integer induction.\n");
7637 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7638 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7639 = cond_reduc_val;
7640 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7643 else if (cond_reduc_dt == vect_constant_def)
7645 enum vect_def_type cond_initial_dt;
7646 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7647 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7648 if (cond_initial_dt == vect_constant_def
7649 && types_compatible_p (TREE_TYPE (cond_initial_val),
7650 TREE_TYPE (cond_reduc_val)))
7652 tree e = fold_binary (LE_EXPR, boolean_type_node,
7653 cond_initial_val, cond_reduc_val);
7654 if (e && (integer_onep (e) || integer_zerop (e)))
7656 if (dump_enabled_p ())
7657 dump_printf_loc (MSG_NOTE, vect_location,
7658 "condition expression based on "
7659 "compile time constant.\n");
7660 /* Record reduction code at analysis stage. */
7661 STMT_VINFO_REDUC_CODE (reduc_info)
7662 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7663 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7669 if (STMT_VINFO_LIVE_P (phi_info))
7670 return false;
7672 if (slp_node)
7673 ncopies = 1;
7674 else
7675 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7677 gcc_assert (ncopies >= 1);
7679 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7681 if (nested_cycle)
7683 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7684 == vect_double_reduction_def);
7685 double_reduc = true;
7688 /* 4.2. Check support for the epilog operation.
7690 If STMT represents a reduction pattern, then the type of the
7691 reduction variable may be different than the type of the rest
7692 of the arguments. For example, consider the case of accumulation
7693 of shorts into an int accumulator; The original code:
7694 S1: int_a = (int) short_a;
7695 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7697 was replaced with:
7698 STMT: int_acc = widen_sum <short_a, int_acc>
7700 This means that:
7701 1. The tree-code that is used to create the vector operation in the
7702 epilog code (that reduces the partial results) is not the
7703 tree-code of STMT, but is rather the tree-code of the original
7704 stmt from the pattern that STMT is replacing. I.e, in the example
7705 above we want to use 'widen_sum' in the loop, but 'plus' in the
7706 epilog.
7707 2. The type (mode) we use to check available target support
7708 for the vector operation to be created in the *epilog*, is
7709 determined by the type of the reduction variable (in the example
7710 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7711 However the type (mode) we use to check available target support
7712 for the vector operation to be created *inside the loop*, is
7713 determined by the type of the other arguments to STMT (in the
7714 example we'd check this: optab_handler (widen_sum_optab,
7715 vect_short_mode)).
7717 This is contrary to "regular" reductions, in which the types of all
7718 the arguments are the same as the type of the reduction variable.
7719 For "regular" reductions we can therefore use the same vector type
7720 (and also the same tree-code) when generating the epilog code and
7721 when generating the code inside the loop. */
7723 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7724 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7726 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7727 if (reduction_type == TREE_CODE_REDUCTION)
7729 /* Check whether it's ok to change the order of the computation.
7730 Generally, when vectorizing a reduction we change the order of the
7731 computation. This may change the behavior of the program in some
7732 cases, so we need to check that this is ok. One exception is when
7733 vectorizing an outer-loop: the inner-loop is executed sequentially,
7734 and therefore vectorizing reductions in the inner-loop during
7735 outer-loop vectorization is safe. Likewise when we are vectorizing
7736 a series of reductions using SLP and the VF is one the reductions
7737 are performed in scalar order. */
7738 if (slp_node
7739 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7740 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7742 else if (needs_fold_left_reduction_p (op.type, orig_code))
7744 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7745 is not directy used in stmt. */
7746 if (!only_slp_reduc_chain
7747 && reduc_chain_length != 1)
7749 if (dump_enabled_p ())
7750 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7751 "in-order reduction chain without SLP.\n");
7752 return false;
7754 STMT_VINFO_REDUC_TYPE (reduc_info)
7755 = reduction_type = FOLD_LEFT_REDUCTION;
7757 else if (!commutative_binary_op_p (orig_code, op.type)
7758 || !associative_binary_op_p (orig_code, op.type))
7760 if (dump_enabled_p ())
7761 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7762 "reduction: not commutative/associative");
7763 return false;
7767 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7768 && ncopies > 1)
7770 if (dump_enabled_p ())
7771 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7772 "multiple types in double reduction or condition "
7773 "reduction or fold-left reduction.\n");
7774 return false;
7777 internal_fn reduc_fn = IFN_LAST;
7778 if (reduction_type == TREE_CODE_REDUCTION
7779 || reduction_type == FOLD_LEFT_REDUCTION
7780 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7781 || reduction_type == CONST_COND_REDUCTION)
7783 if (reduction_type == FOLD_LEFT_REDUCTION
7784 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7785 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7787 if (reduc_fn != IFN_LAST
7788 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7789 OPTIMIZE_FOR_SPEED))
7791 if (dump_enabled_p ())
7792 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7793 "reduc op not supported by target.\n");
7795 reduc_fn = IFN_LAST;
7798 else
7800 if (!nested_cycle || double_reduc)
7802 if (dump_enabled_p ())
7803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7804 "no reduc code for scalar code.\n");
7806 return false;
7810 else if (reduction_type == COND_REDUCTION)
7812 int scalar_precision
7813 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7814 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7815 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7816 vectype_out);
7818 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7819 OPTIMIZE_FOR_SPEED))
7820 reduc_fn = IFN_REDUC_MAX;
7822 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7824 if (reduction_type != EXTRACT_LAST_REDUCTION
7825 && (!nested_cycle || double_reduc)
7826 && reduc_fn == IFN_LAST
7827 && !nunits_out.is_constant ())
7829 if (dump_enabled_p ())
7830 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7831 "missing target support for reduction on"
7832 " variable-length vectors.\n");
7833 return false;
7836 /* For SLP reductions, see if there is a neutral value we can use. */
7837 tree neutral_op = NULL_TREE;
7838 if (slp_node)
7840 tree initial_value = NULL_TREE;
7841 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7842 initial_value = vect_phi_initial_value (reduc_def_phi);
7843 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7844 orig_code, initial_value);
7847 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7849 /* We can't support in-order reductions of code such as this:
7851 for (int i = 0; i < n1; ++i)
7852 for (int j = 0; j < n2; ++j)
7853 l += a[j];
7855 since GCC effectively transforms the loop when vectorizing:
7857 for (int i = 0; i < n1 / VF; ++i)
7858 for (int j = 0; j < n2; ++j)
7859 for (int k = 0; k < VF; ++k)
7860 l += a[j];
7862 which is a reassociation of the original operation. */
7863 if (dump_enabled_p ())
7864 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7865 "in-order double reduction not supported.\n");
7867 return false;
7870 if (reduction_type == FOLD_LEFT_REDUCTION
7871 && slp_node
7872 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7874 /* We cannot use in-order reductions in this case because there is
7875 an implicit reassociation of the operations involved. */
7876 if (dump_enabled_p ())
7877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7878 "in-order unchained SLP reductions not supported.\n");
7879 return false;
7882 /* For double reductions, and for SLP reductions with a neutral value,
7883 we construct a variable-length initial vector by loading a vector
7884 full of the neutral value and then shift-and-inserting the start
7885 values into the low-numbered elements. */
7886 if ((double_reduc || neutral_op)
7887 && !nunits_out.is_constant ()
7888 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7889 vectype_out, OPTIMIZE_FOR_SPEED))
7891 if (dump_enabled_p ())
7892 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7893 "reduction on variable-length vectors requires"
7894 " target support for a vector-shift-and-insert"
7895 " operation.\n");
7896 return false;
7899 /* Check extra constraints for variable-length unchained SLP reductions. */
7900 if (slp_node
7901 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7902 && !nunits_out.is_constant ())
7904 /* We checked above that we could build the initial vector when
7905 there's a neutral element value. Check here for the case in
7906 which each SLP statement has its own initial value and in which
7907 that value needs to be repeated for every instance of the
7908 statement within the initial vector. */
7909 unsigned int group_size = SLP_TREE_LANES (slp_node);
7910 if (!neutral_op
7911 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7912 TREE_TYPE (vectype_out)))
7914 if (dump_enabled_p ())
7915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7916 "unsupported form of SLP reduction for"
7917 " variable-length vectors: cannot build"
7918 " initial vector.\n");
7919 return false;
7921 /* The epilogue code relies on the number of elements being a multiple
7922 of the group size. The duplicate-and-interleave approach to setting
7923 up the initial vector does too. */
7924 if (!multiple_p (nunits_out, group_size))
7926 if (dump_enabled_p ())
7927 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7928 "unsupported form of SLP reduction for"
7929 " variable-length vectors: the vector size"
7930 " is not a multiple of the number of results.\n");
7931 return false;
7935 if (reduction_type == COND_REDUCTION)
7937 widest_int ni;
7939 if (! max_loop_iterations (loop, &ni))
7941 if (dump_enabled_p ())
7942 dump_printf_loc (MSG_NOTE, vect_location,
7943 "loop count not known, cannot create cond "
7944 "reduction.\n");
7945 return false;
7947 /* Convert backedges to iterations. */
7948 ni += 1;
7950 /* The additional index will be the same type as the condition. Check
7951 that the loop can fit into this less one (because we'll use up the
7952 zero slot for when there are no matches). */
7953 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7954 if (wi::geu_p (ni, wi::to_widest (max_index)))
7956 if (dump_enabled_p ())
7957 dump_printf_loc (MSG_NOTE, vect_location,
7958 "loop size is greater than data size.\n");
7959 return false;
7963 /* In case the vectorization factor (VF) is bigger than the number
7964 of elements that we can fit in a vectype (nunits), we have to generate
7965 more than one vector stmt - i.e - we need to "unroll" the
7966 vector stmt by a factor VF/nunits. For more details see documentation
7967 in vectorizable_operation. */
7969 /* If the reduction is used in an outer loop we need to generate
7970 VF intermediate results, like so (e.g. for ncopies=2):
7971 r0 = phi (init, r0)
7972 r1 = phi (init, r1)
7973 r0 = x0 + r0;
7974 r1 = x1 + r1;
7975 (i.e. we generate VF results in 2 registers).
7976 In this case we have a separate def-use cycle for each copy, and therefore
7977 for each copy we get the vector def for the reduction variable from the
7978 respective phi node created for this copy.
7980 Otherwise (the reduction is unused in the loop nest), we can combine
7981 together intermediate results, like so (e.g. for ncopies=2):
7982 r = phi (init, r)
7983 r = x0 + r;
7984 r = x1 + r;
7985 (i.e. we generate VF/2 results in a single register).
7986 In this case for each copy we get the vector def for the reduction variable
7987 from the vectorized reduction operation generated in the previous iteration.
7989 This only works when we see both the reduction PHI and its only consumer
7990 in vectorizable_reduction and there are no intermediate stmts
7991 participating. When unrolling we want each unrolled iteration to have its
7992 own reduction accumulator since one of the main goals of unrolling a
7993 reduction is to reduce the aggregate loop-carried latency. */
7994 if (ncopies > 1
7995 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7996 && reduc_chain_length == 1
7997 && loop_vinfo->suggested_unroll_factor == 1)
7998 single_defuse_cycle = true;
8000 if (single_defuse_cycle || lane_reduc_code_p)
8002 gcc_assert (op.code != COND_EXPR);
8004 /* 4. Supportable by target? */
8005 bool ok = true;
8007 /* 4.1. check support for the operation in the loop
8009 This isn't necessary for the lane reduction codes, since they
8010 can only be produced by pattern matching, and it's up to the
8011 pattern matcher to test for support. The main reason for
8012 specifically skipping this step is to avoid rechecking whether
8013 mixed-sign dot-products can be implemented using signed
8014 dot-products. */
8015 machine_mode vec_mode = TYPE_MODE (vectype_in);
8016 if (!lane_reduc_code_p
8017 && !directly_supported_p (op.code, vectype_in, optab_vector))
8019 if (dump_enabled_p ())
8020 dump_printf (MSG_NOTE, "op not supported by target.\n");
8021 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8022 || !vect_can_vectorize_without_simd_p (op.code))
8023 ok = false;
8024 else
8025 if (dump_enabled_p ())
8026 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8029 if (vect_emulated_vector_p (vectype_in)
8030 && !vect_can_vectorize_without_simd_p (op.code))
8032 if (dump_enabled_p ())
8033 dump_printf (MSG_NOTE, "using word mode not possible.\n");
8034 return false;
8037 /* lane-reducing operations have to go through vect_transform_reduction.
8038 For the other cases try without the single cycle optimization. */
8039 if (!ok)
8041 if (lane_reduc_code_p)
8042 return false;
8043 else
8044 single_defuse_cycle = false;
8047 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8049 /* If the reduction stmt is one of the patterns that have lane
8050 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
8051 if ((ncopies > 1 && ! single_defuse_cycle)
8052 && lane_reduc_code_p)
8054 if (dump_enabled_p ())
8055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8056 "multi def-use cycle not possible for lane-reducing "
8057 "reduction operation\n");
8058 return false;
8061 if (slp_node
8062 && !(!single_defuse_cycle
8063 && !lane_reduc_code_p
8064 && reduction_type != FOLD_LEFT_REDUCTION))
8065 for (i = 0; i < (int) op.num_ops; i++)
8066 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8068 if (dump_enabled_p ())
8069 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8070 "incompatible vector types for invariants\n");
8071 return false;
8074 if (slp_node)
8075 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8076 else
8077 vec_num = 1;
8079 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8080 reduction_type, ncopies, cost_vec);
8081 /* Cost the reduction op inside the loop if transformed via
8082 vect_transform_reduction. Otherwise this is costed by the
8083 separate vectorizable_* routines. */
8084 if (single_defuse_cycle || lane_reduc_code_p)
8086 int factor = 1;
8087 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8088 /* Three dot-products and a subtraction. */
8089 factor = 4;
8090 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8091 stmt_info, 0, vect_body);
8094 if (dump_enabled_p ()
8095 && reduction_type == FOLD_LEFT_REDUCTION)
8096 dump_printf_loc (MSG_NOTE, vect_location,
8097 "using an in-order (fold-left) reduction.\n");
8098 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8099 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8100 reductions go through their own vectorizable_* routines. */
8101 if (!single_defuse_cycle
8102 && !lane_reduc_code_p
8103 && reduction_type != FOLD_LEFT_REDUCTION)
8105 stmt_vec_info tem
8106 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8107 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8109 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8110 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8112 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8113 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8115 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8117 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8118 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8119 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8121 if (reduction_type != FOLD_LEFT_REDUCTION
8122 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8123 && (cond_fn == IFN_LAST
8124 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8125 OPTIMIZE_FOR_SPEED)))
8127 if (dump_enabled_p ())
8128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8129 "can't operate on partial vectors because"
8130 " no conditional operation is available.\n");
8131 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8133 else if (reduction_type == FOLD_LEFT_REDUCTION
8134 && reduc_fn == IFN_LAST
8135 && !expand_vec_cond_expr_p (vectype_in,
8136 truth_type_for (vectype_in),
8137 SSA_NAME))
8139 if (dump_enabled_p ())
8140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8141 "can't operate on partial vectors because"
8142 " no conditional operation is available.\n");
8143 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8145 else if (reduction_type == FOLD_LEFT_REDUCTION
8146 && reduc_fn == IFN_LAST
8147 && FLOAT_TYPE_P (vectype_in)
8148 && HONOR_SIGNED_ZEROS (vectype_in)
8149 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8151 if (dump_enabled_p ())
8152 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8153 "can't operate on partial vectors because"
8154 " signed zeros cannot be preserved.\n");
8155 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8157 else
8159 internal_fn mask_reduc_fn
8160 = get_masked_reduction_fn (reduc_fn, vectype_in);
8162 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8163 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8164 vectype_in, 1);
8165 else
8166 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8167 vectype_in, NULL);
8170 return true;
8173 /* STMT_INFO is a dot-product reduction whose multiplication operands
8174 have different signs. Emit a sequence to emulate the operation
8175 using a series of signed DOT_PROD_EXPRs and return the last
8176 statement generated. VEC_DEST is the result of the vector operation
8177 and VOP lists its inputs. */
8179 static gassign *
8180 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8181 gimple_stmt_iterator *gsi, tree vec_dest,
8182 tree vop[3])
8184 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8185 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8186 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8187 gimple *new_stmt;
8189 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8190 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8191 std::swap (vop[0], vop[1]);
8193 /* Convert all inputs to signed types. */
8194 for (int i = 0; i < 3; ++i)
8195 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8197 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8198 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8199 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8200 vop[i] = tmp;
8203 /* In the comments below we assume 8-bit inputs for simplicity,
8204 but the approach works for any full integer type. */
8206 /* Create a vector of -128. */
8207 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8208 tree min_narrow = build_vector_from_val (narrow_vectype,
8209 min_narrow_elttype);
8211 /* Create a vector of 64. */
8212 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8213 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8214 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8216 /* Emit: SUB_RES = VOP[0] - 128. */
8217 tree sub_res = make_ssa_name (narrow_vectype);
8218 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8219 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8221 /* Emit:
8223 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8224 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8225 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8227 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8228 Doing the two 64 * y steps first allows more time to compute x. */
8229 tree stage1 = make_ssa_name (wide_vectype);
8230 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8231 vop[1], half_narrow, vop[2]);
8232 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8234 tree stage2 = make_ssa_name (wide_vectype);
8235 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8236 vop[1], half_narrow, stage1);
8237 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8239 tree stage3 = make_ssa_name (wide_vectype);
8240 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8241 sub_res, vop[1], stage2);
8242 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8244 /* Convert STAGE3 to the reduction type. */
8245 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8248 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8249 value. */
8251 bool
8252 vect_transform_reduction (loop_vec_info loop_vinfo,
8253 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8254 gimple **vec_stmt, slp_tree slp_node)
8256 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8257 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8258 int i;
8259 int ncopies;
8260 int vec_num;
8262 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8263 gcc_assert (reduc_info->is_reduc_info);
8265 if (nested_in_vect_loop_p (loop, stmt_info))
8267 loop = loop->inner;
8268 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8271 gimple_match_op op;
8272 if (!gimple_extract_op (stmt_info->stmt, &op))
8273 gcc_unreachable ();
8275 /* All uses but the last are expected to be defined in the loop.
8276 The last use is the reduction variable. In case of nested cycle this
8277 assumption is not true: we use reduc_index to record the index of the
8278 reduction variable. */
8279 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8280 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8281 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8282 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8284 if (slp_node)
8286 ncopies = 1;
8287 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8289 else
8291 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8292 vec_num = 1;
8295 code_helper code = canonicalize_code (op.code, op.type);
8296 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8297 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8298 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8299 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8301 /* Transform. */
8302 tree new_temp = NULL_TREE;
8303 auto_vec<tree> vec_oprnds0;
8304 auto_vec<tree> vec_oprnds1;
8305 auto_vec<tree> vec_oprnds2;
8306 tree def0;
8308 if (dump_enabled_p ())
8309 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8311 /* FORNOW: Multiple types are not supported for condition. */
8312 if (code == COND_EXPR)
8313 gcc_assert (ncopies == 1);
8315 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8317 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8318 if (reduction_type == FOLD_LEFT_REDUCTION)
8320 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8321 gcc_assert (code.is_tree_code ());
8322 return vectorize_fold_left_reduction
8323 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8324 tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
8325 lens);
8328 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8329 gcc_assert (single_defuse_cycle
8330 || code == DOT_PROD_EXPR
8331 || code == WIDEN_SUM_EXPR
8332 || code == SAD_EXPR);
8334 /* Create the destination vector */
8335 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8336 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8338 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8339 single_defuse_cycle && reduc_index == 0
8340 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8341 single_defuse_cycle && reduc_index == 1
8342 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8343 op.num_ops == 3
8344 && !(single_defuse_cycle && reduc_index == 2)
8345 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8346 if (single_defuse_cycle)
8348 gcc_assert (!slp_node);
8349 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8350 op.ops[reduc_index],
8351 reduc_index == 0 ? &vec_oprnds0
8352 : (reduc_index == 1 ? &vec_oprnds1
8353 : &vec_oprnds2));
8356 bool emulated_mixed_dot_prod
8357 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8358 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8360 gimple *new_stmt;
8361 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8362 if (masked_loop_p && !mask_by_cond_expr)
8364 /* No conditional ifns have been defined for dot-product yet. */
8365 gcc_assert (code != DOT_PROD_EXPR);
8367 /* Make sure that the reduction accumulator is vop[0]. */
8368 if (reduc_index == 1)
8370 gcc_assert (commutative_binary_op_p (code, op.type));
8371 std::swap (vop[0], vop[1]);
8373 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8374 vec_num * ncopies, vectype_in, i);
8375 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8376 vop[0], vop[1], vop[0]);
8377 new_temp = make_ssa_name (vec_dest, call);
8378 gimple_call_set_lhs (call, new_temp);
8379 gimple_call_set_nothrow (call, true);
8380 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8381 new_stmt = call;
8383 else
8385 if (op.num_ops == 3)
8386 vop[2] = vec_oprnds2[i];
8388 if (masked_loop_p && mask_by_cond_expr)
8390 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8391 vec_num * ncopies, vectype_in, i);
8392 build_vect_cond_expr (code, vop, mask, gsi);
8395 if (emulated_mixed_dot_prod)
8396 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8397 vec_dest, vop);
8398 else if (code.is_internal_fn ())
8399 new_stmt = gimple_build_call_internal (internal_fn (code),
8400 op.num_ops,
8401 vop[0], vop[1], vop[2]);
8402 else
8403 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8404 vop[0], vop[1], vop[2]);
8405 new_temp = make_ssa_name (vec_dest, new_stmt);
8406 gimple_set_lhs (new_stmt, new_temp);
8407 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8410 if (slp_node)
8411 slp_node->push_vec_def (new_stmt);
8412 else if (single_defuse_cycle
8413 && i < ncopies - 1)
8415 if (reduc_index == 0)
8416 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8417 else if (reduc_index == 1)
8418 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8419 else if (reduc_index == 2)
8420 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8422 else
8423 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8426 if (!slp_node)
8427 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8429 return true;
8432 /* Transform phase of a cycle PHI. */
8434 bool
8435 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8436 stmt_vec_info stmt_info, gimple **vec_stmt,
8437 slp_tree slp_node, slp_instance slp_node_instance)
8439 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8440 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8441 int i;
8442 int ncopies;
8443 int j;
8444 bool nested_cycle = false;
8445 int vec_num;
8447 if (nested_in_vect_loop_p (loop, stmt_info))
8449 loop = loop->inner;
8450 nested_cycle = true;
8453 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8454 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8455 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8456 gcc_assert (reduc_info->is_reduc_info);
8458 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8459 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8460 /* Leave the scalar phi in place. */
8461 return true;
8463 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8464 /* For a nested cycle we do not fill the above. */
8465 if (!vectype_in)
8466 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8467 gcc_assert (vectype_in);
8469 if (slp_node)
8471 /* The size vect_schedule_slp_instance computes is off for us. */
8472 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8473 * SLP_TREE_LANES (slp_node), vectype_in);
8474 ncopies = 1;
8476 else
8478 vec_num = 1;
8479 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8482 /* Check whether we should use a single PHI node and accumulate
8483 vectors to one before the backedge. */
8484 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8485 ncopies = 1;
8487 /* Create the destination vector */
8488 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8489 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8490 vectype_out);
8492 /* Get the loop-entry arguments. */
8493 tree vec_initial_def = NULL_TREE;
8494 auto_vec<tree> vec_initial_defs;
8495 if (slp_node)
8497 vec_initial_defs.reserve (vec_num);
8498 if (nested_cycle)
8500 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8501 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8502 &vec_initial_defs);
8504 else
8506 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8507 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8508 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8510 unsigned int num_phis = stmts.length ();
8511 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8512 num_phis = 1;
8513 initial_values.reserve (num_phis);
8514 for (unsigned int i = 0; i < num_phis; ++i)
8516 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8517 initial_values.quick_push (vect_phi_initial_value (this_phi));
8519 if (vec_num == 1)
8520 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8521 if (!initial_values.is_empty ())
8523 tree initial_value
8524 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8525 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8526 tree neutral_op
8527 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8528 code, initial_value);
8529 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8530 &vec_initial_defs, vec_num,
8531 stmts.length (), neutral_op);
8535 else
8537 /* Get at the scalar def before the loop, that defines the initial
8538 value of the reduction variable. */
8539 tree initial_def = vect_phi_initial_value (phi);
8540 reduc_info->reduc_initial_values.safe_push (initial_def);
8541 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8542 and we can't use zero for induc_val, use initial_def. Similarly
8543 for REDUC_MIN and initial_def larger than the base. */
8544 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8546 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8547 if (TREE_CODE (initial_def) == INTEGER_CST
8548 && !integer_zerop (induc_val)
8549 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8550 && tree_int_cst_lt (initial_def, induc_val))
8551 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8552 && tree_int_cst_lt (induc_val, initial_def))))
8554 induc_val = initial_def;
8555 /* Communicate we used the initial_def to epilouge
8556 generation. */
8557 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8559 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8561 else if (nested_cycle)
8563 /* Do not use an adjustment def as that case is not supported
8564 correctly if ncopies is not one. */
8565 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8566 ncopies, initial_def,
8567 &vec_initial_defs);
8569 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8570 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8571 /* Fill the initial vector with the initial scalar value. */
8572 vec_initial_def
8573 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8574 initial_def, initial_def);
8575 else
8577 if (ncopies == 1)
8578 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8579 if (!reduc_info->reduc_initial_values.is_empty ())
8581 initial_def = reduc_info->reduc_initial_values[0];
8582 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8583 tree neutral_op
8584 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8585 code, initial_def);
8586 gcc_assert (neutral_op);
8587 /* Try to simplify the vector initialization by applying an
8588 adjustment after the reduction has been performed. */
8589 if (!reduc_info->reused_accumulator
8590 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8591 && !operand_equal_p (neutral_op, initial_def))
8593 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8594 = initial_def;
8595 initial_def = neutral_op;
8597 vec_initial_def
8598 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8599 initial_def, neutral_op);
8604 if (vec_initial_def)
8606 vec_initial_defs.create (ncopies);
8607 for (i = 0; i < ncopies; ++i)
8608 vec_initial_defs.quick_push (vec_initial_def);
8611 if (auto *accumulator = reduc_info->reused_accumulator)
8613 tree def = accumulator->reduc_input;
8614 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8616 unsigned int nreduc;
8617 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8618 (TREE_TYPE (def)),
8619 TYPE_VECTOR_SUBPARTS (vectype_out),
8620 &nreduc);
8621 gcc_assert (res);
8622 gimple_seq stmts = NULL;
8623 /* Reduce the single vector to a smaller one. */
8624 if (nreduc != 1)
8626 /* Perform the reduction in the appropriate type. */
8627 tree rvectype = vectype_out;
8628 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8629 TREE_TYPE (TREE_TYPE (def))))
8630 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8631 TYPE_VECTOR_SUBPARTS
8632 (vectype_out));
8633 def = vect_create_partial_epilog (def, rvectype,
8634 STMT_VINFO_REDUC_CODE
8635 (reduc_info),
8636 &stmts);
8638 /* The epilogue loop might use a different vector mode, like
8639 VNx2DI vs. V2DI. */
8640 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8642 tree reduc_type = build_vector_type_for_mode
8643 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8644 def = gimple_convert (&stmts, reduc_type, def);
8646 /* Adjust the input so we pick up the partially reduced value
8647 for the skip edge in vect_create_epilog_for_reduction. */
8648 accumulator->reduc_input = def;
8649 /* And the reduction could be carried out using a different sign. */
8650 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8651 def = gimple_convert (&stmts, vectype_out, def);
8652 if (loop_vinfo->main_loop_edge)
8654 /* While we'd like to insert on the edge this will split
8655 blocks and disturb bookkeeping, we also will eventually
8656 need this on the skip edge. Rely on sinking to
8657 fixup optimal placement and insert in the pred. */
8658 gimple_stmt_iterator gsi
8659 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8660 /* Insert before a cond that eventually skips the
8661 epilogue. */
8662 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8663 gsi_prev (&gsi);
8664 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8666 else
8667 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8668 stmts);
8670 if (loop_vinfo->main_loop_edge)
8671 vec_initial_defs[0]
8672 = vect_get_main_loop_result (loop_vinfo, def,
8673 vec_initial_defs[0]);
8674 else
8675 vec_initial_defs.safe_push (def);
8678 /* Generate the reduction PHIs upfront. */
8679 for (i = 0; i < vec_num; i++)
8681 tree vec_init_def = vec_initial_defs[i];
8682 for (j = 0; j < ncopies; j++)
8684 /* Create the reduction-phi that defines the reduction
8685 operand. */
8686 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8688 /* Set the loop-entry arg of the reduction-phi. */
8689 if (j != 0 && nested_cycle)
8690 vec_init_def = vec_initial_defs[j];
8691 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8692 UNKNOWN_LOCATION);
8694 /* The loop-latch arg is set in epilogue processing. */
8696 if (slp_node)
8697 slp_node->push_vec_def (new_phi);
8698 else
8700 if (j == 0)
8701 *vec_stmt = new_phi;
8702 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8707 return true;
8710 /* Vectorizes LC PHIs. */
8712 bool
8713 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8714 stmt_vec_info stmt_info, gimple **vec_stmt,
8715 slp_tree slp_node)
8717 if (!loop_vinfo
8718 || !is_a <gphi *> (stmt_info->stmt)
8719 || gimple_phi_num_args (stmt_info->stmt) != 1)
8720 return false;
8722 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8723 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8724 return false;
8726 if (!vec_stmt) /* transformation not required. */
8728 /* Deal with copies from externs or constants that disguise as
8729 loop-closed PHI nodes (PR97886). */
8730 if (slp_node
8731 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8732 SLP_TREE_VECTYPE (slp_node)))
8734 if (dump_enabled_p ())
8735 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8736 "incompatible vector types for invariants\n");
8737 return false;
8739 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8740 return true;
8743 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8744 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8745 basic_block bb = gimple_bb (stmt_info->stmt);
8746 edge e = single_pred_edge (bb);
8747 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8748 auto_vec<tree> vec_oprnds;
8749 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8750 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8751 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8752 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8754 /* Create the vectorized LC PHI node. */
8755 gphi *new_phi = create_phi_node (vec_dest, bb);
8756 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8757 if (slp_node)
8758 slp_node->push_vec_def (new_phi);
8759 else
8760 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8762 if (!slp_node)
8763 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8765 return true;
8768 /* Vectorizes PHIs. */
8770 bool
8771 vectorizable_phi (vec_info *,
8772 stmt_vec_info stmt_info, gimple **vec_stmt,
8773 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8775 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8776 return false;
8778 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8779 return false;
8781 tree vectype = SLP_TREE_VECTYPE (slp_node);
8783 if (!vec_stmt) /* transformation not required. */
8785 slp_tree child;
8786 unsigned i;
8787 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8788 if (!child)
8790 if (dump_enabled_p ())
8791 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8792 "PHI node with unvectorized backedge def\n");
8793 return false;
8795 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8797 if (dump_enabled_p ())
8798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8799 "incompatible vector types for invariants\n");
8800 return false;
8802 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8803 && !useless_type_conversion_p (vectype,
8804 SLP_TREE_VECTYPE (child)))
8806 /* With bools we can have mask and non-mask precision vectors
8807 or different non-mask precisions. while pattern recog is
8808 supposed to guarantee consistency here bugs in it can cause
8809 mismatches (PR103489 and PR103800 for example).
8810 Deal with them here instead of ICEing later. */
8811 if (dump_enabled_p ())
8812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8813 "incompatible vector type setup from "
8814 "bool pattern detection\n");
8815 return false;
8818 /* For single-argument PHIs assume coalescing which means zero cost
8819 for the scalar and the vector PHIs. This avoids artificially
8820 favoring the vector path (but may pessimize it in some cases). */
8821 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8822 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8823 vector_stmt, stmt_info, vectype, 0, vect_body);
8824 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8825 return true;
8828 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8829 basic_block bb = gimple_bb (stmt_info->stmt);
8830 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8831 auto_vec<gphi *> new_phis;
8832 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8834 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8836 /* Skip not yet vectorized defs. */
8837 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8838 && SLP_TREE_VEC_DEFS (child).is_empty ())
8839 continue;
8841 auto_vec<tree> vec_oprnds;
8842 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8843 if (!new_phis.exists ())
8845 new_phis.create (vec_oprnds.length ());
8846 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8848 /* Create the vectorized LC PHI node. */
8849 new_phis.quick_push (create_phi_node (vec_dest, bb));
8850 slp_node->push_vec_def (new_phis[j]);
8853 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8854 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8855 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8857 /* We should have at least one already vectorized child. */
8858 gcc_assert (new_phis.exists ());
8860 return true;
8863 /* Vectorizes first order recurrences. An overview of the transformation
8864 is described below. Suppose we have the following loop.
8866 int t = 0;
8867 for (int i = 0; i < n; ++i)
8869 b[i] = a[i] - t;
8870 t = a[i];
8873 There is a first-order recurrence on 'a'. For this loop, the scalar IR
8874 looks (simplified) like:
8876 scalar.preheader:
8877 init = 0;
8879 scalar.body:
8880 i = PHI <0(scalar.preheader), i+1(scalar.body)>
8881 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8882 _1 = a[i]
8883 b[i] = _1 - _2
8884 if (i < n) goto scalar.body
8886 In this example, _2 is a recurrence because it's value depends on the
8887 previous iteration. We vectorize this as (VF = 4)
8889 vector.preheader:
8890 vect_init = vect_cst(..., ..., ..., 0)
8892 vector.body
8893 i = PHI <0(vector.preheader), i+4(vector.body)>
8894 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8895 vect_2 = a[i, i+1, i+2, i+3];
8896 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8897 b[i, i+1, i+2, i+3] = vect_2 - vect_3
8898 if (..) goto vector.body
8900 In this function, vectorizable_recurr, we code generate both the
8901 vector PHI node and the permute since those together compute the
8902 vectorized value of the scalar PHI. We do not yet have the
8903 backedge value to fill in there nor into the vec_perm. Those
8904 are filled in maybe_set_vectorized_backedge_value and
8905 vect_schedule_scc.
8907 TODO: Since the scalar loop does not have a use of the recurrence
8908 outside of the loop the natural way to implement peeling via
8909 vectorizing the live value doesn't work. For now peeling of loops
8910 with a recurrence is not implemented. For SLP the supported cases
8911 are restricted to those requiring a single vector recurrence PHI. */
8913 bool
8914 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8915 gimple **vec_stmt, slp_tree slp_node,
8916 stmt_vector_for_cost *cost_vec)
8918 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8919 return false;
8921 gphi *phi = as_a<gphi *> (stmt_info->stmt);
8923 /* So far we only support first-order recurrence auto-vectorization. */
8924 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8925 return false;
8927 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8928 unsigned ncopies;
8929 if (slp_node)
8930 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8931 else
8932 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8933 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8934 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8935 /* We need to be able to make progress with a single vector. */
8936 if (maybe_gt (dist * 2, nunits))
8938 if (dump_enabled_p ())
8939 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8940 "first order recurrence exceeds half of "
8941 "a vector\n");
8942 return false;
8945 /* First-order recurrence autovectorization needs to handle permutation
8946 with indices = [nunits-1, nunits, nunits+1, ...]. */
8947 vec_perm_builder sel (nunits, 1, 3);
8948 for (int i = 0; i < 3; ++i)
8949 sel.quick_push (nunits - dist + i);
8950 vec_perm_indices indices (sel, 2, nunits);
8952 if (!vec_stmt) /* transformation not required. */
8954 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8955 indices))
8956 return false;
8958 if (slp_node)
8960 /* We eventually need to set a vector type on invariant
8961 arguments. */
8962 unsigned j;
8963 slp_tree child;
8964 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8965 if (!vect_maybe_update_slp_op_vectype
8966 (child, SLP_TREE_VECTYPE (slp_node)))
8968 if (dump_enabled_p ())
8969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8970 "incompatible vector types for "
8971 "invariants\n");
8972 return false;
8975 /* The recurrence costs the initialization vector and one permute
8976 for each copy. */
8977 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8978 stmt_info, 0, vect_prologue);
8979 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8980 stmt_info, 0, vect_body);
8981 if (dump_enabled_p ())
8982 dump_printf_loc (MSG_NOTE, vect_location,
8983 "vectorizable_recurr: inside_cost = %d, "
8984 "prologue_cost = %d .\n", inside_cost,
8985 prologue_cost);
8987 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8988 return true;
8991 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8992 basic_block bb = gimple_bb (phi);
8993 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8994 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8996 gimple_seq stmts = NULL;
8997 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8998 gsi_insert_seq_on_edge_immediate (pe, stmts);
9000 tree vec_init = build_vector_from_val (vectype, preheader);
9001 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9003 /* Create the vectorized first-order PHI node. */
9004 tree vec_dest = vect_get_new_vect_var (vectype,
9005 vect_simple_var, "vec_recur_");
9006 gphi *new_phi = create_phi_node (vec_dest, bb);
9007 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9009 /* Insert shuffles the first-order recurrence autovectorization.
9010 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9011 tree perm = vect_gen_perm_mask_checked (vectype, indices);
9013 /* Insert the required permute after the latch definition. The
9014 second and later operands are tentative and will be updated when we have
9015 vectorized the latch definition. */
9016 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9017 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9018 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9019 gsi_next (&gsi2);
9021 for (unsigned i = 0; i < ncopies; ++i)
9023 vec_dest = make_ssa_name (vectype);
9024 gassign *vperm
9025 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9026 i == 0 ? gimple_phi_result (new_phi) : NULL,
9027 NULL, perm);
9028 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9030 if (slp_node)
9031 slp_node->push_vec_def (vperm);
9032 else
9033 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9036 if (!slp_node)
9037 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9038 return true;
9041 /* Return true if VECTYPE represents a vector that requires lowering
9042 by the vector lowering pass. */
9044 bool
9045 vect_emulated_vector_p (tree vectype)
9047 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9048 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9049 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9052 /* Return true if we can emulate CODE on an integer mode representation
9053 of a vector. */
9055 bool
9056 vect_can_vectorize_without_simd_p (tree_code code)
9058 switch (code)
9060 case PLUS_EXPR:
9061 case MINUS_EXPR:
9062 case NEGATE_EXPR:
9063 case BIT_AND_EXPR:
9064 case BIT_IOR_EXPR:
9065 case BIT_XOR_EXPR:
9066 case BIT_NOT_EXPR:
9067 return true;
9069 default:
9070 return false;
9074 /* Likewise, but taking a code_helper. */
9076 bool
9077 vect_can_vectorize_without_simd_p (code_helper code)
9079 return (code.is_tree_code ()
9080 && vect_can_vectorize_without_simd_p (tree_code (code)));
9083 /* Create vector init for vectorized iv. */
9084 static tree
9085 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9086 tree step_expr, poly_uint64 nunits,
9087 tree vectype,
9088 enum vect_induction_op_type induction_type)
9090 unsigned HOST_WIDE_INT const_nunits;
9091 tree vec_shift, vec_init, new_name;
9092 unsigned i;
9093 tree itype = TREE_TYPE (vectype);
9095 /* iv_loop is the loop to be vectorized. Create:
9096 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9097 new_name = gimple_convert (stmts, itype, init_expr);
9098 switch (induction_type)
9100 case vect_step_op_shr:
9101 case vect_step_op_shl:
9102 /* Build the Initial value from shift_expr. */
9103 vec_init = gimple_build_vector_from_val (stmts,
9104 vectype,
9105 new_name);
9106 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9107 build_zero_cst (itype), step_expr);
9108 vec_init = gimple_build (stmts,
9109 (induction_type == vect_step_op_shr
9110 ? RSHIFT_EXPR : LSHIFT_EXPR),
9111 vectype, vec_init, vec_shift);
9112 break;
9114 case vect_step_op_neg:
9116 vec_init = gimple_build_vector_from_val (stmts,
9117 vectype,
9118 new_name);
9119 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9120 vectype, vec_init);
9121 /* The encoding has 2 interleaved stepped patterns. */
9122 vec_perm_builder sel (nunits, 2, 3);
9123 sel.quick_grow (6);
9124 for (i = 0; i < 3; i++)
9126 sel[2 * i] = i;
9127 sel[2 * i + 1] = i + nunits;
9129 vec_perm_indices indices (sel, 2, nunits);
9130 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9131 fail when vec_init is const vector. In that situation vec_perm is not
9132 really needed. */
9133 tree perm_mask_even
9134 = vect_gen_perm_mask_any (vectype, indices);
9135 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9136 vectype,
9137 vec_init, vec_neg,
9138 perm_mask_even);
9140 break;
9142 case vect_step_op_mul:
9144 /* Use unsigned mult to avoid UD integer overflow. */
9145 gcc_assert (nunits.is_constant (&const_nunits));
9146 tree utype = unsigned_type_for (itype);
9147 tree uvectype = build_vector_type (utype,
9148 TYPE_VECTOR_SUBPARTS (vectype));
9149 new_name = gimple_convert (stmts, utype, new_name);
9150 vec_init = gimple_build_vector_from_val (stmts,
9151 uvectype,
9152 new_name);
9153 tree_vector_builder elts (uvectype, const_nunits, 1);
9154 tree elt_step = build_one_cst (utype);
9156 elts.quick_push (elt_step);
9157 for (i = 1; i < const_nunits; i++)
9159 /* Create: new_name_i = new_name + step_expr. */
9160 elt_step = gimple_build (stmts, MULT_EXPR,
9161 utype, elt_step, step_expr);
9162 elts.quick_push (elt_step);
9164 /* Create a vector from [new_name_0, new_name_1, ...,
9165 new_name_nunits-1]. */
9166 tree vec_mul = gimple_build_vector (stmts, &elts);
9167 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9168 vec_init, vec_mul);
9169 vec_init = gimple_convert (stmts, vectype, vec_init);
9171 break;
9173 default:
9174 gcc_unreachable ();
9177 return vec_init;
9180 /* Peel init_expr by skip_niter for induction_type. */
9181 tree
9182 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9183 tree skip_niters, tree step_expr,
9184 enum vect_induction_op_type induction_type)
9186 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9187 tree type = TREE_TYPE (init_expr);
9188 unsigned prec = TYPE_PRECISION (type);
9189 switch (induction_type)
9191 case vect_step_op_neg:
9192 if (TREE_INT_CST_LOW (skip_niters) % 2)
9193 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9194 /* else no change. */
9195 break;
9197 case vect_step_op_shr:
9198 case vect_step_op_shl:
9199 skip_niters = gimple_convert (stmts, type, skip_niters);
9200 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9201 /* When shift mount >= precision, need to avoid UD.
9202 In the original loop, there's no UD, and according to semantic,
9203 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9204 if (!tree_fits_uhwi_p (step_expr)
9205 || tree_to_uhwi (step_expr) >= prec)
9207 if (induction_type == vect_step_op_shl
9208 || TYPE_UNSIGNED (type))
9209 init_expr = build_zero_cst (type);
9210 else
9211 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9212 init_expr,
9213 wide_int_to_tree (type, prec - 1));
9215 else
9216 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9217 ? RSHIFT_EXPR : LSHIFT_EXPR),
9218 type, init_expr, step_expr);
9219 break;
9221 case vect_step_op_mul:
9223 tree utype = unsigned_type_for (type);
9224 init_expr = gimple_convert (stmts, utype, init_expr);
9225 unsigned skipn = TREE_INT_CST_LOW (skip_niters);
9226 wide_int begin = wi::to_wide (step_expr);
9227 for (unsigned i = 0; i != skipn - 1; i++)
9228 begin = wi::mul (begin, wi::to_wide (step_expr));
9229 tree mult_expr = wide_int_to_tree (utype, begin);
9230 init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
9231 init_expr = gimple_convert (stmts, type, init_expr);
9233 break;
9235 default:
9236 gcc_unreachable ();
9239 return init_expr;
9242 /* Create vector step for vectorized iv. */
9243 static tree
9244 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9245 poly_uint64 vf,
9246 enum vect_induction_op_type induction_type)
9248 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9249 tree new_name = NULL;
9250 /* Step should be pow (step, vf) for mult induction. */
9251 if (induction_type == vect_step_op_mul)
9253 gcc_assert (vf.is_constant ());
9254 wide_int begin = wi::to_wide (step_expr);
9256 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9257 begin = wi::mul (begin, wi::to_wide (step_expr));
9259 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9261 else if (induction_type == vect_step_op_neg)
9262 /* Do nothing. */
9264 else
9265 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9266 expr, step_expr);
9267 return new_name;
9270 static tree
9271 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9272 stmt_vec_info stmt_info,
9273 tree new_name, tree vectype,
9274 enum vect_induction_op_type induction_type)
9276 /* No step is needed for neg induction. */
9277 if (induction_type == vect_step_op_neg)
9278 return NULL;
9280 tree t = unshare_expr (new_name);
9281 gcc_assert (CONSTANT_CLASS_P (new_name)
9282 || TREE_CODE (new_name) == SSA_NAME);
9283 tree new_vec = build_vector_from_val (vectype, t);
9284 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9285 new_vec, vectype, NULL);
9286 return vec_step;
9289 /* Update vectorized iv with vect_step, induc_def is init. */
9290 static tree
9291 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9292 tree induc_def, tree vec_step,
9293 enum vect_induction_op_type induction_type)
9295 tree vec_def = induc_def;
9296 switch (induction_type)
9298 case vect_step_op_mul:
9300 /* Use unsigned mult to avoid UD integer overflow. */
9301 tree uvectype
9302 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9303 TYPE_VECTOR_SUBPARTS (vectype));
9304 vec_def = gimple_convert (stmts, uvectype, vec_def);
9305 vec_step = gimple_convert (stmts, uvectype, vec_step);
9306 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9307 vec_def, vec_step);
9308 vec_def = gimple_convert (stmts, vectype, vec_def);
9310 break;
9312 case vect_step_op_shr:
9313 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9314 vec_def, vec_step);
9315 break;
9317 case vect_step_op_shl:
9318 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9319 vec_def, vec_step);
9320 break;
9321 case vect_step_op_neg:
9322 vec_def = induc_def;
9323 /* Do nothing. */
9324 break;
9325 default:
9326 gcc_unreachable ();
9329 return vec_def;
9333 /* Function vectorizable_induction
9335 Check if STMT_INFO performs an nonlinear induction computation that can be
9336 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9337 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9338 basic block.
9339 Return true if STMT_INFO is vectorizable in this way. */
9341 static bool
9342 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9343 stmt_vec_info stmt_info,
9344 gimple **vec_stmt, slp_tree slp_node,
9345 stmt_vector_for_cost *cost_vec)
9347 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9348 unsigned ncopies;
9349 bool nested_in_vect_loop = false;
9350 class loop *iv_loop;
9351 tree vec_def;
9352 edge pe = loop_preheader_edge (loop);
9353 basic_block new_bb;
9354 tree vec_init, vec_step;
9355 tree new_name;
9356 gimple *new_stmt;
9357 gphi *induction_phi;
9358 tree induc_def, vec_dest;
9359 tree init_expr, step_expr;
9360 tree niters_skip;
9361 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9362 unsigned i;
9363 gimple_stmt_iterator si;
9365 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9367 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9368 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9369 enum vect_induction_op_type induction_type
9370 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9372 gcc_assert (induction_type > vect_step_op_add);
9374 if (slp_node)
9375 ncopies = 1;
9376 else
9377 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9378 gcc_assert (ncopies >= 1);
9380 /* FORNOW. Only handle nonlinear induction in the same loop. */
9381 if (nested_in_vect_loop_p (loop, stmt_info))
9383 if (dump_enabled_p ())
9384 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9385 "nonlinear induction in nested loop.\n");
9386 return false;
9389 iv_loop = loop;
9390 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9392 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9393 update for each iv and a permutation to generate wanted vector iv. */
9394 if (slp_node)
9396 if (dump_enabled_p ())
9397 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9398 "SLP induction not supported for nonlinear"
9399 " induction.\n");
9400 return false;
9403 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9405 if (dump_enabled_p ())
9406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9407 "floating point nonlinear induction vectorization"
9408 " not supported.\n");
9409 return false;
9412 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9413 init_expr = vect_phi_initial_value (phi);
9414 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9415 && TREE_CODE (step_expr) == INTEGER_CST);
9416 /* step_expr should be aligned with init_expr,
9417 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9418 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9420 if (TREE_CODE (init_expr) == INTEGER_CST)
9421 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9422 else
9423 gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9424 TREE_TYPE (init_expr)));
9426 switch (induction_type)
9428 case vect_step_op_neg:
9429 if (TREE_CODE (init_expr) != INTEGER_CST
9430 && TREE_CODE (init_expr) != REAL_CST)
9432 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9433 if (!directly_supported_p (NEGATE_EXPR, vectype))
9434 return false;
9436 /* The encoding has 2 interleaved stepped patterns. */
9437 vec_perm_builder sel (nunits, 2, 3);
9438 machine_mode mode = TYPE_MODE (vectype);
9439 sel.quick_grow (6);
9440 for (i = 0; i < 3; i++)
9442 sel[i * 2] = i;
9443 sel[i * 2 + 1] = i + nunits;
9445 vec_perm_indices indices (sel, 2, nunits);
9446 if (!can_vec_perm_const_p (mode, mode, indices))
9447 return false;
9449 break;
9451 case vect_step_op_mul:
9453 /* Check for backend support of MULT_EXPR. */
9454 if (!directly_supported_p (MULT_EXPR, vectype))
9455 return false;
9457 /* ?? How to construct vector step for variable number vector.
9458 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9459 if (!vf.is_constant ())
9460 return false;
9462 break;
9464 case vect_step_op_shr:
9465 /* Check for backend support of RSHIFT_EXPR. */
9466 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9467 return false;
9469 /* Don't shift more than type precision to avoid UD. */
9470 if (!tree_fits_uhwi_p (step_expr)
9471 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9472 TYPE_PRECISION (TREE_TYPE (init_expr))))
9473 return false;
9474 break;
9476 case vect_step_op_shl:
9477 /* Check for backend support of RSHIFT_EXPR. */
9478 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9479 return false;
9481 /* Don't shift more than type precision to avoid UD. */
9482 if (!tree_fits_uhwi_p (step_expr)
9483 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9484 TYPE_PRECISION (TREE_TYPE (init_expr))))
9485 return false;
9487 break;
9489 default:
9490 gcc_unreachable ();
9493 if (!vec_stmt) /* transformation not required. */
9495 unsigned inside_cost = 0, prologue_cost = 0;
9496 /* loop cost for vec_loop. Neg induction doesn't have any
9497 inside_cost. */
9498 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9499 stmt_info, 0, vect_body);
9501 /* loop cost for vec_loop. Neg induction doesn't have any
9502 inside_cost. */
9503 if (induction_type == vect_step_op_neg)
9504 inside_cost = 0;
9506 /* prologue cost for vec_init and vec_step. */
9507 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9508 stmt_info, 0, vect_prologue);
9510 if (dump_enabled_p ())
9511 dump_printf_loc (MSG_NOTE, vect_location,
9512 "vect_model_induction_cost: inside_cost = %d, "
9513 "prologue_cost = %d. \n", inside_cost,
9514 prologue_cost);
9516 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9517 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9518 return true;
9521 /* Transform. */
9523 /* Compute a vector variable, initialized with the first VF values of
9524 the induction variable. E.g., for an iv with IV_PHI='X' and
9525 evolution S, for a vector of 4 units, we want to compute:
9526 [X, X + S, X + 2*S, X + 3*S]. */
9528 if (dump_enabled_p ())
9529 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9531 pe = loop_preheader_edge (iv_loop);
9532 /* Find the first insertion point in the BB. */
9533 basic_block bb = gimple_bb (phi);
9534 si = gsi_after_labels (bb);
9536 gimple_seq stmts = NULL;
9538 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9539 /* If we are using the loop mask to "peel" for alignment then we need
9540 to adjust the start value here. */
9541 if (niters_skip != NULL_TREE)
9542 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9543 step_expr, induction_type);
9545 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9546 step_expr, nunits, vectype,
9547 induction_type);
9548 if (stmts)
9550 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9551 gcc_assert (!new_bb);
9554 stmts = NULL;
9555 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9556 vf, induction_type);
9557 if (stmts)
9559 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9560 gcc_assert (!new_bb);
9563 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9564 new_name, vectype,
9565 induction_type);
9566 /* Create the following def-use cycle:
9567 loop prolog:
9568 vec_init = ...
9569 vec_step = ...
9570 loop:
9571 vec_iv = PHI <vec_init, vec_loop>
9573 STMT
9575 vec_loop = vec_iv + vec_step; */
9577 /* Create the induction-phi that defines the induction-operand. */
9578 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9579 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9580 induc_def = PHI_RESULT (induction_phi);
9582 /* Create the iv update inside the loop. */
9583 stmts = NULL;
9584 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9585 induc_def, vec_step,
9586 induction_type);
9588 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9589 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9591 /* Set the arguments of the phi node: */
9592 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9593 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9594 UNKNOWN_LOCATION);
9596 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9597 *vec_stmt = induction_phi;
9599 /* In case that vectorization factor (VF) is bigger than the number
9600 of elements that we can fit in a vectype (nunits), we have to generate
9601 more than one vector stmt - i.e - we need to "unroll" the
9602 vector stmt by a factor VF/nunits. For more details see documentation
9603 in vectorizable_operation. */
9605 if (ncopies > 1)
9607 stmts = NULL;
9608 /* FORNOW. This restriction should be relaxed. */
9609 gcc_assert (!nested_in_vect_loop);
9611 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9612 nunits, induction_type);
9614 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9615 new_name, vectype,
9616 induction_type);
9617 vec_def = induc_def;
9618 for (i = 1; i < ncopies; i++)
9620 /* vec_i = vec_prev + vec_step. */
9621 stmts = NULL;
9622 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9623 vec_def, vec_step,
9624 induction_type);
9625 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9626 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9627 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9631 if (dump_enabled_p ())
9632 dump_printf_loc (MSG_NOTE, vect_location,
9633 "transform induction: created def-use cycle: %G%G",
9634 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9636 return true;
9639 /* Function vectorizable_induction
9641 Check if STMT_INFO performs an induction computation that can be vectorized.
9642 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9643 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9644 Return true if STMT_INFO is vectorizable in this way. */
9646 bool
9647 vectorizable_induction (loop_vec_info loop_vinfo,
9648 stmt_vec_info stmt_info,
9649 gimple **vec_stmt, slp_tree slp_node,
9650 stmt_vector_for_cost *cost_vec)
9652 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9653 unsigned ncopies;
9654 bool nested_in_vect_loop = false;
9655 class loop *iv_loop;
9656 tree vec_def;
9657 edge pe = loop_preheader_edge (loop);
9658 basic_block new_bb;
9659 tree new_vec, vec_init, vec_step, t;
9660 tree new_name;
9661 gimple *new_stmt;
9662 gphi *induction_phi;
9663 tree induc_def, vec_dest;
9664 tree init_expr, step_expr;
9665 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9666 unsigned i;
9667 tree expr;
9668 gimple_stmt_iterator si;
9669 enum vect_induction_op_type induction_type
9670 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9672 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9673 if (!phi)
9674 return false;
9676 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9677 return false;
9679 /* Make sure it was recognized as induction computation. */
9680 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9681 return false;
9683 /* Handle nonlinear induction in a separate place. */
9684 if (induction_type != vect_step_op_add)
9685 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9686 vec_stmt, slp_node, cost_vec);
9688 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9689 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9691 if (slp_node)
9692 ncopies = 1;
9693 else
9694 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9695 gcc_assert (ncopies >= 1);
9697 /* FORNOW. These restrictions should be relaxed. */
9698 if (nested_in_vect_loop_p (loop, stmt_info))
9700 imm_use_iterator imm_iter;
9701 use_operand_p use_p;
9702 gimple *exit_phi;
9703 edge latch_e;
9704 tree loop_arg;
9706 if (ncopies > 1)
9708 if (dump_enabled_p ())
9709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9710 "multiple types in nested loop.\n");
9711 return false;
9714 exit_phi = NULL;
9715 latch_e = loop_latch_edge (loop->inner);
9716 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9717 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9719 gimple *use_stmt = USE_STMT (use_p);
9720 if (is_gimple_debug (use_stmt))
9721 continue;
9723 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9725 exit_phi = use_stmt;
9726 break;
9729 if (exit_phi)
9731 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9732 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9733 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9735 if (dump_enabled_p ())
9736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9737 "inner-loop induction only used outside "
9738 "of the outer vectorized loop.\n");
9739 return false;
9743 nested_in_vect_loop = true;
9744 iv_loop = loop->inner;
9746 else
9747 iv_loop = loop;
9748 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9750 if (slp_node && !nunits.is_constant ())
9752 /* The current SLP code creates the step value element-by-element. */
9753 if (dump_enabled_p ())
9754 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9755 "SLP induction not supported for variable-length"
9756 " vectors.\n");
9757 return false;
9760 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9762 if (dump_enabled_p ())
9763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9764 "floating point induction vectorization disabled\n");
9765 return false;
9768 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9769 gcc_assert (step_expr != NULL_TREE);
9770 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9772 /* Check for backend support of PLUS/MINUS_EXPR. */
9773 if (!directly_supported_p (PLUS_EXPR, step_vectype)
9774 || !directly_supported_p (MINUS_EXPR, step_vectype))
9775 return false;
9777 if (!vec_stmt) /* transformation not required. */
9779 unsigned inside_cost = 0, prologue_cost = 0;
9780 if (slp_node)
9782 /* We eventually need to set a vector type on invariant
9783 arguments. */
9784 unsigned j;
9785 slp_tree child;
9786 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9787 if (!vect_maybe_update_slp_op_vectype
9788 (child, SLP_TREE_VECTYPE (slp_node)))
9790 if (dump_enabled_p ())
9791 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9792 "incompatible vector types for "
9793 "invariants\n");
9794 return false;
9796 /* loop cost for vec_loop. */
9797 inside_cost
9798 = record_stmt_cost (cost_vec,
9799 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9800 vector_stmt, stmt_info, 0, vect_body);
9801 /* prologue cost for vec_init (if not nested) and step. */
9802 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9803 scalar_to_vec,
9804 stmt_info, 0, vect_prologue);
9806 else /* if (!slp_node) */
9808 /* loop cost for vec_loop. */
9809 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9810 stmt_info, 0, vect_body);
9811 /* prologue cost for vec_init and vec_step. */
9812 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9813 stmt_info, 0, vect_prologue);
9815 if (dump_enabled_p ())
9816 dump_printf_loc (MSG_NOTE, vect_location,
9817 "vect_model_induction_cost: inside_cost = %d, "
9818 "prologue_cost = %d .\n", inside_cost,
9819 prologue_cost);
9821 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9822 DUMP_VECT_SCOPE ("vectorizable_induction");
9823 return true;
9826 /* Transform. */
9828 /* Compute a vector variable, initialized with the first VF values of
9829 the induction variable. E.g., for an iv with IV_PHI='X' and
9830 evolution S, for a vector of 4 units, we want to compute:
9831 [X, X + S, X + 2*S, X + 3*S]. */
9833 if (dump_enabled_p ())
9834 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9836 pe = loop_preheader_edge (iv_loop);
9837 /* Find the first insertion point in the BB. */
9838 basic_block bb = gimple_bb (phi);
9839 si = gsi_after_labels (bb);
9841 /* For SLP induction we have to generate several IVs as for example
9842 with group size 3 we need
9843 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9844 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9845 if (slp_node)
9847 /* Enforced above. */
9848 unsigned int const_nunits = nunits.to_constant ();
9850 /* The initial values are vectorized, but any lanes > group_size
9851 need adjustment. */
9852 slp_tree init_node
9853 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9855 /* Gather steps. Since we do not vectorize inductions as
9856 cycles we have to reconstruct the step from SCEV data. */
9857 unsigned group_size = SLP_TREE_LANES (slp_node);
9858 tree *steps = XALLOCAVEC (tree, group_size);
9859 tree *inits = XALLOCAVEC (tree, group_size);
9860 stmt_vec_info phi_info;
9861 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9863 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9864 if (!init_node)
9865 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9866 pe->dest_idx);
9869 /* Now generate the IVs. */
9870 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9871 gcc_assert ((const_nunits * nvects) % group_size == 0);
9872 unsigned nivs;
9873 if (nested_in_vect_loop)
9874 nivs = nvects;
9875 else
9877 /* Compute the number of distinct IVs we need. First reduce
9878 group_size if it is a multiple of const_nunits so we get
9879 one IV for a group_size of 4 but const_nunits 2. */
9880 unsigned group_sizep = group_size;
9881 if (group_sizep % const_nunits == 0)
9882 group_sizep = group_sizep / const_nunits;
9883 nivs = least_common_multiple (group_sizep,
9884 const_nunits) / const_nunits;
9886 tree stept = TREE_TYPE (step_vectype);
9887 tree lupdate_mul = NULL_TREE;
9888 if (!nested_in_vect_loop)
9890 /* The number of iterations covered in one vector iteration. */
9891 unsigned lup_mul = (nvects * const_nunits) / group_size;
9892 lupdate_mul
9893 = build_vector_from_val (step_vectype,
9894 SCALAR_FLOAT_TYPE_P (stept)
9895 ? build_real_from_wide (stept, lup_mul,
9896 UNSIGNED)
9897 : build_int_cstu (stept, lup_mul));
9899 tree peel_mul = NULL_TREE;
9900 gimple_seq init_stmts = NULL;
9901 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9903 if (SCALAR_FLOAT_TYPE_P (stept))
9904 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9905 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9906 else
9907 peel_mul = gimple_convert (&init_stmts, stept,
9908 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9909 peel_mul = gimple_build_vector_from_val (&init_stmts,
9910 step_vectype, peel_mul);
9912 unsigned ivn;
9913 auto_vec<tree> vec_steps;
9914 for (ivn = 0; ivn < nivs; ++ivn)
9916 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9917 tree_vector_builder init_elts (vectype, const_nunits, 1);
9918 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9919 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9921 /* The scalar steps of the IVs. */
9922 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9923 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9924 step_elts.quick_push (elt);
9925 if (!init_node)
9927 /* The scalar inits of the IVs if not vectorized. */
9928 elt = inits[(ivn*const_nunits + eltn) % group_size];
9929 if (!useless_type_conversion_p (TREE_TYPE (vectype),
9930 TREE_TYPE (elt)))
9931 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9932 TREE_TYPE (vectype), elt);
9933 init_elts.quick_push (elt);
9935 /* The number of steps to add to the initial values. */
9936 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9937 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9938 ? build_real_from_wide (stept,
9939 mul_elt, UNSIGNED)
9940 : build_int_cstu (stept, mul_elt));
9942 vec_step = gimple_build_vector (&init_stmts, &step_elts);
9943 vec_steps.safe_push (vec_step);
9944 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9945 if (peel_mul)
9946 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9947 step_mul, peel_mul);
9948 if (!init_node)
9949 vec_init = gimple_build_vector (&init_stmts, &init_elts);
9951 /* Create the induction-phi that defines the induction-operand. */
9952 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9953 "vec_iv_");
9954 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9955 induc_def = PHI_RESULT (induction_phi);
9957 /* Create the iv update inside the loop */
9958 tree up = vec_step;
9959 if (lupdate_mul)
9960 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9961 vec_step, lupdate_mul);
9962 gimple_seq stmts = NULL;
9963 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9964 vec_def = gimple_build (&stmts,
9965 PLUS_EXPR, step_vectype, vec_def, up);
9966 vec_def = gimple_convert (&stmts, vectype, vec_def);
9967 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9968 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9969 UNKNOWN_LOCATION);
9971 if (init_node)
9972 vec_init = vect_get_slp_vect_def (init_node, ivn);
9973 if (!nested_in_vect_loop
9974 && !integer_zerop (step_mul))
9976 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9977 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9978 vec_step, step_mul);
9979 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9980 vec_def, up);
9981 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9984 /* Set the arguments of the phi node: */
9985 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9987 slp_node->push_vec_def (induction_phi);
9989 if (!nested_in_vect_loop)
9991 /* Fill up to the number of vectors we need for the whole group. */
9992 nivs = least_common_multiple (group_size,
9993 const_nunits) / const_nunits;
9994 vec_steps.reserve (nivs-ivn);
9995 for (; ivn < nivs; ++ivn)
9997 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9998 vec_steps.quick_push (vec_steps[0]);
10002 /* Re-use IVs when we can. We are generating further vector
10003 stmts by adding VF' * stride to the IVs generated above. */
10004 if (ivn < nvects)
10006 unsigned vfp
10007 = least_common_multiple (group_size, const_nunits) / group_size;
10008 tree lupdate_mul
10009 = build_vector_from_val (step_vectype,
10010 SCALAR_FLOAT_TYPE_P (stept)
10011 ? build_real_from_wide (stept,
10012 vfp, UNSIGNED)
10013 : build_int_cstu (stept, vfp));
10014 for (; ivn < nvects; ++ivn)
10016 gimple *iv
10017 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10018 tree def = gimple_get_lhs (iv);
10019 if (ivn < 2*nivs)
10020 vec_steps[ivn - nivs]
10021 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10022 vec_steps[ivn - nivs], lupdate_mul);
10023 gimple_seq stmts = NULL;
10024 def = gimple_convert (&stmts, step_vectype, def);
10025 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10026 def, vec_steps[ivn % nivs]);
10027 def = gimple_convert (&stmts, vectype, def);
10028 if (gimple_code (iv) == GIMPLE_PHI)
10029 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10030 else
10032 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10033 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10035 slp_node->push_vec_def (def);
10039 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10040 gcc_assert (!new_bb);
10042 return true;
10045 init_expr = vect_phi_initial_value (phi);
10047 gimple_seq stmts = NULL;
10048 if (!nested_in_vect_loop)
10050 /* Convert the initial value to the IV update type. */
10051 tree new_type = TREE_TYPE (step_expr);
10052 init_expr = gimple_convert (&stmts, new_type, init_expr);
10054 /* If we are using the loop mask to "peel" for alignment then we need
10055 to adjust the start value here. */
10056 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10057 if (skip_niters != NULL_TREE)
10059 if (FLOAT_TYPE_P (vectype))
10060 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10061 skip_niters);
10062 else
10063 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10064 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10065 skip_niters, step_expr);
10066 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10067 init_expr, skip_step);
10071 if (stmts)
10073 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10074 gcc_assert (!new_bb);
10077 /* Create the vector that holds the initial_value of the induction. */
10078 if (nested_in_vect_loop)
10080 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10081 been created during vectorization of previous stmts. We obtain it
10082 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10083 auto_vec<tree> vec_inits;
10084 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10085 init_expr, &vec_inits);
10086 vec_init = vec_inits[0];
10087 /* If the initial value is not of proper type, convert it. */
10088 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10090 new_stmt
10091 = gimple_build_assign (vect_get_new_ssa_name (vectype,
10092 vect_simple_var,
10093 "vec_iv_"),
10094 VIEW_CONVERT_EXPR,
10095 build1 (VIEW_CONVERT_EXPR, vectype,
10096 vec_init));
10097 vec_init = gimple_assign_lhs (new_stmt);
10098 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10099 new_stmt);
10100 gcc_assert (!new_bb);
10103 else
10105 /* iv_loop is the loop to be vectorized. Create:
10106 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10107 stmts = NULL;
10108 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10110 unsigned HOST_WIDE_INT const_nunits;
10111 if (nunits.is_constant (&const_nunits))
10113 tree_vector_builder elts (step_vectype, const_nunits, 1);
10114 elts.quick_push (new_name);
10115 for (i = 1; i < const_nunits; i++)
10117 /* Create: new_name_i = new_name + step_expr */
10118 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10119 new_name, step_expr);
10120 elts.quick_push (new_name);
10122 /* Create a vector from [new_name_0, new_name_1, ...,
10123 new_name_nunits-1] */
10124 vec_init = gimple_build_vector (&stmts, &elts);
10126 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10127 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10128 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10129 new_name, step_expr);
10130 else
10132 /* Build:
10133 [base, base, base, ...]
10134 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10135 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10136 gcc_assert (flag_associative_math);
10137 tree index = build_index_vector (step_vectype, 0, 1);
10138 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10139 new_name);
10140 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10141 step_expr);
10142 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10143 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10144 vec_init, step_vec);
10145 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10146 vec_init, base_vec);
10148 vec_init = gimple_convert (&stmts, vectype, vec_init);
10150 if (stmts)
10152 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10153 gcc_assert (!new_bb);
10158 /* Create the vector that holds the step of the induction. */
10159 if (nested_in_vect_loop)
10160 /* iv_loop is nested in the loop to be vectorized. Generate:
10161 vec_step = [S, S, S, S] */
10162 new_name = step_expr;
10163 else
10165 /* iv_loop is the loop to be vectorized. Generate:
10166 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10167 gimple_seq seq = NULL;
10168 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10170 expr = build_int_cst (integer_type_node, vf);
10171 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10173 else
10174 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10175 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10176 expr, step_expr);
10177 if (seq)
10179 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10180 gcc_assert (!new_bb);
10184 t = unshare_expr (new_name);
10185 gcc_assert (CONSTANT_CLASS_P (new_name)
10186 || TREE_CODE (new_name) == SSA_NAME);
10187 new_vec = build_vector_from_val (step_vectype, t);
10188 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10189 new_vec, step_vectype, NULL);
10192 /* Create the following def-use cycle:
10193 loop prolog:
10194 vec_init = ...
10195 vec_step = ...
10196 loop:
10197 vec_iv = PHI <vec_init, vec_loop>
10199 STMT
10201 vec_loop = vec_iv + vec_step; */
10203 /* Create the induction-phi that defines the induction-operand. */
10204 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10205 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10206 induc_def = PHI_RESULT (induction_phi);
10208 /* Create the iv update inside the loop */
10209 stmts = NULL;
10210 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10211 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10212 vec_def = gimple_convert (&stmts, vectype, vec_def);
10213 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10214 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10216 /* Set the arguments of the phi node: */
10217 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10218 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10219 UNKNOWN_LOCATION);
10221 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10222 *vec_stmt = induction_phi;
10224 /* In case that vectorization factor (VF) is bigger than the number
10225 of elements that we can fit in a vectype (nunits), we have to generate
10226 more than one vector stmt - i.e - we need to "unroll" the
10227 vector stmt by a factor VF/nunits. For more details see documentation
10228 in vectorizable_operation. */
10230 if (ncopies > 1)
10232 gimple_seq seq = NULL;
10233 /* FORNOW. This restriction should be relaxed. */
10234 gcc_assert (!nested_in_vect_loop);
10236 /* Create the vector that holds the step of the induction. */
10237 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10239 expr = build_int_cst (integer_type_node, nunits);
10240 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10242 else
10243 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10244 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10245 expr, step_expr);
10246 if (seq)
10248 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10249 gcc_assert (!new_bb);
10252 t = unshare_expr (new_name);
10253 gcc_assert (CONSTANT_CLASS_P (new_name)
10254 || TREE_CODE (new_name) == SSA_NAME);
10255 new_vec = build_vector_from_val (step_vectype, t);
10256 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10257 new_vec, step_vectype, NULL);
10259 vec_def = induc_def;
10260 for (i = 1; i < ncopies + 1; i++)
10262 /* vec_i = vec_prev + vec_step */
10263 gimple_seq stmts = NULL;
10264 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10265 vec_def = gimple_build (&stmts,
10266 PLUS_EXPR, step_vectype, vec_def, vec_step);
10267 vec_def = gimple_convert (&stmts, vectype, vec_def);
10269 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10270 if (i < ncopies)
10272 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10273 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10275 else
10277 /* vec_1 = vec_iv + (VF/n * S)
10278 vec_2 = vec_1 + (VF/n * S)
10280 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10282 vec_n is used as vec_loop to save the large step register and
10283 related operations. */
10284 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10285 UNKNOWN_LOCATION);
10290 if (dump_enabled_p ())
10291 dump_printf_loc (MSG_NOTE, vect_location,
10292 "transform induction: created def-use cycle: %G%G",
10293 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10295 return true;
10298 /* Function vectorizable_live_operation.
10300 STMT_INFO computes a value that is used outside the loop. Check if
10301 it can be supported. */
10303 bool
10304 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10305 slp_tree slp_node, slp_instance slp_node_instance,
10306 int slp_index, bool vec_stmt_p,
10307 stmt_vector_for_cost *cost_vec)
10309 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10310 imm_use_iterator imm_iter;
10311 tree lhs, lhs_type, bitsize;
10312 tree vectype = (slp_node
10313 ? SLP_TREE_VECTYPE (slp_node)
10314 : STMT_VINFO_VECTYPE (stmt_info));
10315 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10316 int ncopies;
10317 gimple *use_stmt;
10318 auto_vec<tree> vec_oprnds;
10319 int vec_entry = 0;
10320 poly_uint64 vec_index = 0;
10322 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10324 /* If a stmt of a reduction is live, vectorize it via
10325 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10326 validity so just trigger the transform here. */
10327 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10329 if (!vec_stmt_p)
10330 return true;
10331 if (slp_node)
10333 /* For reduction chains the meta-info is attached to
10334 the group leader. */
10335 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10336 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10337 /* For SLP reductions we vectorize the epilogue for
10338 all involved stmts together. */
10339 else if (slp_index != 0)
10340 return true;
10342 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10343 gcc_assert (reduc_info->is_reduc_info);
10344 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10345 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10346 return true;
10347 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10348 slp_node_instance);
10349 return true;
10352 /* If STMT is not relevant and it is a simple assignment and its inputs are
10353 invariant then it can remain in place, unvectorized. The original last
10354 scalar value that it computes will be used. */
10355 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10357 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10358 if (dump_enabled_p ())
10359 dump_printf_loc (MSG_NOTE, vect_location,
10360 "statement is simple and uses invariant. Leaving in "
10361 "place.\n");
10362 return true;
10365 if (slp_node)
10366 ncopies = 1;
10367 else
10368 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10370 if (slp_node)
10372 gcc_assert (slp_index >= 0);
10374 /* Get the last occurrence of the scalar index from the concatenation of
10375 all the slp vectors. Calculate which slp vector it is and the index
10376 within. */
10377 int num_scalar = SLP_TREE_LANES (slp_node);
10378 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10379 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10381 /* Calculate which vector contains the result, and which lane of
10382 that vector we need. */
10383 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10385 if (dump_enabled_p ())
10386 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10387 "Cannot determine which vector holds the"
10388 " final result.\n");
10389 return false;
10393 if (!vec_stmt_p)
10395 /* No transformation required. */
10396 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10398 if (slp_node)
10400 if (dump_enabled_p ())
10401 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10402 "can't operate on partial vectors "
10403 "because an SLP statement is live after "
10404 "the loop.\n");
10405 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10407 else if (ncopies > 1)
10409 if (dump_enabled_p ())
10410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10411 "can't operate on partial vectors "
10412 "because ncopies is greater than 1.\n");
10413 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10415 else
10417 gcc_assert (ncopies == 1 && !slp_node);
10418 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10419 OPTIMIZE_FOR_SPEED))
10420 vect_record_loop_mask (loop_vinfo,
10421 &LOOP_VINFO_MASKS (loop_vinfo),
10422 1, vectype, NULL);
10423 else if (can_vec_extract_var_idx_p (
10424 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10425 vect_record_loop_len (loop_vinfo,
10426 &LOOP_VINFO_LENS (loop_vinfo),
10427 1, vectype, 1);
10428 else
10430 if (dump_enabled_p ())
10431 dump_printf_loc (
10432 MSG_MISSED_OPTIMIZATION, vect_location,
10433 "can't operate on partial vectors "
10434 "because the target doesn't support extract "
10435 "last reduction.\n");
10436 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10440 /* ??? Enable for loop costing as well. */
10441 if (!loop_vinfo)
10442 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10443 0, vect_epilogue);
10444 return true;
10447 /* Use the lhs of the original scalar statement. */
10448 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10449 if (dump_enabled_p ())
10450 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10451 "stmt %G", stmt);
10453 lhs = gimple_get_lhs (stmt);
10454 lhs_type = TREE_TYPE (lhs);
10456 bitsize = vector_element_bits_tree (vectype);
10458 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10459 tree vec_lhs, bitstart;
10460 gimple *vec_stmt;
10461 if (slp_node)
10463 gcc_assert (!loop_vinfo
10464 || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10465 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10467 /* Get the correct slp vectorized stmt. */
10468 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10469 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10471 /* Get entry to use. */
10472 bitstart = bitsize_int (vec_index);
10473 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10475 else
10477 /* For multiple copies, get the last copy. */
10478 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10479 vec_lhs = gimple_get_lhs (vec_stmt);
10481 /* Get the last lane in the vector. */
10482 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10485 if (loop_vinfo)
10487 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10488 requirement, insert one phi node for it. It looks like:
10489 loop;
10491 # lhs' = PHI <lhs>
10493 loop;
10495 # vec_lhs' = PHI <vec_lhs>
10496 new_tree = lane_extract <vec_lhs', ...>;
10497 lhs' = new_tree; */
10499 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10500 basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
10501 gcc_assert (single_pred_p (exit_bb));
10503 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10504 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10505 SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, vec_lhs);
10507 gimple_seq stmts = NULL;
10508 tree new_tree;
10509 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10511 /* Emit:
10513 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10515 where VEC_LHS is the vectorized live-out result and MASK is
10516 the loop mask for the final iteration. */
10517 gcc_assert (ncopies == 1 && !slp_node);
10518 gimple_seq tem = NULL;
10519 gimple_stmt_iterator gsi = gsi_last (tem);
10520 tree len
10521 = vect_get_loop_len (loop_vinfo, &gsi,
10522 &LOOP_VINFO_LENS (loop_vinfo),
10523 1, vectype, 0, 0);
10525 /* BIAS - 1. */
10526 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10527 tree bias_minus_one
10528 = int_const_binop (MINUS_EXPR,
10529 build_int_cst (TREE_TYPE (len), biasval),
10530 build_one_cst (TREE_TYPE (len)));
10532 /* LAST_INDEX = LEN + (BIAS - 1). */
10533 tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10534 len, bias_minus_one);
10536 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10537 tree scalar_res
10538 = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10539 vec_lhs_phi, last_index);
10541 /* Convert the extracted vector element to the scalar type. */
10542 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10544 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10546 /* Emit:
10548 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10550 where VEC_LHS is the vectorized live-out result and MASK is
10551 the loop mask for the final iteration. */
10552 gcc_assert (ncopies == 1 && !slp_node);
10553 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10554 gimple_seq tem = NULL;
10555 gimple_stmt_iterator gsi = gsi_last (tem);
10556 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10557 &LOOP_VINFO_MASKS (loop_vinfo),
10558 1, vectype, 0);
10559 gimple_seq_add_seq (&stmts, tem);
10560 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10561 mask, vec_lhs_phi);
10563 /* Convert the extracted vector element to the scalar type. */
10564 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10566 else
10568 tree bftype = TREE_TYPE (vectype);
10569 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10570 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10571 new_tree = build3 (BIT_FIELD_REF, bftype,
10572 vec_lhs_phi, bitsize, bitstart);
10573 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10574 &stmts, true, NULL_TREE);
10577 if (stmts)
10579 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10580 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10582 /* Remove existing phi from lhs and create one copy from new_tree. */
10583 tree lhs_phi = NULL_TREE;
10584 gimple_stmt_iterator gsi;
10585 for (gsi = gsi_start_phis (exit_bb);
10586 !gsi_end_p (gsi); gsi_next (&gsi))
10588 gimple *phi = gsi_stmt (gsi);
10589 if ((gimple_phi_arg_def (phi, 0) == lhs))
10591 remove_phi_node (&gsi, false);
10592 lhs_phi = gimple_phi_result (phi);
10593 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10594 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10595 break;
10600 /* Replace use of lhs with newly computed result. If the use stmt is a
10601 single arg PHI, just replace all uses of PHI result. It's necessary
10602 because lcssa PHI defining lhs may be before newly inserted stmt. */
10603 use_operand_p use_p;
10604 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10605 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10606 && !is_gimple_debug (use_stmt))
10608 if (gimple_code (use_stmt) == GIMPLE_PHI
10609 && gimple_phi_num_args (use_stmt) == 1)
10611 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10613 else
10615 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10616 SET_USE (use_p, new_tree);
10618 update_stmt (use_stmt);
10621 else
10623 /* For basic-block vectorization simply insert the lane-extraction. */
10624 tree bftype = TREE_TYPE (vectype);
10625 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10626 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10627 tree new_tree = build3 (BIT_FIELD_REF, bftype,
10628 vec_lhs, bitsize, bitstart);
10629 gimple_seq stmts = NULL;
10630 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10631 &stmts, true, NULL_TREE);
10632 if (TREE_CODE (new_tree) == SSA_NAME
10633 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10634 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10635 if (is_a <gphi *> (vec_stmt))
10637 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10638 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10640 else
10642 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10643 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10646 /* Replace use of lhs with newly computed result. If the use stmt is a
10647 single arg PHI, just replace all uses of PHI result. It's necessary
10648 because lcssa PHI defining lhs may be before newly inserted stmt. */
10649 use_operand_p use_p;
10650 stmt_vec_info use_stmt_info;
10651 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10652 if (!is_gimple_debug (use_stmt)
10653 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10654 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10656 /* ??? This can happen when the live lane ends up being
10657 used in a vector construction code-generated by an
10658 external SLP node (and code-generation for that already
10659 happened). See gcc.dg/vect/bb-slp-47.c.
10660 Doing this is what would happen if that vector CTOR
10661 were not code-generated yet so it is not too bad.
10662 ??? In fact we'd likely want to avoid this situation
10663 in the first place. */
10664 if (TREE_CODE (new_tree) == SSA_NAME
10665 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10666 && gimple_code (use_stmt) != GIMPLE_PHI
10667 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10668 use_stmt))
10670 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10671 gcc_checking_assert (code == SSA_NAME
10672 || code == CONSTRUCTOR
10673 || code == VIEW_CONVERT_EXPR
10674 || CONVERT_EXPR_CODE_P (code));
10675 if (dump_enabled_p ())
10676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10677 "Using original scalar computation for "
10678 "live lane because use preceeds vector "
10679 "def\n");
10680 continue;
10682 /* ??? It can also happen that we end up pulling a def into
10683 a loop where replacing out-of-loop uses would require
10684 a new LC SSA PHI node. Retain the original scalar in
10685 those cases as well. PR98064. */
10686 if (TREE_CODE (new_tree) == SSA_NAME
10687 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10688 && (gimple_bb (use_stmt)->loop_father
10689 != gimple_bb (vec_stmt)->loop_father)
10690 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10691 gimple_bb (use_stmt)->loop_father))
10693 if (dump_enabled_p ())
10694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10695 "Using original scalar computation for "
10696 "live lane because there is an out-of-loop "
10697 "definition for it\n");
10698 continue;
10700 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10701 SET_USE (use_p, new_tree);
10702 update_stmt (use_stmt);
10706 return true;
10709 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
10711 static void
10712 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10714 ssa_op_iter op_iter;
10715 imm_use_iterator imm_iter;
10716 def_operand_p def_p;
10717 gimple *ustmt;
10719 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10721 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10723 basic_block bb;
10725 if (!is_gimple_debug (ustmt))
10726 continue;
10728 bb = gimple_bb (ustmt);
10730 if (!flow_bb_inside_loop_p (loop, bb))
10732 if (gimple_debug_bind_p (ustmt))
10734 if (dump_enabled_p ())
10735 dump_printf_loc (MSG_NOTE, vect_location,
10736 "killing debug use\n");
10738 gimple_debug_bind_reset_value (ustmt);
10739 update_stmt (ustmt);
10741 else
10742 gcc_unreachable ();
10748 /* Given loop represented by LOOP_VINFO, return true if computation of
10749 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10750 otherwise. */
10752 static bool
10753 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10755 /* Constant case. */
10756 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10758 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10759 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10761 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10762 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10763 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10764 return true;
10767 widest_int max;
10768 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10769 /* Check the upper bound of loop niters. */
10770 if (get_max_loop_iterations (loop, &max))
10772 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10773 signop sgn = TYPE_SIGN (type);
10774 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10775 if (max < type_max)
10776 return true;
10778 return false;
10781 /* Return a mask type with half the number of elements as OLD_TYPE,
10782 given that it should have mode NEW_MODE. */
10784 tree
10785 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10787 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10788 return build_truth_vector_type_for_mode (nunits, new_mode);
10791 /* Return a mask type with twice as many elements as OLD_TYPE,
10792 given that it should have mode NEW_MODE. */
10794 tree
10795 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10797 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10798 return build_truth_vector_type_for_mode (nunits, new_mode);
10801 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10802 contain a sequence of NVECTORS masks that each control a vector of type
10803 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10804 these vector masks with the vector version of SCALAR_MASK. */
10806 void
10807 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10808 unsigned int nvectors, tree vectype, tree scalar_mask)
10810 gcc_assert (nvectors != 0);
10812 if (scalar_mask)
10814 scalar_cond_masked_key cond (scalar_mask, nvectors);
10815 loop_vinfo->scalar_cond_masked_set.add (cond);
10818 masks->mask_set.add (std::make_pair (vectype, nvectors));
10821 /* Given a complete set of masks MASKS, extract mask number INDEX
10822 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10823 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10825 See the comment above vec_loop_masks for more details about the mask
10826 arrangement. */
10828 tree
10829 vect_get_loop_mask (loop_vec_info loop_vinfo,
10830 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10831 unsigned int nvectors, tree vectype, unsigned int index)
10833 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10834 == vect_partial_vectors_while_ult)
10836 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10837 tree mask_type = rgm->type;
10839 /* Populate the rgroup's mask array, if this is the first time we've
10840 used it. */
10841 if (rgm->controls.is_empty ())
10843 rgm->controls.safe_grow_cleared (nvectors, true);
10844 for (unsigned int i = 0; i < nvectors; ++i)
10846 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10847 /* Provide a dummy definition until the real one is available. */
10848 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10849 rgm->controls[i] = mask;
10853 tree mask = rgm->controls[index];
10854 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10855 TYPE_VECTOR_SUBPARTS (vectype)))
10857 /* A loop mask for data type X can be reused for data type Y
10858 if X has N times more elements than Y and if Y's elements
10859 are N times bigger than X's. In this case each sequence
10860 of N elements in the loop mask will be all-zero or all-one.
10861 We can then view-convert the mask so that each sequence of
10862 N elements is replaced by a single element. */
10863 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10864 TYPE_VECTOR_SUBPARTS (vectype)));
10865 gimple_seq seq = NULL;
10866 mask_type = truth_type_for (vectype);
10867 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10868 if (seq)
10869 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10871 return mask;
10873 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10874 == vect_partial_vectors_avx512)
10876 /* The number of scalars per iteration and the number of vectors are
10877 both compile-time constants. */
10878 unsigned int nscalars_per_iter
10879 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10880 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10882 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10884 /* The stored nV is dependent on the mask type produced. */
10885 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10886 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10887 == rgm->factor);
10888 nvectors = rgm->factor;
10890 /* Populate the rgroup's mask array, if this is the first time we've
10891 used it. */
10892 if (rgm->controls.is_empty ())
10894 rgm->controls.safe_grow_cleared (nvectors, true);
10895 for (unsigned int i = 0; i < nvectors; ++i)
10897 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10898 /* Provide a dummy definition until the real one is available. */
10899 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10900 rgm->controls[i] = mask;
10903 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10904 TYPE_VECTOR_SUBPARTS (vectype)))
10905 return rgm->controls[index];
10907 /* Split the vector if needed. Since we are dealing with integer mode
10908 masks with AVX512 we can operate on the integer representation
10909 performing the whole vector shifting. */
10910 unsigned HOST_WIDE_INT factor;
10911 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10912 TYPE_VECTOR_SUBPARTS (vectype), &factor);
10913 gcc_assert (ok);
10914 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10915 tree mask_type = truth_type_for (vectype);
10916 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10917 unsigned vi = index / factor;
10918 unsigned vpart = index % factor;
10919 tree vec = rgm->controls[vi];
10920 gimple_seq seq = NULL;
10921 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10922 lang_hooks.types.type_for_mode
10923 (TYPE_MODE (rgm->type), 1), vec);
10924 /* For integer mode masks simply shift the right bits into position. */
10925 if (vpart != 0)
10926 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10927 build_int_cst (integer_type_node,
10928 (TYPE_VECTOR_SUBPARTS (vectype)
10929 * vpart)));
10930 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10931 (TYPE_MODE (mask_type), 1), vec);
10932 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10933 if (seq)
10934 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10935 return vec;
10937 else
10938 gcc_unreachable ();
10941 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10942 lengths for controlling an operation on VECTYPE. The operation splits
10943 each element of VECTYPE into FACTOR separate subelements, measuring the
10944 length as a number of these subelements. */
10946 void
10947 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10948 unsigned int nvectors, tree vectype, unsigned int factor)
10950 gcc_assert (nvectors != 0);
10951 if (lens->length () < nvectors)
10952 lens->safe_grow_cleared (nvectors, true);
10953 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10955 /* The number of scalars per iteration, scalar occupied bytes and
10956 the number of vectors are both compile-time constants. */
10957 unsigned int nscalars_per_iter
10958 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10959 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10961 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10963 /* For now, we only support cases in which all loads and stores fall back
10964 to VnQI or none do. */
10965 gcc_assert (!rgl->max_nscalars_per_iter
10966 || (rgl->factor == 1 && factor == 1)
10967 || (rgl->max_nscalars_per_iter * rgl->factor
10968 == nscalars_per_iter * factor));
10969 rgl->max_nscalars_per_iter = nscalars_per_iter;
10970 rgl->type = vectype;
10971 rgl->factor = factor;
10975 /* Given a complete set of lengths LENS, extract length number INDEX
10976 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10977 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
10978 multipled by the number of elements that should be processed.
10979 Insert any set-up statements before GSI. */
10981 tree
10982 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10983 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10984 unsigned int index, unsigned int factor)
10986 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10987 bool use_bias_adjusted_len =
10988 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10990 /* Populate the rgroup's len array, if this is the first time we've
10991 used it. */
10992 if (rgl->controls.is_empty ())
10994 rgl->controls.safe_grow_cleared (nvectors, true);
10995 for (unsigned int i = 0; i < nvectors; ++i)
10997 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10998 gcc_assert (len_type != NULL_TREE);
11000 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11002 /* Provide a dummy definition until the real one is available. */
11003 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11004 rgl->controls[i] = len;
11006 if (use_bias_adjusted_len)
11008 gcc_assert (i == 0);
11009 tree adjusted_len =
11010 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11011 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11012 rgl->bias_adjusted_ctrl = adjusted_len;
11017 if (use_bias_adjusted_len)
11018 return rgl->bias_adjusted_ctrl;
11020 tree loop_len = rgl->controls[index];
11021 if (rgl->factor == 1 && factor == 1)
11023 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11024 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11025 if (maybe_ne (nunits1, nunits2))
11027 /* A loop len for data type X can be reused for data type Y
11028 if X has N times more elements than Y and if Y's elements
11029 are N times bigger than X's. */
11030 gcc_assert (multiple_p (nunits1, nunits2));
11031 factor = exact_div (nunits1, nunits2).to_constant ();
11032 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11033 gimple_seq seq = NULL;
11034 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11035 build_int_cst (iv_type, factor));
11036 if (seq)
11037 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11040 return loop_len;
11043 /* Scale profiling counters by estimation for LOOP which is vectorized
11044 by factor VF.
11045 If FLAT is true, the loop we started with had unrealistically flat
11046 profile. */
11048 static void
11049 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11051 /* For flat profiles do not scale down proportionally by VF and only
11052 cap by known iteration count bounds. */
11053 if (flat)
11055 if (dump_file && (dump_flags & TDF_DETAILS))
11056 fprintf (dump_file,
11057 "Vectorized loop profile seems flat; not scaling iteration "
11058 "count down by the vectorization factor %i\n", vf);
11059 scale_loop_profile (loop, profile_probability::always (),
11060 get_likely_max_loop_iterations_int (loop));
11061 return;
11063 /* Loop body executes VF fewer times and exit increases VF times. */
11064 profile_count entry_count = loop_preheader_edge (loop)->count ();
11066 /* If we have unreliable loop profile avoid dropping entry
11067 count bellow header count. This can happen since loops
11068 has unrealistically low trip counts. */
11069 while (vf > 1
11070 && loop->header->count > entry_count
11071 && loop->header->count < entry_count * vf)
11073 if (dump_file && (dump_flags & TDF_DETAILS))
11074 fprintf (dump_file,
11075 "Vectorization factor %i seems too large for profile "
11076 "prevoiusly believed to be consistent; reducing.\n", vf);
11077 vf /= 2;
11080 if (entry_count.nonzero_p ())
11081 set_edge_probability_and_rescale_others
11082 (exit_e,
11083 entry_count.probability_in (loop->header->count / vf));
11084 /* Avoid producing very large exit probability when we do not have
11085 sensible profile. */
11086 else if (exit_e->probability < profile_probability::always () / (vf * 2))
11087 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11088 loop->latch->count = single_pred_edge (loop->latch)->count ();
11090 scale_loop_profile (loop, profile_probability::always () / vf,
11091 get_likely_max_loop_iterations_int (loop));
11094 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11095 latch edge values originally defined by it. */
11097 static void
11098 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11099 stmt_vec_info def_stmt_info)
11101 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11102 if (!def || TREE_CODE (def) != SSA_NAME)
11103 return;
11104 stmt_vec_info phi_info;
11105 imm_use_iterator iter;
11106 use_operand_p use_p;
11107 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11109 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11110 if (!phi)
11111 continue;
11112 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11113 && (phi_info = loop_vinfo->lookup_stmt (phi))
11114 && STMT_VINFO_RELEVANT_P (phi_info)))
11115 continue;
11116 loop_p loop = gimple_bb (phi)->loop_father;
11117 edge e = loop_latch_edge (loop);
11118 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11119 continue;
11121 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11122 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11123 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11125 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11126 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11127 gcc_assert (phi_defs.length () == latch_defs.length ());
11128 for (unsigned i = 0; i < phi_defs.length (); ++i)
11129 add_phi_arg (as_a <gphi *> (phi_defs[i]),
11130 gimple_get_lhs (latch_defs[i]), e,
11131 gimple_phi_arg_location (phi, e->dest_idx));
11133 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11135 /* For first order recurrences we have to update both uses of
11136 the latch definition, the one in the PHI node and the one
11137 in the generated VEC_PERM_EXPR. */
11138 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11139 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11140 gcc_assert (phi_defs.length () == latch_defs.length ());
11141 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11142 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11143 for (unsigned i = 0; i < phi_defs.length (); ++i)
11145 gassign *perm = as_a <gassign *> (phi_defs[i]);
11146 if (i > 0)
11147 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11148 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11149 update_stmt (perm);
11151 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11152 gimple_phi_arg_location (phi, e->dest_idx));
11157 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11158 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11159 stmt_vec_info. */
11161 static bool
11162 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11163 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11165 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11166 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11168 if (dump_enabled_p ())
11169 dump_printf_loc (MSG_NOTE, vect_location,
11170 "------>vectorizing statement: %G", stmt_info->stmt);
11172 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11173 vect_loop_kill_debug_uses (loop, stmt_info);
11175 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11176 && !STMT_VINFO_LIVE_P (stmt_info))
11177 return false;
11179 if (STMT_VINFO_VECTYPE (stmt_info))
11181 poly_uint64 nunits
11182 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11183 if (!STMT_SLP_TYPE (stmt_info)
11184 && maybe_ne (nunits, vf)
11185 && dump_enabled_p ())
11186 /* For SLP VF is set according to unrolling factor, and not
11187 to vector size, hence for SLP this print is not valid. */
11188 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11191 /* Pure SLP statements have already been vectorized. We still need
11192 to apply loop vectorization to hybrid SLP statements. */
11193 if (PURE_SLP_STMT (stmt_info))
11194 return false;
11196 if (dump_enabled_p ())
11197 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11199 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11200 *seen_store = stmt_info;
11202 return true;
11205 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11206 in the hash_map with its corresponding values. */
11208 static tree
11209 find_in_mapping (tree t, void *context)
11211 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11213 tree *value = mapping->get (t);
11214 return value ? *value : t;
11217 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11218 original loop that has now been vectorized.
11220 The inits of the data_references need to be advanced with the number of
11221 iterations of the main loop. This has been computed in vect_do_peeling and
11222 is stored in parameter ADVANCE. We first restore the data_references
11223 initial offset with the values recored in ORIG_DRS_INIT.
11225 Since the loop_vec_info of this EPILOGUE was constructed for the original
11226 loop, its stmt_vec_infos all point to the original statements. These need
11227 to be updated to point to their corresponding copies as well as the SSA_NAMES
11228 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11230 The data_reference's connections also need to be updated. Their
11231 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11232 stmt_vec_infos, their statements need to point to their corresponding copy,
11233 if they are gather loads or scatter stores then their reference needs to be
11234 updated to point to its corresponding copy and finally we set
11235 'base_misaligned' to false as we have already peeled for alignment in the
11236 prologue of the main loop. */
11238 static void
11239 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11241 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11242 auto_vec<gimple *> stmt_worklist;
11243 hash_map<tree,tree> mapping;
11244 gimple *orig_stmt, *new_stmt;
11245 gimple_stmt_iterator epilogue_gsi;
11246 gphi_iterator epilogue_phi_gsi;
11247 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11248 basic_block *epilogue_bbs = get_loop_body (epilogue);
11249 unsigned i;
11251 free (LOOP_VINFO_BBS (epilogue_vinfo));
11252 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11254 /* Advance data_reference's with the number of iterations of the previous
11255 loop and its prologue. */
11256 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11259 /* The EPILOGUE loop is a copy of the original loop so they share the same
11260 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11261 point to the copied statements. We also create a mapping of all LHS' in
11262 the original loop and all the LHS' in the EPILOGUE and create worklists to
11263 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11264 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11266 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11267 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11269 new_stmt = epilogue_phi_gsi.phi ();
11271 gcc_assert (gimple_uid (new_stmt) > 0);
11272 stmt_vinfo
11273 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11275 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11276 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11278 mapping.put (gimple_phi_result (orig_stmt),
11279 gimple_phi_result (new_stmt));
11280 /* PHI nodes can not have patterns or related statements. */
11281 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11282 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11285 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11286 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11288 new_stmt = gsi_stmt (epilogue_gsi);
11289 if (is_gimple_debug (new_stmt))
11290 continue;
11292 gcc_assert (gimple_uid (new_stmt) > 0);
11293 stmt_vinfo
11294 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11296 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11297 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11299 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11300 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11302 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11304 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11305 for (gimple_stmt_iterator gsi = gsi_start (seq);
11306 !gsi_end_p (gsi); gsi_next (&gsi))
11307 stmt_worklist.safe_push (gsi_stmt (gsi));
11310 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11311 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11313 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11314 stmt_worklist.safe_push (stmt);
11315 /* Set BB such that the assert in
11316 'get_initial_def_for_reduction' is able to determine that
11317 the BB of the related stmt is inside this loop. */
11318 gimple_set_bb (stmt,
11319 gimple_bb (new_stmt));
11320 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11321 gcc_assert (related_vinfo == NULL
11322 || related_vinfo == stmt_vinfo);
11327 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11328 using the original main loop and thus need to be updated to refer to the
11329 cloned variables used in the epilogue. */
11330 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11332 gimple *stmt = stmt_worklist[i];
11333 tree *new_op;
11335 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11337 tree op = gimple_op (stmt, j);
11338 if ((new_op = mapping.get(op)))
11339 gimple_set_op (stmt, j, *new_op);
11340 else
11342 /* PR92429: The last argument of simplify_replace_tree disables
11343 folding when replacing arguments. This is required as
11344 otherwise you might end up with different statements than the
11345 ones analyzed in vect_loop_analyze, leading to different
11346 vectorization. */
11347 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11348 &find_in_mapping, &mapping, false);
11349 gimple_set_op (stmt, j, op);
11354 struct data_reference *dr;
11355 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11356 FOR_EACH_VEC_ELT (datarefs, i, dr)
11358 orig_stmt = DR_STMT (dr);
11359 gcc_assert (gimple_uid (orig_stmt) > 0);
11360 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11361 /* Data references for gather loads and scatter stores do not use the
11362 updated offset we set using ADVANCE. Instead we have to make sure the
11363 reference in the data references point to the corresponding copy of
11364 the original in the epilogue. Make sure to update both
11365 gather/scatters recognized by dataref analysis and also other
11366 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11367 auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11368 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11369 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11371 DR_REF (dr)
11372 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11373 &find_in_mapping, &mapping);
11374 DR_BASE_ADDRESS (dr)
11375 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11376 &find_in_mapping, &mapping);
11378 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11379 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11380 /* The vector size of the epilogue is smaller than that of the main loop
11381 so the alignment is either the same or lower. This means the dr will
11382 thus by definition be aligned. */
11383 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11386 epilogue_vinfo->shared->datarefs_copy.release ();
11387 epilogue_vinfo->shared->save_datarefs ();
11390 /* Function vect_transform_loop.
11392 The analysis phase has determined that the loop is vectorizable.
11393 Vectorize the loop - created vectorized stmts to replace the scalar
11394 stmts in the loop, and update the loop exit condition.
11395 Returns scalar epilogue loop if any. */
11397 class loop *
11398 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11400 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11401 class loop *epilogue = NULL;
11402 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11403 int nbbs = loop->num_nodes;
11404 int i;
11405 tree niters_vector = NULL_TREE;
11406 tree step_vector = NULL_TREE;
11407 tree niters_vector_mult_vf = NULL_TREE;
11408 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11409 unsigned int lowest_vf = constant_lower_bound (vf);
11410 gimple *stmt;
11411 bool check_profitability = false;
11412 unsigned int th;
11413 bool flat = maybe_flat_loop_profile (loop);
11415 DUMP_VECT_SCOPE ("vec_transform_loop");
11417 loop_vinfo->shared->check_datarefs ();
11419 /* Use the more conservative vectorization threshold. If the number
11420 of iterations is constant assume the cost check has been performed
11421 by our caller. If the threshold makes all loops profitable that
11422 run at least the (estimated) vectorization factor number of times
11423 checking is pointless, too. */
11424 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11425 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11427 if (dump_enabled_p ())
11428 dump_printf_loc (MSG_NOTE, vect_location,
11429 "Profitability threshold is %d loop iterations.\n",
11430 th);
11431 check_profitability = true;
11434 /* Make sure there exists a single-predecessor exit bb. Do this before
11435 versioning. */
11436 edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11437 if (! single_pred_p (e->dest))
11439 split_loop_exit_edge (e, true);
11440 if (dump_enabled_p ())
11441 dump_printf (MSG_NOTE, "split exit edge\n");
11444 /* Version the loop first, if required, so the profitability check
11445 comes first. */
11447 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11449 class loop *sloop
11450 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11451 sloop->force_vectorize = false;
11452 check_profitability = false;
11455 /* Make sure there exists a single-predecessor exit bb also on the
11456 scalar loop copy. Do this after versioning but before peeling
11457 so CFG structure is fine for both scalar and if-converted loop
11458 to make slpeel_duplicate_current_defs_from_edges face matched
11459 loop closed PHI nodes on the exit. */
11460 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11462 e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11463 if (! single_pred_p (e->dest))
11465 split_loop_exit_edge (e, true);
11466 if (dump_enabled_p ())
11467 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11471 tree niters = vect_build_loop_niters (loop_vinfo);
11472 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11473 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11474 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11475 tree advance;
11476 drs_init_vec orig_drs_init;
11478 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11479 &step_vector, &niters_vector_mult_vf, th,
11480 check_profitability, niters_no_overflow,
11481 &advance);
11482 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11483 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11485 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11486 block after loop exit. We need to scale all that. */
11487 basic_block preheader
11488 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11489 preheader->count
11490 = preheader->count.apply_probability
11491 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11492 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11493 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11494 single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->dest->count
11495 = preheader->count;
11498 if (niters_vector == NULL_TREE)
11500 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11501 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11502 && known_eq (lowest_vf, vf))
11504 niters_vector
11505 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11506 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11507 step_vector = build_one_cst (TREE_TYPE (niters));
11509 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11510 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11511 &step_vector, niters_no_overflow);
11512 else
11513 /* vect_do_peeling subtracted the number of peeled prologue
11514 iterations from LOOP_VINFO_NITERS. */
11515 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11516 &niters_vector, &step_vector,
11517 niters_no_overflow);
11520 /* 1) Make sure the loop header has exactly two entries
11521 2) Make sure we have a preheader basic block. */
11523 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11525 split_edge (loop_preheader_edge (loop));
11527 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11528 /* This will deal with any possible peeling. */
11529 vect_prepare_for_masked_peels (loop_vinfo);
11531 /* Schedule the SLP instances first, then handle loop vectorization
11532 below. */
11533 if (!loop_vinfo->slp_instances.is_empty ())
11535 DUMP_VECT_SCOPE ("scheduling SLP instances");
11536 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11539 /* FORNOW: the vectorizer supports only loops which body consist
11540 of one basic block (header + empty latch). When the vectorizer will
11541 support more involved loop forms, the order by which the BBs are
11542 traversed need to be reconsidered. */
11544 for (i = 0; i < nbbs; i++)
11546 basic_block bb = bbs[i];
11547 stmt_vec_info stmt_info;
11549 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11550 gsi_next (&si))
11552 gphi *phi = si.phi ();
11553 if (dump_enabled_p ())
11554 dump_printf_loc (MSG_NOTE, vect_location,
11555 "------>vectorizing phi: %G", (gimple *) phi);
11556 stmt_info = loop_vinfo->lookup_stmt (phi);
11557 if (!stmt_info)
11558 continue;
11560 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11561 vect_loop_kill_debug_uses (loop, stmt_info);
11563 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11564 && !STMT_VINFO_LIVE_P (stmt_info))
11565 continue;
11567 if (STMT_VINFO_VECTYPE (stmt_info)
11568 && (maybe_ne
11569 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11570 && dump_enabled_p ())
11571 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11573 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11574 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11575 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11576 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11577 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11578 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11579 && ! PURE_SLP_STMT (stmt_info))
11581 if (dump_enabled_p ())
11582 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11583 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11587 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11588 gsi_next (&si))
11590 gphi *phi = si.phi ();
11591 stmt_info = loop_vinfo->lookup_stmt (phi);
11592 if (!stmt_info)
11593 continue;
11595 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11596 && !STMT_VINFO_LIVE_P (stmt_info))
11597 continue;
11599 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11600 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11601 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11602 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11603 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11604 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11605 && ! PURE_SLP_STMT (stmt_info))
11606 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11609 for (gimple_stmt_iterator si = gsi_start_bb (bb);
11610 !gsi_end_p (si);)
11612 stmt = gsi_stmt (si);
11613 /* During vectorization remove existing clobber stmts. */
11614 if (gimple_clobber_p (stmt))
11616 unlink_stmt_vdef (stmt);
11617 gsi_remove (&si, true);
11618 release_defs (stmt);
11620 else
11622 /* Ignore vector stmts created in the outer loop. */
11623 stmt_info = loop_vinfo->lookup_stmt (stmt);
11625 /* vector stmts created in the outer-loop during vectorization of
11626 stmts in an inner-loop may not have a stmt_info, and do not
11627 need to be vectorized. */
11628 stmt_vec_info seen_store = NULL;
11629 if (stmt_info)
11631 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11633 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11634 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11635 !gsi_end_p (subsi); gsi_next (&subsi))
11637 stmt_vec_info pat_stmt_info
11638 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11639 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11640 &si, &seen_store);
11642 stmt_vec_info pat_stmt_info
11643 = STMT_VINFO_RELATED_STMT (stmt_info);
11644 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11645 &si, &seen_store))
11646 maybe_set_vectorized_backedge_value (loop_vinfo,
11647 pat_stmt_info);
11649 else
11651 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11652 &seen_store))
11653 maybe_set_vectorized_backedge_value (loop_vinfo,
11654 stmt_info);
11657 gsi_next (&si);
11658 if (seen_store)
11660 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11661 /* Interleaving. If IS_STORE is TRUE, the
11662 vectorization of the interleaving chain was
11663 completed - free all the stores in the chain. */
11664 vect_remove_stores (loop_vinfo,
11665 DR_GROUP_FIRST_ELEMENT (seen_store));
11666 else
11667 /* Free the attached stmt_vec_info and remove the stmt. */
11668 loop_vinfo->remove_stmt (stmt_info);
11673 /* Stub out scalar statements that must not survive vectorization.
11674 Doing this here helps with grouped statements, or statements that
11675 are involved in patterns. */
11676 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11677 !gsi_end_p (gsi); gsi_next (&gsi))
11679 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11680 if (!call || !gimple_call_internal_p (call))
11681 continue;
11682 internal_fn ifn = gimple_call_internal_fn (call);
11683 if (ifn == IFN_MASK_LOAD)
11685 tree lhs = gimple_get_lhs (call);
11686 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11688 tree zero = build_zero_cst (TREE_TYPE (lhs));
11689 gimple *new_stmt = gimple_build_assign (lhs, zero);
11690 gsi_replace (&gsi, new_stmt, true);
11693 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11695 tree lhs = gimple_get_lhs (call);
11696 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11698 tree else_arg
11699 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11700 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11701 gsi_replace (&gsi, new_stmt, true);
11705 } /* BBs in loop */
11707 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11708 a zero NITERS becomes a nonzero NITERS_VECTOR. */
11709 if (integer_onep (step_vector))
11710 niters_no_overflow = true;
11711 vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
11712 niters_vector, step_vector, niters_vector_mult_vf,
11713 !niters_no_overflow);
11715 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11717 /* True if the final iteration might not handle a full vector's
11718 worth of scalar iterations. */
11719 bool final_iter_may_be_partial
11720 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11721 /* The minimum number of iterations performed by the epilogue. This
11722 is 1 when peeling for gaps because we always need a final scalar
11723 iteration. */
11724 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11725 /* +1 to convert latch counts to loop iteration counts,
11726 -min_epilogue_iters to remove iterations that cannot be performed
11727 by the vector code. */
11728 int bias_for_lowest = 1 - min_epilogue_iters;
11729 int bias_for_assumed = bias_for_lowest;
11730 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11731 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11733 /* When the amount of peeling is known at compile time, the first
11734 iteration will have exactly alignment_npeels active elements.
11735 In the worst case it will have at least one. */
11736 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11737 bias_for_lowest += lowest_vf - min_first_active;
11738 bias_for_assumed += assumed_vf - min_first_active;
11740 /* In these calculations the "- 1" converts loop iteration counts
11741 back to latch counts. */
11742 if (loop->any_upper_bound)
11744 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11745 loop->nb_iterations_upper_bound
11746 = (final_iter_may_be_partial
11747 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11748 lowest_vf) - 1
11749 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11750 lowest_vf) - 1);
11751 if (main_vinfo
11752 /* Both peeling for alignment and peeling for gaps can end up
11753 with the scalar epilogue running for more than VF-1 iterations. */
11754 && !main_vinfo->peeling_for_alignment
11755 && !main_vinfo->peeling_for_gaps)
11757 unsigned int bound;
11758 poly_uint64 main_iters
11759 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11760 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11761 main_iters
11762 = upper_bound (main_iters,
11763 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11764 if (can_div_away_from_zero_p (main_iters,
11765 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11766 &bound))
11767 loop->nb_iterations_upper_bound
11768 = wi::umin ((bound_wide_int) (bound - 1),
11769 loop->nb_iterations_upper_bound);
11772 if (loop->any_likely_upper_bound)
11773 loop->nb_iterations_likely_upper_bound
11774 = (final_iter_may_be_partial
11775 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11776 + bias_for_lowest, lowest_vf) - 1
11777 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11778 + bias_for_lowest, lowest_vf) - 1);
11779 if (loop->any_estimate)
11780 loop->nb_iterations_estimate
11781 = (final_iter_may_be_partial
11782 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11783 assumed_vf) - 1
11784 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11785 assumed_vf) - 1);
11786 scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
11787 assumed_vf, flat);
11789 if (dump_enabled_p ())
11791 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11793 dump_printf_loc (MSG_NOTE, vect_location,
11794 "LOOP VECTORIZED\n");
11795 if (loop->inner)
11796 dump_printf_loc (MSG_NOTE, vect_location,
11797 "OUTER LOOP VECTORIZED\n");
11798 dump_printf (MSG_NOTE, "\n");
11800 else
11801 dump_printf_loc (MSG_NOTE, vect_location,
11802 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11803 GET_MODE_NAME (loop_vinfo->vector_mode));
11806 /* Loops vectorized with a variable factor won't benefit from
11807 unrolling/peeling. */
11808 if (!vf.is_constant ())
11810 loop->unroll = 1;
11811 if (dump_enabled_p ())
11812 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11813 " variable-length vectorization factor\n");
11815 /* Free SLP instances here because otherwise stmt reference counting
11816 won't work. */
11817 slp_instance instance;
11818 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11819 vect_free_slp_instance (instance);
11820 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11821 /* Clear-up safelen field since its value is invalid after vectorization
11822 since vectorized loop can have loop-carried dependencies. */
11823 loop->safelen = 0;
11825 if (epilogue)
11827 update_epilogue_loop_vinfo (epilogue, advance);
11829 epilogue->simduid = loop->simduid;
11830 epilogue->force_vectorize = loop->force_vectorize;
11831 epilogue->dont_vectorize = false;
11834 return epilogue;
11837 /* The code below is trying to perform simple optimization - revert
11838 if-conversion for masked stores, i.e. if the mask of a store is zero
11839 do not perform it and all stored value producers also if possible.
11840 For example,
11841 for (i=0; i<n; i++)
11842 if (c[i])
11844 p1[i] += 1;
11845 p2[i] = p3[i] +2;
11847 this transformation will produce the following semi-hammock:
11849 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11851 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11852 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11853 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11854 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11855 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11856 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11860 void
11861 optimize_mask_stores (class loop *loop)
11863 basic_block *bbs = get_loop_body (loop);
11864 unsigned nbbs = loop->num_nodes;
11865 unsigned i;
11866 basic_block bb;
11867 class loop *bb_loop;
11868 gimple_stmt_iterator gsi;
11869 gimple *stmt;
11870 auto_vec<gimple *> worklist;
11871 auto_purge_vect_location sentinel;
11873 vect_location = find_loop_location (loop);
11874 /* Pick up all masked stores in loop if any. */
11875 for (i = 0; i < nbbs; i++)
11877 bb = bbs[i];
11878 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11879 gsi_next (&gsi))
11881 stmt = gsi_stmt (gsi);
11882 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11883 worklist.safe_push (stmt);
11887 free (bbs);
11888 if (worklist.is_empty ())
11889 return;
11891 /* Loop has masked stores. */
11892 while (!worklist.is_empty ())
11894 gimple *last, *last_store;
11895 edge e, efalse;
11896 tree mask;
11897 basic_block store_bb, join_bb;
11898 gimple_stmt_iterator gsi_to;
11899 tree vdef, new_vdef;
11900 gphi *phi;
11901 tree vectype;
11902 tree zero;
11904 last = worklist.pop ();
11905 mask = gimple_call_arg (last, 2);
11906 bb = gimple_bb (last);
11907 /* Create then_bb and if-then structure in CFG, then_bb belongs to
11908 the same loop as if_bb. It could be different to LOOP when two
11909 level loop-nest is vectorized and mask_store belongs to the inner
11910 one. */
11911 e = split_block (bb, last);
11912 bb_loop = bb->loop_father;
11913 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11914 join_bb = e->dest;
11915 store_bb = create_empty_bb (bb);
11916 add_bb_to_loop (store_bb, bb_loop);
11917 e->flags = EDGE_TRUE_VALUE;
11918 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11919 /* Put STORE_BB to likely part. */
11920 efalse->probability = profile_probability::likely ();
11921 e->probability = efalse->probability.invert ();
11922 store_bb->count = efalse->count ();
11923 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11924 if (dom_info_available_p (CDI_DOMINATORS))
11925 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11926 if (dump_enabled_p ())
11927 dump_printf_loc (MSG_NOTE, vect_location,
11928 "Create new block %d to sink mask stores.",
11929 store_bb->index);
11930 /* Create vector comparison with boolean result. */
11931 vectype = TREE_TYPE (mask);
11932 zero = build_zero_cst (vectype);
11933 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11934 gsi = gsi_last_bb (bb);
11935 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11936 /* Create new PHI node for vdef of the last masked store:
11937 .MEM_2 = VDEF <.MEM_1>
11938 will be converted to
11939 .MEM.3 = VDEF <.MEM_1>
11940 and new PHI node will be created in join bb
11941 .MEM_2 = PHI <.MEM_1, .MEM_3>
11943 vdef = gimple_vdef (last);
11944 new_vdef = make_ssa_name (gimple_vop (cfun), last);
11945 gimple_set_vdef (last, new_vdef);
11946 phi = create_phi_node (vdef, join_bb);
11947 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11949 /* Put all masked stores with the same mask to STORE_BB if possible. */
11950 while (true)
11952 gimple_stmt_iterator gsi_from;
11953 gimple *stmt1 = NULL;
11955 /* Move masked store to STORE_BB. */
11956 last_store = last;
11957 gsi = gsi_for_stmt (last);
11958 gsi_from = gsi;
11959 /* Shift GSI to the previous stmt for further traversal. */
11960 gsi_prev (&gsi);
11961 gsi_to = gsi_start_bb (store_bb);
11962 gsi_move_before (&gsi_from, &gsi_to);
11963 /* Setup GSI_TO to the non-empty block start. */
11964 gsi_to = gsi_start_bb (store_bb);
11965 if (dump_enabled_p ())
11966 dump_printf_loc (MSG_NOTE, vect_location,
11967 "Move stmt to created bb\n%G", last);
11968 /* Move all stored value producers if possible. */
11969 while (!gsi_end_p (gsi))
11971 tree lhs;
11972 imm_use_iterator imm_iter;
11973 use_operand_p use_p;
11974 bool res;
11976 /* Skip debug statements. */
11977 if (is_gimple_debug (gsi_stmt (gsi)))
11979 gsi_prev (&gsi);
11980 continue;
11982 stmt1 = gsi_stmt (gsi);
11983 /* Do not consider statements writing to memory or having
11984 volatile operand. */
11985 if (gimple_vdef (stmt1)
11986 || gimple_has_volatile_ops (stmt1))
11987 break;
11988 gsi_from = gsi;
11989 gsi_prev (&gsi);
11990 lhs = gimple_get_lhs (stmt1);
11991 if (!lhs)
11992 break;
11994 /* LHS of vectorized stmt must be SSA_NAME. */
11995 if (TREE_CODE (lhs) != SSA_NAME)
11996 break;
11998 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12000 /* Remove dead scalar statement. */
12001 if (has_zero_uses (lhs))
12003 gsi_remove (&gsi_from, true);
12004 continue;
12008 /* Check that LHS does not have uses outside of STORE_BB. */
12009 res = true;
12010 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12012 gimple *use_stmt;
12013 use_stmt = USE_STMT (use_p);
12014 if (is_gimple_debug (use_stmt))
12015 continue;
12016 if (gimple_bb (use_stmt) != store_bb)
12018 res = false;
12019 break;
12022 if (!res)
12023 break;
12025 if (gimple_vuse (stmt1)
12026 && gimple_vuse (stmt1) != gimple_vuse (last_store))
12027 break;
12029 /* Can move STMT1 to STORE_BB. */
12030 if (dump_enabled_p ())
12031 dump_printf_loc (MSG_NOTE, vect_location,
12032 "Move stmt to created bb\n%G", stmt1);
12033 gsi_move_before (&gsi_from, &gsi_to);
12034 /* Shift GSI_TO for further insertion. */
12035 gsi_prev (&gsi_to);
12037 /* Put other masked stores with the same mask to STORE_BB. */
12038 if (worklist.is_empty ()
12039 || gimple_call_arg (worklist.last (), 2) != mask
12040 || worklist.last () != stmt1)
12041 break;
12042 last = worklist.pop ();
12044 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12048 /* Decide whether it is possible to use a zero-based induction variable
12049 when vectorizing LOOP_VINFO with partial vectors. If it is, return
12050 the value that the induction variable must be able to hold in order
12051 to ensure that the rgroups eventually have no active vector elements.
12052 Return -1 otherwise. */
12054 widest_int
12055 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12057 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12058 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12059 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12061 /* Calculate the value that the induction variable must be able
12062 to hit in order to ensure that we end the loop with an all-false mask.
12063 This involves adding the maximum number of inactive trailing scalar
12064 iterations. */
12065 widest_int iv_limit = -1;
12066 if (max_loop_iterations (loop, &iv_limit))
12068 if (niters_skip)
12070 /* Add the maximum number of skipped iterations to the
12071 maximum iteration count. */
12072 if (TREE_CODE (niters_skip) == INTEGER_CST)
12073 iv_limit += wi::to_widest (niters_skip);
12074 else
12075 iv_limit += max_vf - 1;
12077 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12078 /* Make a conservatively-correct assumption. */
12079 iv_limit += max_vf - 1;
12081 /* IV_LIMIT is the maximum number of latch iterations, which is also
12082 the maximum in-range IV value. Round this value down to the previous
12083 vector alignment boundary and then add an extra full iteration. */
12084 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12085 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12087 return iv_limit;
12090 /* For the given rgroup_controls RGC, check whether an induction variable
12091 would ever hit a value that produces a set of all-false masks or zero
12092 lengths before wrapping around. Return true if it's possible to wrap
12093 around before hitting the desirable value, otherwise return false. */
12095 bool
12096 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12098 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12100 if (iv_limit == -1)
12101 return true;
12103 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12104 unsigned int compare_precision = TYPE_PRECISION (compare_type);
12105 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12107 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12108 return true;
12110 return false;