libstdc++: Normalise _GLIBCXX20_DEPRECATED macro
[official-gcc.git] / gcc / tree-vect-loop.cc
blobbecf96bb2b804529a5f68cc1a759b279dce8c76f
1 /* Loop Vectorization
2 Copyright (C) 2003-2023 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 #include "case-cfn-macros.h"
59 /* Loop Vectorization Pass.
61 This pass tries to vectorize loops.
63 For example, the vectorizer transforms the following simple loop:
65 short a[N]; short b[N]; short c[N]; int i;
67 for (i=0; i<N; i++){
68 a[i] = b[i] + c[i];
71 as if it was manually vectorized by rewriting the source code into:
73 typedef int __attribute__((mode(V8HI))) v8hi;
74 short a[N]; short b[N]; short c[N]; int i;
75 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76 v8hi va, vb, vc;
78 for (i=0; i<N/8; i++){
79 vb = pb[i];
80 vc = pc[i];
81 va = vb + vc;
82 pa[i] = va;
85 The main entry to this pass is vectorize_loops(), in which
86 the vectorizer applies a set of analyses on a given set of loops,
87 followed by the actual vectorization transformation for the loops that
88 had successfully passed the analysis phase.
89 Throughout this pass we make a distinction between two types of
90 data: scalars (which are represented by SSA_NAMES), and memory references
91 ("data-refs"). These two types of data require different handling both
92 during analysis and transformation. The types of data-refs that the
93 vectorizer currently supports are ARRAY_REFS which base is an array DECL
94 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95 accesses are required to have a simple (consecutive) access pattern.
97 Analysis phase:
98 ===============
99 The driver for the analysis phase is vect_analyze_loop().
100 It applies a set of analyses, some of which rely on the scalar evolution
101 analyzer (scev) developed by Sebastian Pop.
103 During the analysis phase the vectorizer records some information
104 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105 loop, as well as general information about the loop as a whole, which is
106 recorded in a "loop_vec_info" struct attached to each loop.
108 Transformation phase:
109 =====================
110 The loop transformation phase scans all the stmts in the loop, and
111 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112 the loop that needs to be vectorized. It inserts the vector code sequence
113 just before the scalar stmt S, and records a pointer to the vector code
114 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115 attached to S). This pointer will be used for the vectorization of following
116 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117 otherwise, we rely on dead code elimination for removing it.
119 For example, say stmt S1 was vectorized into stmt VS1:
121 VS1: vb = px[i];
122 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123 S2: a = b;
125 To vectorize stmt S2, the vectorizer first finds the stmt that defines
126 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
128 resulting sequence would be:
130 VS1: vb = px[i];
131 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132 VS2: va = vb;
133 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
135 Operands that are not SSA_NAMEs, are data-refs that appear in
136 load/store operations (like 'x[i]' in S1), and are handled differently.
138 Target modeling:
139 =================
140 Currently the only target specific information that is used is the
141 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142 Targets that can support different sizes of vectors, for now will need
143 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
144 flexibility will be added in the future.
146 Since we only vectorize operations which vector form can be
147 expressed using existing tree codes, to verify that an operation is
148 supported, the vectorizer checks the relevant optab at the relevant
149 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
150 the value found is CODE_FOR_nothing, then there's no target support, and
151 we can't vectorize the stmt.
153 For additional information on this project see:
154 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
158 unsigned *);
159 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
160 bool *, bool *, bool);
162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
163 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
164 may already be set for general statements (not just data refs). */
166 static opt_result
167 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
168 bool vectype_maybe_set_p,
169 poly_uint64 *vf)
171 gimple *stmt = stmt_info->stmt;
173 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
174 && !STMT_VINFO_LIVE_P (stmt_info))
175 || gimple_clobber_p (stmt))
177 if (dump_enabled_p ())
178 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
179 return opt_result::success ();
182 tree stmt_vectype, nunits_vectype;
183 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
184 &stmt_vectype,
185 &nunits_vectype);
186 if (!res)
187 return res;
189 if (stmt_vectype)
191 if (STMT_VINFO_VECTYPE (stmt_info))
192 /* The only case when a vectype had been already set is for stmts
193 that contain a data ref, or for "pattern-stmts" (stmts generated
194 by the vectorizer to represent/replace a certain idiom). */
195 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
196 || vectype_maybe_set_p)
197 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
198 else
199 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
202 if (nunits_vectype)
203 vect_update_max_nunits (vf, nunits_vectype);
205 return opt_result::success ();
208 /* Subroutine of vect_determine_vectorization_factor. Set the vector
209 types of STMT_INFO and all attached pattern statements and update
210 the vectorization factor VF accordingly. Return true on success
211 or false if something prevented vectorization. */
213 static opt_result
214 vect_determine_vf_for_stmt (vec_info *vinfo,
215 stmt_vec_info stmt_info, poly_uint64 *vf)
217 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
219 stmt_info->stmt);
220 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
221 if (!res)
222 return res;
224 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
225 && STMT_VINFO_RELATED_STMT (stmt_info))
227 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
228 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
230 /* If a pattern statement has def stmts, analyze them too. */
231 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
232 !gsi_end_p (si); gsi_next (&si))
234 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
235 if (dump_enabled_p ())
236 dump_printf_loc (MSG_NOTE, vect_location,
237 "==> examining pattern def stmt: %G",
238 def_stmt_info->stmt);
239 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
240 if (!res)
241 return res;
244 if (dump_enabled_p ())
245 dump_printf_loc (MSG_NOTE, vect_location,
246 "==> examining pattern statement: %G",
247 stmt_info->stmt);
248 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
249 if (!res)
250 return res;
253 return opt_result::success ();
256 /* Function vect_determine_vectorization_factor
258 Determine the vectorization factor (VF). VF is the number of data elements
259 that are operated upon in parallel in a single iteration of the vectorized
260 loop. For example, when vectorizing a loop that operates on 4byte elements,
261 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
262 elements can fit in a single vector register.
264 We currently support vectorization of loops in which all types operated upon
265 are of the same size. Therefore this function currently sets VF according to
266 the size of the types operated upon, and fails if there are multiple sizes
267 in the loop.
269 VF is also the factor by which the loop iterations are strip-mined, e.g.:
270 original loop:
271 for (i=0; i<N; i++){
272 a[i] = b[i] + c[i];
275 vectorized loop:
276 for (i=0; i<N; i+=VF){
277 a[i:VF] = b[i:VF] + c[i:VF];
281 static opt_result
282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
284 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
285 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
286 unsigned nbbs = loop->num_nodes;
287 poly_uint64 vectorization_factor = 1;
288 tree scalar_type = NULL_TREE;
289 gphi *phi;
290 tree vectype;
291 stmt_vec_info stmt_info;
292 unsigned i;
294 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
296 for (i = 0; i < nbbs; i++)
298 basic_block bb = bbs[i];
300 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
301 gsi_next (&si))
303 phi = si.phi ();
304 stmt_info = loop_vinfo->lookup_stmt (phi);
305 if (dump_enabled_p ())
306 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
307 (gimple *) phi);
309 gcc_assert (stmt_info);
311 if (STMT_VINFO_RELEVANT_P (stmt_info)
312 || STMT_VINFO_LIVE_P (stmt_info))
314 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
315 scalar_type = TREE_TYPE (PHI_RESULT (phi));
317 if (dump_enabled_p ())
318 dump_printf_loc (MSG_NOTE, vect_location,
319 "get vectype for scalar type: %T\n",
320 scalar_type);
322 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
323 if (!vectype)
324 return opt_result::failure_at (phi,
325 "not vectorized: unsupported "
326 "data-type %T\n",
327 scalar_type);
328 STMT_VINFO_VECTYPE (stmt_info) = vectype;
330 if (dump_enabled_p ())
331 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
332 vectype);
334 if (dump_enabled_p ())
336 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
337 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
338 dump_printf (MSG_NOTE, "\n");
341 vect_update_max_nunits (&vectorization_factor, vectype);
345 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
346 gsi_next (&si))
348 if (is_gimple_debug (gsi_stmt (si)))
349 continue;
350 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
351 opt_result res
352 = vect_determine_vf_for_stmt (loop_vinfo,
353 stmt_info, &vectorization_factor);
354 if (!res)
355 return res;
359 /* TODO: Analyze cost. Decide if worth while to vectorize. */
360 if (dump_enabled_p ())
362 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
363 dump_dec (MSG_NOTE, vectorization_factor);
364 dump_printf (MSG_NOTE, "\n");
367 if (known_le (vectorization_factor, 1U))
368 return opt_result::failure_at (vect_location,
369 "not vectorized: unsupported data-type\n");
370 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
371 return opt_result::success ();
375 /* Function vect_is_simple_iv_evolution.
377 FORNOW: A simple evolution of an induction variables in the loop is
378 considered a polynomial evolution. */
380 static bool
381 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
382 tree * step)
384 tree init_expr;
385 tree step_expr;
386 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
387 basic_block bb;
389 /* When there is no evolution in this loop, the evolution function
390 is not "simple". */
391 if (evolution_part == NULL_TREE)
392 return false;
394 /* When the evolution is a polynomial of degree >= 2
395 the evolution function is not "simple". */
396 if (tree_is_chrec (evolution_part))
397 return false;
399 step_expr = evolution_part;
400 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
402 if (dump_enabled_p ())
403 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
404 step_expr, init_expr);
406 *init = init_expr;
407 *step = step_expr;
409 if (TREE_CODE (step_expr) != INTEGER_CST
410 && (TREE_CODE (step_expr) != SSA_NAME
411 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
412 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
413 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
414 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
415 || !flag_associative_math)))
416 && (TREE_CODE (step_expr) != REAL_CST
417 || !flag_associative_math))
419 if (dump_enabled_p ())
420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
421 "step unknown.\n");
422 return false;
425 return true;
428 /* Function vect_is_nonlinear_iv_evolution
430 Only support nonlinear induction for integer type
431 1. neg
432 2. mul by constant
433 3. lshift/rshift by constant.
435 For neg induction, return a fake step as integer -1. */
436 static bool
437 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
438 gphi* loop_phi_node, tree *init, tree *step)
440 tree init_expr, ev_expr, result, op1, op2;
441 gimple* def;
443 if (gimple_phi_num_args (loop_phi_node) != 2)
444 return false;
446 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
447 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
449 /* Support nonlinear induction only for integer type. */
450 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
451 return false;
453 *init = init_expr;
454 result = PHI_RESULT (loop_phi_node);
456 if (TREE_CODE (ev_expr) != SSA_NAME
457 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
458 || !is_gimple_assign (def))
459 return false;
461 enum tree_code t_code = gimple_assign_rhs_code (def);
462 switch (t_code)
464 case NEGATE_EXPR:
465 if (gimple_assign_rhs1 (def) != result)
466 return false;
467 *step = build_int_cst (TREE_TYPE (init_expr), -1);
468 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
469 break;
471 case RSHIFT_EXPR:
472 case LSHIFT_EXPR:
473 case MULT_EXPR:
474 op1 = gimple_assign_rhs1 (def);
475 op2 = gimple_assign_rhs2 (def);
476 if (TREE_CODE (op2) != INTEGER_CST
477 || op1 != result)
478 return false;
479 *step = op2;
480 if (t_code == LSHIFT_EXPR)
481 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
482 else if (t_code == RSHIFT_EXPR)
483 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
484 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
485 else
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
487 break;
489 default:
490 return false;
493 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
494 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
496 return true;
499 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
500 what we are assuming is a double reduction. For example, given
501 a structure like this:
503 outer1:
504 x_1 = PHI <x_4(outer2), ...>;
507 inner:
508 x_2 = PHI <x_1(outer1), ...>;
510 x_3 = ...;
513 outer2:
514 x_4 = PHI <x_3(inner)>;
517 outer loop analysis would treat x_1 as a double reduction phi and
518 this function would then return true for x_2. */
520 static bool
521 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
523 use_operand_p use_p;
524 ssa_op_iter op_iter;
525 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
526 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
527 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
528 return true;
529 return false;
532 /* Returns true if Phi is a first-order recurrence. A first-order
533 recurrence is a non-reduction recurrence relation in which the value of
534 the recurrence in the current loop iteration equals a value defined in
535 the previous iteration. */
537 static bool
538 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
539 gphi *phi)
541 /* Ensure the loop latch definition is from within the loop. */
542 edge latch = loop_latch_edge (loop);
543 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
544 if (TREE_CODE (ldef) != SSA_NAME
545 || SSA_NAME_IS_DEFAULT_DEF (ldef)
546 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
547 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
548 return false;
550 tree def = gimple_phi_result (phi);
552 /* Ensure every use_stmt of the phi node is dominated by the latch
553 definition. */
554 imm_use_iterator imm_iter;
555 use_operand_p use_p;
556 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
557 if (!is_gimple_debug (USE_STMT (use_p))
558 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
559 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
560 USE_STMT (use_p))))
561 return false;
563 /* First-order recurrence autovectorization needs shuffle vector. */
564 tree scalar_type = TREE_TYPE (def);
565 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
566 if (!vectype)
567 return false;
569 return true;
572 /* Function vect_analyze_scalar_cycles_1.
574 Examine the cross iteration def-use cycles of scalar variables
575 in LOOP. LOOP_VINFO represents the loop that is now being
576 considered for vectorization (can be LOOP, or an outer-loop
577 enclosing LOOP). SLP indicates there will be some subsequent
578 slp analyses or not. */
580 static void
581 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
582 bool slp)
584 basic_block bb = loop->header;
585 tree init, step;
586 auto_vec<stmt_vec_info, 64> worklist;
587 gphi_iterator gsi;
588 bool double_reduc, reduc_chain;
590 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
592 /* First - identify all inductions. Reduction detection assumes that all the
593 inductions have been identified, therefore, this order must not be
594 changed. */
595 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
597 gphi *phi = gsi.phi ();
598 tree access_fn = NULL;
599 tree def = PHI_RESULT (phi);
600 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
602 if (dump_enabled_p ())
603 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
604 (gimple *) phi);
606 /* Skip virtual phi's. The data dependences that are associated with
607 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
608 if (virtual_operand_p (def))
609 continue;
611 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
613 /* Analyze the evolution function. */
614 access_fn = analyze_scalar_evolution (loop, def);
615 if (access_fn)
617 STRIP_NOPS (access_fn);
618 if (dump_enabled_p ())
619 dump_printf_loc (MSG_NOTE, vect_location,
620 "Access function of PHI: %T\n", access_fn);
621 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
622 = initial_condition_in_loop_num (access_fn, loop->num);
623 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
624 = evolution_part_in_loop_num (access_fn, loop->num);
627 if ((!access_fn
628 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
629 || !vect_is_simple_iv_evolution (loop->num, access_fn,
630 &init, &step)
631 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
632 && TREE_CODE (step) != INTEGER_CST))
633 /* Only handle nonlinear iv for same loop. */
634 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
635 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
636 phi, &init, &step)))
638 worklist.safe_push (stmt_vinfo);
639 continue;
642 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
643 != NULL_TREE);
644 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
646 if (dump_enabled_p ())
647 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
648 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
652 /* Second - identify all reductions and nested cycles. */
653 while (worklist.length () > 0)
655 stmt_vec_info stmt_vinfo = worklist.pop ();
656 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
657 tree def = PHI_RESULT (phi);
659 if (dump_enabled_p ())
660 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
661 (gimple *) phi);
663 gcc_assert (!virtual_operand_p (def)
664 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
666 stmt_vec_info reduc_stmt_info
667 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
668 &reduc_chain, slp);
669 if (reduc_stmt_info)
671 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
672 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
673 if (double_reduc)
675 if (dump_enabled_p ())
676 dump_printf_loc (MSG_NOTE, vect_location,
677 "Detected double reduction.\n");
679 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
680 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
682 else
684 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
686 if (dump_enabled_p ())
687 dump_printf_loc (MSG_NOTE, vect_location,
688 "Detected vectorizable nested cycle.\n");
690 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
692 else
694 if (dump_enabled_p ())
695 dump_printf_loc (MSG_NOTE, vect_location,
696 "Detected reduction.\n");
698 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
699 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
700 /* Store the reduction cycles for possible vectorization in
701 loop-aware SLP if it was not detected as reduction
702 chain. */
703 if (! reduc_chain)
704 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
705 (reduc_stmt_info);
709 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
710 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
711 else
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
714 "Unknown def-use cycle pattern.\n");
719 /* Function vect_analyze_scalar_cycles.
721 Examine the cross iteration def-use cycles of scalar variables, by
722 analyzing the loop-header PHIs of scalar variables. Classify each
723 cycle as one of the following: invariant, induction, reduction, unknown.
724 We do that for the loop represented by LOOP_VINFO, and also to its
725 inner-loop, if exists.
726 Examples for scalar cycles:
728 Example1: reduction:
730 loop1:
731 for (i=0; i<N; i++)
732 sum += a[i];
734 Example2: induction:
736 loop2:
737 for (i=0; i<N; i++)
738 a[i] = i; */
740 static void
741 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
743 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
745 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
747 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
748 Reductions in such inner-loop therefore have different properties than
749 the reductions in the nest that gets vectorized:
750 1. When vectorized, they are executed in the same order as in the original
751 scalar loop, so we can't change the order of computation when
752 vectorizing them.
753 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
754 current checks are too strict. */
756 if (loop->inner)
757 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
760 /* Transfer group and reduction information from STMT_INFO to its
761 pattern stmt. */
763 static void
764 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
766 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
767 stmt_vec_info stmtp;
768 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
769 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
770 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
773 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
774 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
775 == STMT_VINFO_DEF_TYPE (stmt_info));
776 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
777 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
778 if (stmt_info)
779 REDUC_GROUP_NEXT_ELEMENT (stmtp)
780 = STMT_VINFO_RELATED_STMT (stmt_info);
782 while (stmt_info);
785 /* Fixup scalar cycles that now have their stmts detected as patterns. */
787 static void
788 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
790 stmt_vec_info first;
791 unsigned i;
793 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
795 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
796 while (next)
798 if ((STMT_VINFO_IN_PATTERN_P (next)
799 != STMT_VINFO_IN_PATTERN_P (first))
800 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
801 break;
802 next = REDUC_GROUP_NEXT_ELEMENT (next);
804 /* If all reduction chain members are well-formed patterns adjust
805 the group to group the pattern stmts instead. */
806 if (! next
807 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
809 if (STMT_VINFO_IN_PATTERN_P (first))
811 vect_fixup_reduc_chain (first);
812 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
813 = STMT_VINFO_RELATED_STMT (first);
816 /* If not all stmt in the chain are patterns or if we failed
817 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
818 it as regular reduction instead. */
819 else
821 stmt_vec_info vinfo = first;
822 stmt_vec_info last = NULL;
823 while (vinfo)
825 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
826 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
827 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
828 last = vinfo;
829 vinfo = next;
831 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
832 = vect_internal_def;
833 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
834 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
835 --i;
840 /* Function vect_get_loop_niters.
842 Determine how many iterations the loop is executed and place it
843 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
844 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
845 niter information holds in ASSUMPTIONS.
847 Return the loop exit condition. */
850 static gcond *
851 vect_get_loop_niters (class loop *loop, tree *assumptions,
852 tree *number_of_iterations, tree *number_of_iterationsm1)
854 edge exit = single_exit (loop);
855 class tree_niter_desc niter_desc;
856 tree niter_assumptions, niter, may_be_zero;
857 gcond *cond = get_loop_exit_condition (loop);
859 *assumptions = boolean_true_node;
860 *number_of_iterationsm1 = chrec_dont_know;
861 *number_of_iterations = chrec_dont_know;
862 DUMP_VECT_SCOPE ("get_loop_niters");
864 if (!exit)
865 return cond;
867 may_be_zero = NULL_TREE;
868 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
869 || chrec_contains_undetermined (niter_desc.niter))
870 return cond;
872 niter_assumptions = niter_desc.assumptions;
873 may_be_zero = niter_desc.may_be_zero;
874 niter = niter_desc.niter;
876 if (may_be_zero && integer_zerop (may_be_zero))
877 may_be_zero = NULL_TREE;
879 if (may_be_zero)
881 if (COMPARISON_CLASS_P (may_be_zero))
883 /* Try to combine may_be_zero with assumptions, this can simplify
884 computation of niter expression. */
885 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
886 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
887 niter_assumptions,
888 fold_build1 (TRUTH_NOT_EXPR,
889 boolean_type_node,
890 may_be_zero));
891 else
892 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
893 build_int_cst (TREE_TYPE (niter), 0),
894 rewrite_to_non_trapping_overflow (niter));
896 may_be_zero = NULL_TREE;
898 else if (integer_nonzerop (may_be_zero))
900 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
901 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
902 return cond;
904 else
905 return cond;
908 *assumptions = niter_assumptions;
909 *number_of_iterationsm1 = niter;
911 /* We want the number of loop header executions which is the number
912 of latch executions plus one.
913 ??? For UINT_MAX latch executions this number overflows to zero
914 for loops like do { n++; } while (n != 0); */
915 if (niter && !chrec_contains_undetermined (niter))
916 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
917 build_int_cst (TREE_TYPE (niter), 1));
918 *number_of_iterations = niter;
920 return cond;
923 /* Function bb_in_loop_p
925 Used as predicate for dfs order traversal of the loop bbs. */
927 static bool
928 bb_in_loop_p (const_basic_block bb, const void *data)
930 const class loop *const loop = (const class loop *)data;
931 if (flow_bb_inside_loop_p (loop, bb))
932 return true;
933 return false;
937 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
938 stmt_vec_info structs for all the stmts in LOOP_IN. */
940 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
941 : vec_info (vec_info::loop, shared),
942 loop (loop_in),
943 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
944 num_itersm1 (NULL_TREE),
945 num_iters (NULL_TREE),
946 num_iters_unchanged (NULL_TREE),
947 num_iters_assumptions (NULL_TREE),
948 vector_costs (nullptr),
949 scalar_costs (nullptr),
950 th (0),
951 versioning_threshold (0),
952 vectorization_factor (0),
953 main_loop_edge (nullptr),
954 skip_main_loop_edge (nullptr),
955 skip_this_loop_edge (nullptr),
956 reusable_accumulators (),
957 suggested_unroll_factor (1),
958 max_vectorization_factor (0),
959 mask_skip_niters (NULL_TREE),
960 rgroup_compare_type (NULL_TREE),
961 simd_if_cond (NULL_TREE),
962 unaligned_dr (NULL),
963 peeling_for_alignment (0),
964 ptr_mask (0),
965 ivexpr_map (NULL),
966 scan_map (NULL),
967 slp_unrolling_factor (1),
968 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
969 vectorizable (false),
970 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
971 using_partial_vectors_p (false),
972 epil_using_partial_vectors_p (false),
973 partial_load_store_bias (0),
974 peeling_for_gaps (false),
975 peeling_for_niter (false),
976 no_data_dependencies (false),
977 has_mask_store (false),
978 scalar_loop_scaling (profile_probability::uninitialized ()),
979 scalar_loop (NULL),
980 orig_loop_info (NULL)
982 /* CHECKME: We want to visit all BBs before their successors (except for
983 latch blocks, for which this assertion wouldn't hold). In the simple
984 case of the loop forms we allow, a dfs order of the BBs would the same
985 as reversed postorder traversal, so we are safe. */
987 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
988 bbs, loop->num_nodes, loop);
989 gcc_assert (nbbs == loop->num_nodes);
991 for (unsigned int i = 0; i < nbbs; i++)
993 basic_block bb = bbs[i];
994 gimple_stmt_iterator si;
996 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
998 gimple *phi = gsi_stmt (si);
999 gimple_set_uid (phi, 0);
1000 add_stmt (phi);
1003 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1005 gimple *stmt = gsi_stmt (si);
1006 gimple_set_uid (stmt, 0);
1007 if (is_gimple_debug (stmt))
1008 continue;
1009 add_stmt (stmt);
1010 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1011 third argument is the #pragma omp simd if (x) condition, when 0,
1012 loop shouldn't be vectorized, when non-zero constant, it should
1013 be vectorized normally, otherwise versioned with vectorized loop
1014 done if the condition is non-zero at runtime. */
1015 if (loop_in->simduid
1016 && is_gimple_call (stmt)
1017 && gimple_call_internal_p (stmt)
1018 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1019 && gimple_call_num_args (stmt) >= 3
1020 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1021 && (loop_in->simduid
1022 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1024 tree arg = gimple_call_arg (stmt, 2);
1025 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1026 simd_if_cond = arg;
1027 else
1028 gcc_assert (integer_nonzerop (arg));
1033 epilogue_vinfos.create (6);
1036 /* Free all levels of rgroup CONTROLS. */
1038 void
1039 release_vec_loop_controls (vec<rgroup_controls> *controls)
1041 rgroup_controls *rgc;
1042 unsigned int i;
1043 FOR_EACH_VEC_ELT (*controls, i, rgc)
1044 rgc->controls.release ();
1045 controls->release ();
1048 /* Free all memory used by the _loop_vec_info, as well as all the
1049 stmt_vec_info structs of all the stmts in the loop. */
1051 _loop_vec_info::~_loop_vec_info ()
1053 free (bbs);
1055 release_vec_loop_controls (&masks);
1056 release_vec_loop_controls (&lens);
1057 delete ivexpr_map;
1058 delete scan_map;
1059 epilogue_vinfos.release ();
1060 delete scalar_costs;
1061 delete vector_costs;
1063 /* When we release an epiloge vinfo that we do not intend to use
1064 avoid clearing AUX of the main loop which should continue to
1065 point to the main loop vinfo since otherwise we'll leak that. */
1066 if (loop->aux == this)
1067 loop->aux = NULL;
1070 /* Return an invariant or register for EXPR and emit necessary
1071 computations in the LOOP_VINFO loop preheader. */
1073 tree
1074 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1076 if (is_gimple_reg (expr)
1077 || is_gimple_min_invariant (expr))
1078 return expr;
1080 if (! loop_vinfo->ivexpr_map)
1081 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1082 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1083 if (! cached)
1085 gimple_seq stmts = NULL;
1086 cached = force_gimple_operand (unshare_expr (expr),
1087 &stmts, true, NULL_TREE);
1088 if (stmts)
1090 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1091 gsi_insert_seq_on_edge_immediate (e, stmts);
1094 return cached;
1097 /* Return true if we can use CMP_TYPE as the comparison type to produce
1098 all masks required to mask LOOP_VINFO. */
1100 static bool
1101 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1103 rgroup_controls *rgm;
1104 unsigned int i;
1105 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1106 if (rgm->type != NULL_TREE
1107 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1108 cmp_type, rgm->type,
1109 OPTIMIZE_FOR_SPEED))
1110 return false;
1111 return true;
1114 /* Calculate the maximum number of scalars per iteration for every
1115 rgroup in LOOP_VINFO. */
1117 static unsigned int
1118 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1120 unsigned int res = 1;
1121 unsigned int i;
1122 rgroup_controls *rgm;
1123 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1124 res = MAX (res, rgm->max_nscalars_per_iter);
1125 return res;
1128 /* Calculate the minimum precision necessary to represent:
1130 MAX_NITERS * FACTOR
1132 as an unsigned integer, where MAX_NITERS is the maximum number of
1133 loop header iterations for the original scalar form of LOOP_VINFO. */
1135 static unsigned
1136 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1138 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1140 /* Get the maximum number of iterations that is representable
1141 in the counter type. */
1142 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1143 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1145 /* Get a more refined estimate for the number of iterations. */
1146 widest_int max_back_edges;
1147 if (max_loop_iterations (loop, &max_back_edges))
1148 max_ni = wi::smin (max_ni, max_back_edges + 1);
1150 /* Work out how many bits we need to represent the limit. */
1151 return wi::min_precision (max_ni * factor, UNSIGNED);
1154 /* True if the loop needs peeling or partial vectors when vectorized. */
1156 static bool
1157 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1159 unsigned HOST_WIDE_INT const_vf;
1160 HOST_WIDE_INT max_niter
1161 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1163 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1164 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1165 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1166 (loop_vinfo));
1168 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1169 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1171 /* Work out the (constant) number of iterations that need to be
1172 peeled for reasons other than niters. */
1173 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1174 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1175 peel_niter += 1;
1176 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1177 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1178 return true;
1180 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1181 /* ??? When peeling for gaps but not alignment, we could
1182 try to check whether the (variable) niters is known to be
1183 VF * N + 1. That's something of a niche case though. */
1184 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1185 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1186 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1187 < (unsigned) exact_log2 (const_vf))
1188 /* In case of versioning, check if the maximum number of
1189 iterations is greater than th. If they are identical,
1190 the epilogue is unnecessary. */
1191 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1192 || ((unsigned HOST_WIDE_INT) max_niter
1193 > (th / const_vf) * const_vf))))
1194 return true;
1196 return false;
1199 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1200 whether we can actually generate the masks required. Return true if so,
1201 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1203 static bool
1204 vect_verify_full_masking (loop_vec_info loop_vinfo)
1206 unsigned int min_ni_width;
1207 unsigned int max_nscalars_per_iter
1208 = vect_get_max_nscalars_per_iter (loop_vinfo);
1210 /* Use a normal loop if there are no statements that need masking.
1211 This only happens in rare degenerate cases: it means that the loop
1212 has no loads, no stores, and no live-out values. */
1213 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1214 return false;
1216 /* Work out how many bits we need to represent the limit. */
1217 min_ni_width
1218 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1220 /* Find a scalar mode for which WHILE_ULT is supported. */
1221 opt_scalar_int_mode cmp_mode_iter;
1222 tree cmp_type = NULL_TREE;
1223 tree iv_type = NULL_TREE;
1224 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1225 unsigned int iv_precision = UINT_MAX;
1227 if (iv_limit != -1)
1228 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1229 UNSIGNED);
1231 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1233 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1234 if (cmp_bits >= min_ni_width
1235 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1237 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1238 if (this_type
1239 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1241 /* Although we could stop as soon as we find a valid mode,
1242 there are at least two reasons why that's not always the
1243 best choice:
1245 - An IV that's Pmode or wider is more likely to be reusable
1246 in address calculations than an IV that's narrower than
1247 Pmode.
1249 - Doing the comparison in IV_PRECISION or wider allows
1250 a natural 0-based IV, whereas using a narrower comparison
1251 type requires mitigations against wrap-around.
1253 Conversely, if the IV limit is variable, doing the comparison
1254 in a wider type than the original type can introduce
1255 unnecessary extensions, so picking the widest valid mode
1256 is not always a good choice either.
1258 Here we prefer the first IV type that's Pmode or wider,
1259 and the first comparison type that's IV_PRECISION or wider.
1260 (The comparison type must be no wider than the IV type,
1261 to avoid extensions in the vector loop.)
1263 ??? We might want to try continuing beyond Pmode for ILP32
1264 targets if CMP_BITS < IV_PRECISION. */
1265 iv_type = this_type;
1266 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1267 cmp_type = this_type;
1268 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1269 break;
1274 if (!cmp_type)
1275 return false;
1277 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1278 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1279 return true;
1282 /* Check whether we can use vector access with length based on precison
1283 comparison. So far, to keep it simple, we only allow the case that the
1284 precision of the target supported length is larger than the precision
1285 required by loop niters. */
1287 static bool
1288 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1290 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1291 return false;
1293 machine_mode len_load_mode = get_len_load_store_mode
1294 (loop_vinfo->vector_mode, true).require ();
1295 machine_mode len_store_mode = get_len_load_store_mode
1296 (loop_vinfo->vector_mode, false).require ();
1298 signed char partial_load_bias = internal_len_load_store_bias
1299 (IFN_LEN_LOAD, len_load_mode);
1301 signed char partial_store_bias = internal_len_load_store_bias
1302 (IFN_LEN_STORE, len_store_mode);
1304 gcc_assert (partial_load_bias == partial_store_bias);
1306 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1307 return false;
1309 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1310 len_loads with a length of zero. In order to avoid that we prohibit
1311 more than one loop length here. */
1312 if (partial_load_bias == -1
1313 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1314 return false;
1316 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1318 unsigned int max_nitems_per_iter = 1;
1319 unsigned int i;
1320 rgroup_controls *rgl;
1321 /* Find the maximum number of items per iteration for every rgroup. */
1322 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1324 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1325 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1328 /* Work out how many bits we need to represent the length limit. */
1329 unsigned int min_ni_prec
1330 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1332 /* Now use the maximum of below precisions for one suitable IV type:
1333 - the IV's natural precision
1334 - the precision needed to hold: the maximum number of scalar
1335 iterations multiplied by the scale factor (min_ni_prec above)
1336 - the Pmode precision
1338 If min_ni_prec is less than the precision of the current niters,
1339 we perfer to still use the niters type. Prefer to use Pmode and
1340 wider IV to avoid narrow conversions. */
1342 unsigned int ni_prec
1343 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1344 min_ni_prec = MAX (min_ni_prec, ni_prec);
1345 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1347 tree iv_type = NULL_TREE;
1348 opt_scalar_int_mode tmode_iter;
1349 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1351 scalar_mode tmode = tmode_iter.require ();
1352 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1354 /* ??? Do we really want to construct one IV whose precision exceeds
1355 BITS_PER_WORD? */
1356 if (tbits > BITS_PER_WORD)
1357 break;
1359 /* Find the first available standard integral type. */
1360 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1362 iv_type = build_nonstandard_integer_type (tbits, true);
1363 break;
1367 if (!iv_type)
1369 if (dump_enabled_p ())
1370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371 "can't vectorize with length-based partial vectors"
1372 " because there is no suitable iv type.\n");
1373 return false;
1376 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1377 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1379 return true;
1382 /* Calculate the cost of one scalar iteration of the loop. */
1383 static void
1384 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1386 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1387 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1388 int nbbs = loop->num_nodes, factor;
1389 int innerloop_iters, i;
1391 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1393 /* Gather costs for statements in the scalar loop. */
1395 /* FORNOW. */
1396 innerloop_iters = 1;
1397 if (loop->inner)
1398 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1400 for (i = 0; i < nbbs; i++)
1402 gimple_stmt_iterator si;
1403 basic_block bb = bbs[i];
1405 if (bb->loop_father == loop->inner)
1406 factor = innerloop_iters;
1407 else
1408 factor = 1;
1410 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1412 gimple *stmt = gsi_stmt (si);
1413 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1415 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1416 continue;
1418 /* Skip stmts that are not vectorized inside the loop. */
1419 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1420 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1421 && (!STMT_VINFO_LIVE_P (vstmt_info)
1422 || !VECTORIZABLE_CYCLE_DEF
1423 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1424 continue;
1426 vect_cost_for_stmt kind;
1427 if (STMT_VINFO_DATA_REF (stmt_info))
1429 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1430 kind = scalar_load;
1431 else
1432 kind = scalar_store;
1434 else if (vect_nop_conversion_p (stmt_info))
1435 continue;
1436 else
1437 kind = scalar_stmt;
1439 /* We are using vect_prologue here to avoid scaling twice
1440 by the inner loop factor. */
1441 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1442 factor, kind, stmt_info, 0, vect_prologue);
1446 /* Now accumulate cost. */
1447 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1448 add_stmt_costs (loop_vinfo->scalar_costs,
1449 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1450 loop_vinfo->scalar_costs->finish_cost (nullptr);
1454 /* Function vect_analyze_loop_form.
1456 Verify that certain CFG restrictions hold, including:
1457 - the loop has a pre-header
1458 - the loop has a single entry and exit
1459 - the loop exit condition is simple enough
1460 - the number of iterations can be analyzed, i.e, a countable loop. The
1461 niter could be analyzed under some assumptions. */
1463 opt_result
1464 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1466 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1468 /* Different restrictions apply when we are considering an inner-most loop,
1469 vs. an outer (nested) loop.
1470 (FORNOW. May want to relax some of these restrictions in the future). */
1472 info->inner_loop_cond = NULL;
1473 if (!loop->inner)
1475 /* Inner-most loop. We currently require that the number of BBs is
1476 exactly 2 (the header and latch). Vectorizable inner-most loops
1477 look like this:
1479 (pre-header)
1481 header <--------+
1482 | | |
1483 | +--> latch --+
1485 (exit-bb) */
1487 if (loop->num_nodes != 2)
1488 return opt_result::failure_at (vect_location,
1489 "not vectorized:"
1490 " control flow in loop.\n");
1492 if (empty_block_p (loop->header))
1493 return opt_result::failure_at (vect_location,
1494 "not vectorized: empty loop.\n");
1496 else
1498 class loop *innerloop = loop->inner;
1499 edge entryedge;
1501 /* Nested loop. We currently require that the loop is doubly-nested,
1502 contains a single inner loop, and the number of BBs is exactly 5.
1503 Vectorizable outer-loops look like this:
1505 (pre-header)
1507 header <---+
1509 inner-loop |
1511 tail ------+
1513 (exit-bb)
1515 The inner-loop has the properties expected of inner-most loops
1516 as described above. */
1518 if ((loop->inner)->inner || (loop->inner)->next)
1519 return opt_result::failure_at (vect_location,
1520 "not vectorized:"
1521 " multiple nested loops.\n");
1523 if (loop->num_nodes != 5)
1524 return opt_result::failure_at (vect_location,
1525 "not vectorized:"
1526 " control flow in loop.\n");
1528 entryedge = loop_preheader_edge (innerloop);
1529 if (entryedge->src != loop->header
1530 || !single_exit (innerloop)
1531 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1532 return opt_result::failure_at (vect_location,
1533 "not vectorized:"
1534 " unsupported outerloop form.\n");
1536 /* Analyze the inner-loop. */
1537 vect_loop_form_info inner;
1538 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1539 if (!res)
1541 if (dump_enabled_p ())
1542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1543 "not vectorized: Bad inner loop.\n");
1544 return res;
1547 /* Don't support analyzing niter under assumptions for inner
1548 loop. */
1549 if (!integer_onep (inner.assumptions))
1550 return opt_result::failure_at (vect_location,
1551 "not vectorized: Bad inner loop.\n");
1553 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1554 return opt_result::failure_at (vect_location,
1555 "not vectorized: inner-loop count not"
1556 " invariant.\n");
1558 if (dump_enabled_p ())
1559 dump_printf_loc (MSG_NOTE, vect_location,
1560 "Considering outer-loop vectorization.\n");
1561 info->inner_loop_cond = inner.loop_cond;
1564 if (!single_exit (loop))
1565 return opt_result::failure_at (vect_location,
1566 "not vectorized: multiple exits.\n");
1567 if (EDGE_COUNT (loop->header->preds) != 2)
1568 return opt_result::failure_at (vect_location,
1569 "not vectorized:"
1570 " too many incoming edges.\n");
1572 /* We assume that the loop exit condition is at the end of the loop. i.e,
1573 that the loop is represented as a do-while (with a proper if-guard
1574 before the loop if needed), where the loop header contains all the
1575 executable statements, and the latch is empty. */
1576 if (!empty_block_p (loop->latch)
1577 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1578 return opt_result::failure_at (vect_location,
1579 "not vectorized: latch block not empty.\n");
1581 /* Make sure the exit is not abnormal. */
1582 edge e = single_exit (loop);
1583 if (e->flags & EDGE_ABNORMAL)
1584 return opt_result::failure_at (vect_location,
1585 "not vectorized:"
1586 " abnormal loop exit edge.\n");
1588 info->loop_cond
1589 = vect_get_loop_niters (loop, &info->assumptions,
1590 &info->number_of_iterations,
1591 &info->number_of_iterationsm1);
1592 if (!info->loop_cond)
1593 return opt_result::failure_at
1594 (vect_location,
1595 "not vectorized: complicated exit condition.\n");
1597 if (integer_zerop (info->assumptions)
1598 || !info->number_of_iterations
1599 || chrec_contains_undetermined (info->number_of_iterations))
1600 return opt_result::failure_at
1601 (info->loop_cond,
1602 "not vectorized: number of iterations cannot be computed.\n");
1604 if (integer_zerop (info->number_of_iterations))
1605 return opt_result::failure_at
1606 (info->loop_cond,
1607 "not vectorized: number of iterations = 0.\n");
1609 if (!(tree_fits_shwi_p (info->number_of_iterations)
1610 && tree_to_shwi (info->number_of_iterations) > 0))
1612 if (dump_enabled_p ())
1614 dump_printf_loc (MSG_NOTE, vect_location,
1615 "Symbolic number of iterations is ");
1616 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1617 dump_printf (MSG_NOTE, "\n");
1621 return opt_result::success ();
1624 /* Create a loop_vec_info for LOOP with SHARED and the
1625 vect_analyze_loop_form result. */
1627 loop_vec_info
1628 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1629 const vect_loop_form_info *info,
1630 loop_vec_info main_loop_info)
1632 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1633 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1634 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1635 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1636 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1637 /* Also record the assumptions for versioning. */
1638 if (!integer_onep (info->assumptions) && !main_loop_info)
1639 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1641 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1642 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1643 if (info->inner_loop_cond)
1645 stmt_vec_info inner_loop_cond_info
1646 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1647 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1648 /* If we have an estimate on the number of iterations of the inner
1649 loop use that to limit the scale for costing, otherwise use
1650 --param vect-inner-loop-cost-factor literally. */
1651 widest_int nit;
1652 if (estimated_stmt_executions (loop->inner, &nit))
1653 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1654 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1657 return loop_vinfo;
1662 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1663 statements update the vectorization factor. */
1665 static void
1666 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1668 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1669 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1670 int nbbs = loop->num_nodes;
1671 poly_uint64 vectorization_factor;
1672 int i;
1674 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1676 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1677 gcc_assert (known_ne (vectorization_factor, 0U));
1679 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1680 vectorization factor of the loop is the unrolling factor required by
1681 the SLP instances. If that unrolling factor is 1, we say, that we
1682 perform pure SLP on loop - cross iteration parallelism is not
1683 exploited. */
1684 bool only_slp_in_loop = true;
1685 for (i = 0; i < nbbs; i++)
1687 basic_block bb = bbs[i];
1688 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1689 gsi_next (&si))
1691 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1692 if (!stmt_info)
1693 continue;
1694 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1695 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1696 && !PURE_SLP_STMT (stmt_info))
1697 /* STMT needs both SLP and loop-based vectorization. */
1698 only_slp_in_loop = false;
1700 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1701 gsi_next (&si))
1703 if (is_gimple_debug (gsi_stmt (si)))
1704 continue;
1705 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1706 stmt_info = vect_stmt_to_vectorize (stmt_info);
1707 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1708 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1709 && !PURE_SLP_STMT (stmt_info))
1710 /* STMT needs both SLP and loop-based vectorization. */
1711 only_slp_in_loop = false;
1715 if (only_slp_in_loop)
1717 if (dump_enabled_p ())
1718 dump_printf_loc (MSG_NOTE, vect_location,
1719 "Loop contains only SLP stmts\n");
1720 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1722 else
1724 if (dump_enabled_p ())
1725 dump_printf_loc (MSG_NOTE, vect_location,
1726 "Loop contains SLP and non-SLP stmts\n");
1727 /* Both the vectorization factor and unroll factor have the form
1728 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1729 so they must have a common multiple. */
1730 vectorization_factor
1731 = force_common_multiple (vectorization_factor,
1732 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1735 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1736 if (dump_enabled_p ())
1738 dump_printf_loc (MSG_NOTE, vect_location,
1739 "Updating vectorization factor to ");
1740 dump_dec (MSG_NOTE, vectorization_factor);
1741 dump_printf (MSG_NOTE, ".\n");
1745 /* Return true if STMT_INFO describes a double reduction phi and if
1746 the other phi in the reduction is also relevant for vectorization.
1747 This rejects cases such as:
1749 outer1:
1750 x_1 = PHI <x_3(outer2), ...>;
1753 inner:
1754 x_2 = ...;
1757 outer2:
1758 x_3 = PHI <x_2(inner)>;
1760 if nothing in x_2 or elsewhere makes x_1 relevant. */
1762 static bool
1763 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1765 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1766 return false;
1768 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1771 /* Function vect_analyze_loop_operations.
1773 Scan the loop stmts and make sure they are all vectorizable. */
1775 static opt_result
1776 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1778 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1779 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1780 int nbbs = loop->num_nodes;
1781 int i;
1782 stmt_vec_info stmt_info;
1783 bool need_to_vectorize = false;
1784 bool ok;
1786 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1788 auto_vec<stmt_info_for_cost> cost_vec;
1790 for (i = 0; i < nbbs; i++)
1792 basic_block bb = bbs[i];
1794 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1795 gsi_next (&si))
1797 gphi *phi = si.phi ();
1798 ok = true;
1800 stmt_info = loop_vinfo->lookup_stmt (phi);
1801 if (dump_enabled_p ())
1802 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1803 (gimple *) phi);
1804 if (virtual_operand_p (gimple_phi_result (phi)))
1805 continue;
1807 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1808 (i.e., a phi in the tail of the outer-loop). */
1809 if (! is_loop_header_bb_p (bb))
1811 /* FORNOW: we currently don't support the case that these phis
1812 are not used in the outerloop (unless it is double reduction,
1813 i.e., this phi is vect_reduction_def), cause this case
1814 requires to actually do something here. */
1815 if (STMT_VINFO_LIVE_P (stmt_info)
1816 && !vect_active_double_reduction_p (stmt_info))
1817 return opt_result::failure_at (phi,
1818 "Unsupported loop-closed phi"
1819 " in outer-loop.\n");
1821 /* If PHI is used in the outer loop, we check that its operand
1822 is defined in the inner loop. */
1823 if (STMT_VINFO_RELEVANT_P (stmt_info))
1825 tree phi_op;
1827 if (gimple_phi_num_args (phi) != 1)
1828 return opt_result::failure_at (phi, "unsupported phi");
1830 phi_op = PHI_ARG_DEF (phi, 0);
1831 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1832 if (!op_def_info)
1833 return opt_result::failure_at (phi, "unsupported phi\n");
1835 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1836 && (STMT_VINFO_RELEVANT (op_def_info)
1837 != vect_used_in_outer_by_reduction))
1838 return opt_result::failure_at (phi, "unsupported phi\n");
1840 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1841 || (STMT_VINFO_DEF_TYPE (stmt_info)
1842 == vect_double_reduction_def))
1843 && !vectorizable_lc_phi (loop_vinfo,
1844 stmt_info, NULL, NULL))
1845 return opt_result::failure_at (phi, "unsupported phi\n");
1848 continue;
1851 gcc_assert (stmt_info);
1853 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1854 || STMT_VINFO_LIVE_P (stmt_info))
1855 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
1856 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
1857 /* A scalar-dependence cycle that we don't support. */
1858 return opt_result::failure_at (phi,
1859 "not vectorized:"
1860 " scalar dependence cycle.\n");
1862 if (STMT_VINFO_RELEVANT_P (stmt_info))
1864 need_to_vectorize = true;
1865 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1866 && ! PURE_SLP_STMT (stmt_info))
1867 ok = vectorizable_induction (loop_vinfo,
1868 stmt_info, NULL, NULL,
1869 &cost_vec);
1870 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1871 || (STMT_VINFO_DEF_TYPE (stmt_info)
1872 == vect_double_reduction_def)
1873 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1874 && ! PURE_SLP_STMT (stmt_info))
1875 ok = vectorizable_reduction (loop_vinfo,
1876 stmt_info, NULL, NULL, &cost_vec);
1877 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
1878 == vect_first_order_recurrence)
1879 && ! PURE_SLP_STMT (stmt_info))
1880 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
1881 &cost_vec);
1884 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1885 if (ok
1886 && STMT_VINFO_LIVE_P (stmt_info)
1887 && !PURE_SLP_STMT (stmt_info))
1888 ok = vectorizable_live_operation (loop_vinfo,
1889 stmt_info, NULL, NULL, NULL,
1890 -1, false, &cost_vec);
1892 if (!ok)
1893 return opt_result::failure_at (phi,
1894 "not vectorized: relevant phi not "
1895 "supported: %G",
1896 static_cast <gimple *> (phi));
1899 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1900 gsi_next (&si))
1902 gimple *stmt = gsi_stmt (si);
1903 if (!gimple_clobber_p (stmt)
1904 && !is_gimple_debug (stmt))
1906 opt_result res
1907 = vect_analyze_stmt (loop_vinfo,
1908 loop_vinfo->lookup_stmt (stmt),
1909 &need_to_vectorize,
1910 NULL, NULL, &cost_vec);
1911 if (!res)
1912 return res;
1915 } /* bbs */
1917 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1919 /* All operations in the loop are either irrelevant (deal with loop
1920 control, or dead), or only used outside the loop and can be moved
1921 out of the loop (e.g. invariants, inductions). The loop can be
1922 optimized away by scalar optimizations. We're better off not
1923 touching this loop. */
1924 if (!need_to_vectorize)
1926 if (dump_enabled_p ())
1927 dump_printf_loc (MSG_NOTE, vect_location,
1928 "All the computation can be taken out of the loop.\n");
1929 return opt_result::failure_at
1930 (vect_location,
1931 "not vectorized: redundant loop. no profit to vectorize.\n");
1934 return opt_result::success ();
1937 /* Return true if we know that the iteration count is smaller than the
1938 vectorization factor. Return false if it isn't, or if we can't be sure
1939 either way. */
1941 static bool
1942 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1944 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1946 HOST_WIDE_INT max_niter;
1947 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1948 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1949 else
1950 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1952 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1953 return true;
1955 return false;
1958 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1959 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1960 definitely no, or -1 if it's worth retrying. */
1962 static int
1963 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1964 unsigned *suggested_unroll_factor)
1966 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1967 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1969 /* Only loops that can handle partially-populated vectors can have iteration
1970 counts less than the vectorization factor. */
1971 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1973 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1975 if (dump_enabled_p ())
1976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1977 "not vectorized: iteration count smaller than "
1978 "vectorization factor.\n");
1979 return 0;
1983 /* If using the "very cheap" model. reject cases in which we'd keep
1984 a copy of the scalar code (even if we might be able to vectorize it). */
1985 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1986 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1987 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1988 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1990 if (dump_enabled_p ())
1991 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1992 "some scalar iterations would need to be peeled\n");
1993 return 0;
1996 int min_profitable_iters, min_profitable_estimate;
1997 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1998 &min_profitable_estimate,
1999 suggested_unroll_factor);
2001 if (min_profitable_iters < 0)
2003 if (dump_enabled_p ())
2004 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2005 "not vectorized: vectorization not profitable.\n");
2006 if (dump_enabled_p ())
2007 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2008 "not vectorized: vector version will never be "
2009 "profitable.\n");
2010 return -1;
2013 int min_scalar_loop_bound = (param_min_vect_loop_bound
2014 * assumed_vf);
2016 /* Use the cost model only if it is more conservative than user specified
2017 threshold. */
2018 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2019 min_profitable_iters);
2021 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2023 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2024 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2026 if (dump_enabled_p ())
2027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2028 "not vectorized: vectorization not profitable.\n");
2029 if (dump_enabled_p ())
2030 dump_printf_loc (MSG_NOTE, vect_location,
2031 "not vectorized: iteration count smaller than user "
2032 "specified loop bound parameter or minimum profitable "
2033 "iterations (whichever is more conservative).\n");
2034 return 0;
2037 /* The static profitablity threshold min_profitable_estimate includes
2038 the cost of having to check at runtime whether the scalar loop
2039 should be used instead. If it turns out that we don't need or want
2040 such a check, the threshold we should use for the static estimate
2041 is simply the point at which the vector loop becomes more profitable
2042 than the scalar loop. */
2043 if (min_profitable_estimate > min_profitable_iters
2044 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2045 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2046 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2047 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2049 if (dump_enabled_p ())
2050 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2051 " choice between the scalar and vector loops\n");
2052 min_profitable_estimate = min_profitable_iters;
2055 /* If the vector loop needs multiple iterations to be beneficial then
2056 things are probably too close to call, and the conservative thing
2057 would be to stick with the scalar code. */
2058 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2059 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2061 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2063 "one iteration of the vector loop would be"
2064 " more expensive than the equivalent number of"
2065 " iterations of the scalar loop\n");
2066 return 0;
2069 HOST_WIDE_INT estimated_niter;
2071 /* If we are vectorizing an epilogue then we know the maximum number of
2072 scalar iterations it will cover is at least one lower than the
2073 vectorization factor of the main loop. */
2074 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2075 estimated_niter
2076 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2077 else
2079 estimated_niter = estimated_stmt_executions_int (loop);
2080 if (estimated_niter == -1)
2081 estimated_niter = likely_max_stmt_executions_int (loop);
2083 if (estimated_niter != -1
2084 && ((unsigned HOST_WIDE_INT) estimated_niter
2085 < MAX (th, (unsigned) min_profitable_estimate)))
2087 if (dump_enabled_p ())
2088 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2089 "not vectorized: estimated iteration count too "
2090 "small.\n");
2091 if (dump_enabled_p ())
2092 dump_printf_loc (MSG_NOTE, vect_location,
2093 "not vectorized: estimated iteration count smaller "
2094 "than specified loop bound parameter or minimum "
2095 "profitable iterations (whichever is more "
2096 "conservative).\n");
2097 return -1;
2100 return 1;
2103 static opt_result
2104 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2105 vec<data_reference_p> *datarefs,
2106 unsigned int *n_stmts)
2108 *n_stmts = 0;
2109 for (unsigned i = 0; i < loop->num_nodes; i++)
2110 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2111 !gsi_end_p (gsi); gsi_next (&gsi))
2113 gimple *stmt = gsi_stmt (gsi);
2114 if (is_gimple_debug (stmt))
2115 continue;
2116 ++(*n_stmts);
2117 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2118 NULL, 0);
2119 if (!res)
2121 if (is_gimple_call (stmt) && loop->safelen)
2123 tree fndecl = gimple_call_fndecl (stmt), op;
2124 if (fndecl != NULL_TREE)
2126 cgraph_node *node = cgraph_node::get (fndecl);
2127 if (node != NULL && node->simd_clones != NULL)
2129 unsigned int j, n = gimple_call_num_args (stmt);
2130 for (j = 0; j < n; j++)
2132 op = gimple_call_arg (stmt, j);
2133 if (DECL_P (op)
2134 || (REFERENCE_CLASS_P (op)
2135 && get_base_address (op)))
2136 break;
2138 op = gimple_call_lhs (stmt);
2139 /* Ignore #pragma omp declare simd functions
2140 if they don't have data references in the
2141 call stmt itself. */
2142 if (j == n
2143 && !(op
2144 && (DECL_P (op)
2145 || (REFERENCE_CLASS_P (op)
2146 && get_base_address (op)))))
2147 continue;
2151 return res;
2153 /* If dependence analysis will give up due to the limit on the
2154 number of datarefs stop here and fail fatally. */
2155 if (datarefs->length ()
2156 > (unsigned)param_loop_max_datarefs_for_datadeps)
2157 return opt_result::failure_at (stmt, "exceeded param "
2158 "loop-max-datarefs-for-datadeps\n");
2160 return opt_result::success ();
2163 /* Look for SLP-only access groups and turn each individual access into its own
2164 group. */
2165 static void
2166 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2168 unsigned int i;
2169 struct data_reference *dr;
2171 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2173 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2174 FOR_EACH_VEC_ELT (datarefs, i, dr)
2176 gcc_assert (DR_REF (dr));
2177 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2179 /* Check if the load is a part of an interleaving chain. */
2180 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2182 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2183 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2184 unsigned int group_size = DR_GROUP_SIZE (first_element);
2186 /* Check if SLP-only groups. */
2187 if (!STMT_SLP_TYPE (stmt_info)
2188 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2190 /* Dissolve the group. */
2191 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2193 stmt_vec_info vinfo = first_element;
2194 while (vinfo)
2196 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2197 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2198 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2199 DR_GROUP_SIZE (vinfo) = 1;
2200 if (STMT_VINFO_STRIDED_P (first_element))
2201 DR_GROUP_GAP (vinfo) = 0;
2202 else
2203 DR_GROUP_GAP (vinfo) = group_size - 1;
2204 /* Duplicate and adjust alignment info, it needs to
2205 be present on each group leader, see dr_misalignment. */
2206 if (vinfo != first_element)
2208 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2209 dr_info2->target_alignment = dr_info->target_alignment;
2210 int misalignment = dr_info->misalignment;
2211 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2213 HOST_WIDE_INT diff
2214 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2215 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2216 unsigned HOST_WIDE_INT align_c
2217 = dr_info->target_alignment.to_constant ();
2218 misalignment = (misalignment + diff) % align_c;
2220 dr_info2->misalignment = misalignment;
2222 vinfo = next;
2229 /* Determine if operating on full vectors for LOOP_VINFO might leave
2230 some scalar iterations still to do. If so, decide how we should
2231 handle those scalar iterations. The possibilities are:
2233 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2234 In this case:
2236 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2237 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2238 LOOP_VINFO_PEELING_FOR_NITER == false
2240 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2241 to handle the remaining scalar iterations. In this case:
2243 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2244 LOOP_VINFO_PEELING_FOR_NITER == true
2246 There are two choices:
2248 (2a) Consider vectorizing the epilogue loop at the same VF as the
2249 main loop, but using partial vectors instead of full vectors.
2250 In this case:
2252 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2254 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2255 In this case:
2257 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2259 When FOR_EPILOGUE_P is true, make this determination based on the
2260 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2261 based on the assumption that LOOP_VINFO is the main loop. The caller
2262 has made sure that the number of iterations is set appropriately for
2263 this value of FOR_EPILOGUE_P. */
2265 opt_result
2266 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2267 bool for_epilogue_p)
2269 /* Determine whether there would be any scalar iterations left over. */
2270 bool need_peeling_or_partial_vectors_p
2271 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2273 /* Decide whether to vectorize the loop with partial vectors. */
2274 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2275 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2276 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2277 && need_peeling_or_partial_vectors_p)
2279 /* For partial-vector-usage=1, try to push the handling of partial
2280 vectors to the epilogue, with the main loop continuing to operate
2281 on full vectors.
2283 If we are unrolling we also do not want to use partial vectors. This
2284 is to avoid the overhead of generating multiple masks and also to
2285 avoid having to execute entire iterations of FALSE masked instructions
2286 when dealing with one or less full iterations.
2288 ??? We could then end up failing to use partial vectors if we
2289 decide to peel iterations into a prologue, and if the main loop
2290 then ends up processing fewer than VF iterations. */
2291 if ((param_vect_partial_vector_usage == 1
2292 || loop_vinfo->suggested_unroll_factor > 1)
2293 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2294 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2295 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2296 else
2297 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2300 if (dump_enabled_p ())
2302 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2303 dump_printf_loc (MSG_NOTE, vect_location,
2304 "operating on partial vectors%s.\n",
2305 for_epilogue_p ? " for epilogue loop" : "");
2306 else
2307 dump_printf_loc (MSG_NOTE, vect_location,
2308 "operating only on full vectors%s.\n",
2309 for_epilogue_p ? " for epilogue loop" : "");
2312 if (for_epilogue_p)
2314 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2315 gcc_assert (orig_loop_vinfo);
2316 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2317 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2318 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2321 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2322 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2324 /* Check that the loop processes at least one full vector. */
2325 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2326 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2327 if (known_lt (wi::to_widest (scalar_niters), vf))
2328 return opt_result::failure_at (vect_location,
2329 "loop does not have enough iterations"
2330 " to support vectorization.\n");
2332 /* If we need to peel an extra epilogue iteration to handle data
2333 accesses with gaps, check that there are enough scalar iterations
2334 available.
2336 The check above is redundant with this one when peeling for gaps,
2337 but the distinction is useful for diagnostics. */
2338 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2339 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2340 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2341 return opt_result::failure_at (vect_location,
2342 "loop does not have enough iterations"
2343 " to support peeling for gaps.\n");
2346 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2347 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2348 && need_peeling_or_partial_vectors_p);
2350 return opt_result::success ();
2353 /* Function vect_analyze_loop_2.
2355 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2356 analyses will record information in some members of LOOP_VINFO. FATAL
2357 indicates if some analysis meets fatal error. If one non-NULL pointer
2358 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2359 worked out suggested unroll factor, while one NULL pointer shows it's
2360 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2361 is to hold the slp decision when the suggested unroll factor is worked
2362 out. */
2363 static opt_result
2364 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2365 unsigned *suggested_unroll_factor,
2366 bool& slp_done_for_suggested_uf)
2368 opt_result ok = opt_result::success ();
2369 int res;
2370 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2371 poly_uint64 min_vf = 2;
2372 loop_vec_info orig_loop_vinfo = NULL;
2374 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2375 loop_vec_info of the first vectorized loop. */
2376 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2377 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2378 else
2379 orig_loop_vinfo = loop_vinfo;
2380 gcc_assert (orig_loop_vinfo);
2382 /* The first group of checks is independent of the vector size. */
2383 fatal = true;
2385 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2386 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2387 return opt_result::failure_at (vect_location,
2388 "not vectorized: simd if(0)\n");
2390 /* Find all data references in the loop (which correspond to vdefs/vuses)
2391 and analyze their evolution in the loop. */
2393 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2395 /* Gather the data references and count stmts in the loop. */
2396 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2398 opt_result res
2399 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2400 &LOOP_VINFO_DATAREFS (loop_vinfo),
2401 &LOOP_VINFO_N_STMTS (loop_vinfo));
2402 if (!res)
2404 if (dump_enabled_p ())
2405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2406 "not vectorized: loop contains function "
2407 "calls or data references that cannot "
2408 "be analyzed\n");
2409 return res;
2411 loop_vinfo->shared->save_datarefs ();
2413 else
2414 loop_vinfo->shared->check_datarefs ();
2416 /* Analyze the data references and also adjust the minimal
2417 vectorization factor according to the loads and stores. */
2419 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2420 if (!ok)
2422 if (dump_enabled_p ())
2423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2424 "bad data references.\n");
2425 return ok;
2428 /* Check if we are applying unroll factor now. */
2429 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2430 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2432 /* If the slp decision is false when suggested unroll factor is worked
2433 out, and we are applying suggested unroll factor, we can simply skip
2434 all slp related analyses this time. */
2435 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2437 /* Classify all cross-iteration scalar data-flow cycles.
2438 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2439 vect_analyze_scalar_cycles (loop_vinfo, slp);
2441 vect_pattern_recog (loop_vinfo);
2443 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2445 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2446 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2448 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2449 if (!ok)
2451 if (dump_enabled_p ())
2452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2453 "bad data access.\n");
2454 return ok;
2457 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2459 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2460 if (!ok)
2462 if (dump_enabled_p ())
2463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2464 "unexpected pattern.\n");
2465 return ok;
2468 /* While the rest of the analysis below depends on it in some way. */
2469 fatal = false;
2471 /* Analyze data dependences between the data-refs in the loop
2472 and adjust the maximum vectorization factor according to
2473 the dependences.
2474 FORNOW: fail at the first data dependence that we encounter. */
2476 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2477 if (!ok)
2479 if (dump_enabled_p ())
2480 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2481 "bad data dependence.\n");
2482 return ok;
2484 if (max_vf != MAX_VECTORIZATION_FACTOR
2485 && maybe_lt (max_vf, min_vf))
2486 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2487 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2489 ok = vect_determine_vectorization_factor (loop_vinfo);
2490 if (!ok)
2492 if (dump_enabled_p ())
2493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2494 "can't determine vectorization factor.\n");
2495 return ok;
2497 if (max_vf != MAX_VECTORIZATION_FACTOR
2498 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2499 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2501 /* Compute the scalar iteration cost. */
2502 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2504 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2506 if (slp)
2508 /* Check the SLP opportunities in the loop, analyze and build
2509 SLP trees. */
2510 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2511 if (!ok)
2512 return ok;
2514 /* If there are any SLP instances mark them as pure_slp. */
2515 slp = vect_make_slp_decision (loop_vinfo);
2516 if (slp)
2518 /* Find stmts that need to be both vectorized and SLPed. */
2519 vect_detect_hybrid_slp (loop_vinfo);
2521 /* Update the vectorization factor based on the SLP decision. */
2522 vect_update_vf_for_slp (loop_vinfo);
2524 /* Optimize the SLP graph with the vectorization factor fixed. */
2525 vect_optimize_slp (loop_vinfo);
2527 /* Gather the loads reachable from the SLP graph entries. */
2528 vect_gather_slp_loads (loop_vinfo);
2532 bool saved_can_use_partial_vectors_p
2533 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2535 /* We don't expect to have to roll back to anything other than an empty
2536 set of rgroups. */
2537 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2539 /* This is the point where we can re-start analysis with SLP forced off. */
2540 start_over:
2542 /* Apply the suggested unrolling factor, this was determined by the backend
2543 during finish_cost the first time we ran the analyzis for this
2544 vector mode. */
2545 if (applying_suggested_uf)
2546 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2548 /* Now the vectorization factor is final. */
2549 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2550 gcc_assert (known_ne (vectorization_factor, 0U));
2552 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2554 dump_printf_loc (MSG_NOTE, vect_location,
2555 "vectorization_factor = ");
2556 dump_dec (MSG_NOTE, vectorization_factor);
2557 dump_printf (MSG_NOTE, ", niters = %wd\n",
2558 LOOP_VINFO_INT_NITERS (loop_vinfo));
2561 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2563 /* Analyze the alignment of the data-refs in the loop.
2564 Fail if a data reference is found that cannot be vectorized. */
2566 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2567 if (!ok)
2569 if (dump_enabled_p ())
2570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2571 "bad data alignment.\n");
2572 return ok;
2575 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2576 It is important to call pruning after vect_analyze_data_ref_accesses,
2577 since we use grouping information gathered by interleaving analysis. */
2578 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2579 if (!ok)
2580 return ok;
2582 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2583 vectorization, since we do not want to add extra peeling or
2584 add versioning for alignment. */
2585 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2586 /* This pass will decide on using loop versioning and/or loop peeling in
2587 order to enhance the alignment of data references in the loop. */
2588 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2589 if (!ok)
2590 return ok;
2592 if (slp)
2594 /* Analyze operations in the SLP instances. Note this may
2595 remove unsupported SLP instances which makes the above
2596 SLP kind detection invalid. */
2597 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2598 vect_slp_analyze_operations (loop_vinfo);
2599 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2601 ok = opt_result::failure_at (vect_location,
2602 "unsupported SLP instances\n");
2603 goto again;
2606 /* Check whether any load in ALL SLP instances is possibly permuted. */
2607 slp_tree load_node, slp_root;
2608 unsigned i, x;
2609 slp_instance instance;
2610 bool can_use_lanes = true;
2611 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2613 slp_root = SLP_INSTANCE_TREE (instance);
2614 int group_size = SLP_TREE_LANES (slp_root);
2615 tree vectype = SLP_TREE_VECTYPE (slp_root);
2616 bool loads_permuted = false;
2617 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2619 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2620 continue;
2621 unsigned j;
2622 stmt_vec_info load_info;
2623 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2624 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2626 loads_permuted = true;
2627 break;
2631 /* If the loads and stores can be handled with load/store-lane
2632 instructions record it and move on to the next instance. */
2633 if (loads_permuted
2634 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2635 && vect_store_lanes_supported (vectype, group_size, false))
2637 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2639 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2640 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2641 /* Use SLP for strided accesses (or if we can't
2642 load-lanes). */
2643 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2644 || ! vect_load_lanes_supported
2645 (STMT_VINFO_VECTYPE (stmt_vinfo),
2646 DR_GROUP_SIZE (stmt_vinfo), false))
2647 break;
2650 can_use_lanes
2651 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2653 if (can_use_lanes && dump_enabled_p ())
2654 dump_printf_loc (MSG_NOTE, vect_location,
2655 "SLP instance %p can use load/store-lanes\n",
2656 (void *) instance);
2658 else
2660 can_use_lanes = false;
2661 break;
2665 /* If all SLP instances can use load/store-lanes abort SLP and try again
2666 with SLP disabled. */
2667 if (can_use_lanes)
2669 ok = opt_result::failure_at (vect_location,
2670 "Built SLP cancelled: can use "
2671 "load/store-lanes\n");
2672 if (dump_enabled_p ())
2673 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2674 "Built SLP cancelled: all SLP instances support "
2675 "load/store-lanes\n");
2676 goto again;
2680 /* Dissolve SLP-only groups. */
2681 vect_dissolve_slp_only_groups (loop_vinfo);
2683 /* Scan all the remaining operations in the loop that are not subject
2684 to SLP and make sure they are vectorizable. */
2685 ok = vect_analyze_loop_operations (loop_vinfo);
2686 if (!ok)
2688 if (dump_enabled_p ())
2689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2690 "bad operation or unsupported loop bound.\n");
2691 return ok;
2694 /* For now, we don't expect to mix both masking and length approaches for one
2695 loop, disable it if both are recorded. */
2696 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2697 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2698 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2700 if (dump_enabled_p ())
2701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2702 "can't vectorize a loop with partial vectors"
2703 " because we don't expect to mix different"
2704 " approaches with partial vectors for the"
2705 " same loop.\n");
2706 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2709 /* If we still have the option of using partial vectors,
2710 check whether we can generate the necessary loop controls. */
2711 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2712 && !vect_verify_full_masking (loop_vinfo)
2713 && !vect_verify_loop_lens (loop_vinfo))
2714 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2716 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2717 to be able to handle fewer than VF scalars, or needs to have a lower VF
2718 than the main loop. */
2719 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2720 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2721 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2722 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2723 return opt_result::failure_at (vect_location,
2724 "Vectorization factor too high for"
2725 " epilogue loop.\n");
2727 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2728 assuming that the loop will be used as a main loop. We will redo
2729 this analysis later if we instead decide to use the loop as an
2730 epilogue loop. */
2731 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2732 if (!ok)
2733 return ok;
2735 /* Check the costings of the loop make vectorizing worthwhile. */
2736 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2737 if (res < 0)
2739 ok = opt_result::failure_at (vect_location,
2740 "Loop costings may not be worthwhile.\n");
2741 goto again;
2743 if (!res)
2744 return opt_result::failure_at (vect_location,
2745 "Loop costings not worthwhile.\n");
2747 /* If an epilogue loop is required make sure we can create one. */
2748 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2749 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2751 if (dump_enabled_p ())
2752 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2753 if (!vect_can_advance_ivs_p (loop_vinfo)
2754 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2755 single_exit (LOOP_VINFO_LOOP
2756 (loop_vinfo))))
2758 ok = opt_result::failure_at (vect_location,
2759 "not vectorized: can't create required "
2760 "epilog loop\n");
2761 goto again;
2765 /* During peeling, we need to check if number of loop iterations is
2766 enough for both peeled prolog loop and vector loop. This check
2767 can be merged along with threshold check of loop versioning, so
2768 increase threshold for this case if necessary.
2770 If we are analyzing an epilogue we still want to check what its
2771 versioning threshold would be. If we decide to vectorize the epilogues we
2772 will want to use the lowest versioning threshold of all epilogues and main
2773 loop. This will enable us to enter a vectorized epilogue even when
2774 versioning the loop. We can't simply check whether the epilogue requires
2775 versioning though since we may have skipped some versioning checks when
2776 analyzing the epilogue. For instance, checks for alias versioning will be
2777 skipped when dealing with epilogues as we assume we already checked them
2778 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2779 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2781 poly_uint64 niters_th = 0;
2782 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2784 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2786 /* Niters for peeled prolog loop. */
2787 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2789 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2790 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2791 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2793 else
2794 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2797 /* Niters for at least one iteration of vectorized loop. */
2798 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2799 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2800 /* One additional iteration because of peeling for gap. */
2801 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2802 niters_th += 1;
2804 /* Use the same condition as vect_transform_loop to decide when to use
2805 the cost to determine a versioning threshold. */
2806 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2807 && ordered_p (th, niters_th))
2808 niters_th = ordered_max (poly_uint64 (th), niters_th);
2810 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2813 gcc_assert (known_eq (vectorization_factor,
2814 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2816 slp_done_for_suggested_uf = slp;
2818 /* Ok to vectorize! */
2819 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2820 return opt_result::success ();
2822 again:
2823 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2824 gcc_assert (!ok);
2826 /* Try again with SLP forced off but if we didn't do any SLP there is
2827 no point in re-trying. */
2828 if (!slp)
2829 return ok;
2831 /* If the slp decision is true when suggested unroll factor is worked
2832 out, and we are applying suggested unroll factor, we don't need to
2833 re-try any more. */
2834 if (applying_suggested_uf && slp_done_for_suggested_uf)
2835 return ok;
2837 /* If there are reduction chains re-trying will fail anyway. */
2838 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2839 return ok;
2841 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2842 via interleaving or lane instructions. */
2843 slp_instance instance;
2844 slp_tree node;
2845 unsigned i, j;
2846 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2848 stmt_vec_info vinfo;
2849 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2850 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2851 continue;
2852 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2853 unsigned int size = DR_GROUP_SIZE (vinfo);
2854 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2855 if (! vect_store_lanes_supported (vectype, size, false)
2856 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2857 && ! vect_grouped_store_supported (vectype, size))
2858 return opt_result::failure_at (vinfo->stmt,
2859 "unsupported grouped store\n");
2860 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2862 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2863 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2864 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2865 size = DR_GROUP_SIZE (vinfo);
2866 vectype = STMT_VINFO_VECTYPE (vinfo);
2867 if (! vect_load_lanes_supported (vectype, size, false)
2868 && ! vect_grouped_load_supported (vectype, single_element_p,
2869 size))
2870 return opt_result::failure_at (vinfo->stmt,
2871 "unsupported grouped load\n");
2875 if (dump_enabled_p ())
2876 dump_printf_loc (MSG_NOTE, vect_location,
2877 "re-trying with SLP disabled\n");
2879 /* Roll back state appropriately. No SLP this time. */
2880 slp = false;
2881 /* Restore vectorization factor as it were without SLP. */
2882 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2883 /* Free the SLP instances. */
2884 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2885 vect_free_slp_instance (instance);
2886 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2887 /* Reset SLP type to loop_vect on all stmts. */
2888 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2890 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2891 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2892 !gsi_end_p (si); gsi_next (&si))
2894 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2895 STMT_SLP_TYPE (stmt_info) = loop_vect;
2896 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2897 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2899 /* vectorizable_reduction adjusts reduction stmt def-types,
2900 restore them to that of the PHI. */
2901 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2902 = STMT_VINFO_DEF_TYPE (stmt_info);
2903 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2904 (STMT_VINFO_REDUC_DEF (stmt_info)))
2905 = STMT_VINFO_DEF_TYPE (stmt_info);
2908 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2909 !gsi_end_p (si); gsi_next (&si))
2911 if (is_gimple_debug (gsi_stmt (si)))
2912 continue;
2913 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2914 STMT_SLP_TYPE (stmt_info) = loop_vect;
2915 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2917 stmt_vec_info pattern_stmt_info
2918 = STMT_VINFO_RELATED_STMT (stmt_info);
2919 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2920 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2922 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2923 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2924 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2925 !gsi_end_p (pi); gsi_next (&pi))
2926 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2927 = loop_vect;
2931 /* Free optimized alias test DDRS. */
2932 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2933 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2934 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2935 /* Reset target cost data. */
2936 delete loop_vinfo->vector_costs;
2937 loop_vinfo->vector_costs = nullptr;
2938 /* Reset accumulated rgroup information. */
2939 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2940 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2941 /* Reset assorted flags. */
2942 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2943 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2944 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2945 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2946 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2947 = saved_can_use_partial_vectors_p;
2949 goto start_over;
2952 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2953 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2954 OLD_LOOP_VINFO is better unless something specifically indicates
2955 otherwise.
2957 Note that this deliberately isn't a partial order. */
2959 static bool
2960 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2961 loop_vec_info old_loop_vinfo)
2963 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2964 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2966 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2967 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2969 /* Always prefer a VF of loop->simdlen over any other VF. */
2970 if (loop->simdlen)
2972 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2973 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2974 if (new_simdlen_p != old_simdlen_p)
2975 return new_simdlen_p;
2978 const auto *old_costs = old_loop_vinfo->vector_costs;
2979 const auto *new_costs = new_loop_vinfo->vector_costs;
2980 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2981 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2983 return new_costs->better_main_loop_than_p (old_costs);
2986 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2987 true if we should. */
2989 static bool
2990 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2991 loop_vec_info old_loop_vinfo)
2993 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2994 return false;
2996 if (dump_enabled_p ())
2997 dump_printf_loc (MSG_NOTE, vect_location,
2998 "***** Preferring vector mode %s to vector mode %s\n",
2999 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3000 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3001 return true;
3004 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3005 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3006 MODE_I to the next mode useful to analyze.
3007 Return the loop_vinfo on success and wrapped null on failure. */
3009 static opt_loop_vec_info
3010 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3011 const vect_loop_form_info *loop_form_info,
3012 loop_vec_info main_loop_vinfo,
3013 const vector_modes &vector_modes, unsigned &mode_i,
3014 machine_mode &autodetected_vector_mode,
3015 bool &fatal)
3017 loop_vec_info loop_vinfo
3018 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3020 machine_mode vector_mode = vector_modes[mode_i];
3021 loop_vinfo->vector_mode = vector_mode;
3022 unsigned int suggested_unroll_factor = 1;
3023 bool slp_done_for_suggested_uf;
3025 /* Run the main analysis. */
3026 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3027 &suggested_unroll_factor,
3028 slp_done_for_suggested_uf);
3029 if (dump_enabled_p ())
3030 dump_printf_loc (MSG_NOTE, vect_location,
3031 "***** Analysis %s with vector mode %s\n",
3032 res ? "succeeded" : " failed",
3033 GET_MODE_NAME (loop_vinfo->vector_mode));
3035 if (!main_loop_vinfo && suggested_unroll_factor > 1)
3037 if (dump_enabled_p ())
3038 dump_printf_loc (MSG_NOTE, vect_location,
3039 "***** Re-trying analysis for unrolling"
3040 " with unroll factor %d and slp %s.\n",
3041 suggested_unroll_factor,
3042 slp_done_for_suggested_uf ? "on" : "off");
3043 loop_vec_info unroll_vinfo
3044 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3045 unroll_vinfo->vector_mode = vector_mode;
3046 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3047 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3048 slp_done_for_suggested_uf);
3049 if (new_res)
3051 delete loop_vinfo;
3052 loop_vinfo = unroll_vinfo;
3054 else
3055 delete unroll_vinfo;
3058 /* Remember the autodetected vector mode. */
3059 if (vector_mode == VOIDmode)
3060 autodetected_vector_mode = loop_vinfo->vector_mode;
3062 /* Advance mode_i, first skipping modes that would result in the
3063 same analysis result. */
3064 while (mode_i + 1 < vector_modes.length ()
3065 && vect_chooses_same_modes_p (loop_vinfo,
3066 vector_modes[mode_i + 1]))
3068 if (dump_enabled_p ())
3069 dump_printf_loc (MSG_NOTE, vect_location,
3070 "***** The result for vector mode %s would"
3071 " be the same\n",
3072 GET_MODE_NAME (vector_modes[mode_i + 1]));
3073 mode_i += 1;
3075 if (mode_i + 1 < vector_modes.length ()
3076 && VECTOR_MODE_P (autodetected_vector_mode)
3077 && (related_vector_mode (vector_modes[mode_i + 1],
3078 GET_MODE_INNER (autodetected_vector_mode))
3079 == autodetected_vector_mode)
3080 && (related_vector_mode (autodetected_vector_mode,
3081 GET_MODE_INNER (vector_modes[mode_i + 1]))
3082 == vector_modes[mode_i + 1]))
3084 if (dump_enabled_p ())
3085 dump_printf_loc (MSG_NOTE, vect_location,
3086 "***** Skipping vector mode %s, which would"
3087 " repeat the analysis for %s\n",
3088 GET_MODE_NAME (vector_modes[mode_i + 1]),
3089 GET_MODE_NAME (autodetected_vector_mode));
3090 mode_i += 1;
3092 mode_i++;
3094 if (!res)
3096 delete loop_vinfo;
3097 if (fatal)
3098 gcc_checking_assert (main_loop_vinfo == NULL);
3099 return opt_loop_vec_info::propagate_failure (res);
3102 return opt_loop_vec_info::success (loop_vinfo);
3105 /* Function vect_analyze_loop.
3107 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3108 for it. The different analyses will record information in the
3109 loop_vec_info struct. */
3110 opt_loop_vec_info
3111 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3113 DUMP_VECT_SCOPE ("analyze_loop_nest");
3115 if (loop_outer (loop)
3116 && loop_vec_info_for_loop (loop_outer (loop))
3117 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3118 return opt_loop_vec_info::failure_at (vect_location,
3119 "outer-loop already vectorized.\n");
3121 if (!find_loop_nest (loop, &shared->loop_nest))
3122 return opt_loop_vec_info::failure_at
3123 (vect_location,
3124 "not vectorized: loop nest containing two or more consecutive inner"
3125 " loops cannot be vectorized\n");
3127 /* Analyze the loop form. */
3128 vect_loop_form_info loop_form_info;
3129 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3130 if (!res)
3132 if (dump_enabled_p ())
3133 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3134 "bad loop form.\n");
3135 return opt_loop_vec_info::propagate_failure (res);
3137 if (!integer_onep (loop_form_info.assumptions))
3139 /* We consider to vectorize this loop by versioning it under
3140 some assumptions. In order to do this, we need to clear
3141 existing information computed by scev and niter analyzer. */
3142 scev_reset_htab ();
3143 free_numbers_of_iterations_estimates (loop);
3144 /* Also set flag for this loop so that following scev and niter
3145 analysis are done under the assumptions. */
3146 loop_constraint_set (loop, LOOP_C_FINITE);
3149 auto_vector_modes vector_modes;
3150 /* Autodetect first vector size we try. */
3151 vector_modes.safe_push (VOIDmode);
3152 unsigned int autovec_flags
3153 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3154 loop->simdlen != 0);
3155 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3156 && !unlimited_cost_model (loop));
3157 machine_mode autodetected_vector_mode = VOIDmode;
3158 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3159 unsigned int mode_i = 0;
3160 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3162 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3163 a mode has not been analyzed. */
3164 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3165 for (unsigned i = 0; i < vector_modes.length (); ++i)
3166 cached_vf_per_mode.safe_push (0);
3168 /* First determine the main loop vectorization mode, either the first
3169 one that works, starting with auto-detecting the vector mode and then
3170 following the targets order of preference, or the one with the
3171 lowest cost if pick_lowest_cost_p. */
3172 while (1)
3174 bool fatal;
3175 unsigned int last_mode_i = mode_i;
3176 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3177 failed. */
3178 cached_vf_per_mode[last_mode_i] = -1;
3179 opt_loop_vec_info loop_vinfo
3180 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3181 NULL, vector_modes, mode_i,
3182 autodetected_vector_mode, fatal);
3183 if (fatal)
3184 break;
3186 if (loop_vinfo)
3188 /* Analyzis has been successful so update the VF value. The
3189 VF should always be a multiple of unroll_factor and we want to
3190 capture the original VF here. */
3191 cached_vf_per_mode[last_mode_i]
3192 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3193 loop_vinfo->suggested_unroll_factor);
3194 /* Once we hit the desired simdlen for the first time,
3195 discard any previous attempts. */
3196 if (simdlen
3197 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3199 delete first_loop_vinfo;
3200 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3201 simdlen = 0;
3203 else if (pick_lowest_cost_p
3204 && first_loop_vinfo
3205 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3207 /* Pick loop_vinfo over first_loop_vinfo. */
3208 delete first_loop_vinfo;
3209 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3211 if (first_loop_vinfo == NULL)
3212 first_loop_vinfo = loop_vinfo;
3213 else
3215 delete loop_vinfo;
3216 loop_vinfo = opt_loop_vec_info::success (NULL);
3219 /* Commit to first_loop_vinfo if we have no reason to try
3220 alternatives. */
3221 if (!simdlen && !pick_lowest_cost_p)
3222 break;
3224 if (mode_i == vector_modes.length ()
3225 || autodetected_vector_mode == VOIDmode)
3226 break;
3228 /* Try the next biggest vector size. */
3229 if (dump_enabled_p ())
3230 dump_printf_loc (MSG_NOTE, vect_location,
3231 "***** Re-trying analysis with vector mode %s\n",
3232 GET_MODE_NAME (vector_modes[mode_i]));
3234 if (!first_loop_vinfo)
3235 return opt_loop_vec_info::propagate_failure (res);
3237 if (dump_enabled_p ())
3238 dump_printf_loc (MSG_NOTE, vect_location,
3239 "***** Choosing vector mode %s\n",
3240 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3242 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3243 enabled, SIMDUID is not set, it is the innermost loop and we have
3244 either already found the loop's SIMDLEN or there was no SIMDLEN to
3245 begin with.
3246 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3247 bool vect_epilogues = (!simdlen
3248 && loop->inner == NULL
3249 && param_vect_epilogues_nomask
3250 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3251 && !loop->simduid);
3252 if (!vect_epilogues)
3253 return first_loop_vinfo;
3255 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3256 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3258 /* For epilogues start the analysis from the first mode. The motivation
3259 behind starting from the beginning comes from cases where the VECTOR_MODES
3260 array may contain length-agnostic and length-specific modes. Their
3261 ordering is not guaranteed, so we could end up picking a mode for the main
3262 loop that is after the epilogue's optimal mode. */
3263 vector_modes[0] = autodetected_vector_mode;
3264 mode_i = 0;
3266 bool supports_partial_vectors =
3267 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3268 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3270 while (1)
3272 /* If the target does not support partial vectors we can shorten the
3273 number of modes to analyze for the epilogue as we know we can't pick a
3274 mode that would lead to a VF at least as big as the
3275 FIRST_VINFO_VF. */
3276 if (!supports_partial_vectors
3277 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3279 mode_i++;
3280 if (mode_i == vector_modes.length ())
3281 break;
3282 continue;
3285 if (dump_enabled_p ())
3286 dump_printf_loc (MSG_NOTE, vect_location,
3287 "***** Re-trying epilogue analysis with vector "
3288 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3290 bool fatal;
3291 opt_loop_vec_info loop_vinfo
3292 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3293 first_loop_vinfo,
3294 vector_modes, mode_i,
3295 autodetected_vector_mode, fatal);
3296 if (fatal)
3297 break;
3299 if (loop_vinfo)
3301 if (pick_lowest_cost_p)
3303 /* Keep trying to roll back vectorization attempts while the
3304 loop_vec_infos they produced were worse than this one. */
3305 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3306 while (!vinfos.is_empty ()
3307 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3309 gcc_assert (vect_epilogues);
3310 delete vinfos.pop ();
3313 /* For now only allow one epilogue loop. */
3314 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3316 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3317 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3318 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3319 || maybe_ne (lowest_th, 0U));
3320 /* Keep track of the known smallest versioning
3321 threshold. */
3322 if (ordered_p (lowest_th, th))
3323 lowest_th = ordered_min (lowest_th, th);
3325 else
3327 delete loop_vinfo;
3328 loop_vinfo = opt_loop_vec_info::success (NULL);
3331 /* For now only allow one epilogue loop, but allow
3332 pick_lowest_cost_p to replace it, so commit to the
3333 first epilogue if we have no reason to try alternatives. */
3334 if (!pick_lowest_cost_p)
3335 break;
3338 if (mode_i == vector_modes.length ())
3339 break;
3343 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3345 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3346 if (dump_enabled_p ())
3347 dump_printf_loc (MSG_NOTE, vect_location,
3348 "***** Choosing epilogue vector mode %s\n",
3349 GET_MODE_NAME
3350 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3353 return first_loop_vinfo;
3356 /* Return true if there is an in-order reduction function for CODE, storing
3357 it in *REDUC_FN if so. */
3359 static bool
3360 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3362 if (code == PLUS_EXPR)
3364 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3365 return true;
3367 return false;
3370 /* Function reduction_fn_for_scalar_code
3372 Input:
3373 CODE - tree_code of a reduction operations.
3375 Output:
3376 REDUC_FN - the corresponding internal function to be used to reduce the
3377 vector of partial results into a single scalar result, or IFN_LAST
3378 if the operation is a supported reduction operation, but does not have
3379 such an internal function.
3381 Return FALSE if CODE currently cannot be vectorized as reduction. */
3383 bool
3384 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3386 if (code.is_tree_code ())
3387 switch (tree_code (code))
3389 case MAX_EXPR:
3390 *reduc_fn = IFN_REDUC_MAX;
3391 return true;
3393 case MIN_EXPR:
3394 *reduc_fn = IFN_REDUC_MIN;
3395 return true;
3397 case PLUS_EXPR:
3398 *reduc_fn = IFN_REDUC_PLUS;
3399 return true;
3401 case BIT_AND_EXPR:
3402 *reduc_fn = IFN_REDUC_AND;
3403 return true;
3405 case BIT_IOR_EXPR:
3406 *reduc_fn = IFN_REDUC_IOR;
3407 return true;
3409 case BIT_XOR_EXPR:
3410 *reduc_fn = IFN_REDUC_XOR;
3411 return true;
3413 case MULT_EXPR:
3414 case MINUS_EXPR:
3415 *reduc_fn = IFN_LAST;
3416 return true;
3418 default:
3419 return false;
3421 else
3422 switch (combined_fn (code))
3424 CASE_CFN_FMAX:
3425 *reduc_fn = IFN_REDUC_FMAX;
3426 return true;
3428 CASE_CFN_FMIN:
3429 *reduc_fn = IFN_REDUC_FMIN;
3430 return true;
3432 default:
3433 return false;
3437 /* If there is a neutral value X such that a reduction would not be affected
3438 by the introduction of additional X elements, return that X, otherwise
3439 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3440 of the scalar elements. If the reduction has just a single initial value
3441 then INITIAL_VALUE is that value, otherwise it is null. */
3443 tree
3444 neutral_op_for_reduction (tree scalar_type, code_helper code,
3445 tree initial_value)
3447 if (code.is_tree_code ())
3448 switch (tree_code (code))
3450 case WIDEN_SUM_EXPR:
3451 case DOT_PROD_EXPR:
3452 case SAD_EXPR:
3453 case PLUS_EXPR:
3454 case MINUS_EXPR:
3455 case BIT_IOR_EXPR:
3456 case BIT_XOR_EXPR:
3457 return build_zero_cst (scalar_type);
3459 case MULT_EXPR:
3460 return build_one_cst (scalar_type);
3462 case BIT_AND_EXPR:
3463 return build_all_ones_cst (scalar_type);
3465 case MAX_EXPR:
3466 case MIN_EXPR:
3467 return initial_value;
3469 default:
3470 return NULL_TREE;
3472 else
3473 switch (combined_fn (code))
3475 CASE_CFN_FMIN:
3476 CASE_CFN_FMAX:
3477 return initial_value;
3479 default:
3480 return NULL_TREE;
3484 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3485 STMT is printed with a message MSG. */
3487 static void
3488 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3490 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3493 /* Return true if we need an in-order reduction for operation CODE
3494 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3495 overflow must wrap. */
3497 bool
3498 needs_fold_left_reduction_p (tree type, code_helper code)
3500 /* CHECKME: check for !flag_finite_math_only too? */
3501 if (SCALAR_FLOAT_TYPE_P (type))
3503 if (code.is_tree_code ())
3504 switch (tree_code (code))
3506 case MIN_EXPR:
3507 case MAX_EXPR:
3508 return false;
3510 default:
3511 return !flag_associative_math;
3513 else
3514 switch (combined_fn (code))
3516 CASE_CFN_FMIN:
3517 CASE_CFN_FMAX:
3518 return false;
3520 default:
3521 return !flag_associative_math;
3525 if (INTEGRAL_TYPE_P (type))
3526 return (!code.is_tree_code ()
3527 || !operation_no_trapping_overflow (type, tree_code (code)));
3529 if (SAT_FIXED_POINT_TYPE_P (type))
3530 return true;
3532 return false;
3535 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3536 has a handled computation expression. Store the main reduction
3537 operation in *CODE. */
3539 static bool
3540 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3541 tree loop_arg, code_helper *code,
3542 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3544 auto_bitmap visited;
3545 tree lookfor = PHI_RESULT (phi);
3546 ssa_op_iter curri;
3547 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3548 while (USE_FROM_PTR (curr) != loop_arg)
3549 curr = op_iter_next_use (&curri);
3550 curri.i = curri.numops;
3553 path.safe_push (std::make_pair (curri, curr));
3554 tree use = USE_FROM_PTR (curr);
3555 if (use == lookfor)
3556 break;
3557 gimple *def = SSA_NAME_DEF_STMT (use);
3558 if (gimple_nop_p (def)
3559 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3561 pop:
3564 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3565 curri = x.first;
3566 curr = x.second;
3568 curr = op_iter_next_use (&curri);
3569 /* Skip already visited or non-SSA operands (from iterating
3570 over PHI args). */
3571 while (curr != NULL_USE_OPERAND_P
3572 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3573 || ! bitmap_set_bit (visited,
3574 SSA_NAME_VERSION
3575 (USE_FROM_PTR (curr)))));
3577 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3578 if (curr == NULL_USE_OPERAND_P)
3579 break;
3581 else
3583 if (gimple_code (def) == GIMPLE_PHI)
3584 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3585 else
3586 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3587 while (curr != NULL_USE_OPERAND_P
3588 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3589 || ! bitmap_set_bit (visited,
3590 SSA_NAME_VERSION
3591 (USE_FROM_PTR (curr)))))
3592 curr = op_iter_next_use (&curri);
3593 if (curr == NULL_USE_OPERAND_P)
3594 goto pop;
3597 while (1);
3598 if (dump_file && (dump_flags & TDF_DETAILS))
3600 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3601 unsigned i;
3602 std::pair<ssa_op_iter, use_operand_p> *x;
3603 FOR_EACH_VEC_ELT (path, i, x)
3604 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3605 dump_printf (MSG_NOTE, "\n");
3608 /* Check whether the reduction path detected is valid. */
3609 bool fail = path.length () == 0;
3610 bool neg = false;
3611 int sign = -1;
3612 *code = ERROR_MARK;
3613 for (unsigned i = 1; i < path.length (); ++i)
3615 gimple *use_stmt = USE_STMT (path[i].second);
3616 gimple_match_op op;
3617 if (!gimple_extract_op (use_stmt, &op))
3619 fail = true;
3620 break;
3622 unsigned int opi = op.num_ops;
3623 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3625 /* The following make sure we can compute the operand index
3626 easily plus it mostly disallows chaining via COND_EXPR condition
3627 operands. */
3628 for (opi = 0; opi < op.num_ops; ++opi)
3629 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3630 break;
3632 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3634 for (opi = 0; opi < op.num_ops; ++opi)
3635 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3636 break;
3638 if (opi == op.num_ops)
3640 fail = true;
3641 break;
3643 op.code = canonicalize_code (op.code, op.type);
3644 if (op.code == MINUS_EXPR)
3646 op.code = PLUS_EXPR;
3647 /* Track whether we negate the reduction value each iteration. */
3648 if (op.ops[1] == op.ops[opi])
3649 neg = ! neg;
3651 if (CONVERT_EXPR_CODE_P (op.code)
3652 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3654 else if (*code == ERROR_MARK)
3656 *code = op.code;
3657 sign = TYPE_SIGN (op.type);
3659 else if (op.code != *code)
3661 fail = true;
3662 break;
3664 else if ((op.code == MIN_EXPR
3665 || op.code == MAX_EXPR)
3666 && sign != TYPE_SIGN (op.type))
3668 fail = true;
3669 break;
3671 /* Check there's only a single stmt the op is used on. For the
3672 not value-changing tail and the last stmt allow out-of-loop uses.
3673 ??? We could relax this and handle arbitrary live stmts by
3674 forcing a scalar epilogue for example. */
3675 imm_use_iterator imm_iter;
3676 gimple *op_use_stmt;
3677 unsigned cnt = 0;
3678 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3679 if (!is_gimple_debug (op_use_stmt)
3680 && (*code != ERROR_MARK
3681 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3683 /* We want to allow x + x but not x < 1 ? x : 2. */
3684 if (is_gimple_assign (op_use_stmt)
3685 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3687 use_operand_p use_p;
3688 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3689 cnt++;
3691 else
3692 cnt++;
3694 if (cnt != 1)
3696 fail = true;
3697 break;
3700 return ! fail && ! neg && *code != ERROR_MARK;
3703 bool
3704 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3705 tree loop_arg, enum tree_code code)
3707 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3708 code_helper code_;
3709 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3710 && code_ == code);
3715 /* Function vect_is_simple_reduction
3717 (1) Detect a cross-iteration def-use cycle that represents a simple
3718 reduction computation. We look for the following pattern:
3720 loop_header:
3721 a1 = phi < a0, a2 >
3722 a3 = ...
3723 a2 = operation (a3, a1)
3727 a3 = ...
3728 loop_header:
3729 a1 = phi < a0, a2 >
3730 a2 = operation (a3, a1)
3732 such that:
3733 1. operation is commutative and associative and it is safe to
3734 change the order of the computation
3735 2. no uses for a2 in the loop (a2 is used out of the loop)
3736 3. no uses of a1 in the loop besides the reduction operation
3737 4. no uses of a1 outside the loop.
3739 Conditions 1,4 are tested here.
3740 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3742 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3743 nested cycles.
3745 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3746 reductions:
3748 a1 = phi < a0, a2 >
3749 inner loop (def of a3)
3750 a2 = phi < a3 >
3752 (4) Detect condition expressions, ie:
3753 for (int i = 0; i < N; i++)
3754 if (a[i] < val)
3755 ret_val = a[i];
3759 static stmt_vec_info
3760 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3761 bool *double_reduc, bool *reduc_chain_p, bool slp)
3763 gphi *phi = as_a <gphi *> (phi_info->stmt);
3764 gimple *phi_use_stmt = NULL;
3765 imm_use_iterator imm_iter;
3766 use_operand_p use_p;
3768 *double_reduc = false;
3769 *reduc_chain_p = false;
3770 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3772 tree phi_name = PHI_RESULT (phi);
3773 /* ??? If there are no uses of the PHI result the inner loop reduction
3774 won't be detected as possibly double-reduction by vectorizable_reduction
3775 because that tries to walk the PHI arg from the preheader edge which
3776 can be constant. See PR60382. */
3777 if (has_zero_uses (phi_name))
3778 return NULL;
3779 class loop *loop = (gimple_bb (phi))->loop_father;
3780 unsigned nphi_def_loop_uses = 0;
3781 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3783 gimple *use_stmt = USE_STMT (use_p);
3784 if (is_gimple_debug (use_stmt))
3785 continue;
3787 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3789 if (dump_enabled_p ())
3790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3791 "intermediate value used outside loop.\n");
3793 return NULL;
3796 nphi_def_loop_uses++;
3797 phi_use_stmt = use_stmt;
3800 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3801 if (TREE_CODE (latch_def) != SSA_NAME)
3803 if (dump_enabled_p ())
3804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3805 "reduction: not ssa_name: %T\n", latch_def);
3806 return NULL;
3809 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3810 if (!def_stmt_info
3811 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3812 return NULL;
3814 bool nested_in_vect_loop
3815 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3816 unsigned nlatch_def_loop_uses = 0;
3817 auto_vec<gphi *, 3> lcphis;
3818 bool inner_loop_of_double_reduc = false;
3819 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3821 gimple *use_stmt = USE_STMT (use_p);
3822 if (is_gimple_debug (use_stmt))
3823 continue;
3824 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3825 nlatch_def_loop_uses++;
3826 else
3828 /* We can have more than one loop-closed PHI. */
3829 lcphis.safe_push (as_a <gphi *> (use_stmt));
3830 if (nested_in_vect_loop
3831 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3832 == vect_double_reduction_def))
3833 inner_loop_of_double_reduc = true;
3837 /* If we are vectorizing an inner reduction we are executing that
3838 in the original order only in case we are not dealing with a
3839 double reduction. */
3840 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3842 if (dump_enabled_p ())
3843 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3844 "detected nested cycle: ");
3845 return def_stmt_info;
3848 /* When the inner loop of a double reduction ends up with more than
3849 one loop-closed PHI we have failed to classify alternate such
3850 PHIs as double reduction, leading to wrong code. See PR103237. */
3851 if (inner_loop_of_double_reduc && lcphis.length () != 1)
3853 if (dump_enabled_p ())
3854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3855 "unhandle double reduction\n");
3856 return NULL;
3859 /* If this isn't a nested cycle or if the nested cycle reduction value
3860 is used ouside of the inner loop we cannot handle uses of the reduction
3861 value. */
3862 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3864 if (dump_enabled_p ())
3865 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3866 "reduction used in loop.\n");
3867 return NULL;
3870 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3871 defined in the inner loop. */
3872 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3874 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3875 if (gimple_phi_num_args (def_stmt) != 1
3876 || TREE_CODE (op1) != SSA_NAME)
3878 if (dump_enabled_p ())
3879 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3880 "unsupported phi node definition.\n");
3882 return NULL;
3885 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3886 if (gimple_bb (def1)
3887 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3888 && loop->inner
3889 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3890 && (is_gimple_assign (def1) || is_gimple_call (def1))
3891 && is_a <gphi *> (phi_use_stmt)
3892 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3894 if (dump_enabled_p ())
3895 report_vect_op (MSG_NOTE, def_stmt,
3896 "detected double reduction: ");
3898 *double_reduc = true;
3899 return def_stmt_info;
3902 return NULL;
3905 /* Look for the expression computing latch_def from then loop PHI result. */
3906 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3907 code_helper code;
3908 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3909 path))
3911 STMT_VINFO_REDUC_CODE (phi_info) = code;
3912 if (code == COND_EXPR && !nested_in_vect_loop)
3913 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3915 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3916 reduction chain for which the additional restriction is that
3917 all operations in the chain are the same. */
3918 auto_vec<stmt_vec_info, 8> reduc_chain;
3919 unsigned i;
3920 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3921 for (i = path.length () - 1; i >= 1; --i)
3923 gimple *stmt = USE_STMT (path[i].second);
3924 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3925 gimple_match_op op;
3926 if (!gimple_extract_op (stmt, &op))
3927 gcc_unreachable ();
3928 if (gassign *assign = dyn_cast<gassign *> (stmt))
3929 STMT_VINFO_REDUC_IDX (stmt_info)
3930 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3931 else
3933 gcall *call = as_a<gcall *> (stmt);
3934 STMT_VINFO_REDUC_IDX (stmt_info)
3935 = path[i].second->use - gimple_call_arg_ptr (call, 0);
3937 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3938 && (i == 1 || i == path.length () - 1));
3939 if ((op.code != code && !leading_conversion)
3940 /* We can only handle the final value in epilogue
3941 generation for reduction chains. */
3942 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3943 is_slp_reduc = false;
3944 /* For reduction chains we support a trailing/leading
3945 conversions. We do not store those in the actual chain. */
3946 if (leading_conversion)
3947 continue;
3948 reduc_chain.safe_push (stmt_info);
3950 if (slp && is_slp_reduc && reduc_chain.length () > 1)
3952 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3954 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3955 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3957 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3958 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3960 /* Save the chain for further analysis in SLP detection. */
3961 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3962 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3964 *reduc_chain_p = true;
3965 if (dump_enabled_p ())
3966 dump_printf_loc (MSG_NOTE, vect_location,
3967 "reduction: detected reduction chain\n");
3969 else if (dump_enabled_p ())
3970 dump_printf_loc (MSG_NOTE, vect_location,
3971 "reduction: detected reduction\n");
3973 return def_stmt_info;
3976 if (dump_enabled_p ())
3977 dump_printf_loc (MSG_NOTE, vect_location,
3978 "reduction: unknown pattern\n");
3980 return NULL;
3983 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3984 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3985 or -1 if not known. */
3987 static int
3988 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3990 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3991 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3993 if (dump_enabled_p ())
3994 dump_printf_loc (MSG_NOTE, vect_location,
3995 "cost model: epilogue peel iters set to vf/2 "
3996 "because loop iterations are unknown .\n");
3997 return assumed_vf / 2;
3999 else
4001 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4002 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4003 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4004 /* If we need to peel for gaps, but no peeling is required, we have to
4005 peel VF iterations. */
4006 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4007 peel_iters_epilogue = assumed_vf;
4008 return peel_iters_epilogue;
4012 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4014 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4015 int *peel_iters_epilogue,
4016 stmt_vector_for_cost *scalar_cost_vec,
4017 stmt_vector_for_cost *prologue_cost_vec,
4018 stmt_vector_for_cost *epilogue_cost_vec)
4020 int retval = 0;
4022 *peel_iters_epilogue
4023 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4025 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4027 /* If peeled iterations are known but number of scalar loop
4028 iterations are unknown, count a taken branch per peeled loop. */
4029 if (peel_iters_prologue > 0)
4030 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4031 vect_prologue);
4032 if (*peel_iters_epilogue > 0)
4033 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4034 vect_epilogue);
4037 stmt_info_for_cost *si;
4038 int j;
4039 if (peel_iters_prologue)
4040 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4041 retval += record_stmt_cost (prologue_cost_vec,
4042 si->count * peel_iters_prologue,
4043 si->kind, si->stmt_info, si->misalign,
4044 vect_prologue);
4045 if (*peel_iters_epilogue)
4046 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4047 retval += record_stmt_cost (epilogue_cost_vec,
4048 si->count * *peel_iters_epilogue,
4049 si->kind, si->stmt_info, si->misalign,
4050 vect_epilogue);
4052 return retval;
4055 /* Function vect_estimate_min_profitable_iters
4057 Return the number of iterations required for the vector version of the
4058 loop to be profitable relative to the cost of the scalar version of the
4059 loop.
4061 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4062 of iterations for vectorization. -1 value means loop vectorization
4063 is not profitable. This returned value may be used for dynamic
4064 profitability check.
4066 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4067 for static check against estimated number of iterations. */
4069 static void
4070 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4071 int *ret_min_profitable_niters,
4072 int *ret_min_profitable_estimate,
4073 unsigned *suggested_unroll_factor)
4075 int min_profitable_iters;
4076 int min_profitable_estimate;
4077 int peel_iters_prologue;
4078 int peel_iters_epilogue;
4079 unsigned vec_inside_cost = 0;
4080 int vec_outside_cost = 0;
4081 unsigned vec_prologue_cost = 0;
4082 unsigned vec_epilogue_cost = 0;
4083 int scalar_single_iter_cost = 0;
4084 int scalar_outside_cost = 0;
4085 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4086 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4087 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4089 /* Cost model disabled. */
4090 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4092 if (dump_enabled_p ())
4093 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4094 *ret_min_profitable_niters = 0;
4095 *ret_min_profitable_estimate = 0;
4096 return;
4099 /* Requires loop versioning tests to handle misalignment. */
4100 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4102 /* FIXME: Make cost depend on complexity of individual check. */
4103 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4104 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4105 if (dump_enabled_p ())
4106 dump_printf (MSG_NOTE,
4107 "cost model: Adding cost of checks for loop "
4108 "versioning to treat misalignment.\n");
4111 /* Requires loop versioning with alias checks. */
4112 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4114 /* FIXME: Make cost depend on complexity of individual check. */
4115 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4116 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4117 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4118 if (len)
4119 /* Count LEN - 1 ANDs and LEN comparisons. */
4120 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4121 scalar_stmt, vect_prologue);
4122 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4123 if (len)
4125 /* Count LEN - 1 ANDs and LEN comparisons. */
4126 unsigned int nstmts = len * 2 - 1;
4127 /* +1 for each bias that needs adding. */
4128 for (unsigned int i = 0; i < len; ++i)
4129 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4130 nstmts += 1;
4131 (void) add_stmt_cost (target_cost_data, nstmts,
4132 scalar_stmt, vect_prologue);
4134 if (dump_enabled_p ())
4135 dump_printf (MSG_NOTE,
4136 "cost model: Adding cost of checks for loop "
4137 "versioning aliasing.\n");
4140 /* Requires loop versioning with niter checks. */
4141 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4143 /* FIXME: Make cost depend on complexity of individual check. */
4144 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4145 NULL, NULL, NULL_TREE, 0, vect_prologue);
4146 if (dump_enabled_p ())
4147 dump_printf (MSG_NOTE,
4148 "cost model: Adding cost of checks for loop "
4149 "versioning niters.\n");
4152 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4153 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4154 vect_prologue);
4156 /* Count statements in scalar loop. Using this as scalar cost for a single
4157 iteration for now.
4159 TODO: Add outer loop support.
4161 TODO: Consider assigning different costs to different scalar
4162 statements. */
4164 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4166 /* Add additional cost for the peeled instructions in prologue and epilogue
4167 loop. (For fully-masked loops there will be no peeling.)
4169 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4170 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4172 TODO: Build an expression that represents peel_iters for prologue and
4173 epilogue to be used in a run-time test. */
4175 bool prologue_need_br_taken_cost = false;
4176 bool prologue_need_br_not_taken_cost = false;
4178 /* Calculate peel_iters_prologue. */
4179 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4180 peel_iters_prologue = 0;
4181 else if (npeel < 0)
4183 peel_iters_prologue = assumed_vf / 2;
4184 if (dump_enabled_p ())
4185 dump_printf (MSG_NOTE, "cost model: "
4186 "prologue peel iters set to vf/2.\n");
4188 /* If peeled iterations are unknown, count a taken branch and a not taken
4189 branch per peeled loop. Even if scalar loop iterations are known,
4190 vector iterations are not known since peeled prologue iterations are
4191 not known. Hence guards remain the same. */
4192 prologue_need_br_taken_cost = true;
4193 prologue_need_br_not_taken_cost = true;
4195 else
4197 peel_iters_prologue = npeel;
4198 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4199 /* If peeled iterations are known but number of scalar loop
4200 iterations are unknown, count a taken branch per peeled loop. */
4201 prologue_need_br_taken_cost = true;
4204 bool epilogue_need_br_taken_cost = false;
4205 bool epilogue_need_br_not_taken_cost = false;
4207 /* Calculate peel_iters_epilogue. */
4208 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4209 /* We need to peel exactly one iteration for gaps. */
4210 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4211 else if (npeel < 0)
4213 /* If peeling for alignment is unknown, loop bound of main loop
4214 becomes unknown. */
4215 peel_iters_epilogue = assumed_vf / 2;
4216 if (dump_enabled_p ())
4217 dump_printf (MSG_NOTE, "cost model: "
4218 "epilogue peel iters set to vf/2 because "
4219 "peeling for alignment is unknown.\n");
4221 /* See the same reason above in peel_iters_prologue calculation. */
4222 epilogue_need_br_taken_cost = true;
4223 epilogue_need_br_not_taken_cost = true;
4225 else
4227 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4228 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4229 /* If peeled iterations are known but number of scalar loop
4230 iterations are unknown, count a taken branch per peeled loop. */
4231 epilogue_need_br_taken_cost = true;
4234 stmt_info_for_cost *si;
4235 int j;
4236 /* Add costs associated with peel_iters_prologue. */
4237 if (peel_iters_prologue)
4238 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4240 (void) add_stmt_cost (target_cost_data,
4241 si->count * peel_iters_prologue, si->kind,
4242 si->stmt_info, si->node, si->vectype,
4243 si->misalign, vect_prologue);
4246 /* Add costs associated with peel_iters_epilogue. */
4247 if (peel_iters_epilogue)
4248 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4250 (void) add_stmt_cost (target_cost_data,
4251 si->count * peel_iters_epilogue, si->kind,
4252 si->stmt_info, si->node, si->vectype,
4253 si->misalign, vect_epilogue);
4256 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4258 if (prologue_need_br_taken_cost)
4259 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4260 vect_prologue);
4262 if (prologue_need_br_not_taken_cost)
4263 (void) add_stmt_cost (target_cost_data, 1,
4264 cond_branch_not_taken, vect_prologue);
4266 if (epilogue_need_br_taken_cost)
4267 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4268 vect_epilogue);
4270 if (epilogue_need_br_not_taken_cost)
4271 (void) add_stmt_cost (target_cost_data, 1,
4272 cond_branch_not_taken, vect_epilogue);
4274 /* Take care of special costs for rgroup controls of partial vectors. */
4275 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4277 /* Calculate how many masks we need to generate. */
4278 unsigned int num_masks = 0;
4279 rgroup_controls *rgm;
4280 unsigned int num_vectors_m1;
4281 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4282 if (rgm->type)
4283 num_masks += num_vectors_m1 + 1;
4284 gcc_assert (num_masks > 0);
4286 /* In the worst case, we need to generate each mask in the prologue
4287 and in the loop body. One of the loop body mask instructions
4288 replaces the comparison in the scalar loop, and since we don't
4289 count the scalar comparison against the scalar body, we shouldn't
4290 count that vector instruction against the vector body either.
4292 Sometimes we can use unpacks instead of generating prologue
4293 masks and sometimes the prologue mask will fold to a constant,
4294 so the actual prologue cost might be smaller. However, it's
4295 simpler and safer to use the worst-case cost; if this ends up
4296 being the tie-breaker between vectorizing or not, then it's
4297 probably better not to vectorize. */
4298 (void) add_stmt_cost (target_cost_data, num_masks,
4299 vector_stmt, NULL, NULL, NULL_TREE, 0,
4300 vect_prologue);
4301 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4302 vector_stmt, NULL, NULL, NULL_TREE, 0,
4303 vect_body);
4305 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4307 /* Referring to the functions vect_set_loop_condition_partial_vectors
4308 and vect_set_loop_controls_directly, we need to generate each
4309 length in the prologue and in the loop body if required. Although
4310 there are some possible optimizations, we consider the worst case
4311 here. */
4313 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4314 signed char partial_load_store_bias
4315 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4316 bool need_iterate_p
4317 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4318 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4320 /* Calculate how many statements to be added. */
4321 unsigned int prologue_stmts = 0;
4322 unsigned int body_stmts = 0;
4324 rgroup_controls *rgc;
4325 unsigned int num_vectors_m1;
4326 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4327 if (rgc->type)
4329 /* May need one SHIFT for nitems_total computation. */
4330 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4331 if (nitems != 1 && !niters_known_p)
4332 prologue_stmts += 1;
4334 /* May need one MAX and one MINUS for wrap around. */
4335 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4336 prologue_stmts += 2;
4338 /* Need one MAX and one MINUS for each batch limit excepting for
4339 the 1st one. */
4340 prologue_stmts += num_vectors_m1 * 2;
4342 unsigned int num_vectors = num_vectors_m1 + 1;
4344 /* Need to set up lengths in prologue, only one MIN required
4345 for each since start index is zero. */
4346 prologue_stmts += num_vectors;
4348 /* If we have a non-zero partial load bias, we need one PLUS
4349 to adjust the load length. */
4350 if (partial_load_store_bias != 0)
4351 body_stmts += 1;
4353 /* Each may need two MINs and one MINUS to update lengths in body
4354 for next iteration. */
4355 if (need_iterate_p)
4356 body_stmts += 3 * num_vectors;
4359 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4360 scalar_stmt, vect_prologue);
4361 (void) add_stmt_cost (target_cost_data, body_stmts,
4362 scalar_stmt, vect_body);
4365 /* FORNOW: The scalar outside cost is incremented in one of the
4366 following ways:
4368 1. The vectorizer checks for alignment and aliasing and generates
4369 a condition that allows dynamic vectorization. A cost model
4370 check is ANDED with the versioning condition. Hence scalar code
4371 path now has the added cost of the versioning check.
4373 if (cost > th & versioning_check)
4374 jmp to vector code
4376 Hence run-time scalar is incremented by not-taken branch cost.
4378 2. The vectorizer then checks if a prologue is required. If the
4379 cost model check was not done before during versioning, it has to
4380 be done before the prologue check.
4382 if (cost <= th)
4383 prologue = scalar_iters
4384 if (prologue == 0)
4385 jmp to vector code
4386 else
4387 execute prologue
4388 if (prologue == num_iters)
4389 go to exit
4391 Hence the run-time scalar cost is incremented by a taken branch,
4392 plus a not-taken branch, plus a taken branch cost.
4394 3. The vectorizer then checks if an epilogue is required. If the
4395 cost model check was not done before during prologue check, it
4396 has to be done with the epilogue check.
4398 if (prologue == 0)
4399 jmp to vector code
4400 else
4401 execute prologue
4402 if (prologue == num_iters)
4403 go to exit
4404 vector code:
4405 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4406 jmp to epilogue
4408 Hence the run-time scalar cost should be incremented by 2 taken
4409 branches.
4411 TODO: The back end may reorder the BBS's differently and reverse
4412 conditions/branch directions. Change the estimates below to
4413 something more reasonable. */
4415 /* If the number of iterations is known and we do not do versioning, we can
4416 decide whether to vectorize at compile time. Hence the scalar version
4417 do not carry cost model guard costs. */
4418 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4419 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4421 /* Cost model check occurs at versioning. */
4422 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4423 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4424 else
4426 /* Cost model check occurs at prologue generation. */
4427 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4428 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4429 + vect_get_stmt_cost (cond_branch_not_taken);
4430 /* Cost model check occurs at epilogue generation. */
4431 else
4432 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4436 /* Complete the target-specific cost calculations. */
4437 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4438 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4439 suggested_unroll_factor);
4441 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4442 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4443 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4444 *suggested_unroll_factor,
4445 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4447 if (dump_enabled_p ())
4448 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4449 "can't unroll as unrolled vectorization factor larger"
4450 " than maximum vectorization factor: "
4451 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4452 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4453 *suggested_unroll_factor = 1;
4456 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4458 if (dump_enabled_p ())
4460 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4461 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4462 vec_inside_cost);
4463 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4464 vec_prologue_cost);
4465 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4466 vec_epilogue_cost);
4467 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4468 scalar_single_iter_cost);
4469 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4470 scalar_outside_cost);
4471 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4472 vec_outside_cost);
4473 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4474 peel_iters_prologue);
4475 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4476 peel_iters_epilogue);
4479 /* Calculate number of iterations required to make the vector version
4480 profitable, relative to the loop bodies only. The following condition
4481 must hold true:
4482 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4483 where
4484 SIC = scalar iteration cost, VIC = vector iteration cost,
4485 VOC = vector outside cost, VF = vectorization factor,
4486 NPEEL = prologue iterations + epilogue iterations,
4487 SOC = scalar outside cost for run time cost model check. */
4489 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4490 - vec_inside_cost);
4491 if (saving_per_viter <= 0)
4493 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4494 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4495 "vectorization did not happen for a simd loop");
4497 if (dump_enabled_p ())
4498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4499 "cost model: the vector iteration cost = %d "
4500 "divided by the scalar iteration cost = %d "
4501 "is greater or equal to the vectorization factor = %d"
4502 ".\n",
4503 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4504 *ret_min_profitable_niters = -1;
4505 *ret_min_profitable_estimate = -1;
4506 return;
4509 /* ??? The "if" arm is written to handle all cases; see below for what
4510 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4511 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4513 /* Rewriting the condition above in terms of the number of
4514 vector iterations (vniters) rather than the number of
4515 scalar iterations (niters) gives:
4517 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4519 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4521 For integer N, X and Y when X > 0:
4523 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4524 int outside_overhead = (vec_outside_cost
4525 - scalar_single_iter_cost * peel_iters_prologue
4526 - scalar_single_iter_cost * peel_iters_epilogue
4527 - scalar_outside_cost);
4528 /* We're only interested in cases that require at least one
4529 vector iteration. */
4530 int min_vec_niters = 1;
4531 if (outside_overhead > 0)
4532 min_vec_niters = outside_overhead / saving_per_viter + 1;
4534 if (dump_enabled_p ())
4535 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4536 min_vec_niters);
4538 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4540 /* Now that we know the minimum number of vector iterations,
4541 find the minimum niters for which the scalar cost is larger:
4543 SIC * niters > VIC * vniters + VOC - SOC
4545 We know that the minimum niters is no more than
4546 vniters * VF + NPEEL, but it might be (and often is) less
4547 than that if a partial vector iteration is cheaper than the
4548 equivalent scalar code. */
4549 int threshold = (vec_inside_cost * min_vec_niters
4550 + vec_outside_cost
4551 - scalar_outside_cost);
4552 if (threshold <= 0)
4553 min_profitable_iters = 1;
4554 else
4555 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4557 else
4558 /* Convert the number of vector iterations into a number of
4559 scalar iterations. */
4560 min_profitable_iters = (min_vec_niters * assumed_vf
4561 + peel_iters_prologue
4562 + peel_iters_epilogue);
4564 else
4566 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4567 * assumed_vf
4568 - vec_inside_cost * peel_iters_prologue
4569 - vec_inside_cost * peel_iters_epilogue);
4570 if (min_profitable_iters <= 0)
4571 min_profitable_iters = 0;
4572 else
4574 min_profitable_iters /= saving_per_viter;
4576 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4577 <= (((int) vec_inside_cost * min_profitable_iters)
4578 + (((int) vec_outside_cost - scalar_outside_cost)
4579 * assumed_vf)))
4580 min_profitable_iters++;
4584 if (dump_enabled_p ())
4585 dump_printf (MSG_NOTE,
4586 " Calculated minimum iters for profitability: %d\n",
4587 min_profitable_iters);
4589 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4590 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4591 /* We want the vectorized loop to execute at least once. */
4592 min_profitable_iters = assumed_vf + peel_iters_prologue;
4593 else if (min_profitable_iters < peel_iters_prologue)
4594 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4595 vectorized loop executes at least once. */
4596 min_profitable_iters = peel_iters_prologue;
4598 if (dump_enabled_p ())
4599 dump_printf_loc (MSG_NOTE, vect_location,
4600 " Runtime profitability threshold = %d\n",
4601 min_profitable_iters);
4603 *ret_min_profitable_niters = min_profitable_iters;
4605 /* Calculate number of iterations required to make the vector version
4606 profitable, relative to the loop bodies only.
4608 Non-vectorized variant is SIC * niters and it must win over vector
4609 variant on the expected loop trip count. The following condition must hold true:
4610 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4612 if (vec_outside_cost <= 0)
4613 min_profitable_estimate = 0;
4614 /* ??? This "else if" arm is written to handle all cases; see below for
4615 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4616 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4618 /* This is a repeat of the code above, but with + SOC rather
4619 than - SOC. */
4620 int outside_overhead = (vec_outside_cost
4621 - scalar_single_iter_cost * peel_iters_prologue
4622 - scalar_single_iter_cost * peel_iters_epilogue
4623 + scalar_outside_cost);
4624 int min_vec_niters = 1;
4625 if (outside_overhead > 0)
4626 min_vec_niters = outside_overhead / saving_per_viter + 1;
4628 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4630 int threshold = (vec_inside_cost * min_vec_niters
4631 + vec_outside_cost
4632 + scalar_outside_cost);
4633 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4635 else
4636 min_profitable_estimate = (min_vec_niters * assumed_vf
4637 + peel_iters_prologue
4638 + peel_iters_epilogue);
4640 else
4642 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4643 * assumed_vf
4644 - vec_inside_cost * peel_iters_prologue
4645 - vec_inside_cost * peel_iters_epilogue)
4646 / ((scalar_single_iter_cost * assumed_vf)
4647 - vec_inside_cost);
4649 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4650 if (dump_enabled_p ())
4651 dump_printf_loc (MSG_NOTE, vect_location,
4652 " Static estimate profitability threshold = %d\n",
4653 min_profitable_estimate);
4655 *ret_min_profitable_estimate = min_profitable_estimate;
4658 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4659 vector elements (not bits) for a vector with NELT elements. */
4660 static void
4661 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4662 vec_perm_builder *sel)
4664 /* The encoding is a single stepped pattern. Any wrap-around is handled
4665 by vec_perm_indices. */
4666 sel->new_vector (nelt, 1, 3);
4667 for (unsigned int i = 0; i < 3; i++)
4668 sel->quick_push (i + offset);
4671 /* Checks whether the target supports whole-vector shifts for vectors of mode
4672 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4673 it supports vec_perm_const with masks for all necessary shift amounts. */
4674 static bool
4675 have_whole_vector_shift (machine_mode mode)
4677 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4678 return true;
4680 /* Variable-length vectors should be handled via the optab. */
4681 unsigned int nelt;
4682 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4683 return false;
4685 vec_perm_builder sel;
4686 vec_perm_indices indices;
4687 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4689 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4690 indices.new_vector (sel, 2, nelt);
4691 if (!can_vec_perm_const_p (mode, mode, indices, false))
4692 return false;
4694 return true;
4697 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4698 multiplication operands have differing signs and (b) we intend
4699 to emulate the operation using a series of signed DOT_PROD_EXPRs.
4700 See vect_emulate_mixed_dot_prod for the actual sequence used. */
4702 static bool
4703 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
4704 stmt_vec_info stmt_info)
4706 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4707 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4708 return false;
4710 tree rhs1 = gimple_assign_rhs1 (assign);
4711 tree rhs2 = gimple_assign_rhs2 (assign);
4712 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4713 return false;
4715 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4716 gcc_assert (reduc_info->is_reduc_info);
4717 return !directly_supported_p (DOT_PROD_EXPR,
4718 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
4719 optab_vector_mixed_sign);
4722 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4723 functions. Design better to avoid maintenance issues. */
4725 /* Function vect_model_reduction_cost.
4727 Models cost for a reduction operation, including the vector ops
4728 generated within the strip-mine loop in some cases, the initial
4729 definition before the loop, and the epilogue code that must be generated. */
4731 static void
4732 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4733 stmt_vec_info stmt_info, internal_fn reduc_fn,
4734 vect_reduction_type reduction_type,
4735 int ncopies, stmt_vector_for_cost *cost_vec)
4737 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4738 tree vectype;
4739 machine_mode mode;
4740 class loop *loop = NULL;
4742 if (loop_vinfo)
4743 loop = LOOP_VINFO_LOOP (loop_vinfo);
4745 /* Condition reductions generate two reductions in the loop. */
4746 if (reduction_type == COND_REDUCTION)
4747 ncopies *= 2;
4749 vectype = STMT_VINFO_VECTYPE (stmt_info);
4750 mode = TYPE_MODE (vectype);
4751 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4753 gimple_match_op op;
4754 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4755 gcc_unreachable ();
4757 bool emulated_mixed_dot_prod
4758 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
4759 if (reduction_type == EXTRACT_LAST_REDUCTION)
4760 /* No extra instructions are needed in the prologue. The loop body
4761 operations are costed in vectorizable_condition. */
4762 inside_cost = 0;
4763 else if (reduction_type == FOLD_LEFT_REDUCTION)
4765 /* No extra instructions needed in the prologue. */
4766 prologue_cost = 0;
4768 if (reduc_fn != IFN_LAST)
4769 /* Count one reduction-like operation per vector. */
4770 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4771 stmt_info, 0, vect_body);
4772 else
4774 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4775 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4776 inside_cost = record_stmt_cost (cost_vec, nelements,
4777 vec_to_scalar, stmt_info, 0,
4778 vect_body);
4779 inside_cost += record_stmt_cost (cost_vec, nelements,
4780 scalar_stmt, stmt_info, 0,
4781 vect_body);
4784 else
4786 /* Add in the cost of the initial definitions. */
4787 int prologue_stmts;
4788 if (reduction_type == COND_REDUCTION)
4789 /* For cond reductions we have four vectors: initial index, step,
4790 initial result of the data reduction, initial value of the index
4791 reduction. */
4792 prologue_stmts = 4;
4793 else if (emulated_mixed_dot_prod)
4794 /* We need the initial reduction value and two invariants:
4795 one that contains the minimum signed value and one that
4796 contains half of its negative. */
4797 prologue_stmts = 3;
4798 else
4799 prologue_stmts = 1;
4800 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4801 scalar_to_vec, stmt_info, 0,
4802 vect_prologue);
4805 /* Determine cost of epilogue code.
4807 We have a reduction operator that will reduce the vector in one statement.
4808 Also requires scalar extract. */
4810 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4812 if (reduc_fn != IFN_LAST)
4814 if (reduction_type == COND_REDUCTION)
4816 /* An EQ stmt and an COND_EXPR stmt. */
4817 epilogue_cost += record_stmt_cost (cost_vec, 2,
4818 vector_stmt, stmt_info, 0,
4819 vect_epilogue);
4820 /* Reduction of the max index and a reduction of the found
4821 values. */
4822 epilogue_cost += record_stmt_cost (cost_vec, 2,
4823 vec_to_scalar, stmt_info, 0,
4824 vect_epilogue);
4825 /* A broadcast of the max value. */
4826 epilogue_cost += record_stmt_cost (cost_vec, 1,
4827 scalar_to_vec, stmt_info, 0,
4828 vect_epilogue);
4830 else
4832 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4833 stmt_info, 0, vect_epilogue);
4834 epilogue_cost += record_stmt_cost (cost_vec, 1,
4835 vec_to_scalar, stmt_info, 0,
4836 vect_epilogue);
4839 else if (reduction_type == COND_REDUCTION)
4841 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4842 /* Extraction of scalar elements. */
4843 epilogue_cost += record_stmt_cost (cost_vec,
4844 2 * estimated_nunits,
4845 vec_to_scalar, stmt_info, 0,
4846 vect_epilogue);
4847 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4848 epilogue_cost += record_stmt_cost (cost_vec,
4849 2 * estimated_nunits - 3,
4850 scalar_stmt, stmt_info, 0,
4851 vect_epilogue);
4853 else if (reduction_type == EXTRACT_LAST_REDUCTION
4854 || reduction_type == FOLD_LEFT_REDUCTION)
4855 /* No extra instructions need in the epilogue. */
4857 else
4859 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4860 tree bitsize = TYPE_SIZE (op.type);
4861 int element_bitsize = tree_to_uhwi (bitsize);
4862 int nelements = vec_size_in_bits / element_bitsize;
4864 if (op.code == COND_EXPR)
4865 op.code = MAX_EXPR;
4867 /* We have a whole vector shift available. */
4868 if (VECTOR_MODE_P (mode)
4869 && directly_supported_p (op.code, vectype)
4870 && have_whole_vector_shift (mode))
4872 /* Final reduction via vector shifts and the reduction operator.
4873 Also requires scalar extract. */
4874 epilogue_cost += record_stmt_cost (cost_vec,
4875 exact_log2 (nelements) * 2,
4876 vector_stmt, stmt_info, 0,
4877 vect_epilogue);
4878 epilogue_cost += record_stmt_cost (cost_vec, 1,
4879 vec_to_scalar, stmt_info, 0,
4880 vect_epilogue);
4882 else
4883 /* Use extracts and reduction op for final reduction. For N
4884 elements, we have N extracts and N-1 reduction ops. */
4885 epilogue_cost += record_stmt_cost (cost_vec,
4886 nelements + nelements - 1,
4887 vector_stmt, stmt_info, 0,
4888 vect_epilogue);
4892 if (dump_enabled_p ())
4893 dump_printf (MSG_NOTE,
4894 "vect_model_reduction_cost: inside_cost = %d, "
4895 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4896 prologue_cost, epilogue_cost);
4899 /* SEQ is a sequence of instructions that initialize the reduction
4900 described by REDUC_INFO. Emit them in the appropriate place. */
4902 static void
4903 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4904 stmt_vec_info reduc_info, gimple *seq)
4906 if (reduc_info->reused_accumulator)
4908 /* When reusing an accumulator from the main loop, we only need
4909 initialization instructions if the main loop can be skipped.
4910 In that case, emit the initialization instructions at the end
4911 of the guard block that does the skip. */
4912 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4913 gcc_assert (skip_edge);
4914 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4915 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4917 else
4919 /* The normal case: emit the initialization instructions on the
4920 preheader edge. */
4921 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4922 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4926 /* Function get_initial_def_for_reduction
4928 Input:
4929 REDUC_INFO - the info_for_reduction
4930 INIT_VAL - the initial value of the reduction variable
4931 NEUTRAL_OP - a value that has no effect on the reduction, as per
4932 neutral_op_for_reduction
4934 Output:
4935 Return a vector variable, initialized according to the operation that
4936 STMT_VINFO performs. This vector will be used as the initial value
4937 of the vector of partial results.
4939 The value we need is a vector in which element 0 has value INIT_VAL
4940 and every other element has value NEUTRAL_OP. */
4942 static tree
4943 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4944 stmt_vec_info reduc_info,
4945 tree init_val, tree neutral_op)
4947 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4948 tree scalar_type = TREE_TYPE (init_val);
4949 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4950 tree init_def;
4951 gimple_seq stmts = NULL;
4953 gcc_assert (vectype);
4955 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4956 || SCALAR_FLOAT_TYPE_P (scalar_type));
4958 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4959 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4961 if (operand_equal_p (init_val, neutral_op))
4963 /* If both elements are equal then the vector described above is
4964 just a splat. */
4965 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4966 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4968 else
4970 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4971 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4972 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4974 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4975 element 0. */
4976 init_def = gimple_build_vector_from_val (&stmts, vectype,
4977 neutral_op);
4978 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4979 vectype, init_def, init_val);
4981 else
4983 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
4984 tree_vector_builder elts (vectype, 1, 2);
4985 elts.quick_push (init_val);
4986 elts.quick_push (neutral_op);
4987 init_def = gimple_build_vector (&stmts, &elts);
4991 if (stmts)
4992 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4993 return init_def;
4996 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4997 which performs a reduction involving GROUP_SIZE scalar statements.
4998 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4999 is nonnull, introducing extra elements of that value will not change the
5000 result. */
5002 static void
5003 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5004 stmt_vec_info reduc_info,
5005 vec<tree> *vec_oprnds,
5006 unsigned int number_of_vectors,
5007 unsigned int group_size, tree neutral_op)
5009 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5010 unsigned HOST_WIDE_INT nunits;
5011 unsigned j, number_of_places_left_in_vector;
5012 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5013 unsigned int i;
5015 gcc_assert (group_size == initial_values.length () || neutral_op);
5017 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5018 created vectors. It is greater than 1 if unrolling is performed.
5020 For example, we have two scalar operands, s1 and s2 (e.g., group of
5021 strided accesses of size two), while NUNITS is four (i.e., four scalars
5022 of this type can be packed in a vector). The output vector will contain
5023 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5024 will be 2).
5026 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5027 vectors containing the operands.
5029 For example, NUNITS is four as before, and the group size is 8
5030 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5031 {s5, s6, s7, s8}. */
5033 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5034 nunits = group_size;
5036 number_of_places_left_in_vector = nunits;
5037 bool constant_p = true;
5038 tree_vector_builder elts (vector_type, nunits, 1);
5039 elts.quick_grow (nunits);
5040 gimple_seq ctor_seq = NULL;
5041 for (j = 0; j < nunits * number_of_vectors; ++j)
5043 tree op;
5044 i = j % group_size;
5046 /* Get the def before the loop. In reduction chain we have only
5047 one initial value. Else we have as many as PHIs in the group. */
5048 if (i >= initial_values.length () || (j > i && neutral_op))
5049 op = neutral_op;
5050 else
5051 op = initial_values[i];
5053 /* Create 'vect_ = {op0,op1,...,opn}'. */
5054 number_of_places_left_in_vector--;
5055 elts[nunits - number_of_places_left_in_vector - 1] = op;
5056 if (!CONSTANT_CLASS_P (op))
5057 constant_p = false;
5059 if (number_of_places_left_in_vector == 0)
5061 tree init;
5062 if (constant_p && !neutral_op
5063 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5064 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5065 /* Build the vector directly from ELTS. */
5066 init = gimple_build_vector (&ctor_seq, &elts);
5067 else if (neutral_op)
5069 /* Build a vector of the neutral value and shift the
5070 other elements into place. */
5071 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5072 neutral_op);
5073 int k = nunits;
5074 while (k > 0 && elts[k - 1] == neutral_op)
5075 k -= 1;
5076 while (k > 0)
5078 k -= 1;
5079 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5080 vector_type, init, elts[k]);
5083 else
5085 /* First time round, duplicate ELTS to fill the
5086 required number of vectors. */
5087 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5088 elts, number_of_vectors, *vec_oprnds);
5089 break;
5091 vec_oprnds->quick_push (init);
5093 number_of_places_left_in_vector = nunits;
5094 elts.new_vector (vector_type, nunits, 1);
5095 elts.quick_grow (nunits);
5096 constant_p = true;
5099 if (ctor_seq != NULL)
5100 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5103 /* For a statement STMT_INFO taking part in a reduction operation return
5104 the stmt_vec_info the meta information is stored on. */
5106 stmt_vec_info
5107 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5109 stmt_info = vect_orig_stmt (stmt_info);
5110 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5111 if (!is_a <gphi *> (stmt_info->stmt)
5112 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5113 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5114 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5115 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5117 if (gimple_phi_num_args (phi) == 1)
5118 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5120 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5122 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5123 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5124 stmt_info = info;
5126 return stmt_info;
5129 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5130 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5131 return false. */
5133 static bool
5134 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5135 stmt_vec_info reduc_info)
5137 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5138 if (!main_loop_vinfo)
5139 return false;
5141 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5142 return false;
5144 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5145 auto_vec<tree, 16> main_loop_results (num_phis);
5146 auto_vec<tree, 16> initial_values (num_phis);
5147 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5149 /* The epilogue loop can be entered either from the main loop or
5150 from an earlier guard block. */
5151 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5152 for (tree incoming_value : reduc_info->reduc_initial_values)
5154 /* Look for:
5156 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5157 INITIAL_VALUE(guard block)>. */
5158 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5160 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5161 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5163 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5164 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5166 main_loop_results.quick_push (from_main_loop);
5167 initial_values.quick_push (from_skip);
5170 else
5171 /* The main loop dominates the epilogue loop. */
5172 main_loop_results.splice (reduc_info->reduc_initial_values);
5174 /* See if the main loop has the kind of accumulator we need. */
5175 vect_reusable_accumulator *accumulator
5176 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5177 if (!accumulator
5178 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5179 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5180 accumulator->reduc_info->reduc_scalar_results.begin ()))
5181 return false;
5183 /* Handle the case where we can reduce wider vectors to narrower ones. */
5184 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5185 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5186 unsigned HOST_WIDE_INT m;
5187 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5188 TYPE_VECTOR_SUBPARTS (vectype), &m))
5189 return false;
5190 /* Check the intermediate vector types and operations are available. */
5191 tree prev_vectype = old_vectype;
5192 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5193 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5195 intermediate_nunits = exact_div (intermediate_nunits, 2);
5196 tree intermediate_vectype = get_related_vectype_for_scalar_type
5197 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5198 if (!intermediate_vectype
5199 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5200 intermediate_vectype)
5201 || !can_vec_extract (TYPE_MODE (prev_vectype),
5202 TYPE_MODE (intermediate_vectype)))
5203 return false;
5204 prev_vectype = intermediate_vectype;
5207 /* Non-SLP reductions might apply an adjustment after the reduction
5208 operation, in order to simplify the initialization of the accumulator.
5209 If the epilogue loop carries on from where the main loop left off,
5210 it should apply the same adjustment to the final reduction result.
5212 If the epilogue loop can also be entered directly (rather than via
5213 the main loop), we need to be able to handle that case in the same way,
5214 with the same adjustment. (In principle we could add a PHI node
5215 to select the correct adjustment, but in practice that shouldn't be
5216 necessary.) */
5217 tree main_adjustment
5218 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5219 if (loop_vinfo->main_loop_edge && main_adjustment)
5221 gcc_assert (num_phis == 1);
5222 tree initial_value = initial_values[0];
5223 /* Check that we can use INITIAL_VALUE as the adjustment and
5224 initialize the accumulator with a neutral value instead. */
5225 if (!operand_equal_p (initial_value, main_adjustment))
5226 return false;
5227 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5228 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5229 code, initial_value);
5231 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5232 reduc_info->reduc_initial_values.truncate (0);
5233 reduc_info->reduc_initial_values.splice (initial_values);
5234 reduc_info->reused_accumulator = accumulator;
5235 return true;
5238 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5239 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5241 static tree
5242 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5243 gimple_seq *seq)
5245 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5246 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5247 tree stype = TREE_TYPE (vectype);
5248 tree new_temp = vec_def;
5249 while (nunits > nunits1)
5251 nunits /= 2;
5252 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5253 stype, nunits);
5254 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5256 /* The target has to make sure we support lowpart/highpart
5257 extraction, either via direct vector extract or through
5258 an integer mode punning. */
5259 tree dst1, dst2;
5260 gimple *epilog_stmt;
5261 if (convert_optab_handler (vec_extract_optab,
5262 TYPE_MODE (TREE_TYPE (new_temp)),
5263 TYPE_MODE (vectype1))
5264 != CODE_FOR_nothing)
5266 /* Extract sub-vectors directly once vec_extract becomes
5267 a conversion optab. */
5268 dst1 = make_ssa_name (vectype1);
5269 epilog_stmt
5270 = gimple_build_assign (dst1, BIT_FIELD_REF,
5271 build3 (BIT_FIELD_REF, vectype1,
5272 new_temp, TYPE_SIZE (vectype1),
5273 bitsize_int (0)));
5274 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5275 dst2 = make_ssa_name (vectype1);
5276 epilog_stmt
5277 = gimple_build_assign (dst2, BIT_FIELD_REF,
5278 build3 (BIT_FIELD_REF, vectype1,
5279 new_temp, TYPE_SIZE (vectype1),
5280 bitsize_int (bitsize)));
5281 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5283 else
5285 /* Extract via punning to appropriately sized integer mode
5286 vector. */
5287 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5288 tree etype = build_vector_type (eltype, 2);
5289 gcc_assert (convert_optab_handler (vec_extract_optab,
5290 TYPE_MODE (etype),
5291 TYPE_MODE (eltype))
5292 != CODE_FOR_nothing);
5293 tree tem = make_ssa_name (etype);
5294 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5295 build1 (VIEW_CONVERT_EXPR,
5296 etype, new_temp));
5297 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5298 new_temp = tem;
5299 tem = make_ssa_name (eltype);
5300 epilog_stmt
5301 = gimple_build_assign (tem, BIT_FIELD_REF,
5302 build3 (BIT_FIELD_REF, eltype,
5303 new_temp, TYPE_SIZE (eltype),
5304 bitsize_int (0)));
5305 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5306 dst1 = make_ssa_name (vectype1);
5307 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5308 build1 (VIEW_CONVERT_EXPR,
5309 vectype1, tem));
5310 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5311 tem = make_ssa_name (eltype);
5312 epilog_stmt
5313 = gimple_build_assign (tem, BIT_FIELD_REF,
5314 build3 (BIT_FIELD_REF, eltype,
5315 new_temp, TYPE_SIZE (eltype),
5316 bitsize_int (bitsize)));
5317 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5318 dst2 = make_ssa_name (vectype1);
5319 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5320 build1 (VIEW_CONVERT_EXPR,
5321 vectype1, tem));
5322 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5325 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5328 return new_temp;
5331 /* Function vect_create_epilog_for_reduction
5333 Create code at the loop-epilog to finalize the result of a reduction
5334 computation.
5336 STMT_INFO is the scalar reduction stmt that is being vectorized.
5337 SLP_NODE is an SLP node containing a group of reduction statements. The
5338 first one in this group is STMT_INFO.
5339 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5340 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5341 (counting from 0)
5343 This function:
5344 1. Completes the reduction def-use cycles.
5345 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5346 by calling the function specified by REDUC_FN if available, or by
5347 other means (whole-vector shifts or a scalar loop).
5348 The function also creates a new phi node at the loop exit to preserve
5349 loop-closed form, as illustrated below.
5351 The flow at the entry to this function:
5353 loop:
5354 vec_def = phi <vec_init, null> # REDUCTION_PHI
5355 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5356 s_loop = scalar_stmt # (scalar) STMT_INFO
5357 loop_exit:
5358 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5359 use <s_out0>
5360 use <s_out0>
5362 The above is transformed by this function into:
5364 loop:
5365 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5366 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5367 s_loop = scalar_stmt # (scalar) STMT_INFO
5368 loop_exit:
5369 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5370 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5371 v_out2 = reduce <v_out1>
5372 s_out3 = extract_field <v_out2, 0>
5373 s_out4 = adjust_result <s_out3>
5374 use <s_out4>
5375 use <s_out4>
5378 static void
5379 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5380 stmt_vec_info stmt_info,
5381 slp_tree slp_node,
5382 slp_instance slp_node_instance)
5384 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5385 gcc_assert (reduc_info->is_reduc_info);
5386 /* For double reductions we need to get at the inner loop reduction
5387 stmt which has the meta info attached. Our stmt_info is that of the
5388 loop-closed PHI of the inner loop which we remember as
5389 def for the reduction PHI generation. */
5390 bool double_reduc = false;
5391 stmt_vec_info rdef_info = stmt_info;
5392 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5394 gcc_assert (!slp_node);
5395 double_reduc = true;
5396 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5397 (stmt_info->stmt, 0));
5398 stmt_info = vect_stmt_to_vectorize (stmt_info);
5400 gphi *reduc_def_stmt
5401 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5402 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5403 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5404 tree vectype;
5405 machine_mode mode;
5406 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5407 basic_block exit_bb;
5408 tree scalar_dest;
5409 tree scalar_type;
5410 gimple *new_phi = NULL, *phi;
5411 gimple_stmt_iterator exit_gsi;
5412 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5413 gimple *epilog_stmt = NULL;
5414 gimple *exit_phi;
5415 tree bitsize;
5416 tree def;
5417 tree orig_name, scalar_result;
5418 imm_use_iterator imm_iter, phi_imm_iter;
5419 use_operand_p use_p, phi_use_p;
5420 gimple *use_stmt;
5421 auto_vec<tree> reduc_inputs;
5422 int j, i;
5423 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5424 unsigned int group_size = 1, k;
5425 auto_vec<gimple *> phis;
5426 /* SLP reduction without reduction chain, e.g.,
5427 # a1 = phi <a2, a0>
5428 # b1 = phi <b2, b0>
5429 a2 = operation (a1)
5430 b2 = operation (b1) */
5431 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5432 bool direct_slp_reduc;
5433 tree induction_index = NULL_TREE;
5435 if (slp_node)
5436 group_size = SLP_TREE_LANES (slp_node);
5438 if (nested_in_vect_loop_p (loop, stmt_info))
5440 outer_loop = loop;
5441 loop = loop->inner;
5442 gcc_assert (!slp_node && double_reduc);
5445 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5446 gcc_assert (vectype);
5447 mode = TYPE_MODE (vectype);
5449 tree induc_val = NULL_TREE;
5450 tree adjustment_def = NULL;
5451 if (slp_node)
5453 else
5455 /* Optimize: for induction condition reduction, if we can't use zero
5456 for induc_val, use initial_def. */
5457 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5458 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5459 else if (double_reduc)
5461 else
5462 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5465 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5466 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5467 if (slp_reduc)
5468 /* All statements produce live-out values. */
5469 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5470 else if (slp_node)
5472 /* The last statement in the reduction chain produces the live-out
5473 value. Note SLP optimization can shuffle scalar stmts to
5474 optimize permutations so we have to search for the last stmt. */
5475 for (k = 0; k < group_size; ++k)
5476 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5478 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5479 break;
5483 unsigned vec_num;
5484 int ncopies;
5485 if (slp_node)
5487 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5488 ncopies = 1;
5490 else
5492 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5493 vec_num = 1;
5494 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5497 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5498 which is updated with the current index of the loop for every match of
5499 the original loop's cond_expr (VEC_STMT). This results in a vector
5500 containing the last time the condition passed for that vector lane.
5501 The first match will be a 1 to allow 0 to be used for non-matching
5502 indexes. If there are no matches at all then the vector will be all
5503 zeroes.
5505 PR92772: This algorithm is broken for architectures that support
5506 masked vectors, but do not provide fold_extract_last. */
5507 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5509 auto_vec<std::pair<tree, bool>, 2> ccompares;
5510 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5511 cond_info = vect_stmt_to_vectorize (cond_info);
5512 while (cond_info != reduc_info)
5514 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5516 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5517 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5518 ccompares.safe_push
5519 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5520 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5522 cond_info
5523 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5524 1 + STMT_VINFO_REDUC_IDX
5525 (cond_info)));
5526 cond_info = vect_stmt_to_vectorize (cond_info);
5528 gcc_assert (ccompares.length () != 0);
5530 tree indx_before_incr, indx_after_incr;
5531 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5532 int scalar_precision
5533 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5534 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5535 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5536 (TYPE_MODE (vectype), cr_index_scalar_type,
5537 TYPE_VECTOR_SUBPARTS (vectype));
5539 /* First we create a simple vector induction variable which starts
5540 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5541 vector size (STEP). */
5543 /* Create a {1,2,3,...} vector. */
5544 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5546 /* Create a vector of the step value. */
5547 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5548 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5550 /* Create an induction variable. */
5551 gimple_stmt_iterator incr_gsi;
5552 bool insert_after;
5553 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5554 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5555 insert_after, &indx_before_incr, &indx_after_incr);
5557 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5558 filled with zeros (VEC_ZERO). */
5560 /* Create a vector of 0s. */
5561 tree zero = build_zero_cst (cr_index_scalar_type);
5562 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5564 /* Create a vector phi node. */
5565 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5566 new_phi = create_phi_node (new_phi_tree, loop->header);
5567 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5568 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5570 /* Now take the condition from the loops original cond_exprs
5571 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5572 every match uses values from the induction variable
5573 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5574 (NEW_PHI_TREE).
5575 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5576 the new cond_expr (INDEX_COND_EXPR). */
5577 gimple_seq stmts = NULL;
5578 for (int i = ccompares.length () - 1; i != -1; --i)
5580 tree ccompare = ccompares[i].first;
5581 if (ccompares[i].second)
5582 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5583 cr_index_vector_type,
5584 ccompare,
5585 indx_before_incr, new_phi_tree);
5586 else
5587 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5588 cr_index_vector_type,
5589 ccompare,
5590 new_phi_tree, indx_before_incr);
5592 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5594 /* Update the phi with the vec cond. */
5595 induction_index = new_phi_tree;
5596 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5597 loop_latch_edge (loop), UNKNOWN_LOCATION);
5600 /* 2. Create epilog code.
5601 The reduction epilog code operates across the elements of the vector
5602 of partial results computed by the vectorized loop.
5603 The reduction epilog code consists of:
5605 step 1: compute the scalar result in a vector (v_out2)
5606 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5607 step 3: adjust the scalar result (s_out3) if needed.
5609 Step 1 can be accomplished using one the following three schemes:
5610 (scheme 1) using reduc_fn, if available.
5611 (scheme 2) using whole-vector shifts, if available.
5612 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5613 combined.
5615 The overall epilog code looks like this:
5617 s_out0 = phi <s_loop> # original EXIT_PHI
5618 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5619 v_out2 = reduce <v_out1> # step 1
5620 s_out3 = extract_field <v_out2, 0> # step 2
5621 s_out4 = adjust_result <s_out3> # step 3
5623 (step 3 is optional, and steps 1 and 2 may be combined).
5624 Lastly, the uses of s_out0 are replaced by s_out4. */
5627 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5628 v_out1 = phi <VECT_DEF>
5629 Store them in NEW_PHIS. */
5630 if (double_reduc)
5631 loop = outer_loop;
5632 exit_bb = single_exit (loop)->dest;
5633 exit_gsi = gsi_after_labels (exit_bb);
5634 reduc_inputs.create (slp_node ? vec_num : ncopies);
5635 for (unsigned i = 0; i < vec_num; i++)
5637 gimple_seq stmts = NULL;
5638 if (slp_node)
5639 def = vect_get_slp_vect_def (slp_node, i);
5640 else
5641 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5642 for (j = 0; j < ncopies; j++)
5644 tree new_def = copy_ssa_name (def);
5645 phi = create_phi_node (new_def, exit_bb);
5646 if (j)
5647 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5648 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5649 new_def = gimple_convert (&stmts, vectype, new_def);
5650 reduc_inputs.quick_push (new_def);
5652 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5655 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5656 (i.e. when reduc_fn is not available) and in the final adjustment
5657 code (if needed). Also get the original scalar reduction variable as
5658 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5659 represents a reduction pattern), the tree-code and scalar-def are
5660 taken from the original stmt that the pattern-stmt (STMT) replaces.
5661 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5662 are taken from STMT. */
5664 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5665 if (orig_stmt_info != stmt_info)
5667 /* Reduction pattern */
5668 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5669 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5672 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5673 scalar_type = TREE_TYPE (scalar_dest);
5674 scalar_results.truncate (0);
5675 scalar_results.reserve_exact (group_size);
5676 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5677 bitsize = TYPE_SIZE (scalar_type);
5679 /* True if we should implement SLP_REDUC using native reduction operations
5680 instead of scalar operations. */
5681 direct_slp_reduc = (reduc_fn != IFN_LAST
5682 && slp_reduc
5683 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5685 /* In case of reduction chain, e.g.,
5686 # a1 = phi <a3, a0>
5687 a2 = operation (a1)
5688 a3 = operation (a2),
5690 we may end up with more than one vector result. Here we reduce them
5691 to one vector.
5693 The same is true for a SLP reduction, e.g.,
5694 # a1 = phi <a2, a0>
5695 # b1 = phi <b2, b0>
5696 a2 = operation (a1)
5697 b2 = operation (a2),
5699 where we can end up with more than one vector as well. We can
5700 easily accumulate vectors when the number of vector elements is
5701 a multiple of the SLP group size.
5703 The same is true if we couldn't use a single defuse cycle. */
5704 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5705 || direct_slp_reduc
5706 || (slp_reduc
5707 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
5708 || ncopies > 1)
5710 gimple_seq stmts = NULL;
5711 tree single_input = reduc_inputs[0];
5712 for (k = 1; k < reduc_inputs.length (); k++)
5713 single_input = gimple_build (&stmts, code, vectype,
5714 single_input, reduc_inputs[k]);
5715 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5717 reduc_inputs.truncate (0);
5718 reduc_inputs.safe_push (single_input);
5721 tree orig_reduc_input = reduc_inputs[0];
5723 /* If this loop is an epilogue loop that can be skipped after the
5724 main loop, we can only share a reduction operation between the
5725 main loop and the epilogue if we put it at the target of the
5726 skip edge.
5728 We can still reuse accumulators if this check fails. Doing so has
5729 the minor(?) benefit of making the epilogue loop's scalar result
5730 independent of the main loop's scalar result. */
5731 bool unify_with_main_loop_p = false;
5732 if (reduc_info->reused_accumulator
5733 && loop_vinfo->skip_this_loop_edge
5734 && single_succ_p (exit_bb)
5735 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5737 unify_with_main_loop_p = true;
5739 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5740 reduc_inputs[0] = make_ssa_name (vectype);
5741 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5742 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5743 UNKNOWN_LOCATION);
5744 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5745 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5746 exit_gsi = gsi_after_labels (reduc_block);
5749 /* Shouldn't be used beyond this point. */
5750 exit_bb = nullptr;
5752 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5753 && reduc_fn != IFN_LAST)
5755 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5756 various data values where the condition matched and another vector
5757 (INDUCTION_INDEX) containing all the indexes of those matches. We
5758 need to extract the last matching index (which will be the index with
5759 highest value) and use this to index into the data vector.
5760 For the case where there were no matches, the data vector will contain
5761 all default values and the index vector will be all zeros. */
5763 /* Get various versions of the type of the vector of indexes. */
5764 tree index_vec_type = TREE_TYPE (induction_index);
5765 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5766 tree index_scalar_type = TREE_TYPE (index_vec_type);
5767 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5769 /* Get an unsigned integer version of the type of the data vector. */
5770 int scalar_precision
5771 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5772 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5773 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5774 vectype);
5776 /* First we need to create a vector (ZERO_VEC) of zeros and another
5777 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5778 can create using a MAX reduction and then expanding.
5779 In the case where the loop never made any matches, the max index will
5780 be zero. */
5782 /* Vector of {0, 0, 0,...}. */
5783 tree zero_vec = build_zero_cst (vectype);
5785 /* Find maximum value from the vector of found indexes. */
5786 tree max_index = make_ssa_name (index_scalar_type);
5787 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5788 1, induction_index);
5789 gimple_call_set_lhs (max_index_stmt, max_index);
5790 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5792 /* Vector of {max_index, max_index, max_index,...}. */
5793 tree max_index_vec = make_ssa_name (index_vec_type);
5794 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5795 max_index);
5796 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5797 max_index_vec_rhs);
5798 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5800 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5801 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5802 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5803 otherwise. Only one value should match, resulting in a vector
5804 (VEC_COND) with one data value and the rest zeros.
5805 In the case where the loop never made any matches, every index will
5806 match, resulting in a vector with all data values (which will all be
5807 the default value). */
5809 /* Compare the max index vector to the vector of found indexes to find
5810 the position of the max value. */
5811 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5812 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5813 induction_index,
5814 max_index_vec);
5815 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5817 /* Use the compare to choose either values from the data vector or
5818 zero. */
5819 tree vec_cond = make_ssa_name (vectype);
5820 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5821 vec_compare,
5822 reduc_inputs[0],
5823 zero_vec);
5824 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5826 /* Finally we need to extract the data value from the vector (VEC_COND)
5827 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5828 reduction, but because this doesn't exist, we can use a MAX reduction
5829 instead. The data value might be signed or a float so we need to cast
5830 it first.
5831 In the case where the loop never made any matches, the data values are
5832 all identical, and so will reduce down correctly. */
5834 /* Make the matched data values unsigned. */
5835 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5836 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5837 vec_cond);
5838 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5839 VIEW_CONVERT_EXPR,
5840 vec_cond_cast_rhs);
5841 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5843 /* Reduce down to a scalar value. */
5844 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5845 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5846 1, vec_cond_cast);
5847 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5848 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5850 /* Convert the reduced value back to the result type and set as the
5851 result. */
5852 gimple_seq stmts = NULL;
5853 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5854 data_reduc);
5855 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5856 scalar_results.safe_push (new_temp);
5858 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5859 && reduc_fn == IFN_LAST)
5861 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5862 idx = 0;
5863 idx_val = induction_index[0];
5864 val = data_reduc[0];
5865 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5866 if (induction_index[i] > idx_val)
5867 val = data_reduc[i], idx_val = induction_index[i];
5868 return val; */
5870 tree data_eltype = TREE_TYPE (vectype);
5871 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5872 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5873 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5874 /* Enforced by vectorizable_reduction, which ensures we have target
5875 support before allowing a conditional reduction on variable-length
5876 vectors. */
5877 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5878 tree idx_val = NULL_TREE, val = NULL_TREE;
5879 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5881 tree old_idx_val = idx_val;
5882 tree old_val = val;
5883 idx_val = make_ssa_name (idx_eltype);
5884 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5885 build3 (BIT_FIELD_REF, idx_eltype,
5886 induction_index,
5887 bitsize_int (el_size),
5888 bitsize_int (off)));
5889 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5890 val = make_ssa_name (data_eltype);
5891 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5892 build3 (BIT_FIELD_REF,
5893 data_eltype,
5894 reduc_inputs[0],
5895 bitsize_int (el_size),
5896 bitsize_int (off)));
5897 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5898 if (off != 0)
5900 tree new_idx_val = idx_val;
5901 if (off != v_size - el_size)
5903 new_idx_val = make_ssa_name (idx_eltype);
5904 epilog_stmt = gimple_build_assign (new_idx_val,
5905 MAX_EXPR, idx_val,
5906 old_idx_val);
5907 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5909 tree cond = make_ssa_name (boolean_type_node);
5910 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5911 idx_val, old_idx_val);
5912 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5913 tree new_val = make_ssa_name (data_eltype);
5914 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5915 cond, val, old_val);
5916 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5917 idx_val = new_idx_val;
5918 val = new_val;
5921 /* Convert the reduced value back to the result type and set as the
5922 result. */
5923 gimple_seq stmts = NULL;
5924 val = gimple_convert (&stmts, scalar_type, val);
5925 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5926 scalar_results.safe_push (val);
5929 /* 2.3 Create the reduction code, using one of the three schemes described
5930 above. In SLP we simply need to extract all the elements from the
5931 vector (without reducing them), so we use scalar shifts. */
5932 else if (reduc_fn != IFN_LAST && !slp_reduc)
5934 tree tmp;
5935 tree vec_elem_type;
5937 /* Case 1: Create:
5938 v_out2 = reduc_expr <v_out1> */
5940 if (dump_enabled_p ())
5941 dump_printf_loc (MSG_NOTE, vect_location,
5942 "Reduce using direct vector reduction.\n");
5944 gimple_seq stmts = NULL;
5945 vec_elem_type = TREE_TYPE (vectype);
5946 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5947 vec_elem_type, reduc_inputs[0]);
5948 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5949 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5951 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5952 && induc_val)
5954 /* Earlier we set the initial value to be a vector if induc_val
5955 values. Check the result and if it is induc_val then replace
5956 with the original initial value, unless induc_val is
5957 the same as initial_def already. */
5958 tree zcompare = make_ssa_name (boolean_type_node);
5959 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5960 new_temp, induc_val);
5961 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5962 tree initial_def = reduc_info->reduc_initial_values[0];
5963 tmp = make_ssa_name (new_scalar_dest);
5964 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5965 initial_def, new_temp);
5966 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5967 new_temp = tmp;
5970 scalar_results.safe_push (new_temp);
5972 else if (direct_slp_reduc)
5974 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5975 with the elements for other SLP statements replaced with the
5976 neutral value. We can then do a normal reduction on each vector. */
5978 /* Enforced by vectorizable_reduction. */
5979 gcc_assert (reduc_inputs.length () == 1);
5980 gcc_assert (pow2p_hwi (group_size));
5982 gimple_seq seq = NULL;
5984 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5985 and the same element size as VECTYPE. */
5986 tree index = build_index_vector (vectype, 0, 1);
5987 tree index_type = TREE_TYPE (index);
5988 tree index_elt_type = TREE_TYPE (index_type);
5989 tree mask_type = truth_type_for (index_type);
5991 /* Create a vector that, for each element, identifies which of
5992 the REDUC_GROUP_SIZE results should use it. */
5993 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5994 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5995 build_vector_from_val (index_type, index_mask));
5997 /* Get a neutral vector value. This is simply a splat of the neutral
5998 scalar value if we have one, otherwise the initial scalar value
5999 is itself a neutral value. */
6000 tree vector_identity = NULL_TREE;
6001 tree neutral_op = NULL_TREE;
6002 if (slp_node)
6004 tree initial_value = NULL_TREE;
6005 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6006 initial_value = reduc_info->reduc_initial_values[0];
6007 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6008 initial_value);
6010 if (neutral_op)
6011 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6012 neutral_op);
6013 for (unsigned int i = 0; i < group_size; ++i)
6015 /* If there's no univeral neutral value, we can use the
6016 initial scalar value from the original PHI. This is used
6017 for MIN and MAX reduction, for example. */
6018 if (!neutral_op)
6020 tree scalar_value = reduc_info->reduc_initial_values[i];
6021 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6022 scalar_value);
6023 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6024 scalar_value);
6027 /* Calculate the equivalent of:
6029 sel[j] = (index[j] == i);
6031 which selects the elements of REDUC_INPUTS[0] that should
6032 be included in the result. */
6033 tree compare_val = build_int_cst (index_elt_type, i);
6034 compare_val = build_vector_from_val (index_type, compare_val);
6035 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6036 index, compare_val);
6038 /* Calculate the equivalent of:
6040 vec = seq ? reduc_inputs[0] : vector_identity;
6042 VEC is now suitable for a full vector reduction. */
6043 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6044 sel, reduc_inputs[0], vector_identity);
6046 /* Do the reduction and convert it to the appropriate type. */
6047 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6048 TREE_TYPE (vectype), vec);
6049 scalar = gimple_convert (&seq, scalar_type, scalar);
6050 scalar_results.safe_push (scalar);
6052 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6054 else
6056 bool reduce_with_shift;
6057 tree vec_temp;
6059 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6061 /* See if the target wants to do the final (shift) reduction
6062 in a vector mode of smaller size and first reduce upper/lower
6063 halves against each other. */
6064 enum machine_mode mode1 = mode;
6065 tree stype = TREE_TYPE (vectype);
6066 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6067 unsigned nunits1 = nunits;
6068 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6069 && reduc_inputs.length () == 1)
6071 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6072 /* For SLP reductions we have to make sure lanes match up, but
6073 since we're doing individual element final reduction reducing
6074 vector width here is even more important.
6075 ??? We can also separate lanes with permutes, for the common
6076 case of power-of-two group-size odd/even extracts would work. */
6077 if (slp_reduc && nunits != nunits1)
6079 nunits1 = least_common_multiple (nunits1, group_size);
6080 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6083 if (!slp_reduc
6084 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6085 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6087 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6088 stype, nunits1);
6089 reduce_with_shift = have_whole_vector_shift (mode1);
6090 if (!VECTOR_MODE_P (mode1)
6091 || !directly_supported_p (code, vectype1))
6092 reduce_with_shift = false;
6094 /* First reduce the vector to the desired vector size we should
6095 do shift reduction on by combining upper and lower halves. */
6096 gimple_seq stmts = NULL;
6097 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6098 code, &stmts);
6099 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6100 reduc_inputs[0] = new_temp;
6102 if (reduce_with_shift && !slp_reduc)
6104 int element_bitsize = tree_to_uhwi (bitsize);
6105 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6106 for variable-length vectors and also requires direct target support
6107 for loop reductions. */
6108 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6109 int nelements = vec_size_in_bits / element_bitsize;
6110 vec_perm_builder sel;
6111 vec_perm_indices indices;
6113 int elt_offset;
6115 tree zero_vec = build_zero_cst (vectype1);
6116 /* Case 2: Create:
6117 for (offset = nelements/2; offset >= 1; offset/=2)
6119 Create: va' = vec_shift <va, offset>
6120 Create: va = vop <va, va'>
6121 } */
6123 tree rhs;
6125 if (dump_enabled_p ())
6126 dump_printf_loc (MSG_NOTE, vect_location,
6127 "Reduce using vector shifts\n");
6129 gimple_seq stmts = NULL;
6130 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6131 for (elt_offset = nelements / 2;
6132 elt_offset >= 1;
6133 elt_offset /= 2)
6135 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6136 indices.new_vector (sel, 2, nelements);
6137 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6138 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6139 new_temp, zero_vec, mask);
6140 new_temp = gimple_build (&stmts, code,
6141 vectype1, new_name, new_temp);
6143 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6145 /* 2.4 Extract the final scalar result. Create:
6146 s_out3 = extract_field <v_out2, bitpos> */
6148 if (dump_enabled_p ())
6149 dump_printf_loc (MSG_NOTE, vect_location,
6150 "extract scalar result\n");
6152 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6153 bitsize, bitsize_zero_node);
6154 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6155 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6156 gimple_assign_set_lhs (epilog_stmt, new_temp);
6157 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6158 scalar_results.safe_push (new_temp);
6160 else
6162 /* Case 3: Create:
6163 s = extract_field <v_out2, 0>
6164 for (offset = element_size;
6165 offset < vector_size;
6166 offset += element_size;)
6168 Create: s' = extract_field <v_out2, offset>
6169 Create: s = op <s, s'> // For non SLP cases
6170 } */
6172 if (dump_enabled_p ())
6173 dump_printf_loc (MSG_NOTE, vect_location,
6174 "Reduce using scalar code.\n");
6176 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6177 int element_bitsize = tree_to_uhwi (bitsize);
6178 tree compute_type = TREE_TYPE (vectype);
6179 gimple_seq stmts = NULL;
6180 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6182 int bit_offset;
6183 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6184 vec_temp, bitsize, bitsize_zero_node);
6186 /* In SLP we don't need to apply reduction operation, so we just
6187 collect s' values in SCALAR_RESULTS. */
6188 if (slp_reduc)
6189 scalar_results.safe_push (new_temp);
6191 for (bit_offset = element_bitsize;
6192 bit_offset < vec_size_in_bits;
6193 bit_offset += element_bitsize)
6195 tree bitpos = bitsize_int (bit_offset);
6196 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6197 compute_type, vec_temp,
6198 bitsize, bitpos);
6199 if (slp_reduc)
6201 /* In SLP we don't need to apply reduction operation, so
6202 we just collect s' values in SCALAR_RESULTS. */
6203 new_temp = new_name;
6204 scalar_results.safe_push (new_name);
6206 else
6207 new_temp = gimple_build (&stmts, code, compute_type,
6208 new_name, new_temp);
6212 /* The only case where we need to reduce scalar results in SLP, is
6213 unrolling. If the size of SCALAR_RESULTS is greater than
6214 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6215 REDUC_GROUP_SIZE. */
6216 if (slp_reduc)
6218 tree res, first_res, new_res;
6220 /* Reduce multiple scalar results in case of SLP unrolling. */
6221 for (j = group_size; scalar_results.iterate (j, &res);
6222 j++)
6224 first_res = scalar_results[j % group_size];
6225 new_res = gimple_build (&stmts, code, compute_type,
6226 first_res, res);
6227 scalar_results[j % group_size] = new_res;
6229 scalar_results.truncate (group_size);
6230 for (k = 0; k < group_size; k++)
6231 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6232 scalar_results[k]);
6234 else
6236 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6237 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6238 scalar_results.safe_push (new_temp);
6241 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6244 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6245 && induc_val)
6247 /* Earlier we set the initial value to be a vector if induc_val
6248 values. Check the result and if it is induc_val then replace
6249 with the original initial value, unless induc_val is
6250 the same as initial_def already. */
6251 tree zcompare = make_ssa_name (boolean_type_node);
6252 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6253 induc_val);
6254 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6255 tree initial_def = reduc_info->reduc_initial_values[0];
6256 tree tmp = make_ssa_name (new_scalar_dest);
6257 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6258 initial_def, new_temp);
6259 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6260 scalar_results[0] = tmp;
6264 /* 2.5 Adjust the final result by the initial value of the reduction
6265 variable. (When such adjustment is not needed, then
6266 'adjustment_def' is zero). For example, if code is PLUS we create:
6267 new_temp = loop_exit_def + adjustment_def */
6269 if (adjustment_def)
6271 gcc_assert (!slp_reduc);
6272 gimple_seq stmts = NULL;
6273 if (double_reduc)
6275 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6276 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6277 new_temp = gimple_build (&stmts, code, vectype,
6278 reduc_inputs[0], adjustment_def);
6280 else
6282 new_temp = scalar_results[0];
6283 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6284 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
6285 new_temp = gimple_build (&stmts, code, scalar_type,
6286 new_temp, adjustment_def);
6289 epilog_stmt = gimple_seq_last_stmt (stmts);
6290 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6291 scalar_results[0] = new_temp;
6294 /* Record this operation if it could be reused by the epilogue loop. */
6295 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6296 && reduc_inputs.length () == 1)
6297 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6298 { orig_reduc_input, reduc_info });
6300 if (double_reduc)
6301 loop = outer_loop;
6303 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6304 phis with new adjusted scalar results, i.e., replace use <s_out0>
6305 with use <s_out4>.
6307 Transform:
6308 loop_exit:
6309 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6310 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6311 v_out2 = reduce <v_out1>
6312 s_out3 = extract_field <v_out2, 0>
6313 s_out4 = adjust_result <s_out3>
6314 use <s_out0>
6315 use <s_out0>
6317 into:
6319 loop_exit:
6320 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6321 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6322 v_out2 = reduce <v_out1>
6323 s_out3 = extract_field <v_out2, 0>
6324 s_out4 = adjust_result <s_out3>
6325 use <s_out4>
6326 use <s_out4> */
6328 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6329 for (k = 0; k < live_out_stmts.size (); k++)
6331 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6332 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6334 phis.create (3);
6335 /* Find the loop-closed-use at the loop exit of the original scalar
6336 result. (The reduction result is expected to have two immediate uses,
6337 one at the latch block, and one at the loop exit). For double
6338 reductions we are looking for exit phis of the outer loop. */
6339 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6341 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6343 if (!is_gimple_debug (USE_STMT (use_p)))
6344 phis.safe_push (USE_STMT (use_p));
6346 else
6348 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6350 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6352 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6354 if (!flow_bb_inside_loop_p (loop,
6355 gimple_bb (USE_STMT (phi_use_p)))
6356 && !is_gimple_debug (USE_STMT (phi_use_p)))
6357 phis.safe_push (USE_STMT (phi_use_p));
6363 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6365 /* Replace the uses: */
6366 orig_name = PHI_RESULT (exit_phi);
6368 /* Look for a single use at the target of the skip edge. */
6369 if (unify_with_main_loop_p)
6371 use_operand_p use_p;
6372 gimple *user;
6373 if (!single_imm_use (orig_name, &use_p, &user))
6374 gcc_unreachable ();
6375 orig_name = gimple_get_lhs (user);
6378 scalar_result = scalar_results[k];
6379 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6381 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6382 SET_USE (use_p, scalar_result);
6383 update_stmt (use_stmt);
6387 phis.release ();
6391 /* Return a vector of type VECTYPE that is equal to the vector select
6392 operation "MASK ? VEC : IDENTITY". Insert the select statements
6393 before GSI. */
6395 static tree
6396 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6397 tree vec, tree identity)
6399 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6400 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6401 mask, vec, identity);
6402 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6403 return cond;
6406 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6407 order, starting with LHS. Insert the extraction statements before GSI and
6408 associate the new scalar SSA names with variable SCALAR_DEST.
6409 Return the SSA name for the result. */
6411 static tree
6412 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6413 tree_code code, tree lhs, tree vector_rhs)
6415 tree vectype = TREE_TYPE (vector_rhs);
6416 tree scalar_type = TREE_TYPE (vectype);
6417 tree bitsize = TYPE_SIZE (scalar_type);
6418 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6419 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6421 for (unsigned HOST_WIDE_INT bit_offset = 0;
6422 bit_offset < vec_size_in_bits;
6423 bit_offset += element_bitsize)
6425 tree bitpos = bitsize_int (bit_offset);
6426 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6427 bitsize, bitpos);
6429 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6430 rhs = make_ssa_name (scalar_dest, stmt);
6431 gimple_assign_set_lhs (stmt, rhs);
6432 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6434 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6435 tree new_name = make_ssa_name (scalar_dest, stmt);
6436 gimple_assign_set_lhs (stmt, new_name);
6437 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6438 lhs = new_name;
6440 return lhs;
6443 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6444 type of the vector input. */
6446 static internal_fn
6447 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6449 internal_fn mask_reduc_fn;
6451 switch (reduc_fn)
6453 case IFN_FOLD_LEFT_PLUS:
6454 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6455 break;
6457 default:
6458 return IFN_LAST;
6461 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6462 OPTIMIZE_FOR_SPEED))
6463 return mask_reduc_fn;
6464 return IFN_LAST;
6467 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6468 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6469 statement. CODE is the operation performed by STMT_INFO and OPS are
6470 its scalar operands. REDUC_INDEX is the index of the operand in
6471 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6472 implements in-order reduction, or IFN_LAST if we should open-code it.
6473 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6474 that should be used to control the operation in a fully-masked loop. */
6476 static bool
6477 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6478 stmt_vec_info stmt_info,
6479 gimple_stmt_iterator *gsi,
6480 gimple **vec_stmt, slp_tree slp_node,
6481 gimple *reduc_def_stmt,
6482 tree_code code, internal_fn reduc_fn,
6483 tree ops[3], tree vectype_in,
6484 int reduc_index, vec_loop_masks *masks)
6486 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6487 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6488 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6490 int ncopies;
6491 if (slp_node)
6492 ncopies = 1;
6493 else
6494 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6496 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6497 gcc_assert (ncopies == 1);
6498 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6500 if (slp_node)
6501 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6502 TYPE_VECTOR_SUBPARTS (vectype_in)));
6504 tree op0 = ops[1 - reduc_index];
6506 int group_size = 1;
6507 stmt_vec_info scalar_dest_def_info;
6508 auto_vec<tree> vec_oprnds0;
6509 if (slp_node)
6511 auto_vec<vec<tree> > vec_defs (2);
6512 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6513 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6514 vec_defs[0].release ();
6515 vec_defs[1].release ();
6516 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6517 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6519 else
6521 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6522 op0, &vec_oprnds0);
6523 scalar_dest_def_info = stmt_info;
6526 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6527 tree scalar_type = TREE_TYPE (scalar_dest);
6528 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6530 int vec_num = vec_oprnds0.length ();
6531 gcc_assert (vec_num == 1 || slp_node);
6532 tree vec_elem_type = TREE_TYPE (vectype_out);
6533 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6535 tree vector_identity = NULL_TREE;
6536 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6537 vector_identity = build_zero_cst (vectype_out);
6539 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6540 int i;
6541 tree def0;
6542 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6544 gimple *new_stmt;
6545 tree mask = NULL_TREE;
6546 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6547 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6549 /* Handle MINUS by adding the negative. */
6550 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6552 tree negated = make_ssa_name (vectype_out);
6553 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6554 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6555 def0 = negated;
6558 if (mask && mask_reduc_fn == IFN_LAST)
6559 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6560 vector_identity);
6562 /* On the first iteration the input is simply the scalar phi
6563 result, and for subsequent iterations it is the output of
6564 the preceding operation. */
6565 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6567 if (mask && mask_reduc_fn != IFN_LAST)
6568 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6569 def0, mask);
6570 else
6571 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6572 def0);
6573 /* For chained SLP reductions the output of the previous reduction
6574 operation serves as the input of the next. For the final statement
6575 the output cannot be a temporary - we reuse the original
6576 scalar destination of the last statement. */
6577 if (i != vec_num - 1)
6579 gimple_set_lhs (new_stmt, scalar_dest_var);
6580 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6581 gimple_set_lhs (new_stmt, reduc_var);
6584 else
6586 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6587 reduc_var, def0);
6588 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6589 /* Remove the statement, so that we can use the same code paths
6590 as for statements that we've just created. */
6591 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6592 gsi_remove (&tmp_gsi, true);
6595 if (i == vec_num - 1)
6597 gimple_set_lhs (new_stmt, scalar_dest);
6598 vect_finish_replace_stmt (loop_vinfo,
6599 scalar_dest_def_info,
6600 new_stmt);
6602 else
6603 vect_finish_stmt_generation (loop_vinfo,
6604 scalar_dest_def_info,
6605 new_stmt, gsi);
6607 if (slp_node)
6608 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6609 else
6611 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6612 *vec_stmt = new_stmt;
6616 return true;
6619 /* Function is_nonwrapping_integer_induction.
6621 Check if STMT_VINO (which is part of loop LOOP) both increments and
6622 does not cause overflow. */
6624 static bool
6625 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6627 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6628 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6629 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6630 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6631 widest_int ni, max_loop_value, lhs_max;
6632 wi::overflow_type overflow = wi::OVF_NONE;
6634 /* Make sure the loop is integer based. */
6635 if (TREE_CODE (base) != INTEGER_CST
6636 || TREE_CODE (step) != INTEGER_CST)
6637 return false;
6639 /* Check that the max size of the loop will not wrap. */
6641 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6642 return true;
6644 if (! max_stmt_executions (loop, &ni))
6645 return false;
6647 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6648 &overflow);
6649 if (overflow)
6650 return false;
6652 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6653 TYPE_SIGN (lhs_type), &overflow);
6654 if (overflow)
6655 return false;
6657 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6658 <= TYPE_PRECISION (lhs_type));
6661 /* Check if masking can be supported by inserting a conditional expression.
6662 CODE is the code for the operation. COND_FN is the conditional internal
6663 function, if it exists. VECTYPE_IN is the type of the vector input. */
6664 static bool
6665 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6666 tree vectype_in)
6668 if (cond_fn != IFN_LAST
6669 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6670 OPTIMIZE_FOR_SPEED))
6671 return false;
6673 if (code.is_tree_code ())
6674 switch (tree_code (code))
6676 case DOT_PROD_EXPR:
6677 case SAD_EXPR:
6678 return true;
6680 default:
6681 break;
6683 return false;
6686 /* Insert a conditional expression to enable masked vectorization. CODE is the
6687 code for the operation. VOP is the array of operands. MASK is the loop
6688 mask. GSI is a statement iterator used to place the new conditional
6689 expression. */
6690 static void
6691 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6692 gimple_stmt_iterator *gsi)
6694 switch (tree_code (code))
6696 case DOT_PROD_EXPR:
6698 tree vectype = TREE_TYPE (vop[1]);
6699 tree zero = build_zero_cst (vectype);
6700 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6701 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6702 mask, vop[1], zero);
6703 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6704 vop[1] = masked_op1;
6705 break;
6708 case SAD_EXPR:
6710 tree vectype = TREE_TYPE (vop[1]);
6711 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6712 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6713 mask, vop[1], vop[0]);
6714 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6715 vop[1] = masked_op1;
6716 break;
6719 default:
6720 gcc_unreachable ();
6724 /* Function vectorizable_reduction.
6726 Check if STMT_INFO performs a reduction operation that can be vectorized.
6727 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6728 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6729 Return true if STMT_INFO is vectorizable in this way.
6731 This function also handles reduction idioms (patterns) that have been
6732 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6733 may be of this form:
6734 X = pattern_expr (arg0, arg1, ..., X)
6735 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6736 sequence that had been detected and replaced by the pattern-stmt
6737 (STMT_INFO).
6739 This function also handles reduction of condition expressions, for example:
6740 for (int i = 0; i < N; i++)
6741 if (a[i] < value)
6742 last = a[i];
6743 This is handled by vectorising the loop and creating an additional vector
6744 containing the loop indexes for which "a[i] < value" was true. In the
6745 function epilogue this is reduced to a single max value and then used to
6746 index into the vector of results.
6748 In some cases of reduction patterns, the type of the reduction variable X is
6749 different than the type of the other arguments of STMT_INFO.
6750 In such cases, the vectype that is used when transforming STMT_INFO into
6751 a vector stmt is different than the vectype that is used to determine the
6752 vectorization factor, because it consists of a different number of elements
6753 than the actual number of elements that are being operated upon in parallel.
6755 For example, consider an accumulation of shorts into an int accumulator.
6756 On some targets it's possible to vectorize this pattern operating on 8
6757 shorts at a time (hence, the vectype for purposes of determining the
6758 vectorization factor should be V8HI); on the other hand, the vectype that
6759 is used to create the vector form is actually V4SI (the type of the result).
6761 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6762 indicates what is the actual level of parallelism (V8HI in the example), so
6763 that the right vectorization factor would be derived. This vectype
6764 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6765 be used to create the vectorized stmt. The right vectype for the vectorized
6766 stmt is obtained from the type of the result X:
6767 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6769 This means that, contrary to "regular" reductions (or "regular" stmts in
6770 general), the following equation:
6771 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6772 does *NOT* necessarily hold for reduction patterns. */
6774 bool
6775 vectorizable_reduction (loop_vec_info loop_vinfo,
6776 stmt_vec_info stmt_info, slp_tree slp_node,
6777 slp_instance slp_node_instance,
6778 stmt_vector_for_cost *cost_vec)
6780 tree vectype_in = NULL_TREE;
6781 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6782 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6783 stmt_vec_info cond_stmt_vinfo = NULL;
6784 int i;
6785 int ncopies;
6786 bool single_defuse_cycle = false;
6787 bool nested_cycle = false;
6788 bool double_reduc = false;
6789 int vec_num;
6790 tree tem;
6791 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6792 tree cond_reduc_val = NULL_TREE;
6794 /* Make sure it was already recognized as a reduction computation. */
6795 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6796 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6797 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6798 return false;
6800 /* The stmt we store reduction analysis meta on. */
6801 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6802 reduc_info->is_reduc_info = true;
6804 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6806 if (is_a <gphi *> (stmt_info->stmt))
6808 if (slp_node)
6810 /* We eventually need to set a vector type on invariant
6811 arguments. */
6812 unsigned j;
6813 slp_tree child;
6814 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6815 if (!vect_maybe_update_slp_op_vectype
6816 (child, SLP_TREE_VECTYPE (slp_node)))
6818 if (dump_enabled_p ())
6819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6820 "incompatible vector types for "
6821 "invariants\n");
6822 return false;
6825 /* Analysis for double-reduction is done on the outer
6826 loop PHI, nested cycles have no further restrictions. */
6827 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6829 else
6830 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6831 return true;
6834 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6835 stmt_vec_info phi_info = stmt_info;
6836 if (!is_a <gphi *> (stmt_info->stmt))
6838 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6839 return true;
6841 if (slp_node)
6843 slp_node_instance->reduc_phis = slp_node;
6844 /* ??? We're leaving slp_node to point to the PHIs, we only
6845 need it to get at the number of vector stmts which wasn't
6846 yet initialized for the instance root. */
6848 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6850 use_operand_p use_p;
6851 gimple *use_stmt;
6852 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6853 &use_p, &use_stmt);
6854 gcc_assert (res);
6855 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6858 /* PHIs should not participate in patterns. */
6859 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6860 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6862 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6863 and compute the reduction chain length. Discover the real
6864 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6865 tree reduc_def
6866 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6867 loop_latch_edge
6868 (gimple_bb (reduc_def_phi)->loop_father));
6869 unsigned reduc_chain_length = 0;
6870 bool only_slp_reduc_chain = true;
6871 stmt_info = NULL;
6872 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6873 while (reduc_def != PHI_RESULT (reduc_def_phi))
6875 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6876 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6877 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6879 if (dump_enabled_p ())
6880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6881 "reduction chain broken by patterns.\n");
6882 return false;
6884 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6885 only_slp_reduc_chain = false;
6886 /* For epilogue generation live members of the chain need
6887 to point back to the PHI via their original stmt for
6888 info_for_reduction to work. For SLP we need to look at
6889 all lanes here - even though we only will vectorize from
6890 the SLP node with live lane zero the other live lanes also
6891 need to be identified as part of a reduction to be able
6892 to skip code generation for them. */
6893 if (slp_for_stmt_info)
6895 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
6896 if (STMT_VINFO_LIVE_P (s))
6897 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
6899 else if (STMT_VINFO_LIVE_P (vdef))
6900 STMT_VINFO_REDUC_DEF (def) = phi_info;
6901 gimple_match_op op;
6902 if (!gimple_extract_op (vdef->stmt, &op))
6904 if (dump_enabled_p ())
6905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6906 "reduction chain includes unsupported"
6907 " statement type.\n");
6908 return false;
6910 if (CONVERT_EXPR_CODE_P (op.code))
6912 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6914 if (dump_enabled_p ())
6915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6916 "conversion in the reduction chain.\n");
6917 return false;
6920 else if (!stmt_info)
6921 /* First non-conversion stmt. */
6922 stmt_info = vdef;
6923 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6924 reduc_chain_length++;
6925 if (!stmt_info && slp_node)
6926 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6928 /* PHIs should not participate in patterns. */
6929 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6931 if (nested_in_vect_loop_p (loop, stmt_info))
6933 loop = loop->inner;
6934 nested_cycle = true;
6937 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6938 element. */
6939 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6941 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6942 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6944 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6945 gcc_assert (slp_node
6946 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6948 /* 1. Is vectorizable reduction? */
6949 /* Not supportable if the reduction variable is used in the loop, unless
6950 it's a reduction chain. */
6951 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6952 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6953 return false;
6955 /* Reductions that are not used even in an enclosing outer-loop,
6956 are expected to be "live" (used out of the loop). */
6957 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6958 && !STMT_VINFO_LIVE_P (stmt_info))
6959 return false;
6961 /* 2. Has this been recognized as a reduction pattern?
6963 Check if STMT represents a pattern that has been recognized
6964 in earlier analysis stages. For stmts that represent a pattern,
6965 the STMT_VINFO_RELATED_STMT field records the last stmt in
6966 the original sequence that constitutes the pattern. */
6968 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6969 if (orig_stmt_info)
6971 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6972 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6975 /* 3. Check the operands of the operation. The first operands are defined
6976 inside the loop body. The last operand is the reduction variable,
6977 which is defined by the loop-header-phi. */
6979 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6980 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6981 gimple_match_op op;
6982 if (!gimple_extract_op (stmt_info->stmt, &op))
6983 gcc_unreachable ();
6984 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
6985 || op.code == WIDEN_SUM_EXPR
6986 || op.code == SAD_EXPR);
6988 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
6989 && !SCALAR_FLOAT_TYPE_P (op.type))
6990 return false;
6992 /* Do not try to vectorize bit-precision reductions. */
6993 if (!type_has_mode_precision_p (op.type))
6994 return false;
6996 /* For lane-reducing ops we're reducing the number of reduction PHIs
6997 which means the only use of that may be in the lane-reducing operation. */
6998 if (lane_reduc_code_p
6999 && reduc_chain_length != 1
7000 && !only_slp_reduc_chain)
7002 if (dump_enabled_p ())
7003 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7004 "lane-reducing reduction with extra stmts.\n");
7005 return false;
7008 /* All uses but the last are expected to be defined in the loop.
7009 The last use is the reduction variable. In case of nested cycle this
7010 assumption is not true: we use reduc_index to record the index of the
7011 reduction variable. */
7012 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7013 /* We need to skip an extra operand for COND_EXPRs with embedded
7014 comparison. */
7015 unsigned opno_adjust = 0;
7016 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7017 opno_adjust = 1;
7018 for (i = 0; i < (int) op.num_ops; i++)
7020 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7021 if (i == 0 && op.code == COND_EXPR)
7022 continue;
7024 stmt_vec_info def_stmt_info;
7025 enum vect_def_type dt;
7026 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7027 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7028 &tem, &def_stmt_info))
7030 if (dump_enabled_p ())
7031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7032 "use not simple.\n");
7033 return false;
7035 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7036 continue;
7038 /* There should be only one cycle def in the stmt, the one
7039 leading to reduc_def. */
7040 if (VECTORIZABLE_CYCLE_DEF (dt))
7041 return false;
7043 /* To properly compute ncopies we are interested in the widest
7044 non-reduction input type in case we're looking at a widening
7045 accumulation that we later handle in vect_transform_reduction. */
7046 if (lane_reduc_code_p
7047 && tem
7048 && (!vectype_in
7049 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7050 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
7051 vectype_in = tem;
7053 if (op.code == COND_EXPR)
7055 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7056 if (dt == vect_constant_def)
7058 cond_reduc_dt = dt;
7059 cond_reduc_val = op.ops[i];
7061 if (dt == vect_induction_def
7062 && def_stmt_info
7063 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7065 cond_reduc_dt = dt;
7066 cond_stmt_vinfo = def_stmt_info;
7070 if (!vectype_in)
7071 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7072 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7074 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7075 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7076 /* If we have a condition reduction, see if we can simplify it further. */
7077 if (v_reduc_type == COND_REDUCTION)
7079 if (slp_node)
7080 return false;
7082 /* When the condition uses the reduction value in the condition, fail. */
7083 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7085 if (dump_enabled_p ())
7086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7087 "condition depends on previous iteration\n");
7088 return false;
7091 if (reduc_chain_length == 1
7092 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
7093 vectype_in, OPTIMIZE_FOR_SPEED))
7095 if (dump_enabled_p ())
7096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7097 "optimizing condition reduction with"
7098 " FOLD_EXTRACT_LAST.\n");
7099 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7101 else if (cond_reduc_dt == vect_induction_def)
7103 tree base
7104 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7105 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7107 gcc_assert (TREE_CODE (base) == INTEGER_CST
7108 && TREE_CODE (step) == INTEGER_CST);
7109 cond_reduc_val = NULL_TREE;
7110 enum tree_code cond_reduc_op_code = ERROR_MARK;
7111 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7112 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7114 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7115 above base; punt if base is the minimum value of the type for
7116 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7117 else if (tree_int_cst_sgn (step) == -1)
7119 cond_reduc_op_code = MIN_EXPR;
7120 if (tree_int_cst_sgn (base) == -1)
7121 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7122 else if (tree_int_cst_lt (base,
7123 TYPE_MAX_VALUE (TREE_TYPE (base))))
7124 cond_reduc_val
7125 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7127 else
7129 cond_reduc_op_code = MAX_EXPR;
7130 if (tree_int_cst_sgn (base) == 1)
7131 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7132 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7133 base))
7134 cond_reduc_val
7135 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7137 if (cond_reduc_val)
7139 if (dump_enabled_p ())
7140 dump_printf_loc (MSG_NOTE, vect_location,
7141 "condition expression based on "
7142 "integer induction.\n");
7143 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7144 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7145 = cond_reduc_val;
7146 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7149 else if (cond_reduc_dt == vect_constant_def)
7151 enum vect_def_type cond_initial_dt;
7152 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7153 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7154 if (cond_initial_dt == vect_constant_def
7155 && types_compatible_p (TREE_TYPE (cond_initial_val),
7156 TREE_TYPE (cond_reduc_val)))
7158 tree e = fold_binary (LE_EXPR, boolean_type_node,
7159 cond_initial_val, cond_reduc_val);
7160 if (e && (integer_onep (e) || integer_zerop (e)))
7162 if (dump_enabled_p ())
7163 dump_printf_loc (MSG_NOTE, vect_location,
7164 "condition expression based on "
7165 "compile time constant.\n");
7166 /* Record reduction code at analysis stage. */
7167 STMT_VINFO_REDUC_CODE (reduc_info)
7168 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7169 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7175 if (STMT_VINFO_LIVE_P (phi_info))
7176 return false;
7178 if (slp_node)
7179 ncopies = 1;
7180 else
7181 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7183 gcc_assert (ncopies >= 1);
7185 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7187 if (nested_cycle)
7189 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7190 == vect_double_reduction_def);
7191 double_reduc = true;
7194 /* 4.2. Check support for the epilog operation.
7196 If STMT represents a reduction pattern, then the type of the
7197 reduction variable may be different than the type of the rest
7198 of the arguments. For example, consider the case of accumulation
7199 of shorts into an int accumulator; The original code:
7200 S1: int_a = (int) short_a;
7201 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7203 was replaced with:
7204 STMT: int_acc = widen_sum <short_a, int_acc>
7206 This means that:
7207 1. The tree-code that is used to create the vector operation in the
7208 epilog code (that reduces the partial results) is not the
7209 tree-code of STMT, but is rather the tree-code of the original
7210 stmt from the pattern that STMT is replacing. I.e, in the example
7211 above we want to use 'widen_sum' in the loop, but 'plus' in the
7212 epilog.
7213 2. The type (mode) we use to check available target support
7214 for the vector operation to be created in the *epilog*, is
7215 determined by the type of the reduction variable (in the example
7216 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7217 However the type (mode) we use to check available target support
7218 for the vector operation to be created *inside the loop*, is
7219 determined by the type of the other arguments to STMT (in the
7220 example we'd check this: optab_handler (widen_sum_optab,
7221 vect_short_mode)).
7223 This is contrary to "regular" reductions, in which the types of all
7224 the arguments are the same as the type of the reduction variable.
7225 For "regular" reductions we can therefore use the same vector type
7226 (and also the same tree-code) when generating the epilog code and
7227 when generating the code inside the loop. */
7229 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7230 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7232 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7233 if (reduction_type == TREE_CODE_REDUCTION)
7235 /* Check whether it's ok to change the order of the computation.
7236 Generally, when vectorizing a reduction we change the order of the
7237 computation. This may change the behavior of the program in some
7238 cases, so we need to check that this is ok. One exception is when
7239 vectorizing an outer-loop: the inner-loop is executed sequentially,
7240 and therefore vectorizing reductions in the inner-loop during
7241 outer-loop vectorization is safe. Likewise when we are vectorizing
7242 a series of reductions using SLP and the VF is one the reductions
7243 are performed in scalar order. */
7244 if (slp_node
7245 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7246 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7248 else if (needs_fold_left_reduction_p (op.type, orig_code))
7250 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7251 is not directy used in stmt. */
7252 if (!only_slp_reduc_chain
7253 && reduc_chain_length != 1)
7255 if (dump_enabled_p ())
7256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7257 "in-order reduction chain without SLP.\n");
7258 return false;
7260 STMT_VINFO_REDUC_TYPE (reduc_info)
7261 = reduction_type = FOLD_LEFT_REDUCTION;
7263 else if (!commutative_binary_op_p (orig_code, op.type)
7264 || !associative_binary_op_p (orig_code, op.type))
7266 if (dump_enabled_p ())
7267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7268 "reduction: not commutative/associative");
7269 return false;
7273 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7274 && ncopies > 1)
7276 if (dump_enabled_p ())
7277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7278 "multiple types in double reduction or condition "
7279 "reduction or fold-left reduction.\n");
7280 return false;
7283 internal_fn reduc_fn = IFN_LAST;
7284 if (reduction_type == TREE_CODE_REDUCTION
7285 || reduction_type == FOLD_LEFT_REDUCTION
7286 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7287 || reduction_type == CONST_COND_REDUCTION)
7289 if (reduction_type == FOLD_LEFT_REDUCTION
7290 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7291 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7293 if (reduc_fn != IFN_LAST
7294 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7295 OPTIMIZE_FOR_SPEED))
7297 if (dump_enabled_p ())
7298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7299 "reduc op not supported by target.\n");
7301 reduc_fn = IFN_LAST;
7304 else
7306 if (!nested_cycle || double_reduc)
7308 if (dump_enabled_p ())
7309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7310 "no reduc code for scalar code.\n");
7312 return false;
7316 else if (reduction_type == COND_REDUCTION)
7318 int scalar_precision
7319 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7320 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7321 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7322 vectype_out);
7324 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7325 OPTIMIZE_FOR_SPEED))
7326 reduc_fn = IFN_REDUC_MAX;
7328 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7330 if (reduction_type != EXTRACT_LAST_REDUCTION
7331 && (!nested_cycle || double_reduc)
7332 && reduc_fn == IFN_LAST
7333 && !nunits_out.is_constant ())
7335 if (dump_enabled_p ())
7336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7337 "missing target support for reduction on"
7338 " variable-length vectors.\n");
7339 return false;
7342 /* For SLP reductions, see if there is a neutral value we can use. */
7343 tree neutral_op = NULL_TREE;
7344 if (slp_node)
7346 tree initial_value = NULL_TREE;
7347 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7348 initial_value = vect_phi_initial_value (reduc_def_phi);
7349 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7350 orig_code, initial_value);
7353 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7355 /* We can't support in-order reductions of code such as this:
7357 for (int i = 0; i < n1; ++i)
7358 for (int j = 0; j < n2; ++j)
7359 l += a[j];
7361 since GCC effectively transforms the loop when vectorizing:
7363 for (int i = 0; i < n1 / VF; ++i)
7364 for (int j = 0; j < n2; ++j)
7365 for (int k = 0; k < VF; ++k)
7366 l += a[j];
7368 which is a reassociation of the original operation. */
7369 if (dump_enabled_p ())
7370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7371 "in-order double reduction not supported.\n");
7373 return false;
7376 if (reduction_type == FOLD_LEFT_REDUCTION
7377 && slp_node
7378 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7380 /* We cannot use in-order reductions in this case because there is
7381 an implicit reassociation of the operations involved. */
7382 if (dump_enabled_p ())
7383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7384 "in-order unchained SLP reductions not supported.\n");
7385 return false;
7388 /* For double reductions, and for SLP reductions with a neutral value,
7389 we construct a variable-length initial vector by loading a vector
7390 full of the neutral value and then shift-and-inserting the start
7391 values into the low-numbered elements. */
7392 if ((double_reduc || neutral_op)
7393 && !nunits_out.is_constant ()
7394 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7395 vectype_out, OPTIMIZE_FOR_SPEED))
7397 if (dump_enabled_p ())
7398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7399 "reduction on variable-length vectors requires"
7400 " target support for a vector-shift-and-insert"
7401 " operation.\n");
7402 return false;
7405 /* Check extra constraints for variable-length unchained SLP reductions. */
7406 if (STMT_SLP_TYPE (stmt_info)
7407 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7408 && !nunits_out.is_constant ())
7410 /* We checked above that we could build the initial vector when
7411 there's a neutral element value. Check here for the case in
7412 which each SLP statement has its own initial value and in which
7413 that value needs to be repeated for every instance of the
7414 statement within the initial vector. */
7415 unsigned int group_size = SLP_TREE_LANES (slp_node);
7416 if (!neutral_op
7417 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7418 TREE_TYPE (vectype_out)))
7420 if (dump_enabled_p ())
7421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7422 "unsupported form of SLP reduction for"
7423 " variable-length vectors: cannot build"
7424 " initial vector.\n");
7425 return false;
7427 /* The epilogue code relies on the number of elements being a multiple
7428 of the group size. The duplicate-and-interleave approach to setting
7429 up the initial vector does too. */
7430 if (!multiple_p (nunits_out, group_size))
7432 if (dump_enabled_p ())
7433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7434 "unsupported form of SLP reduction for"
7435 " variable-length vectors: the vector size"
7436 " is not a multiple of the number of results.\n");
7437 return false;
7441 if (reduction_type == COND_REDUCTION)
7443 widest_int ni;
7445 if (! max_loop_iterations (loop, &ni))
7447 if (dump_enabled_p ())
7448 dump_printf_loc (MSG_NOTE, vect_location,
7449 "loop count not known, cannot create cond "
7450 "reduction.\n");
7451 return false;
7453 /* Convert backedges to iterations. */
7454 ni += 1;
7456 /* The additional index will be the same type as the condition. Check
7457 that the loop can fit into this less one (because we'll use up the
7458 zero slot for when there are no matches). */
7459 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7460 if (wi::geu_p (ni, wi::to_widest (max_index)))
7462 if (dump_enabled_p ())
7463 dump_printf_loc (MSG_NOTE, vect_location,
7464 "loop size is greater than data size.\n");
7465 return false;
7469 /* In case the vectorization factor (VF) is bigger than the number
7470 of elements that we can fit in a vectype (nunits), we have to generate
7471 more than one vector stmt - i.e - we need to "unroll" the
7472 vector stmt by a factor VF/nunits. For more details see documentation
7473 in vectorizable_operation. */
7475 /* If the reduction is used in an outer loop we need to generate
7476 VF intermediate results, like so (e.g. for ncopies=2):
7477 r0 = phi (init, r0)
7478 r1 = phi (init, r1)
7479 r0 = x0 + r0;
7480 r1 = x1 + r1;
7481 (i.e. we generate VF results in 2 registers).
7482 In this case we have a separate def-use cycle for each copy, and therefore
7483 for each copy we get the vector def for the reduction variable from the
7484 respective phi node created for this copy.
7486 Otherwise (the reduction is unused in the loop nest), we can combine
7487 together intermediate results, like so (e.g. for ncopies=2):
7488 r = phi (init, r)
7489 r = x0 + r;
7490 r = x1 + r;
7491 (i.e. we generate VF/2 results in a single register).
7492 In this case for each copy we get the vector def for the reduction variable
7493 from the vectorized reduction operation generated in the previous iteration.
7495 This only works when we see both the reduction PHI and its only consumer
7496 in vectorizable_reduction and there are no intermediate stmts
7497 participating. When unrolling we want each unrolled iteration to have its
7498 own reduction accumulator since one of the main goals of unrolling a
7499 reduction is to reduce the aggregate loop-carried latency. */
7500 if (ncopies > 1
7501 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7502 && reduc_chain_length == 1
7503 && loop_vinfo->suggested_unroll_factor == 1)
7504 single_defuse_cycle = true;
7506 if (single_defuse_cycle || lane_reduc_code_p)
7508 gcc_assert (op.code != COND_EXPR);
7510 /* 4. Supportable by target? */
7511 bool ok = true;
7513 /* 4.1. check support for the operation in the loop
7515 This isn't necessary for the lane reduction codes, since they
7516 can only be produced by pattern matching, and it's up to the
7517 pattern matcher to test for support. The main reason for
7518 specifically skipping this step is to avoid rechecking whether
7519 mixed-sign dot-products can be implemented using signed
7520 dot-products. */
7521 machine_mode vec_mode = TYPE_MODE (vectype_in);
7522 if (!lane_reduc_code_p
7523 && !directly_supported_p (op.code, vectype_in, optab_vector))
7525 if (dump_enabled_p ())
7526 dump_printf (MSG_NOTE, "op not supported by target.\n");
7527 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7528 || !vect_can_vectorize_without_simd_p (op.code))
7529 ok = false;
7530 else
7531 if (dump_enabled_p ())
7532 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7535 if (vect_emulated_vector_p (vectype_in)
7536 && !vect_can_vectorize_without_simd_p (op.code))
7538 if (dump_enabled_p ())
7539 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7540 return false;
7543 /* lane-reducing operations have to go through vect_transform_reduction.
7544 For the other cases try without the single cycle optimization. */
7545 if (!ok)
7547 if (lane_reduc_code_p)
7548 return false;
7549 else
7550 single_defuse_cycle = false;
7553 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7555 /* If the reduction stmt is one of the patterns that have lane
7556 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7557 if ((ncopies > 1 && ! single_defuse_cycle)
7558 && lane_reduc_code_p)
7560 if (dump_enabled_p ())
7561 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7562 "multi def-use cycle not possible for lane-reducing "
7563 "reduction operation\n");
7564 return false;
7567 if (slp_node
7568 && !(!single_defuse_cycle
7569 && !lane_reduc_code_p
7570 && reduction_type != FOLD_LEFT_REDUCTION))
7571 for (i = 0; i < (int) op.num_ops; i++)
7572 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7574 if (dump_enabled_p ())
7575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7576 "incompatible vector types for invariants\n");
7577 return false;
7580 if (slp_node)
7581 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7582 else
7583 vec_num = 1;
7585 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7586 reduction_type, ncopies, cost_vec);
7587 /* Cost the reduction op inside the loop if transformed via
7588 vect_transform_reduction. Otherwise this is costed by the
7589 separate vectorizable_* routines. */
7590 if (single_defuse_cycle || lane_reduc_code_p)
7592 int factor = 1;
7593 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7594 /* Three dot-products and a subtraction. */
7595 factor = 4;
7596 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
7597 stmt_info, 0, vect_body);
7600 if (dump_enabled_p ()
7601 && reduction_type == FOLD_LEFT_REDUCTION)
7602 dump_printf_loc (MSG_NOTE, vect_location,
7603 "using an in-order (fold-left) reduction.\n");
7604 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7605 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7606 reductions go through their own vectorizable_* routines. */
7607 if (!single_defuse_cycle
7608 && !lane_reduc_code_p
7609 && reduction_type != FOLD_LEFT_REDUCTION)
7611 stmt_vec_info tem
7612 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7613 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7615 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7616 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7618 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7619 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7621 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7623 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7624 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7626 if (reduction_type != FOLD_LEFT_REDUCTION
7627 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7628 && (cond_fn == IFN_LAST
7629 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7630 OPTIMIZE_FOR_SPEED)))
7632 if (dump_enabled_p ())
7633 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7634 "can't operate on partial vectors because"
7635 " no conditional operation is available.\n");
7636 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7638 else if (reduction_type == FOLD_LEFT_REDUCTION
7639 && reduc_fn == IFN_LAST
7640 && !expand_vec_cond_expr_p (vectype_in,
7641 truth_type_for (vectype_in),
7642 SSA_NAME))
7644 if (dump_enabled_p ())
7645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7646 "can't operate on partial vectors because"
7647 " no conditional operation is available.\n");
7648 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7650 else
7651 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7652 vectype_in, NULL);
7654 return true;
7657 /* STMT_INFO is a dot-product reduction whose multiplication operands
7658 have different signs. Emit a sequence to emulate the operation
7659 using a series of signed DOT_PROD_EXPRs and return the last
7660 statement generated. VEC_DEST is the result of the vector operation
7661 and VOP lists its inputs. */
7663 static gassign *
7664 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7665 gimple_stmt_iterator *gsi, tree vec_dest,
7666 tree vop[3])
7668 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7669 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7670 tree narrow_elttype = TREE_TYPE (narrow_vectype);
7671 gimple *new_stmt;
7673 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7674 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7675 std::swap (vop[0], vop[1]);
7677 /* Convert all inputs to signed types. */
7678 for (int i = 0; i < 3; ++i)
7679 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7681 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7682 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7683 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7684 vop[i] = tmp;
7687 /* In the comments below we assume 8-bit inputs for simplicity,
7688 but the approach works for any full integer type. */
7690 /* Create a vector of -128. */
7691 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7692 tree min_narrow = build_vector_from_val (narrow_vectype,
7693 min_narrow_elttype);
7695 /* Create a vector of 64. */
7696 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7697 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7698 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7700 /* Emit: SUB_RES = VOP[0] - 128. */
7701 tree sub_res = make_ssa_name (narrow_vectype);
7702 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7703 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7705 /* Emit:
7707 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7708 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7709 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7711 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7712 Doing the two 64 * y steps first allows more time to compute x. */
7713 tree stage1 = make_ssa_name (wide_vectype);
7714 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7715 vop[1], half_narrow, vop[2]);
7716 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7718 tree stage2 = make_ssa_name (wide_vectype);
7719 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7720 vop[1], half_narrow, stage1);
7721 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7723 tree stage3 = make_ssa_name (wide_vectype);
7724 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7725 sub_res, vop[1], stage2);
7726 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7728 /* Convert STAGE3 to the reduction type. */
7729 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7732 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7733 value. */
7735 bool
7736 vect_transform_reduction (loop_vec_info loop_vinfo,
7737 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7738 gimple **vec_stmt, slp_tree slp_node)
7740 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7741 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7742 int i;
7743 int ncopies;
7744 int vec_num;
7746 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7747 gcc_assert (reduc_info->is_reduc_info);
7749 if (nested_in_vect_loop_p (loop, stmt_info))
7751 loop = loop->inner;
7752 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7755 gimple_match_op op;
7756 if (!gimple_extract_op (stmt_info->stmt, &op))
7757 gcc_unreachable ();
7759 /* All uses but the last are expected to be defined in the loop.
7760 The last use is the reduction variable. In case of nested cycle this
7761 assumption is not true: we use reduc_index to record the index of the
7762 reduction variable. */
7763 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7764 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7765 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7766 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7768 if (slp_node)
7770 ncopies = 1;
7771 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7773 else
7775 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7776 vec_num = 1;
7779 code_helper code = canonicalize_code (op.code, op.type);
7780 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
7781 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7782 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7784 /* Transform. */
7785 tree new_temp = NULL_TREE;
7786 auto_vec<tree> vec_oprnds0;
7787 auto_vec<tree> vec_oprnds1;
7788 auto_vec<tree> vec_oprnds2;
7789 tree def0;
7791 if (dump_enabled_p ())
7792 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7794 /* FORNOW: Multiple types are not supported for condition. */
7795 if (code == COND_EXPR)
7796 gcc_assert (ncopies == 1);
7798 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7800 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7801 if (reduction_type == FOLD_LEFT_REDUCTION)
7803 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7804 gcc_assert (code.is_tree_code ());
7805 return vectorize_fold_left_reduction
7806 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
7807 tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
7810 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7811 gcc_assert (single_defuse_cycle
7812 || code == DOT_PROD_EXPR
7813 || code == WIDEN_SUM_EXPR
7814 || code == SAD_EXPR);
7816 /* Create the destination vector */
7817 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
7818 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7820 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7821 single_defuse_cycle && reduc_index == 0
7822 ? NULL_TREE : op.ops[0], &vec_oprnds0,
7823 single_defuse_cycle && reduc_index == 1
7824 ? NULL_TREE : op.ops[1], &vec_oprnds1,
7825 op.num_ops == 3
7826 && !(single_defuse_cycle && reduc_index == 2)
7827 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7828 if (single_defuse_cycle)
7830 gcc_assert (!slp_node);
7831 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7832 op.ops[reduc_index],
7833 reduc_index == 0 ? &vec_oprnds0
7834 : (reduc_index == 1 ? &vec_oprnds1
7835 : &vec_oprnds2));
7838 bool emulated_mixed_dot_prod
7839 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
7840 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7842 gimple *new_stmt;
7843 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7844 if (masked_loop_p && !mask_by_cond_expr)
7846 /* No conditional ifns have been defined for dot-product yet. */
7847 gcc_assert (code != DOT_PROD_EXPR);
7849 /* Make sure that the reduction accumulator is vop[0]. */
7850 if (reduc_index == 1)
7852 gcc_assert (commutative_binary_op_p (code, op.type));
7853 std::swap (vop[0], vop[1]);
7855 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7856 vectype_in, i);
7857 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7858 vop[0], vop[1], vop[0]);
7859 new_temp = make_ssa_name (vec_dest, call);
7860 gimple_call_set_lhs (call, new_temp);
7861 gimple_call_set_nothrow (call, true);
7862 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7863 new_stmt = call;
7865 else
7867 if (op.num_ops == 3)
7868 vop[2] = vec_oprnds2[i];
7870 if (masked_loop_p && mask_by_cond_expr)
7872 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7873 vectype_in, i);
7874 build_vect_cond_expr (code, vop, mask, gsi);
7877 if (emulated_mixed_dot_prod)
7878 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
7879 vec_dest, vop);
7880 else if (code.is_internal_fn ())
7881 new_stmt = gimple_build_call_internal (internal_fn (code),
7882 op.num_ops,
7883 vop[0], vop[1], vop[2]);
7884 else
7885 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
7886 vop[0], vop[1], vop[2]);
7887 new_temp = make_ssa_name (vec_dest, new_stmt);
7888 gimple_set_lhs (new_stmt, new_temp);
7889 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7892 if (slp_node)
7893 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7894 else if (single_defuse_cycle
7895 && i < ncopies - 1)
7897 if (reduc_index == 0)
7898 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7899 else if (reduc_index == 1)
7900 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7901 else if (reduc_index == 2)
7902 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7904 else
7905 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7908 if (!slp_node)
7909 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7911 return true;
7914 /* Transform phase of a cycle PHI. */
7916 bool
7917 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7918 stmt_vec_info stmt_info, gimple **vec_stmt,
7919 slp_tree slp_node, slp_instance slp_node_instance)
7921 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7922 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7923 int i;
7924 int ncopies;
7925 int j;
7926 bool nested_cycle = false;
7927 int vec_num;
7929 if (nested_in_vect_loop_p (loop, stmt_info))
7931 loop = loop->inner;
7932 nested_cycle = true;
7935 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7936 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7937 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7938 gcc_assert (reduc_info->is_reduc_info);
7940 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7941 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7942 /* Leave the scalar phi in place. */
7943 return true;
7945 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7946 /* For a nested cycle we do not fill the above. */
7947 if (!vectype_in)
7948 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7949 gcc_assert (vectype_in);
7951 if (slp_node)
7953 /* The size vect_schedule_slp_instance computes is off for us. */
7954 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7955 * SLP_TREE_LANES (slp_node), vectype_in);
7956 ncopies = 1;
7958 else
7960 vec_num = 1;
7961 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7964 /* Check whether we should use a single PHI node and accumulate
7965 vectors to one before the backedge. */
7966 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7967 ncopies = 1;
7969 /* Create the destination vector */
7970 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7971 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7972 vectype_out);
7974 /* Get the loop-entry arguments. */
7975 tree vec_initial_def = NULL_TREE;
7976 auto_vec<tree> vec_initial_defs;
7977 if (slp_node)
7979 vec_initial_defs.reserve (vec_num);
7980 if (nested_cycle)
7982 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7983 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7984 &vec_initial_defs);
7986 else
7988 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7989 vec<tree> &initial_values = reduc_info->reduc_initial_values;
7990 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7992 unsigned int num_phis = stmts.length ();
7993 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7994 num_phis = 1;
7995 initial_values.reserve (num_phis);
7996 for (unsigned int i = 0; i < num_phis; ++i)
7998 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7999 initial_values.quick_push (vect_phi_initial_value (this_phi));
8001 if (vec_num == 1)
8002 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8003 if (!initial_values.is_empty ())
8005 tree initial_value
8006 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8007 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8008 tree neutral_op
8009 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8010 code, initial_value);
8011 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8012 &vec_initial_defs, vec_num,
8013 stmts.length (), neutral_op);
8017 else
8019 /* Get at the scalar def before the loop, that defines the initial
8020 value of the reduction variable. */
8021 tree initial_def = vect_phi_initial_value (phi);
8022 reduc_info->reduc_initial_values.safe_push (initial_def);
8023 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8024 and we can't use zero for induc_val, use initial_def. Similarly
8025 for REDUC_MIN and initial_def larger than the base. */
8026 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8028 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8029 if (TREE_CODE (initial_def) == INTEGER_CST
8030 && !integer_zerop (induc_val)
8031 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8032 && tree_int_cst_lt (initial_def, induc_val))
8033 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8034 && tree_int_cst_lt (induc_val, initial_def))))
8036 induc_val = initial_def;
8037 /* Communicate we used the initial_def to epilouge
8038 generation. */
8039 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8041 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8043 else if (nested_cycle)
8045 /* Do not use an adjustment def as that case is not supported
8046 correctly if ncopies is not one. */
8047 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8048 ncopies, initial_def,
8049 &vec_initial_defs);
8051 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8052 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8053 /* Fill the initial vector with the initial scalar value. */
8054 vec_initial_def
8055 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8056 initial_def, initial_def);
8057 else
8059 if (ncopies == 1)
8060 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8061 if (!reduc_info->reduc_initial_values.is_empty ())
8063 initial_def = reduc_info->reduc_initial_values[0];
8064 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8065 tree neutral_op
8066 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8067 code, initial_def);
8068 gcc_assert (neutral_op);
8069 /* Try to simplify the vector initialization by applying an
8070 adjustment after the reduction has been performed. */
8071 if (!reduc_info->reused_accumulator
8072 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8073 && !operand_equal_p (neutral_op, initial_def))
8075 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8076 = initial_def;
8077 initial_def = neutral_op;
8079 vec_initial_def
8080 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8081 initial_def, neutral_op);
8086 if (vec_initial_def)
8088 vec_initial_defs.create (ncopies);
8089 for (i = 0; i < ncopies; ++i)
8090 vec_initial_defs.quick_push (vec_initial_def);
8093 if (auto *accumulator = reduc_info->reused_accumulator)
8095 tree def = accumulator->reduc_input;
8096 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8098 unsigned int nreduc;
8099 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8100 (TREE_TYPE (def)),
8101 TYPE_VECTOR_SUBPARTS (vectype_out),
8102 &nreduc);
8103 gcc_assert (res);
8104 gimple_seq stmts = NULL;
8105 /* Reduce the single vector to a smaller one. */
8106 if (nreduc != 1)
8108 /* Perform the reduction in the appropriate type. */
8109 tree rvectype = vectype_out;
8110 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8111 TREE_TYPE (TREE_TYPE (def))))
8112 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8113 TYPE_VECTOR_SUBPARTS
8114 (vectype_out));
8115 def = vect_create_partial_epilog (def, rvectype,
8116 STMT_VINFO_REDUC_CODE
8117 (reduc_info),
8118 &stmts);
8120 /* The epilogue loop might use a different vector mode, like
8121 VNx2DI vs. V2DI. */
8122 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8124 tree reduc_type = build_vector_type_for_mode
8125 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8126 def = gimple_convert (&stmts, reduc_type, def);
8128 /* Adjust the input so we pick up the partially reduced value
8129 for the skip edge in vect_create_epilog_for_reduction. */
8130 accumulator->reduc_input = def;
8131 /* And the reduction could be carried out using a different sign. */
8132 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8133 def = gimple_convert (&stmts, vectype_out, def);
8134 if (loop_vinfo->main_loop_edge)
8136 /* While we'd like to insert on the edge this will split
8137 blocks and disturb bookkeeping, we also will eventually
8138 need this on the skip edge. Rely on sinking to
8139 fixup optimal placement and insert in the pred. */
8140 gimple_stmt_iterator gsi
8141 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8142 /* Insert before a cond that eventually skips the
8143 epilogue. */
8144 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8145 gsi_prev (&gsi);
8146 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8148 else
8149 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8150 stmts);
8152 if (loop_vinfo->main_loop_edge)
8153 vec_initial_defs[0]
8154 = vect_get_main_loop_result (loop_vinfo, def,
8155 vec_initial_defs[0]);
8156 else
8157 vec_initial_defs.safe_push (def);
8160 /* Generate the reduction PHIs upfront. */
8161 for (i = 0; i < vec_num; i++)
8163 tree vec_init_def = vec_initial_defs[i];
8164 for (j = 0; j < ncopies; j++)
8166 /* Create the reduction-phi that defines the reduction
8167 operand. */
8168 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8170 /* Set the loop-entry arg of the reduction-phi. */
8171 if (j != 0 && nested_cycle)
8172 vec_init_def = vec_initial_defs[j];
8173 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8174 UNKNOWN_LOCATION);
8176 /* The loop-latch arg is set in epilogue processing. */
8178 if (slp_node)
8179 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8180 else
8182 if (j == 0)
8183 *vec_stmt = new_phi;
8184 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8189 return true;
8192 /* Vectorizes LC PHIs. */
8194 bool
8195 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8196 stmt_vec_info stmt_info, gimple **vec_stmt,
8197 slp_tree slp_node)
8199 if (!loop_vinfo
8200 || !is_a <gphi *> (stmt_info->stmt)
8201 || gimple_phi_num_args (stmt_info->stmt) != 1)
8202 return false;
8204 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8205 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8206 return false;
8208 if (!vec_stmt) /* transformation not required. */
8210 /* Deal with copies from externs or constants that disguise as
8211 loop-closed PHI nodes (PR97886). */
8212 if (slp_node
8213 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8214 SLP_TREE_VECTYPE (slp_node)))
8216 if (dump_enabled_p ())
8217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8218 "incompatible vector types for invariants\n");
8219 return false;
8221 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8222 return true;
8225 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8226 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8227 basic_block bb = gimple_bb (stmt_info->stmt);
8228 edge e = single_pred_edge (bb);
8229 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8230 auto_vec<tree> vec_oprnds;
8231 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8232 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8233 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8234 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8236 /* Create the vectorized LC PHI node. */
8237 gphi *new_phi = create_phi_node (vec_dest, bb);
8238 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8239 if (slp_node)
8240 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8241 else
8242 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8244 if (!slp_node)
8245 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8247 return true;
8250 /* Vectorizes PHIs. */
8252 bool
8253 vectorizable_phi (vec_info *,
8254 stmt_vec_info stmt_info, gimple **vec_stmt,
8255 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8257 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8258 return false;
8260 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8261 return false;
8263 tree vectype = SLP_TREE_VECTYPE (slp_node);
8265 if (!vec_stmt) /* transformation not required. */
8267 slp_tree child;
8268 unsigned i;
8269 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8270 if (!child)
8272 if (dump_enabled_p ())
8273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8274 "PHI node with unvectorized backedge def\n");
8275 return false;
8277 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8279 if (dump_enabled_p ())
8280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8281 "incompatible vector types for invariants\n");
8282 return false;
8284 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8285 && !useless_type_conversion_p (vectype,
8286 SLP_TREE_VECTYPE (child)))
8288 /* With bools we can have mask and non-mask precision vectors
8289 or different non-mask precisions. while pattern recog is
8290 supposed to guarantee consistency here bugs in it can cause
8291 mismatches (PR103489 and PR103800 for example).
8292 Deal with them here instead of ICEing later. */
8293 if (dump_enabled_p ())
8294 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8295 "incompatible vector type setup from "
8296 "bool pattern detection\n");
8297 return false;
8300 /* For single-argument PHIs assume coalescing which means zero cost
8301 for the scalar and the vector PHIs. This avoids artificially
8302 favoring the vector path (but may pessimize it in some cases). */
8303 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8304 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8305 vector_stmt, stmt_info, vectype, 0, vect_body);
8306 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8307 return true;
8310 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8311 basic_block bb = gimple_bb (stmt_info->stmt);
8312 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8313 auto_vec<gphi *> new_phis;
8314 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8316 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8318 /* Skip not yet vectorized defs. */
8319 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8320 && SLP_TREE_VEC_STMTS (child).is_empty ())
8321 continue;
8323 auto_vec<tree> vec_oprnds;
8324 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8325 if (!new_phis.exists ())
8327 new_phis.create (vec_oprnds.length ());
8328 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8330 /* Create the vectorized LC PHI node. */
8331 new_phis.quick_push (create_phi_node (vec_dest, bb));
8332 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8335 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8336 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8337 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8339 /* We should have at least one already vectorized child. */
8340 gcc_assert (new_phis.exists ());
8342 return true;
8345 /* Vectorizes first order recurrences. An overview of the transformation
8346 is described below. Suppose we have the following loop.
8348 int t = 0;
8349 for (int i = 0; i < n; ++i)
8351 b[i] = a[i] - t;
8352 t = a[i];
8355 There is a first-order recurrence on 'a'. For this loop, the scalar IR
8356 looks (simplified) like:
8358 scalar.preheader:
8359 init = 0;
8361 scalar.body:
8362 i = PHI <0(scalar.preheader), i+1(scalar.body)>
8363 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8364 _1 = a[i]
8365 b[i] = _1 - _2
8366 if (i < n) goto scalar.body
8368 In this example, _2 is a recurrence because it's value depends on the
8369 previous iteration. We vectorize this as (VF = 4)
8371 vector.preheader:
8372 vect_init = vect_cst(..., ..., ..., 0)
8374 vector.body
8375 i = PHI <0(vector.preheader), i+4(vector.body)>
8376 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8377 vect_2 = a[i, i+1, i+2, i+3];
8378 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8379 b[i, i+1, i+2, i+3] = vect_2 - vect_3
8380 if (..) goto vector.body
8382 In this function, vectorizable_recurr, we code generate both the
8383 vector PHI node and the permute since those together compute the
8384 vectorized value of the scalar PHI. We do not yet have the
8385 backedge value to fill in there nor into the vec_perm. Those
8386 are filled in maybe_set_vectorized_backedge_value and
8387 vect_schedule_scc.
8389 TODO: Since the scalar loop does not have a use of the recurrence
8390 outside of the loop the natural way to implement peeling via
8391 vectorizing the live value doesn't work. For now peeling of loops
8392 with a recurrence is not implemented. For SLP the supported cases
8393 are restricted to those requiring a single vector recurrence PHI. */
8395 bool
8396 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8397 gimple **vec_stmt, slp_tree slp_node,
8398 stmt_vector_for_cost *cost_vec)
8400 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8401 return false;
8403 gphi *phi = as_a<gphi *> (stmt_info->stmt);
8405 /* So far we only support first-order recurrence auto-vectorization. */
8406 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8407 return false;
8409 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8410 unsigned ncopies;
8411 if (slp_node)
8412 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8413 else
8414 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8415 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8416 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8417 /* We need to be able to make progress with a single vector. */
8418 if (maybe_gt (dist * 2, nunits))
8420 if (dump_enabled_p ())
8421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8422 "first order recurrence exceeds half of "
8423 "a vector\n");
8424 return false;
8427 /* First-order recurrence autovectorization needs to handle permutation
8428 with indices = [nunits-1, nunits, nunits+1, ...]. */
8429 vec_perm_builder sel (nunits, 1, 3);
8430 for (int i = 0; i < 3; ++i)
8431 sel.quick_push (nunits - dist + i);
8432 vec_perm_indices indices (sel, 2, nunits);
8434 if (!vec_stmt) /* transformation not required. */
8436 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8437 indices))
8438 return false;
8440 if (slp_node)
8442 /* We eventually need to set a vector type on invariant
8443 arguments. */
8444 unsigned j;
8445 slp_tree child;
8446 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8447 if (!vect_maybe_update_slp_op_vectype
8448 (child, SLP_TREE_VECTYPE (slp_node)))
8450 if (dump_enabled_p ())
8451 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8452 "incompatible vector types for "
8453 "invariants\n");
8454 return false;
8457 /* The recurrence costs the initialization vector and one permute
8458 for each copy. */
8459 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8460 stmt_info, 0, vect_prologue);
8461 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8462 stmt_info, 0, vect_body);
8463 if (dump_enabled_p ())
8464 dump_printf_loc (MSG_NOTE, vect_location,
8465 "vectorizable_recurr: inside_cost = %d, "
8466 "prologue_cost = %d .\n", inside_cost,
8467 prologue_cost);
8469 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8470 return true;
8473 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8474 basic_block bb = gimple_bb (phi);
8475 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8476 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8478 gimple_seq stmts = NULL;
8479 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8480 gsi_insert_seq_on_edge_immediate (pe, stmts);
8482 tree vec_init = build_vector_from_val (vectype, preheader);
8483 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8485 /* Create the vectorized first-order PHI node. */
8486 tree vec_dest = vect_get_new_vect_var (vectype,
8487 vect_simple_var, "vec_recur_");
8488 gphi *new_phi = create_phi_node (vec_dest, bb);
8489 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8491 /* Insert shuffles the first-order recurrence autovectorization.
8492 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8493 tree perm = vect_gen_perm_mask_checked (vectype, indices);
8495 /* Insert the required permute after the latch definition. The
8496 second and later operands are tentative and will be updated when we have
8497 vectorized the latch definition. */
8498 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8499 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8500 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8501 gsi_next (&gsi2);
8503 for (unsigned i = 0; i < ncopies; ++i)
8505 vec_dest = make_ssa_name (vectype);
8506 gassign *vperm
8507 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8508 i == 0 ? gimple_phi_result (new_phi) : NULL,
8509 NULL, perm);
8510 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8512 if (slp_node)
8513 SLP_TREE_VEC_STMTS (slp_node).quick_push (vperm);
8514 else
8515 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8518 if (!slp_node)
8519 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8520 return true;
8523 /* Return true if VECTYPE represents a vector that requires lowering
8524 by the vector lowering pass. */
8526 bool
8527 vect_emulated_vector_p (tree vectype)
8529 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8530 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8531 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8534 /* Return true if we can emulate CODE on an integer mode representation
8535 of a vector. */
8537 bool
8538 vect_can_vectorize_without_simd_p (tree_code code)
8540 switch (code)
8542 case PLUS_EXPR:
8543 case MINUS_EXPR:
8544 case NEGATE_EXPR:
8545 case BIT_AND_EXPR:
8546 case BIT_IOR_EXPR:
8547 case BIT_XOR_EXPR:
8548 case BIT_NOT_EXPR:
8549 return true;
8551 default:
8552 return false;
8556 /* Likewise, but taking a code_helper. */
8558 bool
8559 vect_can_vectorize_without_simd_p (code_helper code)
8561 return (code.is_tree_code ()
8562 && vect_can_vectorize_without_simd_p (tree_code (code)));
8565 /* Create vector init for vectorized iv. */
8566 static tree
8567 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8568 tree step_expr, poly_uint64 nunits,
8569 tree vectype,
8570 enum vect_induction_op_type induction_type)
8572 unsigned HOST_WIDE_INT const_nunits;
8573 tree vec_shift, vec_init, new_name;
8574 unsigned i;
8575 tree itype = TREE_TYPE (vectype);
8577 /* iv_loop is the loop to be vectorized. Create:
8578 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8579 new_name = gimple_convert (stmts, itype, init_expr);
8580 switch (induction_type)
8582 case vect_step_op_shr:
8583 case vect_step_op_shl:
8584 /* Build the Initial value from shift_expr. */
8585 vec_init = gimple_build_vector_from_val (stmts,
8586 vectype,
8587 new_name);
8588 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8589 build_zero_cst (itype), step_expr);
8590 vec_init = gimple_build (stmts,
8591 (induction_type == vect_step_op_shr
8592 ? RSHIFT_EXPR : LSHIFT_EXPR),
8593 vectype, vec_init, vec_shift);
8594 break;
8596 case vect_step_op_neg:
8598 vec_init = gimple_build_vector_from_val (stmts,
8599 vectype,
8600 new_name);
8601 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8602 vectype, vec_init);
8603 /* The encoding has 2 interleaved stepped patterns. */
8604 vec_perm_builder sel (nunits, 2, 3);
8605 sel.quick_grow (6);
8606 for (i = 0; i < 3; i++)
8608 sel[2 * i] = i;
8609 sel[2 * i + 1] = i + nunits;
8611 vec_perm_indices indices (sel, 2, nunits);
8612 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8613 fail when vec_init is const vector. In that situation vec_perm is not
8614 really needed. */
8615 tree perm_mask_even
8616 = vect_gen_perm_mask_any (vectype, indices);
8617 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8618 vectype,
8619 vec_init, vec_neg,
8620 perm_mask_even);
8622 break;
8624 case vect_step_op_mul:
8626 /* Use unsigned mult to avoid UD integer overflow. */
8627 gcc_assert (nunits.is_constant (&const_nunits));
8628 tree utype = unsigned_type_for (itype);
8629 tree uvectype = build_vector_type (utype,
8630 TYPE_VECTOR_SUBPARTS (vectype));
8631 new_name = gimple_convert (stmts, utype, new_name);
8632 vec_init = gimple_build_vector_from_val (stmts,
8633 uvectype,
8634 new_name);
8635 tree_vector_builder elts (uvectype, const_nunits, 1);
8636 tree elt_step = build_one_cst (utype);
8638 elts.quick_push (elt_step);
8639 for (i = 1; i < const_nunits; i++)
8641 /* Create: new_name_i = new_name + step_expr. */
8642 elt_step = gimple_build (stmts, MULT_EXPR,
8643 utype, elt_step, step_expr);
8644 elts.quick_push (elt_step);
8646 /* Create a vector from [new_name_0, new_name_1, ...,
8647 new_name_nunits-1]. */
8648 tree vec_mul = gimple_build_vector (stmts, &elts);
8649 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
8650 vec_init, vec_mul);
8651 vec_init = gimple_convert (stmts, vectype, vec_init);
8653 break;
8655 default:
8656 gcc_unreachable ();
8659 return vec_init;
8662 /* Peel init_expr by skip_niter for induction_type. */
8663 tree
8664 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8665 tree skip_niters, tree step_expr,
8666 enum vect_induction_op_type induction_type)
8668 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
8669 tree type = TREE_TYPE (init_expr);
8670 unsigned prec = TYPE_PRECISION (type);
8671 switch (induction_type)
8673 case vect_step_op_neg:
8674 if (TREE_INT_CST_LOW (skip_niters) % 2)
8675 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
8676 /* else no change. */
8677 break;
8679 case vect_step_op_shr:
8680 case vect_step_op_shl:
8681 skip_niters = gimple_convert (stmts, type, skip_niters);
8682 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
8683 /* When shift mount >= precision, need to avoid UD.
8684 In the original loop, there's no UD, and according to semantic,
8685 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
8686 if (!tree_fits_uhwi_p (step_expr)
8687 || tree_to_uhwi (step_expr) >= prec)
8689 if (induction_type == vect_step_op_shl
8690 || TYPE_UNSIGNED (type))
8691 init_expr = build_zero_cst (type);
8692 else
8693 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
8694 init_expr,
8695 wide_int_to_tree (type, prec - 1));
8697 else
8698 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
8699 ? RSHIFT_EXPR : LSHIFT_EXPR),
8700 type, init_expr, step_expr);
8701 break;
8703 case vect_step_op_mul:
8705 tree utype = unsigned_type_for (type);
8706 init_expr = gimple_convert (stmts, utype, init_expr);
8707 unsigned skipn = TREE_INT_CST_LOW (skip_niters);
8708 wide_int begin = wi::to_wide (step_expr);
8709 for (unsigned i = 0; i != skipn - 1; i++)
8710 begin = wi::mul (begin, wi::to_wide (step_expr));
8711 tree mult_expr = wide_int_to_tree (utype, begin);
8712 init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
8713 init_expr = gimple_convert (stmts, type, init_expr);
8715 break;
8717 default:
8718 gcc_unreachable ();
8721 return init_expr;
8724 /* Create vector step for vectorized iv. */
8725 static tree
8726 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
8727 poly_uint64 vf,
8728 enum vect_induction_op_type induction_type)
8730 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
8731 tree new_name = NULL;
8732 /* Step should be pow (step, vf) for mult induction. */
8733 if (induction_type == vect_step_op_mul)
8735 gcc_assert (vf.is_constant ());
8736 wide_int begin = wi::to_wide (step_expr);
8738 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
8739 begin = wi::mul (begin, wi::to_wide (step_expr));
8741 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
8743 else if (induction_type == vect_step_op_neg)
8744 /* Do nothing. */
8746 else
8747 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
8748 expr, step_expr);
8749 return new_name;
8752 static tree
8753 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
8754 stmt_vec_info stmt_info,
8755 tree new_name, tree vectype,
8756 enum vect_induction_op_type induction_type)
8758 /* No step is needed for neg induction. */
8759 if (induction_type == vect_step_op_neg)
8760 return NULL;
8762 tree t = unshare_expr (new_name);
8763 gcc_assert (CONSTANT_CLASS_P (new_name)
8764 || TREE_CODE (new_name) == SSA_NAME);
8765 tree new_vec = build_vector_from_val (vectype, t);
8766 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
8767 new_vec, vectype, NULL);
8768 return vec_step;
8771 /* Update vectorized iv with vect_step, induc_def is init. */
8772 static tree
8773 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
8774 tree induc_def, tree vec_step,
8775 enum vect_induction_op_type induction_type)
8777 tree vec_def = induc_def;
8778 switch (induction_type)
8780 case vect_step_op_mul:
8782 /* Use unsigned mult to avoid UD integer overflow. */
8783 tree uvectype
8784 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
8785 TYPE_VECTOR_SUBPARTS (vectype));
8786 vec_def = gimple_convert (stmts, uvectype, vec_def);
8787 vec_step = gimple_convert (stmts, uvectype, vec_step);
8788 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
8789 vec_def, vec_step);
8790 vec_def = gimple_convert (stmts, vectype, vec_def);
8792 break;
8794 case vect_step_op_shr:
8795 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
8796 vec_def, vec_step);
8797 break;
8799 case vect_step_op_shl:
8800 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
8801 vec_def, vec_step);
8802 break;
8803 case vect_step_op_neg:
8804 vec_def = induc_def;
8805 /* Do nothing. */
8806 break;
8807 default:
8808 gcc_unreachable ();
8811 return vec_def;
8815 /* Function vectorizable_induction
8817 Check if STMT_INFO performs an nonlinear induction computation that can be
8818 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
8819 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
8820 basic block.
8821 Return true if STMT_INFO is vectorizable in this way. */
8823 static bool
8824 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
8825 stmt_vec_info stmt_info,
8826 gimple **vec_stmt, slp_tree slp_node,
8827 stmt_vector_for_cost *cost_vec)
8829 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8830 unsigned ncopies;
8831 bool nested_in_vect_loop = false;
8832 class loop *iv_loop;
8833 tree vec_def;
8834 edge pe = loop_preheader_edge (loop);
8835 basic_block new_bb;
8836 tree vec_init, vec_step;
8837 tree new_name;
8838 gimple *new_stmt;
8839 gphi *induction_phi;
8840 tree induc_def, vec_dest;
8841 tree init_expr, step_expr;
8842 tree niters_skip;
8843 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8844 unsigned i;
8845 gimple_stmt_iterator si;
8847 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8849 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8850 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8851 enum vect_induction_op_type induction_type
8852 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
8854 gcc_assert (induction_type > vect_step_op_add);
8856 if (slp_node)
8857 ncopies = 1;
8858 else
8859 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8860 gcc_assert (ncopies >= 1);
8862 /* FORNOW. Only handle nonlinear induction in the same loop. */
8863 if (nested_in_vect_loop_p (loop, stmt_info))
8865 if (dump_enabled_p ())
8866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8867 "nonlinear induction in nested loop.\n");
8868 return false;
8871 iv_loop = loop;
8872 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8874 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
8875 update for each iv and a permutation to generate wanted vector iv. */
8876 if (slp_node)
8878 if (dump_enabled_p ())
8879 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8880 "SLP induction not supported for nonlinear"
8881 " induction.\n");
8882 return false;
8885 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
8887 if (dump_enabled_p ())
8888 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8889 "floating point nonlinear induction vectorization"
8890 " not supported.\n");
8891 return false;
8894 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8895 init_expr = vect_phi_initial_value (phi);
8896 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
8897 && TREE_CODE (step_expr) == INTEGER_CST);
8898 /* step_expr should be aligned with init_expr,
8899 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
8900 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
8902 if (TREE_CODE (init_expr) == INTEGER_CST)
8903 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
8904 else
8905 gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
8906 TREE_TYPE (init_expr)));
8908 switch (induction_type)
8910 case vect_step_op_neg:
8911 if (TREE_CODE (init_expr) != INTEGER_CST
8912 && TREE_CODE (init_expr) != REAL_CST)
8914 /* Check for backend support of NEGATE_EXPR and vec_perm. */
8915 if (!directly_supported_p (NEGATE_EXPR, vectype))
8916 return false;
8918 /* The encoding has 2 interleaved stepped patterns. */
8919 vec_perm_builder sel (nunits, 2, 3);
8920 machine_mode mode = TYPE_MODE (vectype);
8921 sel.quick_grow (6);
8922 for (i = 0; i < 3; i++)
8924 sel[i * 2] = i;
8925 sel[i * 2 + 1] = i + nunits;
8927 vec_perm_indices indices (sel, 2, nunits);
8928 if (!can_vec_perm_const_p (mode, mode, indices))
8929 return false;
8931 break;
8933 case vect_step_op_mul:
8935 /* Check for backend support of MULT_EXPR. */
8936 if (!directly_supported_p (MULT_EXPR, vectype))
8937 return false;
8939 /* ?? How to construct vector step for variable number vector.
8940 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
8941 if (!vf.is_constant ())
8942 return false;
8944 break;
8946 case vect_step_op_shr:
8947 /* Check for backend support of RSHIFT_EXPR. */
8948 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
8949 return false;
8951 /* Don't shift more than type precision to avoid UD. */
8952 if (!tree_fits_uhwi_p (step_expr)
8953 || maybe_ge (nunits * tree_to_uhwi (step_expr),
8954 TYPE_PRECISION (TREE_TYPE (init_expr))))
8955 return false;
8956 break;
8958 case vect_step_op_shl:
8959 /* Check for backend support of RSHIFT_EXPR. */
8960 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
8961 return false;
8963 /* Don't shift more than type precision to avoid UD. */
8964 if (!tree_fits_uhwi_p (step_expr)
8965 || maybe_ge (nunits * tree_to_uhwi (step_expr),
8966 TYPE_PRECISION (TREE_TYPE (init_expr))))
8967 return false;
8969 break;
8971 default:
8972 gcc_unreachable ();
8975 if (!vec_stmt) /* transformation not required. */
8977 unsigned inside_cost = 0, prologue_cost = 0;
8978 /* loop cost for vec_loop. Neg induction doesn't have any
8979 inside_cost. */
8980 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8981 stmt_info, 0, vect_body);
8983 /* loop cost for vec_loop. Neg induction doesn't have any
8984 inside_cost. */
8985 if (induction_type == vect_step_op_neg)
8986 inside_cost = 0;
8988 /* prologue cost for vec_init and vec_step. */
8989 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8990 stmt_info, 0, vect_prologue);
8992 if (dump_enabled_p ())
8993 dump_printf_loc (MSG_NOTE, vect_location,
8994 "vect_model_induction_cost: inside_cost = %d, "
8995 "prologue_cost = %d. \n", inside_cost,
8996 prologue_cost);
8998 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8999 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9000 return true;
9003 /* Transform. */
9005 /* Compute a vector variable, initialized with the first VF values of
9006 the induction variable. E.g., for an iv with IV_PHI='X' and
9007 evolution S, for a vector of 4 units, we want to compute:
9008 [X, X + S, X + 2*S, X + 3*S]. */
9010 if (dump_enabled_p ())
9011 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9013 pe = loop_preheader_edge (iv_loop);
9014 /* Find the first insertion point in the BB. */
9015 basic_block bb = gimple_bb (phi);
9016 si = gsi_after_labels (bb);
9018 gimple_seq stmts = NULL;
9020 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9021 /* If we are using the loop mask to "peel" for alignment then we need
9022 to adjust the start value here. */
9023 if (niters_skip != NULL_TREE)
9024 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9025 step_expr, induction_type);
9027 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9028 step_expr, nunits, vectype,
9029 induction_type);
9030 if (stmts)
9032 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9033 gcc_assert (!new_bb);
9036 stmts = NULL;
9037 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9038 vf, induction_type);
9039 if (stmts)
9041 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9042 gcc_assert (!new_bb);
9045 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9046 new_name, vectype,
9047 induction_type);
9048 /* Create the following def-use cycle:
9049 loop prolog:
9050 vec_init = ...
9051 vec_step = ...
9052 loop:
9053 vec_iv = PHI <vec_init, vec_loop>
9055 STMT
9057 vec_loop = vec_iv + vec_step; */
9059 /* Create the induction-phi that defines the induction-operand. */
9060 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9061 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9062 induc_def = PHI_RESULT (induction_phi);
9064 /* Create the iv update inside the loop. */
9065 stmts = NULL;
9066 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9067 induc_def, vec_step,
9068 induction_type);
9070 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9071 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9073 /* Set the arguments of the phi node: */
9074 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9075 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9076 UNKNOWN_LOCATION);
9078 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9079 *vec_stmt = induction_phi;
9081 /* In case that vectorization factor (VF) is bigger than the number
9082 of elements that we can fit in a vectype (nunits), we have to generate
9083 more than one vector stmt - i.e - we need to "unroll" the
9084 vector stmt by a factor VF/nunits. For more details see documentation
9085 in vectorizable_operation. */
9087 if (ncopies > 1)
9089 stmts = NULL;
9090 /* FORNOW. This restriction should be relaxed. */
9091 gcc_assert (!nested_in_vect_loop);
9093 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9094 nunits, induction_type);
9096 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9097 new_name, vectype,
9098 induction_type);
9099 vec_def = induc_def;
9100 for (i = 1; i < ncopies; i++)
9102 /* vec_i = vec_prev + vec_step. */
9103 stmts = NULL;
9104 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9105 vec_def, vec_step,
9106 induction_type);
9107 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9108 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9109 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9113 if (dump_enabled_p ())
9114 dump_printf_loc (MSG_NOTE, vect_location,
9115 "transform induction: created def-use cycle: %G%G",
9116 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9118 return true;
9121 /* Function vectorizable_induction
9123 Check if STMT_INFO performs an induction computation that can be vectorized.
9124 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9125 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9126 Return true if STMT_INFO is vectorizable in this way. */
9128 bool
9129 vectorizable_induction (loop_vec_info loop_vinfo,
9130 stmt_vec_info stmt_info,
9131 gimple **vec_stmt, slp_tree slp_node,
9132 stmt_vector_for_cost *cost_vec)
9134 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9135 unsigned ncopies;
9136 bool nested_in_vect_loop = false;
9137 class loop *iv_loop;
9138 tree vec_def;
9139 edge pe = loop_preheader_edge (loop);
9140 basic_block new_bb;
9141 tree new_vec, vec_init, vec_step, t;
9142 tree new_name;
9143 gimple *new_stmt;
9144 gphi *induction_phi;
9145 tree induc_def, vec_dest;
9146 tree init_expr, step_expr;
9147 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9148 unsigned i;
9149 tree expr;
9150 gimple_stmt_iterator si;
9151 enum vect_induction_op_type induction_type
9152 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9154 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9155 if (!phi)
9156 return false;
9158 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9159 return false;
9161 /* Make sure it was recognized as induction computation. */
9162 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9163 return false;
9165 /* Handle nonlinear induction in a separate place. */
9166 if (induction_type != vect_step_op_add)
9167 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9168 vec_stmt, slp_node, cost_vec);
9170 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9171 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9173 if (slp_node)
9174 ncopies = 1;
9175 else
9176 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9177 gcc_assert (ncopies >= 1);
9179 /* FORNOW. These restrictions should be relaxed. */
9180 if (nested_in_vect_loop_p (loop, stmt_info))
9182 imm_use_iterator imm_iter;
9183 use_operand_p use_p;
9184 gimple *exit_phi;
9185 edge latch_e;
9186 tree loop_arg;
9188 if (ncopies > 1)
9190 if (dump_enabled_p ())
9191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9192 "multiple types in nested loop.\n");
9193 return false;
9196 exit_phi = NULL;
9197 latch_e = loop_latch_edge (loop->inner);
9198 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9199 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9201 gimple *use_stmt = USE_STMT (use_p);
9202 if (is_gimple_debug (use_stmt))
9203 continue;
9205 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9207 exit_phi = use_stmt;
9208 break;
9211 if (exit_phi)
9213 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9214 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9215 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9217 if (dump_enabled_p ())
9218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9219 "inner-loop induction only used outside "
9220 "of the outer vectorized loop.\n");
9221 return false;
9225 nested_in_vect_loop = true;
9226 iv_loop = loop->inner;
9228 else
9229 iv_loop = loop;
9230 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9232 if (slp_node && !nunits.is_constant ())
9234 /* The current SLP code creates the step value element-by-element. */
9235 if (dump_enabled_p ())
9236 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9237 "SLP induction not supported for variable-length"
9238 " vectors.\n");
9239 return false;
9242 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9244 if (dump_enabled_p ())
9245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9246 "floating point induction vectorization disabled\n");
9247 return false;
9250 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9251 gcc_assert (step_expr != NULL_TREE);
9252 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9254 /* Check for backend support of PLUS/MINUS_EXPR. */
9255 if (!directly_supported_p (PLUS_EXPR, step_vectype)
9256 || !directly_supported_p (MINUS_EXPR, step_vectype))
9257 return false;
9259 if (!vec_stmt) /* transformation not required. */
9261 unsigned inside_cost = 0, prologue_cost = 0;
9262 if (slp_node)
9264 /* We eventually need to set a vector type on invariant
9265 arguments. */
9266 unsigned j;
9267 slp_tree child;
9268 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9269 if (!vect_maybe_update_slp_op_vectype
9270 (child, SLP_TREE_VECTYPE (slp_node)))
9272 if (dump_enabled_p ())
9273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9274 "incompatible vector types for "
9275 "invariants\n");
9276 return false;
9278 /* loop cost for vec_loop. */
9279 inside_cost
9280 = record_stmt_cost (cost_vec,
9281 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9282 vector_stmt, stmt_info, 0, vect_body);
9283 /* prologue cost for vec_init (if not nested) and step. */
9284 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9285 scalar_to_vec,
9286 stmt_info, 0, vect_prologue);
9288 else /* if (!slp_node) */
9290 /* loop cost for vec_loop. */
9291 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9292 stmt_info, 0, vect_body);
9293 /* prologue cost for vec_init and vec_step. */
9294 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9295 stmt_info, 0, vect_prologue);
9297 if (dump_enabled_p ())
9298 dump_printf_loc (MSG_NOTE, vect_location,
9299 "vect_model_induction_cost: inside_cost = %d, "
9300 "prologue_cost = %d .\n", inside_cost,
9301 prologue_cost);
9303 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9304 DUMP_VECT_SCOPE ("vectorizable_induction");
9305 return true;
9308 /* Transform. */
9310 /* Compute a vector variable, initialized with the first VF values of
9311 the induction variable. E.g., for an iv with IV_PHI='X' and
9312 evolution S, for a vector of 4 units, we want to compute:
9313 [X, X + S, X + 2*S, X + 3*S]. */
9315 if (dump_enabled_p ())
9316 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9318 pe = loop_preheader_edge (iv_loop);
9319 /* Find the first insertion point in the BB. */
9320 basic_block bb = gimple_bb (phi);
9321 si = gsi_after_labels (bb);
9323 /* For SLP induction we have to generate several IVs as for example
9324 with group size 3 we need
9325 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9326 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9327 if (slp_node)
9329 /* Enforced above. */
9330 unsigned int const_nunits = nunits.to_constant ();
9332 /* The initial values are vectorized, but any lanes > group_size
9333 need adjustment. */
9334 slp_tree init_node
9335 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9337 /* Gather steps. Since we do not vectorize inductions as
9338 cycles we have to reconstruct the step from SCEV data. */
9339 unsigned group_size = SLP_TREE_LANES (slp_node);
9340 tree *steps = XALLOCAVEC (tree, group_size);
9341 tree *inits = XALLOCAVEC (tree, group_size);
9342 stmt_vec_info phi_info;
9343 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9345 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9346 if (!init_node)
9347 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9348 pe->dest_idx);
9351 /* Now generate the IVs. */
9352 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9353 gcc_assert ((const_nunits * nvects) % group_size == 0);
9354 unsigned nivs;
9355 if (nested_in_vect_loop)
9356 nivs = nvects;
9357 else
9359 /* Compute the number of distinct IVs we need. First reduce
9360 group_size if it is a multiple of const_nunits so we get
9361 one IV for a group_size of 4 but const_nunits 2. */
9362 unsigned group_sizep = group_size;
9363 if (group_sizep % const_nunits == 0)
9364 group_sizep = group_sizep / const_nunits;
9365 nivs = least_common_multiple (group_sizep,
9366 const_nunits) / const_nunits;
9368 tree stept = TREE_TYPE (step_vectype);
9369 tree lupdate_mul = NULL_TREE;
9370 if (!nested_in_vect_loop)
9372 /* The number of iterations covered in one vector iteration. */
9373 unsigned lup_mul = (nvects * const_nunits) / group_size;
9374 lupdate_mul
9375 = build_vector_from_val (step_vectype,
9376 SCALAR_FLOAT_TYPE_P (stept)
9377 ? build_real_from_wide (stept, lup_mul,
9378 UNSIGNED)
9379 : build_int_cstu (stept, lup_mul));
9381 tree peel_mul = NULL_TREE;
9382 gimple_seq init_stmts = NULL;
9383 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9385 if (SCALAR_FLOAT_TYPE_P (stept))
9386 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9387 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9388 else
9389 peel_mul = gimple_convert (&init_stmts, stept,
9390 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9391 peel_mul = gimple_build_vector_from_val (&init_stmts,
9392 step_vectype, peel_mul);
9394 unsigned ivn;
9395 auto_vec<tree> vec_steps;
9396 for (ivn = 0; ivn < nivs; ++ivn)
9398 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9399 tree_vector_builder init_elts (vectype, const_nunits, 1);
9400 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9401 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9403 /* The scalar steps of the IVs. */
9404 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9405 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9406 step_elts.quick_push (elt);
9407 if (!init_node)
9409 /* The scalar inits of the IVs if not vectorized. */
9410 elt = inits[(ivn*const_nunits + eltn) % group_size];
9411 if (!useless_type_conversion_p (TREE_TYPE (vectype),
9412 TREE_TYPE (elt)))
9413 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9414 TREE_TYPE (vectype), elt);
9415 init_elts.quick_push (elt);
9417 /* The number of steps to add to the initial values. */
9418 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9419 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9420 ? build_real_from_wide (stept,
9421 mul_elt, UNSIGNED)
9422 : build_int_cstu (stept, mul_elt));
9424 vec_step = gimple_build_vector (&init_stmts, &step_elts);
9425 vec_steps.safe_push (vec_step);
9426 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9427 if (peel_mul)
9428 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9429 step_mul, peel_mul);
9430 if (!init_node)
9431 vec_init = gimple_build_vector (&init_stmts, &init_elts);
9433 /* Create the induction-phi that defines the induction-operand. */
9434 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9435 "vec_iv_");
9436 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9437 induc_def = PHI_RESULT (induction_phi);
9439 /* Create the iv update inside the loop */
9440 tree up = vec_step;
9441 if (lupdate_mul)
9442 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9443 vec_step, lupdate_mul);
9444 gimple_seq stmts = NULL;
9445 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9446 vec_def = gimple_build (&stmts,
9447 PLUS_EXPR, step_vectype, vec_def, up);
9448 vec_def = gimple_convert (&stmts, vectype, vec_def);
9449 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9450 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9451 UNKNOWN_LOCATION);
9453 if (init_node)
9454 vec_init = vect_get_slp_vect_def (init_node, ivn);
9455 if (!nested_in_vect_loop
9456 && !integer_zerop (step_mul))
9458 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9459 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9460 vec_step, step_mul);
9461 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9462 vec_def, up);
9463 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9466 /* Set the arguments of the phi node: */
9467 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9469 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
9471 if (!nested_in_vect_loop)
9473 /* Fill up to the number of vectors we need for the whole group. */
9474 nivs = least_common_multiple (group_size,
9475 const_nunits) / const_nunits;
9476 vec_steps.reserve (nivs-ivn);
9477 for (; ivn < nivs; ++ivn)
9479 SLP_TREE_VEC_STMTS (slp_node)
9480 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
9481 vec_steps.quick_push (vec_steps[0]);
9485 /* Re-use IVs when we can. We are generating further vector
9486 stmts by adding VF' * stride to the IVs generated above. */
9487 if (ivn < nvects)
9489 unsigned vfp
9490 = least_common_multiple (group_size, const_nunits) / group_size;
9491 tree lupdate_mul
9492 = build_vector_from_val (step_vectype,
9493 SCALAR_FLOAT_TYPE_P (stept)
9494 ? build_real_from_wide (stept,
9495 vfp, UNSIGNED)
9496 : build_int_cstu (stept, vfp));
9497 for (; ivn < nvects; ++ivn)
9499 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
9500 tree def = gimple_get_lhs (iv);
9501 if (ivn < 2*nivs)
9502 vec_steps[ivn - nivs]
9503 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9504 vec_steps[ivn - nivs], lupdate_mul);
9505 gimple_seq stmts = NULL;
9506 def = gimple_convert (&stmts, step_vectype, def);
9507 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9508 def, vec_steps[ivn % nivs]);
9509 def = gimple_convert (&stmts, vectype, def);
9510 if (gimple_code (iv) == GIMPLE_PHI)
9511 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9512 else
9514 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9515 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9517 SLP_TREE_VEC_STMTS (slp_node)
9518 .quick_push (SSA_NAME_DEF_STMT (def));
9522 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9523 gcc_assert (!new_bb);
9525 return true;
9528 init_expr = vect_phi_initial_value (phi);
9530 gimple_seq stmts = NULL;
9531 if (!nested_in_vect_loop)
9533 /* Convert the initial value to the IV update type. */
9534 tree new_type = TREE_TYPE (step_expr);
9535 init_expr = gimple_convert (&stmts, new_type, init_expr);
9537 /* If we are using the loop mask to "peel" for alignment then we need
9538 to adjust the start value here. */
9539 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9540 if (skip_niters != NULL_TREE)
9542 if (FLOAT_TYPE_P (vectype))
9543 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9544 skip_niters);
9545 else
9546 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9547 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9548 skip_niters, step_expr);
9549 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9550 init_expr, skip_step);
9554 if (stmts)
9556 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9557 gcc_assert (!new_bb);
9560 /* Create the vector that holds the initial_value of the induction. */
9561 if (nested_in_vect_loop)
9563 /* iv_loop is nested in the loop to be vectorized. init_expr had already
9564 been created during vectorization of previous stmts. We obtain it
9565 from the STMT_VINFO_VEC_STMT of the defining stmt. */
9566 auto_vec<tree> vec_inits;
9567 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9568 init_expr, &vec_inits);
9569 vec_init = vec_inits[0];
9570 /* If the initial value is not of proper type, convert it. */
9571 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9573 new_stmt
9574 = gimple_build_assign (vect_get_new_ssa_name (vectype,
9575 vect_simple_var,
9576 "vec_iv_"),
9577 VIEW_CONVERT_EXPR,
9578 build1 (VIEW_CONVERT_EXPR, vectype,
9579 vec_init));
9580 vec_init = gimple_assign_lhs (new_stmt);
9581 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
9582 new_stmt);
9583 gcc_assert (!new_bb);
9586 else
9588 /* iv_loop is the loop to be vectorized. Create:
9589 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
9590 stmts = NULL;
9591 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
9593 unsigned HOST_WIDE_INT const_nunits;
9594 if (nunits.is_constant (&const_nunits))
9596 tree_vector_builder elts (step_vectype, const_nunits, 1);
9597 elts.quick_push (new_name);
9598 for (i = 1; i < const_nunits; i++)
9600 /* Create: new_name_i = new_name + step_expr */
9601 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
9602 new_name, step_expr);
9603 elts.quick_push (new_name);
9605 /* Create a vector from [new_name_0, new_name_1, ...,
9606 new_name_nunits-1] */
9607 vec_init = gimple_build_vector (&stmts, &elts);
9609 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
9610 /* Build the initial value directly from a VEC_SERIES_EXPR. */
9611 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
9612 new_name, step_expr);
9613 else
9615 /* Build:
9616 [base, base, base, ...]
9617 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9618 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
9619 gcc_assert (flag_associative_math);
9620 tree index = build_index_vector (step_vectype, 0, 1);
9621 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9622 new_name);
9623 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9624 step_expr);
9625 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
9626 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
9627 vec_init, step_vec);
9628 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9629 vec_init, base_vec);
9631 vec_init = gimple_convert (&stmts, vectype, vec_init);
9633 if (stmts)
9635 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9636 gcc_assert (!new_bb);
9641 /* Create the vector that holds the step of the induction. */
9642 if (nested_in_vect_loop)
9643 /* iv_loop is nested in the loop to be vectorized. Generate:
9644 vec_step = [S, S, S, S] */
9645 new_name = step_expr;
9646 else
9648 /* iv_loop is the loop to be vectorized. Generate:
9649 vec_step = [VF*S, VF*S, VF*S, VF*S] */
9650 gimple_seq seq = NULL;
9651 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9653 expr = build_int_cst (integer_type_node, vf);
9654 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9656 else
9657 expr = build_int_cst (TREE_TYPE (step_expr), vf);
9658 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
9659 expr, step_expr);
9660 if (seq)
9662 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
9663 gcc_assert (!new_bb);
9667 t = unshare_expr (new_name);
9668 gcc_assert (CONSTANT_CLASS_P (new_name)
9669 || TREE_CODE (new_name) == SSA_NAME);
9670 new_vec = build_vector_from_val (step_vectype, t);
9671 vec_step = vect_init_vector (loop_vinfo, stmt_info,
9672 new_vec, step_vectype, NULL);
9675 /* Create the following def-use cycle:
9676 loop prolog:
9677 vec_init = ...
9678 vec_step = ...
9679 loop:
9680 vec_iv = PHI <vec_init, vec_loop>
9682 STMT
9684 vec_loop = vec_iv + vec_step; */
9686 /* Create the induction-phi that defines the induction-operand. */
9687 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9688 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9689 induc_def = PHI_RESULT (induction_phi);
9691 /* Create the iv update inside the loop */
9692 stmts = NULL;
9693 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9694 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
9695 vec_def = gimple_convert (&stmts, vectype, vec_def);
9696 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9697 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9699 /* Set the arguments of the phi node: */
9700 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9701 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9702 UNKNOWN_LOCATION);
9704 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9705 *vec_stmt = induction_phi;
9707 /* In case that vectorization factor (VF) is bigger than the number
9708 of elements that we can fit in a vectype (nunits), we have to generate
9709 more than one vector stmt - i.e - we need to "unroll" the
9710 vector stmt by a factor VF/nunits. For more details see documentation
9711 in vectorizable_operation. */
9713 if (ncopies > 1)
9715 gimple_seq seq = NULL;
9716 /* FORNOW. This restriction should be relaxed. */
9717 gcc_assert (!nested_in_vect_loop);
9719 /* Create the vector that holds the step of the induction. */
9720 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9722 expr = build_int_cst (integer_type_node, nunits);
9723 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9725 else
9726 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
9727 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
9728 expr, step_expr);
9729 if (seq)
9731 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
9732 gcc_assert (!new_bb);
9735 t = unshare_expr (new_name);
9736 gcc_assert (CONSTANT_CLASS_P (new_name)
9737 || TREE_CODE (new_name) == SSA_NAME);
9738 new_vec = build_vector_from_val (step_vectype, t);
9739 vec_step = vect_init_vector (loop_vinfo, stmt_info,
9740 new_vec, step_vectype, NULL);
9742 vec_def = induc_def;
9743 for (i = 1; i < ncopies; i++)
9745 /* vec_i = vec_prev + vec_step */
9746 gimple_seq stmts = NULL;
9747 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
9748 vec_def = gimple_build (&stmts,
9749 PLUS_EXPR, step_vectype, vec_def, vec_step);
9750 vec_def = gimple_convert (&stmts, vectype, vec_def);
9752 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9753 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9754 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9758 if (dump_enabled_p ())
9759 dump_printf_loc (MSG_NOTE, vect_location,
9760 "transform induction: created def-use cycle: %G%G",
9761 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9763 return true;
9766 /* Function vectorizable_live_operation.
9768 STMT_INFO computes a value that is used outside the loop. Check if
9769 it can be supported. */
9771 bool
9772 vectorizable_live_operation (vec_info *vinfo,
9773 stmt_vec_info stmt_info,
9774 gimple_stmt_iterator *gsi,
9775 slp_tree slp_node, slp_instance slp_node_instance,
9776 int slp_index, bool vec_stmt_p,
9777 stmt_vector_for_cost *cost_vec)
9779 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
9780 imm_use_iterator imm_iter;
9781 tree lhs, lhs_type, bitsize;
9782 tree vectype = (slp_node
9783 ? SLP_TREE_VECTYPE (slp_node)
9784 : STMT_VINFO_VECTYPE (stmt_info));
9785 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9786 int ncopies;
9787 gimple *use_stmt;
9788 auto_vec<tree> vec_oprnds;
9789 int vec_entry = 0;
9790 poly_uint64 vec_index = 0;
9792 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
9794 /* If a stmt of a reduction is live, vectorize it via
9795 vect_create_epilog_for_reduction. vectorizable_reduction assessed
9796 validity so just trigger the transform here. */
9797 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
9799 if (!vec_stmt_p)
9800 return true;
9801 if (slp_node)
9803 /* For reduction chains the meta-info is attached to
9804 the group leader. */
9805 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
9806 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
9807 /* For SLP reductions we vectorize the epilogue for
9808 all involved stmts together. */
9809 else if (slp_index != 0)
9810 return true;
9812 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
9813 gcc_assert (reduc_info->is_reduc_info);
9814 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
9815 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
9816 return true;
9817 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
9818 slp_node_instance);
9819 return true;
9822 /* If STMT is not relevant and it is a simple assignment and its inputs are
9823 invariant then it can remain in place, unvectorized. The original last
9824 scalar value that it computes will be used. */
9825 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9827 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
9828 if (dump_enabled_p ())
9829 dump_printf_loc (MSG_NOTE, vect_location,
9830 "statement is simple and uses invariant. Leaving in "
9831 "place.\n");
9832 return true;
9835 if (slp_node)
9836 ncopies = 1;
9837 else
9838 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9840 if (slp_node)
9842 gcc_assert (slp_index >= 0);
9844 /* Get the last occurrence of the scalar index from the concatenation of
9845 all the slp vectors. Calculate which slp vector it is and the index
9846 within. */
9847 int num_scalar = SLP_TREE_LANES (slp_node);
9848 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9849 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
9851 /* Calculate which vector contains the result, and which lane of
9852 that vector we need. */
9853 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
9855 if (dump_enabled_p ())
9856 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9857 "Cannot determine which vector holds the"
9858 " final result.\n");
9859 return false;
9863 if (!vec_stmt_p)
9865 /* No transformation required. */
9866 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
9868 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
9869 OPTIMIZE_FOR_SPEED))
9871 if (dump_enabled_p ())
9872 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9873 "can't operate on partial vectors "
9874 "because the target doesn't support extract "
9875 "last reduction.\n");
9876 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9878 else if (slp_node)
9880 if (dump_enabled_p ())
9881 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9882 "can't operate on partial vectors "
9883 "because an SLP statement is live after "
9884 "the loop.\n");
9885 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9887 else if (ncopies > 1)
9889 if (dump_enabled_p ())
9890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9891 "can't operate on partial vectors "
9892 "because ncopies is greater than 1.\n");
9893 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9895 else
9897 gcc_assert (ncopies == 1 && !slp_node);
9898 vect_record_loop_mask (loop_vinfo,
9899 &LOOP_VINFO_MASKS (loop_vinfo),
9900 1, vectype, NULL);
9903 /* ??? Enable for loop costing as well. */
9904 if (!loop_vinfo)
9905 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
9906 0, vect_epilogue);
9907 return true;
9910 /* Use the lhs of the original scalar statement. */
9911 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
9912 if (dump_enabled_p ())
9913 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
9914 "stmt %G", stmt);
9916 lhs = gimple_get_lhs (stmt);
9917 lhs_type = TREE_TYPE (lhs);
9919 bitsize = vector_element_bits_tree (vectype);
9921 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
9922 tree vec_lhs, bitstart;
9923 gimple *vec_stmt;
9924 if (slp_node)
9926 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
9928 /* Get the correct slp vectorized stmt. */
9929 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
9930 vec_lhs = gimple_get_lhs (vec_stmt);
9932 /* Get entry to use. */
9933 bitstart = bitsize_int (vec_index);
9934 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
9936 else
9938 /* For multiple copies, get the last copy. */
9939 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
9940 vec_lhs = gimple_get_lhs (vec_stmt);
9942 /* Get the last lane in the vector. */
9943 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
9946 if (loop_vinfo)
9948 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
9949 requirement, insert one phi node for it. It looks like:
9950 loop;
9952 # lhs' = PHI <lhs>
9954 loop;
9956 # vec_lhs' = PHI <vec_lhs>
9957 new_tree = lane_extract <vec_lhs', ...>;
9958 lhs' = new_tree; */
9960 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9961 basic_block exit_bb = single_exit (loop)->dest;
9962 gcc_assert (single_pred_p (exit_bb));
9964 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
9965 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
9966 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
9968 gimple_seq stmts = NULL;
9969 tree new_tree;
9970 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9972 /* Emit:
9974 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
9976 where VEC_LHS is the vectorized live-out result and MASK is
9977 the loop mask for the final iteration. */
9978 gcc_assert (ncopies == 1 && !slp_node);
9979 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
9980 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
9981 1, vectype, 0);
9982 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
9983 mask, vec_lhs_phi);
9985 /* Convert the extracted vector element to the scalar type. */
9986 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
9988 else
9990 tree bftype = TREE_TYPE (vectype);
9991 if (VECTOR_BOOLEAN_TYPE_P (vectype))
9992 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
9993 new_tree = build3 (BIT_FIELD_REF, bftype,
9994 vec_lhs_phi, bitsize, bitstart);
9995 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
9996 &stmts, true, NULL_TREE);
9999 if (stmts)
10001 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10002 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10004 /* Remove existing phi from lhs and create one copy from new_tree. */
10005 tree lhs_phi = NULL_TREE;
10006 gimple_stmt_iterator gsi;
10007 for (gsi = gsi_start_phis (exit_bb);
10008 !gsi_end_p (gsi); gsi_next (&gsi))
10010 gimple *phi = gsi_stmt (gsi);
10011 if ((gimple_phi_arg_def (phi, 0) == lhs))
10013 remove_phi_node (&gsi, false);
10014 lhs_phi = gimple_phi_result (phi);
10015 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10016 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10017 break;
10022 /* Replace use of lhs with newly computed result. If the use stmt is a
10023 single arg PHI, just replace all uses of PHI result. It's necessary
10024 because lcssa PHI defining lhs may be before newly inserted stmt. */
10025 use_operand_p use_p;
10026 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10027 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10028 && !is_gimple_debug (use_stmt))
10030 if (gimple_code (use_stmt) == GIMPLE_PHI
10031 && gimple_phi_num_args (use_stmt) == 1)
10033 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10035 else
10037 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10038 SET_USE (use_p, new_tree);
10040 update_stmt (use_stmt);
10043 else
10045 /* For basic-block vectorization simply insert the lane-extraction. */
10046 tree bftype = TREE_TYPE (vectype);
10047 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10048 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10049 tree new_tree = build3 (BIT_FIELD_REF, bftype,
10050 vec_lhs, bitsize, bitstart);
10051 gimple_seq stmts = NULL;
10052 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10053 &stmts, true, NULL_TREE);
10054 if (TREE_CODE (new_tree) == SSA_NAME
10055 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10056 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10057 if (is_a <gphi *> (vec_stmt))
10059 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10060 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10062 else
10064 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10065 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10068 /* Replace use of lhs with newly computed result. If the use stmt is a
10069 single arg PHI, just replace all uses of PHI result. It's necessary
10070 because lcssa PHI defining lhs may be before newly inserted stmt. */
10071 use_operand_p use_p;
10072 stmt_vec_info use_stmt_info;
10073 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10074 if (!is_gimple_debug (use_stmt)
10075 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10076 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10078 /* ??? This can happen when the live lane ends up being
10079 used in a vector construction code-generated by an
10080 external SLP node (and code-generation for that already
10081 happened). See gcc.dg/vect/bb-slp-47.c.
10082 Doing this is what would happen if that vector CTOR
10083 were not code-generated yet so it is not too bad.
10084 ??? In fact we'd likely want to avoid this situation
10085 in the first place. */
10086 if (TREE_CODE (new_tree) == SSA_NAME
10087 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10088 && gimple_code (use_stmt) != GIMPLE_PHI
10089 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10090 use_stmt))
10092 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10093 gcc_assert (code == CONSTRUCTOR
10094 || code == VIEW_CONVERT_EXPR
10095 || CONVERT_EXPR_CODE_P (code));
10096 if (dump_enabled_p ())
10097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10098 "Using original scalar computation for "
10099 "live lane because use preceeds vector "
10100 "def\n");
10101 continue;
10103 /* ??? It can also happen that we end up pulling a def into
10104 a loop where replacing out-of-loop uses would require
10105 a new LC SSA PHI node. Retain the original scalar in
10106 those cases as well. PR98064. */
10107 if (TREE_CODE (new_tree) == SSA_NAME
10108 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10109 && (gimple_bb (use_stmt)->loop_father
10110 != gimple_bb (vec_stmt)->loop_father)
10111 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10112 gimple_bb (use_stmt)->loop_father))
10114 if (dump_enabled_p ())
10115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10116 "Using original scalar computation for "
10117 "live lane because there is an out-of-loop "
10118 "definition for it\n");
10119 continue;
10121 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10122 SET_USE (use_p, new_tree);
10123 update_stmt (use_stmt);
10127 return true;
10130 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
10132 static void
10133 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10135 ssa_op_iter op_iter;
10136 imm_use_iterator imm_iter;
10137 def_operand_p def_p;
10138 gimple *ustmt;
10140 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10142 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10144 basic_block bb;
10146 if (!is_gimple_debug (ustmt))
10147 continue;
10149 bb = gimple_bb (ustmt);
10151 if (!flow_bb_inside_loop_p (loop, bb))
10153 if (gimple_debug_bind_p (ustmt))
10155 if (dump_enabled_p ())
10156 dump_printf_loc (MSG_NOTE, vect_location,
10157 "killing debug use\n");
10159 gimple_debug_bind_reset_value (ustmt);
10160 update_stmt (ustmt);
10162 else
10163 gcc_unreachable ();
10169 /* Given loop represented by LOOP_VINFO, return true if computation of
10170 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10171 otherwise. */
10173 static bool
10174 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10176 /* Constant case. */
10177 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10179 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10180 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10182 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10183 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10184 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10185 return true;
10188 widest_int max;
10189 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10190 /* Check the upper bound of loop niters. */
10191 if (get_max_loop_iterations (loop, &max))
10193 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10194 signop sgn = TYPE_SIGN (type);
10195 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10196 if (max < type_max)
10197 return true;
10199 return false;
10202 /* Return a mask type with half the number of elements as OLD_TYPE,
10203 given that it should have mode NEW_MODE. */
10205 tree
10206 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10208 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10209 return build_truth_vector_type_for_mode (nunits, new_mode);
10212 /* Return a mask type with twice as many elements as OLD_TYPE,
10213 given that it should have mode NEW_MODE. */
10215 tree
10216 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10218 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10219 return build_truth_vector_type_for_mode (nunits, new_mode);
10222 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10223 contain a sequence of NVECTORS masks that each control a vector of type
10224 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10225 these vector masks with the vector version of SCALAR_MASK. */
10227 void
10228 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10229 unsigned int nvectors, tree vectype, tree scalar_mask)
10231 gcc_assert (nvectors != 0);
10232 if (masks->length () < nvectors)
10233 masks->safe_grow_cleared (nvectors, true);
10234 rgroup_controls *rgm = &(*masks)[nvectors - 1];
10235 /* The number of scalars per iteration and the number of vectors are
10236 both compile-time constants. */
10237 unsigned int nscalars_per_iter
10238 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10239 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10241 if (scalar_mask)
10243 scalar_cond_masked_key cond (scalar_mask, nvectors);
10244 loop_vinfo->scalar_cond_masked_set.add (cond);
10247 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
10249 rgm->max_nscalars_per_iter = nscalars_per_iter;
10250 rgm->type = truth_type_for (vectype);
10251 rgm->factor = 1;
10255 /* Given a complete set of masks MASKS, extract mask number INDEX
10256 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10257 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10259 See the comment above vec_loop_masks for more details about the mask
10260 arrangement. */
10262 tree
10263 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10264 unsigned int nvectors, tree vectype, unsigned int index)
10266 rgroup_controls *rgm = &(*masks)[nvectors - 1];
10267 tree mask_type = rgm->type;
10269 /* Populate the rgroup's mask array, if this is the first time we've
10270 used it. */
10271 if (rgm->controls.is_empty ())
10273 rgm->controls.safe_grow_cleared (nvectors, true);
10274 for (unsigned int i = 0; i < nvectors; ++i)
10276 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10277 /* Provide a dummy definition until the real one is available. */
10278 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10279 rgm->controls[i] = mask;
10283 tree mask = rgm->controls[index];
10284 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10285 TYPE_VECTOR_SUBPARTS (vectype)))
10287 /* A loop mask for data type X can be reused for data type Y
10288 if X has N times more elements than Y and if Y's elements
10289 are N times bigger than X's. In this case each sequence
10290 of N elements in the loop mask will be all-zero or all-one.
10291 We can then view-convert the mask so that each sequence of
10292 N elements is replaced by a single element. */
10293 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10294 TYPE_VECTOR_SUBPARTS (vectype)));
10295 gimple_seq seq = NULL;
10296 mask_type = truth_type_for (vectype);
10297 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10298 if (seq)
10299 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10301 return mask;
10304 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10305 lengths for controlling an operation on VECTYPE. The operation splits
10306 each element of VECTYPE into FACTOR separate subelements, measuring the
10307 length as a number of these subelements. */
10309 void
10310 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10311 unsigned int nvectors, tree vectype, unsigned int factor)
10313 gcc_assert (nvectors != 0);
10314 if (lens->length () < nvectors)
10315 lens->safe_grow_cleared (nvectors, true);
10316 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10318 /* The number of scalars per iteration, scalar occupied bytes and
10319 the number of vectors are both compile-time constants. */
10320 unsigned int nscalars_per_iter
10321 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10322 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10324 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10326 /* For now, we only support cases in which all loads and stores fall back
10327 to VnQI or none do. */
10328 gcc_assert (!rgl->max_nscalars_per_iter
10329 || (rgl->factor == 1 && factor == 1)
10330 || (rgl->max_nscalars_per_iter * rgl->factor
10331 == nscalars_per_iter * factor));
10332 rgl->max_nscalars_per_iter = nscalars_per_iter;
10333 rgl->type = vectype;
10334 rgl->factor = factor;
10338 /* Given a complete set of length LENS, extract length number INDEX for an
10339 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
10341 tree
10342 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10343 unsigned int nvectors, unsigned int index)
10345 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10346 bool use_bias_adjusted_len =
10347 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10349 /* Populate the rgroup's len array, if this is the first time we've
10350 used it. */
10351 if (rgl->controls.is_empty ())
10353 rgl->controls.safe_grow_cleared (nvectors, true);
10354 for (unsigned int i = 0; i < nvectors; ++i)
10356 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10357 gcc_assert (len_type != NULL_TREE);
10359 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10361 /* Provide a dummy definition until the real one is available. */
10362 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10363 rgl->controls[i] = len;
10365 if (use_bias_adjusted_len)
10367 gcc_assert (i == 0);
10368 tree adjusted_len =
10369 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10370 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10371 rgl->bias_adjusted_ctrl = adjusted_len;
10376 if (use_bias_adjusted_len)
10377 return rgl->bias_adjusted_ctrl;
10378 else
10379 return rgl->controls[index];
10382 /* Scale profiling counters by estimation for LOOP which is vectorized
10383 by factor VF. */
10385 static void
10386 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
10388 edge preheader = loop_preheader_edge (loop);
10389 /* Reduce loop iterations by the vectorization factor. */
10390 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
10391 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
10393 if (freq_h.nonzero_p ())
10395 profile_probability p;
10397 /* Avoid dropping loop body profile counter to 0 because of zero count
10398 in loop's preheader. */
10399 if (!(freq_e == profile_count::zero ()))
10400 freq_e = freq_e.force_nonzero ();
10401 p = (freq_e * (new_est_niter + 1)).probability_in (freq_h);
10402 scale_loop_frequencies (loop, p);
10405 edge exit_e = single_exit (loop);
10406 exit_e->probability = profile_probability::always () / (new_est_niter + 1);
10408 edge exit_l = single_pred_edge (loop->latch);
10409 profile_probability prob = exit_l->probability;
10410 exit_l->probability = exit_e->probability.invert ();
10411 if (prob.initialized_p () && exit_l->probability.initialized_p ())
10412 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
10415 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10416 latch edge values originally defined by it. */
10418 static void
10419 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
10420 stmt_vec_info def_stmt_info)
10422 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
10423 if (!def || TREE_CODE (def) != SSA_NAME)
10424 return;
10425 stmt_vec_info phi_info;
10426 imm_use_iterator iter;
10427 use_operand_p use_p;
10428 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
10430 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
10431 if (!phi)
10432 continue;
10433 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
10434 && (phi_info = loop_vinfo->lookup_stmt (phi))
10435 && STMT_VINFO_RELEVANT_P (phi_info)))
10436 continue;
10437 loop_p loop = gimple_bb (phi)->loop_father;
10438 edge e = loop_latch_edge (loop);
10439 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
10440 continue;
10442 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
10443 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
10444 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
10446 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10447 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10448 gcc_assert (phi_defs.length () == latch_defs.length ());
10449 for (unsigned i = 0; i < phi_defs.length (); ++i)
10450 add_phi_arg (as_a <gphi *> (phi_defs[i]),
10451 gimple_get_lhs (latch_defs[i]), e,
10452 gimple_phi_arg_location (phi, e->dest_idx));
10454 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
10456 /* For first order recurrences we have to update both uses of
10457 the latch definition, the one in the PHI node and the one
10458 in the generated VEC_PERM_EXPR. */
10459 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10460 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10461 gcc_assert (phi_defs.length () == latch_defs.length ());
10462 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
10463 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
10464 for (unsigned i = 0; i < phi_defs.length (); ++i)
10466 gassign *perm = as_a <gassign *> (phi_defs[i]);
10467 if (i > 0)
10468 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
10469 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
10470 update_stmt (perm);
10472 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
10473 gimple_phi_arg_location (phi, e->dest_idx));
10478 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
10479 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
10480 stmt_vec_info. */
10482 static bool
10483 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
10484 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
10486 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10487 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10489 if (dump_enabled_p ())
10490 dump_printf_loc (MSG_NOTE, vect_location,
10491 "------>vectorizing statement: %G", stmt_info->stmt);
10493 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10494 vect_loop_kill_debug_uses (loop, stmt_info);
10496 if (!STMT_VINFO_RELEVANT_P (stmt_info)
10497 && !STMT_VINFO_LIVE_P (stmt_info))
10498 return false;
10500 if (STMT_VINFO_VECTYPE (stmt_info))
10502 poly_uint64 nunits
10503 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
10504 if (!STMT_SLP_TYPE (stmt_info)
10505 && maybe_ne (nunits, vf)
10506 && dump_enabled_p ())
10507 /* For SLP VF is set according to unrolling factor, and not
10508 to vector size, hence for SLP this print is not valid. */
10509 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10512 /* Pure SLP statements have already been vectorized. We still need
10513 to apply loop vectorization to hybrid SLP statements. */
10514 if (PURE_SLP_STMT (stmt_info))
10515 return false;
10517 if (dump_enabled_p ())
10518 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
10520 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
10521 *seen_store = stmt_info;
10523 return true;
10526 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
10527 in the hash_map with its corresponding values. */
10529 static tree
10530 find_in_mapping (tree t, void *context)
10532 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
10534 tree *value = mapping->get (t);
10535 return value ? *value : t;
10538 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10539 original loop that has now been vectorized.
10541 The inits of the data_references need to be advanced with the number of
10542 iterations of the main loop. This has been computed in vect_do_peeling and
10543 is stored in parameter ADVANCE. We first restore the data_references
10544 initial offset with the values recored in ORIG_DRS_INIT.
10546 Since the loop_vec_info of this EPILOGUE was constructed for the original
10547 loop, its stmt_vec_infos all point to the original statements. These need
10548 to be updated to point to their corresponding copies as well as the SSA_NAMES
10549 in their PATTERN_DEF_SEQs and RELATED_STMTs.
10551 The data_reference's connections also need to be updated. Their
10552 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10553 stmt_vec_infos, their statements need to point to their corresponding copy,
10554 if they are gather loads or scatter stores then their reference needs to be
10555 updated to point to its corresponding copy and finally we set
10556 'base_misaligned' to false as we have already peeled for alignment in the
10557 prologue of the main loop. */
10559 static void
10560 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10562 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10563 auto_vec<gimple *> stmt_worklist;
10564 hash_map<tree,tree> mapping;
10565 gimple *orig_stmt, *new_stmt;
10566 gimple_stmt_iterator epilogue_gsi;
10567 gphi_iterator epilogue_phi_gsi;
10568 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10569 basic_block *epilogue_bbs = get_loop_body (epilogue);
10570 unsigned i;
10572 free (LOOP_VINFO_BBS (epilogue_vinfo));
10573 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10575 /* Advance data_reference's with the number of iterations of the previous
10576 loop and its prologue. */
10577 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10580 /* The EPILOGUE loop is a copy of the original loop so they share the same
10581 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10582 point to the copied statements. We also create a mapping of all LHS' in
10583 the original loop and all the LHS' in the EPILOGUE and create worklists to
10584 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
10585 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10587 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10588 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10590 new_stmt = epilogue_phi_gsi.phi ();
10592 gcc_assert (gimple_uid (new_stmt) > 0);
10593 stmt_vinfo
10594 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10596 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
10597 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10599 mapping.put (gimple_phi_result (orig_stmt),
10600 gimple_phi_result (new_stmt));
10601 /* PHI nodes can not have patterns or related statements. */
10602 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
10603 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
10606 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10607 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10609 new_stmt = gsi_stmt (epilogue_gsi);
10610 if (is_gimple_debug (new_stmt))
10611 continue;
10613 gcc_assert (gimple_uid (new_stmt) > 0);
10614 stmt_vinfo
10615 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10617 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
10618 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10620 if (tree old_lhs = gimple_get_lhs (orig_stmt))
10621 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
10623 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
10625 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
10626 for (gimple_stmt_iterator gsi = gsi_start (seq);
10627 !gsi_end_p (gsi); gsi_next (&gsi))
10628 stmt_worklist.safe_push (gsi_stmt (gsi));
10631 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10632 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10634 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10635 stmt_worklist.safe_push (stmt);
10636 /* Set BB such that the assert in
10637 'get_initial_def_for_reduction' is able to determine that
10638 the BB of the related stmt is inside this loop. */
10639 gimple_set_bb (stmt,
10640 gimple_bb (new_stmt));
10641 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10642 gcc_assert (related_vinfo == NULL
10643 || related_vinfo == stmt_vinfo);
10648 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
10649 using the original main loop and thus need to be updated to refer to the
10650 cloned variables used in the epilogue. */
10651 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
10653 gimple *stmt = stmt_worklist[i];
10654 tree *new_op;
10656 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
10658 tree op = gimple_op (stmt, j);
10659 if ((new_op = mapping.get(op)))
10660 gimple_set_op (stmt, j, *new_op);
10661 else
10663 /* PR92429: The last argument of simplify_replace_tree disables
10664 folding when replacing arguments. This is required as
10665 otherwise you might end up with different statements than the
10666 ones analyzed in vect_loop_analyze, leading to different
10667 vectorization. */
10668 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
10669 &find_in_mapping, &mapping, false);
10670 gimple_set_op (stmt, j, op);
10675 struct data_reference *dr;
10676 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10677 FOR_EACH_VEC_ELT (datarefs, i, dr)
10679 orig_stmt = DR_STMT (dr);
10680 gcc_assert (gimple_uid (orig_stmt) > 0);
10681 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10682 /* Data references for gather loads and scatter stores do not use the
10683 updated offset we set using ADVANCE. Instead we have to make sure the
10684 reference in the data references point to the corresponding copy of
10685 the original in the epilogue. */
10686 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
10687 == VMAT_GATHER_SCATTER)
10689 DR_REF (dr)
10690 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
10691 &find_in_mapping, &mapping);
10692 DR_BASE_ADDRESS (dr)
10693 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
10694 &find_in_mapping, &mapping);
10696 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10697 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
10698 /* The vector size of the epilogue is smaller than that of the main loop
10699 so the alignment is either the same or lower. This means the dr will
10700 thus by definition be aligned. */
10701 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
10704 epilogue_vinfo->shared->datarefs_copy.release ();
10705 epilogue_vinfo->shared->save_datarefs ();
10708 /* Function vect_transform_loop.
10710 The analysis phase has determined that the loop is vectorizable.
10711 Vectorize the loop - created vectorized stmts to replace the scalar
10712 stmts in the loop, and update the loop exit condition.
10713 Returns scalar epilogue loop if any. */
10715 class loop *
10716 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
10718 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10719 class loop *epilogue = NULL;
10720 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
10721 int nbbs = loop->num_nodes;
10722 int i;
10723 tree niters_vector = NULL_TREE;
10724 tree step_vector = NULL_TREE;
10725 tree niters_vector_mult_vf = NULL_TREE;
10726 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10727 unsigned int lowest_vf = constant_lower_bound (vf);
10728 gimple *stmt;
10729 bool check_profitability = false;
10730 unsigned int th;
10732 DUMP_VECT_SCOPE ("vec_transform_loop");
10734 loop_vinfo->shared->check_datarefs ();
10736 /* Use the more conservative vectorization threshold. If the number
10737 of iterations is constant assume the cost check has been performed
10738 by our caller. If the threshold makes all loops profitable that
10739 run at least the (estimated) vectorization factor number of times
10740 checking is pointless, too. */
10741 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
10742 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
10744 if (dump_enabled_p ())
10745 dump_printf_loc (MSG_NOTE, vect_location,
10746 "Profitability threshold is %d loop iterations.\n",
10747 th);
10748 check_profitability = true;
10751 /* Make sure there exists a single-predecessor exit bb. Do this before
10752 versioning. */
10753 edge e = single_exit (loop);
10754 if (! single_pred_p (e->dest))
10756 split_loop_exit_edge (e, true);
10757 if (dump_enabled_p ())
10758 dump_printf (MSG_NOTE, "split exit edge\n");
10761 /* Version the loop first, if required, so the profitability check
10762 comes first. */
10764 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
10766 class loop *sloop
10767 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
10768 sloop->force_vectorize = false;
10769 check_profitability = false;
10772 /* Make sure there exists a single-predecessor exit bb also on the
10773 scalar loop copy. Do this after versioning but before peeling
10774 so CFG structure is fine for both scalar and if-converted loop
10775 to make slpeel_duplicate_current_defs_from_edges face matched
10776 loop closed PHI nodes on the exit. */
10777 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
10779 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
10780 if (! single_pred_p (e->dest))
10782 split_loop_exit_edge (e, true);
10783 if (dump_enabled_p ())
10784 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
10788 tree niters = vect_build_loop_niters (loop_vinfo);
10789 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
10790 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
10791 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
10792 tree advance;
10793 drs_init_vec orig_drs_init;
10795 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
10796 &step_vector, &niters_vector_mult_vf, th,
10797 check_profitability, niters_no_overflow,
10798 &advance);
10800 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
10801 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
10802 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
10803 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
10805 if (niters_vector == NULL_TREE)
10807 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
10808 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
10809 && known_eq (lowest_vf, vf))
10811 niters_vector
10812 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
10813 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
10814 step_vector = build_one_cst (TREE_TYPE (niters));
10816 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10817 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
10818 &step_vector, niters_no_overflow);
10819 else
10820 /* vect_do_peeling subtracted the number of peeled prologue
10821 iterations from LOOP_VINFO_NITERS. */
10822 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
10823 &niters_vector, &step_vector,
10824 niters_no_overflow);
10827 /* 1) Make sure the loop header has exactly two entries
10828 2) Make sure we have a preheader basic block. */
10830 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
10832 split_edge (loop_preheader_edge (loop));
10834 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10835 /* This will deal with any possible peeling. */
10836 vect_prepare_for_masked_peels (loop_vinfo);
10838 /* Schedule the SLP instances first, then handle loop vectorization
10839 below. */
10840 if (!loop_vinfo->slp_instances.is_empty ())
10842 DUMP_VECT_SCOPE ("scheduling SLP instances");
10843 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
10846 /* FORNOW: the vectorizer supports only loops which body consist
10847 of one basic block (header + empty latch). When the vectorizer will
10848 support more involved loop forms, the order by which the BBs are
10849 traversed need to be reconsidered. */
10851 for (i = 0; i < nbbs; i++)
10853 basic_block bb = bbs[i];
10854 stmt_vec_info stmt_info;
10856 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
10857 gsi_next (&si))
10859 gphi *phi = si.phi ();
10860 if (dump_enabled_p ())
10861 dump_printf_loc (MSG_NOTE, vect_location,
10862 "------>vectorizing phi: %G", (gimple *) phi);
10863 stmt_info = loop_vinfo->lookup_stmt (phi);
10864 if (!stmt_info)
10865 continue;
10867 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10868 vect_loop_kill_debug_uses (loop, stmt_info);
10870 if (!STMT_VINFO_RELEVANT_P (stmt_info)
10871 && !STMT_VINFO_LIVE_P (stmt_info))
10872 continue;
10874 if (STMT_VINFO_VECTYPE (stmt_info)
10875 && (maybe_ne
10876 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
10877 && dump_enabled_p ())
10878 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10880 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
10881 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
10882 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
10883 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
10884 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
10885 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
10886 && ! PURE_SLP_STMT (stmt_info))
10888 if (dump_enabled_p ())
10889 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
10890 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
10894 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
10895 gsi_next (&si))
10897 gphi *phi = si.phi ();
10898 stmt_info = loop_vinfo->lookup_stmt (phi);
10899 if (!stmt_info)
10900 continue;
10902 if (!STMT_VINFO_RELEVANT_P (stmt_info)
10903 && !STMT_VINFO_LIVE_P (stmt_info))
10904 continue;
10906 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
10907 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
10908 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
10909 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
10910 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
10911 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
10912 && ! PURE_SLP_STMT (stmt_info))
10913 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
10916 for (gimple_stmt_iterator si = gsi_start_bb (bb);
10917 !gsi_end_p (si);)
10919 stmt = gsi_stmt (si);
10920 /* During vectorization remove existing clobber stmts. */
10921 if (gimple_clobber_p (stmt))
10923 unlink_stmt_vdef (stmt);
10924 gsi_remove (&si, true);
10925 release_defs (stmt);
10927 else
10929 /* Ignore vector stmts created in the outer loop. */
10930 stmt_info = loop_vinfo->lookup_stmt (stmt);
10932 /* vector stmts created in the outer-loop during vectorization of
10933 stmts in an inner-loop may not have a stmt_info, and do not
10934 need to be vectorized. */
10935 stmt_vec_info seen_store = NULL;
10936 if (stmt_info)
10938 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
10940 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
10941 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
10942 !gsi_end_p (subsi); gsi_next (&subsi))
10944 stmt_vec_info pat_stmt_info
10945 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
10946 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
10947 &si, &seen_store);
10949 stmt_vec_info pat_stmt_info
10950 = STMT_VINFO_RELATED_STMT (stmt_info);
10951 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
10952 &si, &seen_store))
10953 maybe_set_vectorized_backedge_value (loop_vinfo,
10954 pat_stmt_info);
10956 else
10958 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
10959 &seen_store))
10960 maybe_set_vectorized_backedge_value (loop_vinfo,
10961 stmt_info);
10964 gsi_next (&si);
10965 if (seen_store)
10967 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
10968 /* Interleaving. If IS_STORE is TRUE, the
10969 vectorization of the interleaving chain was
10970 completed - free all the stores in the chain. */
10971 vect_remove_stores (loop_vinfo,
10972 DR_GROUP_FIRST_ELEMENT (seen_store));
10973 else
10974 /* Free the attached stmt_vec_info and remove the stmt. */
10975 loop_vinfo->remove_stmt (stmt_info);
10980 /* Stub out scalar statements that must not survive vectorization.
10981 Doing this here helps with grouped statements, or statements that
10982 are involved in patterns. */
10983 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
10984 !gsi_end_p (gsi); gsi_next (&gsi))
10986 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
10987 if (!call || !gimple_call_internal_p (call))
10988 continue;
10989 internal_fn ifn = gimple_call_internal_fn (call);
10990 if (ifn == IFN_MASK_LOAD)
10992 tree lhs = gimple_get_lhs (call);
10993 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10995 tree zero = build_zero_cst (TREE_TYPE (lhs));
10996 gimple *new_stmt = gimple_build_assign (lhs, zero);
10997 gsi_replace (&gsi, new_stmt, true);
11000 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11002 tree lhs = gimple_get_lhs (call);
11003 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11005 tree else_arg
11006 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11007 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11008 gsi_replace (&gsi, new_stmt, true);
11012 } /* BBs in loop */
11014 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11015 a zero NITERS becomes a nonzero NITERS_VECTOR. */
11016 if (integer_onep (step_vector))
11017 niters_no_overflow = true;
11018 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11019 niters_vector_mult_vf, !niters_no_overflow);
11021 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11022 scale_profile_for_vect_loop (loop, assumed_vf);
11024 /* True if the final iteration might not handle a full vector's
11025 worth of scalar iterations. */
11026 bool final_iter_may_be_partial
11027 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11028 /* The minimum number of iterations performed by the epilogue. This
11029 is 1 when peeling for gaps because we always need a final scalar
11030 iteration. */
11031 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11032 /* +1 to convert latch counts to loop iteration counts,
11033 -min_epilogue_iters to remove iterations that cannot be performed
11034 by the vector code. */
11035 int bias_for_lowest = 1 - min_epilogue_iters;
11036 int bias_for_assumed = bias_for_lowest;
11037 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11038 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11040 /* When the amount of peeling is known at compile time, the first
11041 iteration will have exactly alignment_npeels active elements.
11042 In the worst case it will have at least one. */
11043 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11044 bias_for_lowest += lowest_vf - min_first_active;
11045 bias_for_assumed += assumed_vf - min_first_active;
11047 /* In these calculations the "- 1" converts loop iteration counts
11048 back to latch counts. */
11049 if (loop->any_upper_bound)
11051 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11052 loop->nb_iterations_upper_bound
11053 = (final_iter_may_be_partial
11054 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11055 lowest_vf) - 1
11056 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11057 lowest_vf) - 1);
11058 if (main_vinfo
11059 /* Both peeling for alignment and peeling for gaps can end up
11060 with the scalar epilogue running for more than VF-1 iterations. */
11061 && !main_vinfo->peeling_for_alignment
11062 && !main_vinfo->peeling_for_gaps)
11064 unsigned int bound;
11065 poly_uint64 main_iters
11066 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11067 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11068 main_iters
11069 = upper_bound (main_iters,
11070 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11071 if (can_div_away_from_zero_p (main_iters,
11072 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11073 &bound))
11074 loop->nb_iterations_upper_bound
11075 = wi::umin ((widest_int) (bound - 1),
11076 loop->nb_iterations_upper_bound);
11079 if (loop->any_likely_upper_bound)
11080 loop->nb_iterations_likely_upper_bound
11081 = (final_iter_may_be_partial
11082 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11083 + bias_for_lowest, lowest_vf) - 1
11084 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11085 + bias_for_lowest, lowest_vf) - 1);
11086 if (loop->any_estimate)
11087 loop->nb_iterations_estimate
11088 = (final_iter_may_be_partial
11089 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11090 assumed_vf) - 1
11091 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11092 assumed_vf) - 1);
11094 if (dump_enabled_p ())
11096 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11098 dump_printf_loc (MSG_NOTE, vect_location,
11099 "LOOP VECTORIZED\n");
11100 if (loop->inner)
11101 dump_printf_loc (MSG_NOTE, vect_location,
11102 "OUTER LOOP VECTORIZED\n");
11103 dump_printf (MSG_NOTE, "\n");
11105 else
11106 dump_printf_loc (MSG_NOTE, vect_location,
11107 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11108 GET_MODE_NAME (loop_vinfo->vector_mode));
11111 /* Loops vectorized with a variable factor won't benefit from
11112 unrolling/peeling. */
11113 if (!vf.is_constant ())
11115 loop->unroll = 1;
11116 if (dump_enabled_p ())
11117 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11118 " variable-length vectorization factor\n");
11120 /* Free SLP instances here because otherwise stmt reference counting
11121 won't work. */
11122 slp_instance instance;
11123 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11124 vect_free_slp_instance (instance);
11125 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11126 /* Clear-up safelen field since its value is invalid after vectorization
11127 since vectorized loop can have loop-carried dependencies. */
11128 loop->safelen = 0;
11130 if (epilogue)
11132 update_epilogue_loop_vinfo (epilogue, advance);
11134 epilogue->simduid = loop->simduid;
11135 epilogue->force_vectorize = loop->force_vectorize;
11136 epilogue->dont_vectorize = false;
11139 return epilogue;
11142 /* The code below is trying to perform simple optimization - revert
11143 if-conversion for masked stores, i.e. if the mask of a store is zero
11144 do not perform it and all stored value producers also if possible.
11145 For example,
11146 for (i=0; i<n; i++)
11147 if (c[i])
11149 p1[i] += 1;
11150 p2[i] = p3[i] +2;
11152 this transformation will produce the following semi-hammock:
11154 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11156 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11157 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11158 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11159 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11160 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11161 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11165 void
11166 optimize_mask_stores (class loop *loop)
11168 basic_block *bbs = get_loop_body (loop);
11169 unsigned nbbs = loop->num_nodes;
11170 unsigned i;
11171 basic_block bb;
11172 class loop *bb_loop;
11173 gimple_stmt_iterator gsi;
11174 gimple *stmt;
11175 auto_vec<gimple *> worklist;
11176 auto_purge_vect_location sentinel;
11178 vect_location = find_loop_location (loop);
11179 /* Pick up all masked stores in loop if any. */
11180 for (i = 0; i < nbbs; i++)
11182 bb = bbs[i];
11183 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11184 gsi_next (&gsi))
11186 stmt = gsi_stmt (gsi);
11187 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11188 worklist.safe_push (stmt);
11192 free (bbs);
11193 if (worklist.is_empty ())
11194 return;
11196 /* Loop has masked stores. */
11197 while (!worklist.is_empty ())
11199 gimple *last, *last_store;
11200 edge e, efalse;
11201 tree mask;
11202 basic_block store_bb, join_bb;
11203 gimple_stmt_iterator gsi_to;
11204 tree vdef, new_vdef;
11205 gphi *phi;
11206 tree vectype;
11207 tree zero;
11209 last = worklist.pop ();
11210 mask = gimple_call_arg (last, 2);
11211 bb = gimple_bb (last);
11212 /* Create then_bb and if-then structure in CFG, then_bb belongs to
11213 the same loop as if_bb. It could be different to LOOP when two
11214 level loop-nest is vectorized and mask_store belongs to the inner
11215 one. */
11216 e = split_block (bb, last);
11217 bb_loop = bb->loop_father;
11218 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11219 join_bb = e->dest;
11220 store_bb = create_empty_bb (bb);
11221 add_bb_to_loop (store_bb, bb_loop);
11222 e->flags = EDGE_TRUE_VALUE;
11223 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11224 /* Put STORE_BB to likely part. */
11225 efalse->probability = profile_probability::unlikely ();
11226 store_bb->count = efalse->count ();
11227 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11228 if (dom_info_available_p (CDI_DOMINATORS))
11229 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11230 if (dump_enabled_p ())
11231 dump_printf_loc (MSG_NOTE, vect_location,
11232 "Create new block %d to sink mask stores.",
11233 store_bb->index);
11234 /* Create vector comparison with boolean result. */
11235 vectype = TREE_TYPE (mask);
11236 zero = build_zero_cst (vectype);
11237 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11238 gsi = gsi_last_bb (bb);
11239 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11240 /* Create new PHI node for vdef of the last masked store:
11241 .MEM_2 = VDEF <.MEM_1>
11242 will be converted to
11243 .MEM.3 = VDEF <.MEM_1>
11244 and new PHI node will be created in join bb
11245 .MEM_2 = PHI <.MEM_1, .MEM_3>
11247 vdef = gimple_vdef (last);
11248 new_vdef = make_ssa_name (gimple_vop (cfun), last);
11249 gimple_set_vdef (last, new_vdef);
11250 phi = create_phi_node (vdef, join_bb);
11251 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11253 /* Put all masked stores with the same mask to STORE_BB if possible. */
11254 while (true)
11256 gimple_stmt_iterator gsi_from;
11257 gimple *stmt1 = NULL;
11259 /* Move masked store to STORE_BB. */
11260 last_store = last;
11261 gsi = gsi_for_stmt (last);
11262 gsi_from = gsi;
11263 /* Shift GSI to the previous stmt for further traversal. */
11264 gsi_prev (&gsi);
11265 gsi_to = gsi_start_bb (store_bb);
11266 gsi_move_before (&gsi_from, &gsi_to);
11267 /* Setup GSI_TO to the non-empty block start. */
11268 gsi_to = gsi_start_bb (store_bb);
11269 if (dump_enabled_p ())
11270 dump_printf_loc (MSG_NOTE, vect_location,
11271 "Move stmt to created bb\n%G", last);
11272 /* Move all stored value producers if possible. */
11273 while (!gsi_end_p (gsi))
11275 tree lhs;
11276 imm_use_iterator imm_iter;
11277 use_operand_p use_p;
11278 bool res;
11280 /* Skip debug statements. */
11281 if (is_gimple_debug (gsi_stmt (gsi)))
11283 gsi_prev (&gsi);
11284 continue;
11286 stmt1 = gsi_stmt (gsi);
11287 /* Do not consider statements writing to memory or having
11288 volatile operand. */
11289 if (gimple_vdef (stmt1)
11290 || gimple_has_volatile_ops (stmt1))
11291 break;
11292 gsi_from = gsi;
11293 gsi_prev (&gsi);
11294 lhs = gimple_get_lhs (stmt1);
11295 if (!lhs)
11296 break;
11298 /* LHS of vectorized stmt must be SSA_NAME. */
11299 if (TREE_CODE (lhs) != SSA_NAME)
11300 break;
11302 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11304 /* Remove dead scalar statement. */
11305 if (has_zero_uses (lhs))
11307 gsi_remove (&gsi_from, true);
11308 continue;
11312 /* Check that LHS does not have uses outside of STORE_BB. */
11313 res = true;
11314 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11316 gimple *use_stmt;
11317 use_stmt = USE_STMT (use_p);
11318 if (is_gimple_debug (use_stmt))
11319 continue;
11320 if (gimple_bb (use_stmt) != store_bb)
11322 res = false;
11323 break;
11326 if (!res)
11327 break;
11329 if (gimple_vuse (stmt1)
11330 && gimple_vuse (stmt1) != gimple_vuse (last_store))
11331 break;
11333 /* Can move STMT1 to STORE_BB. */
11334 if (dump_enabled_p ())
11335 dump_printf_loc (MSG_NOTE, vect_location,
11336 "Move stmt to created bb\n%G", stmt1);
11337 gsi_move_before (&gsi_from, &gsi_to);
11338 /* Shift GSI_TO for further insertion. */
11339 gsi_prev (&gsi_to);
11341 /* Put other masked stores with the same mask to STORE_BB. */
11342 if (worklist.is_empty ()
11343 || gimple_call_arg (worklist.last (), 2) != mask
11344 || worklist.last () != stmt1)
11345 break;
11346 last = worklist.pop ();
11348 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11352 /* Decide whether it is possible to use a zero-based induction variable
11353 when vectorizing LOOP_VINFO with partial vectors. If it is, return
11354 the value that the induction variable must be able to hold in order
11355 to ensure that the rgroups eventually have no active vector elements.
11356 Return -1 otherwise. */
11358 widest_int
11359 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11361 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11362 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11363 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11365 /* Calculate the value that the induction variable must be able
11366 to hit in order to ensure that we end the loop with an all-false mask.
11367 This involves adding the maximum number of inactive trailing scalar
11368 iterations. */
11369 widest_int iv_limit = -1;
11370 if (max_loop_iterations (loop, &iv_limit))
11372 if (niters_skip)
11374 /* Add the maximum number of skipped iterations to the
11375 maximum iteration count. */
11376 if (TREE_CODE (niters_skip) == INTEGER_CST)
11377 iv_limit += wi::to_widest (niters_skip);
11378 else
11379 iv_limit += max_vf - 1;
11381 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11382 /* Make a conservatively-correct assumption. */
11383 iv_limit += max_vf - 1;
11385 /* IV_LIMIT is the maximum number of latch iterations, which is also
11386 the maximum in-range IV value. Round this value down to the previous
11387 vector alignment boundary and then add an extra full iteration. */
11388 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11389 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11391 return iv_limit;
11394 /* For the given rgroup_controls RGC, check whether an induction variable
11395 would ever hit a value that produces a set of all-false masks or zero
11396 lengths before wrapping around. Return true if it's possible to wrap
11397 around before hitting the desirable value, otherwise return false. */
11399 bool
11400 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11402 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11404 if (iv_limit == -1)
11405 return true;
11407 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11408 unsigned int compare_precision = TYPE_PRECISION (compare_type);
11409 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11411 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11412 return true;
11414 return false;