Suppress -fstack-protector warning on hppa.
[official-gcc.git] / gcc / tree-vect-loop.cc
blobaacbb12580c89a7a12b11cc096502d669a9e1e21
1 /* Loop Vectorization
2 Copyright (C) 2003-2022 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57 #include "case-cfn-macros.h"
59 /* Loop Vectorization Pass.
61 This pass tries to vectorize loops.
63 For example, the vectorizer transforms the following simple loop:
65 short a[N]; short b[N]; short c[N]; int i;
67 for (i=0; i<N; i++){
68 a[i] = b[i] + c[i];
71 as if it was manually vectorized by rewriting the source code into:
73 typedef int __attribute__((mode(V8HI))) v8hi;
74 short a[N]; short b[N]; short c[N]; int i;
75 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
76 v8hi va, vb, vc;
78 for (i=0; i<N/8; i++){
79 vb = pb[i];
80 vc = pc[i];
81 va = vb + vc;
82 pa[i] = va;
85 The main entry to this pass is vectorize_loops(), in which
86 the vectorizer applies a set of analyses on a given set of loops,
87 followed by the actual vectorization transformation for the loops that
88 had successfully passed the analysis phase.
89 Throughout this pass we make a distinction between two types of
90 data: scalars (which are represented by SSA_NAMES), and memory references
91 ("data-refs"). These two types of data require different handling both
92 during analysis and transformation. The types of data-refs that the
93 vectorizer currently supports are ARRAY_REFS which base is an array DECL
94 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
95 accesses are required to have a simple (consecutive) access pattern.
97 Analysis phase:
98 ===============
99 The driver for the analysis phase is vect_analyze_loop().
100 It applies a set of analyses, some of which rely on the scalar evolution
101 analyzer (scev) developed by Sebastian Pop.
103 During the analysis phase the vectorizer records some information
104 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
105 loop, as well as general information about the loop as a whole, which is
106 recorded in a "loop_vec_info" struct attached to each loop.
108 Transformation phase:
109 =====================
110 The loop transformation phase scans all the stmts in the loop, and
111 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
112 the loop that needs to be vectorized. It inserts the vector code sequence
113 just before the scalar stmt S, and records a pointer to the vector code
114 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
115 attached to S). This pointer will be used for the vectorization of following
116 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
117 otherwise, we rely on dead code elimination for removing it.
119 For example, say stmt S1 was vectorized into stmt VS1:
121 VS1: vb = px[i];
122 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
123 S2: a = b;
125 To vectorize stmt S2, the vectorizer first finds the stmt that defines
126 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
127 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
128 resulting sequence would be:
130 VS1: vb = px[i];
131 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
132 VS2: va = vb;
133 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
135 Operands that are not SSA_NAMEs, are data-refs that appear in
136 load/store operations (like 'x[i]' in S1), and are handled differently.
138 Target modeling:
139 =================
140 Currently the only target specific information that is used is the
141 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
142 Targets that can support different sizes of vectors, for now will need
143 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
144 flexibility will be added in the future.
146 Since we only vectorize operations which vector form can be
147 expressed using existing tree codes, to verify that an operation is
148 supported, the vectorizer checks the relevant optab at the relevant
149 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
150 the value found is CODE_FOR_nothing, then there's no target support, and
151 we can't vectorize the stmt.
153 For additional information on this project see:
154 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
158 unsigned *);
159 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
160 bool *, bool *, bool);
162 /* Subroutine of vect_determine_vf_for_stmt that handles only one
163 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
164 may already be set for general statements (not just data refs). */
166 static opt_result
167 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
168 bool vectype_maybe_set_p,
169 poly_uint64 *vf)
171 gimple *stmt = stmt_info->stmt;
173 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
174 && !STMT_VINFO_LIVE_P (stmt_info))
175 || gimple_clobber_p (stmt))
177 if (dump_enabled_p ())
178 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
179 return opt_result::success ();
182 tree stmt_vectype, nunits_vectype;
183 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
184 &stmt_vectype,
185 &nunits_vectype);
186 if (!res)
187 return res;
189 if (stmt_vectype)
191 if (STMT_VINFO_VECTYPE (stmt_info))
192 /* The only case when a vectype had been already set is for stmts
193 that contain a data ref, or for "pattern-stmts" (stmts generated
194 by the vectorizer to represent/replace a certain idiom). */
195 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
196 || vectype_maybe_set_p)
197 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
198 else
199 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
202 if (nunits_vectype)
203 vect_update_max_nunits (vf, nunits_vectype);
205 return opt_result::success ();
208 /* Subroutine of vect_determine_vectorization_factor. Set the vector
209 types of STMT_INFO and all attached pattern statements and update
210 the vectorization factor VF accordingly. Return true on success
211 or false if something prevented vectorization. */
213 static opt_result
214 vect_determine_vf_for_stmt (vec_info *vinfo,
215 stmt_vec_info stmt_info, poly_uint64 *vf)
217 if (dump_enabled_p ())
218 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
219 stmt_info->stmt);
220 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
221 if (!res)
222 return res;
224 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
225 && STMT_VINFO_RELATED_STMT (stmt_info))
227 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
228 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
230 /* If a pattern statement has def stmts, analyze them too. */
231 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
232 !gsi_end_p (si); gsi_next (&si))
234 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
235 if (dump_enabled_p ())
236 dump_printf_loc (MSG_NOTE, vect_location,
237 "==> examining pattern def stmt: %G",
238 def_stmt_info->stmt);
239 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
240 if (!res)
241 return res;
244 if (dump_enabled_p ())
245 dump_printf_loc (MSG_NOTE, vect_location,
246 "==> examining pattern statement: %G",
247 stmt_info->stmt);
248 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
249 if (!res)
250 return res;
253 return opt_result::success ();
256 /* Function vect_determine_vectorization_factor
258 Determine the vectorization factor (VF). VF is the number of data elements
259 that are operated upon in parallel in a single iteration of the vectorized
260 loop. For example, when vectorizing a loop that operates on 4byte elements,
261 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
262 elements can fit in a single vector register.
264 We currently support vectorization of loops in which all types operated upon
265 are of the same size. Therefore this function currently sets VF according to
266 the size of the types operated upon, and fails if there are multiple sizes
267 in the loop.
269 VF is also the factor by which the loop iterations are strip-mined, e.g.:
270 original loop:
271 for (i=0; i<N; i++){
272 a[i] = b[i] + c[i];
275 vectorized loop:
276 for (i=0; i<N; i+=VF){
277 a[i:VF] = b[i:VF] + c[i:VF];
281 static opt_result
282 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
284 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
285 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
286 unsigned nbbs = loop->num_nodes;
287 poly_uint64 vectorization_factor = 1;
288 tree scalar_type = NULL_TREE;
289 gphi *phi;
290 tree vectype;
291 stmt_vec_info stmt_info;
292 unsigned i;
294 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
296 for (i = 0; i < nbbs; i++)
298 basic_block bb = bbs[i];
300 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
301 gsi_next (&si))
303 phi = si.phi ();
304 stmt_info = loop_vinfo->lookup_stmt (phi);
305 if (dump_enabled_p ())
306 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
307 (gimple *) phi);
309 gcc_assert (stmt_info);
311 if (STMT_VINFO_RELEVANT_P (stmt_info)
312 || STMT_VINFO_LIVE_P (stmt_info))
314 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
315 scalar_type = TREE_TYPE (PHI_RESULT (phi));
317 if (dump_enabled_p ())
318 dump_printf_loc (MSG_NOTE, vect_location,
319 "get vectype for scalar type: %T\n",
320 scalar_type);
322 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
323 if (!vectype)
324 return opt_result::failure_at (phi,
325 "not vectorized: unsupported "
326 "data-type %T\n",
327 scalar_type);
328 STMT_VINFO_VECTYPE (stmt_info) = vectype;
330 if (dump_enabled_p ())
331 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
332 vectype);
334 if (dump_enabled_p ())
336 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
337 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
338 dump_printf (MSG_NOTE, "\n");
341 vect_update_max_nunits (&vectorization_factor, vectype);
345 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
346 gsi_next (&si))
348 if (is_gimple_debug (gsi_stmt (si)))
349 continue;
350 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
351 opt_result res
352 = vect_determine_vf_for_stmt (loop_vinfo,
353 stmt_info, &vectorization_factor);
354 if (!res)
355 return res;
359 /* TODO: Analyze cost. Decide if worth while to vectorize. */
360 if (dump_enabled_p ())
362 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
363 dump_dec (MSG_NOTE, vectorization_factor);
364 dump_printf (MSG_NOTE, "\n");
367 if (known_le (vectorization_factor, 1U))
368 return opt_result::failure_at (vect_location,
369 "not vectorized: unsupported data-type\n");
370 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
371 return opt_result::success ();
375 /* Function vect_is_simple_iv_evolution.
377 FORNOW: A simple evolution of an induction variables in the loop is
378 considered a polynomial evolution. */
380 static bool
381 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
382 tree * step)
384 tree init_expr;
385 tree step_expr;
386 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
387 basic_block bb;
389 /* When there is no evolution in this loop, the evolution function
390 is not "simple". */
391 if (evolution_part == NULL_TREE)
392 return false;
394 /* When the evolution is a polynomial of degree >= 2
395 the evolution function is not "simple". */
396 if (tree_is_chrec (evolution_part))
397 return false;
399 step_expr = evolution_part;
400 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
402 if (dump_enabled_p ())
403 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
404 step_expr, init_expr);
406 *init = init_expr;
407 *step = step_expr;
409 if (TREE_CODE (step_expr) != INTEGER_CST
410 && (TREE_CODE (step_expr) != SSA_NAME
411 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
412 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
413 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
414 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
415 || !flag_associative_math)))
416 && (TREE_CODE (step_expr) != REAL_CST
417 || !flag_associative_math))
419 if (dump_enabled_p ())
420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
421 "step unknown.\n");
422 return false;
425 return true;
428 /* Function vect_is_nonlinear_iv_evolution
430 Only support nonlinear induction for integer type
431 1. neg
432 2. mul by constant
433 3. lshift/rshift by constant.
435 For neg induction, return a fake step as integer -1. */
436 static bool
437 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
438 gphi* loop_phi_node, tree *init, tree *step)
440 tree init_expr, ev_expr, result, op1, op2;
441 gimple* def;
443 if (gimple_phi_num_args (loop_phi_node) != 2)
444 return false;
446 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
447 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
449 /* Support nonlinear induction only for integer type. */
450 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
451 return false;
453 *init = init_expr;
454 result = PHI_RESULT (loop_phi_node);
456 if (TREE_CODE (ev_expr) != SSA_NAME
457 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
458 || !is_gimple_assign (def))
459 return false;
461 enum tree_code t_code = gimple_assign_rhs_code (def);
462 switch (t_code)
464 case NEGATE_EXPR:
465 if (gimple_assign_rhs1 (def) != result)
466 return false;
467 *step = build_int_cst (TREE_TYPE (init_expr), -1);
468 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
469 break;
471 case RSHIFT_EXPR:
472 case LSHIFT_EXPR:
473 case MULT_EXPR:
474 op1 = gimple_assign_rhs1 (def);
475 op2 = gimple_assign_rhs2 (def);
476 if (TREE_CODE (op2) != INTEGER_CST
477 || op1 != result)
478 return false;
479 *step = op2;
480 if (t_code == LSHIFT_EXPR)
481 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
482 else if (t_code == RSHIFT_EXPR)
483 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
484 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
485 else
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
487 break;
489 default:
490 return false;
493 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
494 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
496 return true;
499 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
500 what we are assuming is a double reduction. For example, given
501 a structure like this:
503 outer1:
504 x_1 = PHI <x_4(outer2), ...>;
507 inner:
508 x_2 = PHI <x_1(outer1), ...>;
510 x_3 = ...;
513 outer2:
514 x_4 = PHI <x_3(inner)>;
517 outer loop analysis would treat x_1 as a double reduction phi and
518 this function would then return true for x_2. */
520 static bool
521 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
523 use_operand_p use_p;
524 ssa_op_iter op_iter;
525 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
526 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
527 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
528 return true;
529 return false;
532 /* Returns true if Phi is a first-order recurrence. A first-order
533 recurrence is a non-reduction recurrence relation in which the value of
534 the recurrence in the current loop iteration equals a value defined in
535 the previous iteration. */
537 static bool
538 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
539 gphi *phi)
541 /* Ensure the loop latch definition is from within the loop. */
542 edge latch = loop_latch_edge (loop);
543 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
544 if (TREE_CODE (ldef) != SSA_NAME
545 || SSA_NAME_IS_DEFAULT_DEF (ldef)
546 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
547 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
548 return false;
550 tree def = gimple_phi_result (phi);
552 /* Ensure every use_stmt of the phi node is dominated by the latch
553 definition. */
554 imm_use_iterator imm_iter;
555 use_operand_p use_p;
556 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
557 if (!is_gimple_debug (USE_STMT (use_p))
558 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
559 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
560 USE_STMT (use_p))))
561 return false;
563 /* First-order recurrence autovectorization needs shuffle vector. */
564 tree scalar_type = TREE_TYPE (def);
565 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
566 if (!vectype)
567 return false;
569 return true;
572 /* Function vect_analyze_scalar_cycles_1.
574 Examine the cross iteration def-use cycles of scalar variables
575 in LOOP. LOOP_VINFO represents the loop that is now being
576 considered for vectorization (can be LOOP, or an outer-loop
577 enclosing LOOP). SLP indicates there will be some subsequent
578 slp analyses or not. */
580 static void
581 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
582 bool slp)
584 basic_block bb = loop->header;
585 tree init, step;
586 auto_vec<stmt_vec_info, 64> worklist;
587 gphi_iterator gsi;
588 bool double_reduc, reduc_chain;
590 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
592 /* First - identify all inductions. Reduction detection assumes that all the
593 inductions have been identified, therefore, this order must not be
594 changed. */
595 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
597 gphi *phi = gsi.phi ();
598 tree access_fn = NULL;
599 tree def = PHI_RESULT (phi);
600 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
602 if (dump_enabled_p ())
603 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
604 (gimple *) phi);
606 /* Skip virtual phi's. The data dependences that are associated with
607 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
608 if (virtual_operand_p (def))
609 continue;
611 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
613 /* Analyze the evolution function. */
614 access_fn = analyze_scalar_evolution (loop, def);
615 if (access_fn)
617 STRIP_NOPS (access_fn);
618 if (dump_enabled_p ())
619 dump_printf_loc (MSG_NOTE, vect_location,
620 "Access function of PHI: %T\n", access_fn);
621 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
622 = initial_condition_in_loop_num (access_fn, loop->num);
623 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
624 = evolution_part_in_loop_num (access_fn, loop->num);
627 if ((!access_fn
628 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
629 || !vect_is_simple_iv_evolution (loop->num, access_fn,
630 &init, &step)
631 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
632 && TREE_CODE (step) != INTEGER_CST))
633 /* Only handle nonlinear iv for same loop. */
634 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
635 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
636 phi, &init, &step)))
638 worklist.safe_push (stmt_vinfo);
639 continue;
642 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
643 != NULL_TREE);
644 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
646 if (dump_enabled_p ())
647 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
648 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
652 /* Second - identify all reductions and nested cycles. */
653 while (worklist.length () > 0)
655 stmt_vec_info stmt_vinfo = worklist.pop ();
656 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
657 tree def = PHI_RESULT (phi);
659 if (dump_enabled_p ())
660 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
661 (gimple *) phi);
663 gcc_assert (!virtual_operand_p (def)
664 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
666 stmt_vec_info reduc_stmt_info
667 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
668 &reduc_chain, slp);
669 if (reduc_stmt_info)
671 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
672 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
673 if (double_reduc)
675 if (dump_enabled_p ())
676 dump_printf_loc (MSG_NOTE, vect_location,
677 "Detected double reduction.\n");
679 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
680 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
682 else
684 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
686 if (dump_enabled_p ())
687 dump_printf_loc (MSG_NOTE, vect_location,
688 "Detected vectorizable nested cycle.\n");
690 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
692 else
694 if (dump_enabled_p ())
695 dump_printf_loc (MSG_NOTE, vect_location,
696 "Detected reduction.\n");
698 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
699 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
700 /* Store the reduction cycles for possible vectorization in
701 loop-aware SLP if it was not detected as reduction
702 chain. */
703 if (! reduc_chain)
704 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
705 (reduc_stmt_info);
709 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
710 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
711 else
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
714 "Unknown def-use cycle pattern.\n");
719 /* Function vect_analyze_scalar_cycles.
721 Examine the cross iteration def-use cycles of scalar variables, by
722 analyzing the loop-header PHIs of scalar variables. Classify each
723 cycle as one of the following: invariant, induction, reduction, unknown.
724 We do that for the loop represented by LOOP_VINFO, and also to its
725 inner-loop, if exists.
726 Examples for scalar cycles:
728 Example1: reduction:
730 loop1:
731 for (i=0; i<N; i++)
732 sum += a[i];
734 Example2: induction:
736 loop2:
737 for (i=0; i<N; i++)
738 a[i] = i; */
740 static void
741 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
743 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
745 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
747 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
748 Reductions in such inner-loop therefore have different properties than
749 the reductions in the nest that gets vectorized:
750 1. When vectorized, they are executed in the same order as in the original
751 scalar loop, so we can't change the order of computation when
752 vectorizing them.
753 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
754 current checks are too strict. */
756 if (loop->inner)
757 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
760 /* Transfer group and reduction information from STMT_INFO to its
761 pattern stmt. */
763 static void
764 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
766 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
767 stmt_vec_info stmtp;
768 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
769 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
770 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
773 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
774 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
775 == STMT_VINFO_DEF_TYPE (stmt_info));
776 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
777 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
778 if (stmt_info)
779 REDUC_GROUP_NEXT_ELEMENT (stmtp)
780 = STMT_VINFO_RELATED_STMT (stmt_info);
782 while (stmt_info);
785 /* Fixup scalar cycles that now have their stmts detected as patterns. */
787 static void
788 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
790 stmt_vec_info first;
791 unsigned i;
793 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
795 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
796 while (next)
798 if ((STMT_VINFO_IN_PATTERN_P (next)
799 != STMT_VINFO_IN_PATTERN_P (first))
800 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
801 break;
802 next = REDUC_GROUP_NEXT_ELEMENT (next);
804 /* If all reduction chain members are well-formed patterns adjust
805 the group to group the pattern stmts instead. */
806 if (! next
807 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
809 if (STMT_VINFO_IN_PATTERN_P (first))
811 vect_fixup_reduc_chain (first);
812 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
813 = STMT_VINFO_RELATED_STMT (first);
816 /* If not all stmt in the chain are patterns or if we failed
817 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
818 it as regular reduction instead. */
819 else
821 stmt_vec_info vinfo = first;
822 stmt_vec_info last = NULL;
823 while (vinfo)
825 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
826 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
827 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
828 last = vinfo;
829 vinfo = next;
831 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
832 = vect_internal_def;
833 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
834 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
835 --i;
840 /* Function vect_get_loop_niters.
842 Determine how many iterations the loop is executed and place it
843 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
844 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
845 niter information holds in ASSUMPTIONS.
847 Return the loop exit condition. */
850 static gcond *
851 vect_get_loop_niters (class loop *loop, tree *assumptions,
852 tree *number_of_iterations, tree *number_of_iterationsm1)
854 edge exit = single_exit (loop);
855 class tree_niter_desc niter_desc;
856 tree niter_assumptions, niter, may_be_zero;
857 gcond *cond = get_loop_exit_condition (loop);
859 *assumptions = boolean_true_node;
860 *number_of_iterationsm1 = chrec_dont_know;
861 *number_of_iterations = chrec_dont_know;
862 DUMP_VECT_SCOPE ("get_loop_niters");
864 if (!exit)
865 return cond;
867 may_be_zero = NULL_TREE;
868 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
869 || chrec_contains_undetermined (niter_desc.niter))
870 return cond;
872 niter_assumptions = niter_desc.assumptions;
873 may_be_zero = niter_desc.may_be_zero;
874 niter = niter_desc.niter;
876 if (may_be_zero && integer_zerop (may_be_zero))
877 may_be_zero = NULL_TREE;
879 if (may_be_zero)
881 if (COMPARISON_CLASS_P (may_be_zero))
883 /* Try to combine may_be_zero with assumptions, this can simplify
884 computation of niter expression. */
885 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
886 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
887 niter_assumptions,
888 fold_build1 (TRUTH_NOT_EXPR,
889 boolean_type_node,
890 may_be_zero));
891 else
892 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
893 build_int_cst (TREE_TYPE (niter), 0),
894 rewrite_to_non_trapping_overflow (niter));
896 may_be_zero = NULL_TREE;
898 else if (integer_nonzerop (may_be_zero))
900 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
901 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
902 return cond;
904 else
905 return cond;
908 *assumptions = niter_assumptions;
909 *number_of_iterationsm1 = niter;
911 /* We want the number of loop header executions which is the number
912 of latch executions plus one.
913 ??? For UINT_MAX latch executions this number overflows to zero
914 for loops like do { n++; } while (n != 0); */
915 if (niter && !chrec_contains_undetermined (niter))
916 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
917 build_int_cst (TREE_TYPE (niter), 1));
918 *number_of_iterations = niter;
920 return cond;
923 /* Function bb_in_loop_p
925 Used as predicate for dfs order traversal of the loop bbs. */
927 static bool
928 bb_in_loop_p (const_basic_block bb, const void *data)
930 const class loop *const loop = (const class loop *)data;
931 if (flow_bb_inside_loop_p (loop, bb))
932 return true;
933 return false;
937 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
938 stmt_vec_info structs for all the stmts in LOOP_IN. */
940 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
941 : vec_info (vec_info::loop, shared),
942 loop (loop_in),
943 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
944 num_itersm1 (NULL_TREE),
945 num_iters (NULL_TREE),
946 num_iters_unchanged (NULL_TREE),
947 num_iters_assumptions (NULL_TREE),
948 vector_costs (nullptr),
949 scalar_costs (nullptr),
950 th (0),
951 versioning_threshold (0),
952 vectorization_factor (0),
953 main_loop_edge (nullptr),
954 skip_main_loop_edge (nullptr),
955 skip_this_loop_edge (nullptr),
956 reusable_accumulators (),
957 suggested_unroll_factor (1),
958 max_vectorization_factor (0),
959 mask_skip_niters (NULL_TREE),
960 rgroup_compare_type (NULL_TREE),
961 simd_if_cond (NULL_TREE),
962 unaligned_dr (NULL),
963 peeling_for_alignment (0),
964 ptr_mask (0),
965 ivexpr_map (NULL),
966 scan_map (NULL),
967 slp_unrolling_factor (1),
968 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
969 vectorizable (false),
970 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
971 using_partial_vectors_p (false),
972 epil_using_partial_vectors_p (false),
973 partial_load_store_bias (0),
974 peeling_for_gaps (false),
975 peeling_for_niter (false),
976 no_data_dependencies (false),
977 has_mask_store (false),
978 scalar_loop_scaling (profile_probability::uninitialized ()),
979 scalar_loop (NULL),
980 orig_loop_info (NULL)
982 /* CHECKME: We want to visit all BBs before their successors (except for
983 latch blocks, for which this assertion wouldn't hold). In the simple
984 case of the loop forms we allow, a dfs order of the BBs would the same
985 as reversed postorder traversal, so we are safe. */
987 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
988 bbs, loop->num_nodes, loop);
989 gcc_assert (nbbs == loop->num_nodes);
991 for (unsigned int i = 0; i < nbbs; i++)
993 basic_block bb = bbs[i];
994 gimple_stmt_iterator si;
996 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
998 gimple *phi = gsi_stmt (si);
999 gimple_set_uid (phi, 0);
1000 add_stmt (phi);
1003 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1005 gimple *stmt = gsi_stmt (si);
1006 gimple_set_uid (stmt, 0);
1007 if (is_gimple_debug (stmt))
1008 continue;
1009 add_stmt (stmt);
1010 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1011 third argument is the #pragma omp simd if (x) condition, when 0,
1012 loop shouldn't be vectorized, when non-zero constant, it should
1013 be vectorized normally, otherwise versioned with vectorized loop
1014 done if the condition is non-zero at runtime. */
1015 if (loop_in->simduid
1016 && is_gimple_call (stmt)
1017 && gimple_call_internal_p (stmt)
1018 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1019 && gimple_call_num_args (stmt) >= 3
1020 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1021 && (loop_in->simduid
1022 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1024 tree arg = gimple_call_arg (stmt, 2);
1025 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1026 simd_if_cond = arg;
1027 else
1028 gcc_assert (integer_nonzerop (arg));
1033 epilogue_vinfos.create (6);
1036 /* Free all levels of rgroup CONTROLS. */
1038 void
1039 release_vec_loop_controls (vec<rgroup_controls> *controls)
1041 rgroup_controls *rgc;
1042 unsigned int i;
1043 FOR_EACH_VEC_ELT (*controls, i, rgc)
1044 rgc->controls.release ();
1045 controls->release ();
1048 /* Free all memory used by the _loop_vec_info, as well as all the
1049 stmt_vec_info structs of all the stmts in the loop. */
1051 _loop_vec_info::~_loop_vec_info ()
1053 free (bbs);
1055 release_vec_loop_controls (&masks);
1056 release_vec_loop_controls (&lens);
1057 delete ivexpr_map;
1058 delete scan_map;
1059 epilogue_vinfos.release ();
1060 delete scalar_costs;
1061 delete vector_costs;
1063 /* When we release an epiloge vinfo that we do not intend to use
1064 avoid clearing AUX of the main loop which should continue to
1065 point to the main loop vinfo since otherwise we'll leak that. */
1066 if (loop->aux == this)
1067 loop->aux = NULL;
1070 /* Return an invariant or register for EXPR and emit necessary
1071 computations in the LOOP_VINFO loop preheader. */
1073 tree
1074 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1076 if (is_gimple_reg (expr)
1077 || is_gimple_min_invariant (expr))
1078 return expr;
1080 if (! loop_vinfo->ivexpr_map)
1081 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1082 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1083 if (! cached)
1085 gimple_seq stmts = NULL;
1086 cached = force_gimple_operand (unshare_expr (expr),
1087 &stmts, true, NULL_TREE);
1088 if (stmts)
1090 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1091 gsi_insert_seq_on_edge_immediate (e, stmts);
1094 return cached;
1097 /* Return true if we can use CMP_TYPE as the comparison type to produce
1098 all masks required to mask LOOP_VINFO. */
1100 static bool
1101 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1103 rgroup_controls *rgm;
1104 unsigned int i;
1105 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1106 if (rgm->type != NULL_TREE
1107 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1108 cmp_type, rgm->type,
1109 OPTIMIZE_FOR_SPEED))
1110 return false;
1111 return true;
1114 /* Calculate the maximum number of scalars per iteration for every
1115 rgroup in LOOP_VINFO. */
1117 static unsigned int
1118 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1120 unsigned int res = 1;
1121 unsigned int i;
1122 rgroup_controls *rgm;
1123 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1124 res = MAX (res, rgm->max_nscalars_per_iter);
1125 return res;
1128 /* Calculate the minimum precision necessary to represent:
1130 MAX_NITERS * FACTOR
1132 as an unsigned integer, where MAX_NITERS is the maximum number of
1133 loop header iterations for the original scalar form of LOOP_VINFO. */
1135 static unsigned
1136 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1138 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1140 /* Get the maximum number of iterations that is representable
1141 in the counter type. */
1142 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1143 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1145 /* Get a more refined estimate for the number of iterations. */
1146 widest_int max_back_edges;
1147 if (max_loop_iterations (loop, &max_back_edges))
1148 max_ni = wi::smin (max_ni, max_back_edges + 1);
1150 /* Work out how many bits we need to represent the limit. */
1151 return wi::min_precision (max_ni * factor, UNSIGNED);
1154 /* True if the loop needs peeling or partial vectors when vectorized. */
1156 static bool
1157 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1159 unsigned HOST_WIDE_INT const_vf;
1160 HOST_WIDE_INT max_niter
1161 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1163 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1164 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1165 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1166 (loop_vinfo));
1168 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1169 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1171 /* Work out the (constant) number of iterations that need to be
1172 peeled for reasons other than niters. */
1173 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1174 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1175 peel_niter += 1;
1176 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1177 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1178 return true;
1180 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1181 /* ??? When peeling for gaps but not alignment, we could
1182 try to check whether the (variable) niters is known to be
1183 VF * N + 1. That's something of a niche case though. */
1184 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1185 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1186 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1187 < (unsigned) exact_log2 (const_vf))
1188 /* In case of versioning, check if the maximum number of
1189 iterations is greater than th. If they are identical,
1190 the epilogue is unnecessary. */
1191 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1192 || ((unsigned HOST_WIDE_INT) max_niter
1193 > (th / const_vf) * const_vf))))
1194 return true;
1196 return false;
1199 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1200 whether we can actually generate the masks required. Return true if so,
1201 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1203 static bool
1204 vect_verify_full_masking (loop_vec_info loop_vinfo)
1206 unsigned int min_ni_width;
1207 unsigned int max_nscalars_per_iter
1208 = vect_get_max_nscalars_per_iter (loop_vinfo);
1210 /* Use a normal loop if there are no statements that need masking.
1211 This only happens in rare degenerate cases: it means that the loop
1212 has no loads, no stores, and no live-out values. */
1213 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1214 return false;
1216 /* Work out how many bits we need to represent the limit. */
1217 min_ni_width
1218 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1220 /* Find a scalar mode for which WHILE_ULT is supported. */
1221 opt_scalar_int_mode cmp_mode_iter;
1222 tree cmp_type = NULL_TREE;
1223 tree iv_type = NULL_TREE;
1224 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1225 unsigned int iv_precision = UINT_MAX;
1227 if (iv_limit != -1)
1228 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1229 UNSIGNED);
1231 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1233 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1234 if (cmp_bits >= min_ni_width
1235 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1237 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1238 if (this_type
1239 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1241 /* Although we could stop as soon as we find a valid mode,
1242 there are at least two reasons why that's not always the
1243 best choice:
1245 - An IV that's Pmode or wider is more likely to be reusable
1246 in address calculations than an IV that's narrower than
1247 Pmode.
1249 - Doing the comparison in IV_PRECISION or wider allows
1250 a natural 0-based IV, whereas using a narrower comparison
1251 type requires mitigations against wrap-around.
1253 Conversely, if the IV limit is variable, doing the comparison
1254 in a wider type than the original type can introduce
1255 unnecessary extensions, so picking the widest valid mode
1256 is not always a good choice either.
1258 Here we prefer the first IV type that's Pmode or wider,
1259 and the first comparison type that's IV_PRECISION or wider.
1260 (The comparison type must be no wider than the IV type,
1261 to avoid extensions in the vector loop.)
1263 ??? We might want to try continuing beyond Pmode for ILP32
1264 targets if CMP_BITS < IV_PRECISION. */
1265 iv_type = this_type;
1266 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1267 cmp_type = this_type;
1268 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1269 break;
1274 if (!cmp_type)
1275 return false;
1277 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1278 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1279 return true;
1282 /* Check whether we can use vector access with length based on precison
1283 comparison. So far, to keep it simple, we only allow the case that the
1284 precision of the target supported length is larger than the precision
1285 required by loop niters. */
1287 static bool
1288 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1290 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1291 return false;
1293 machine_mode len_load_mode = get_len_load_store_mode
1294 (loop_vinfo->vector_mode, true).require ();
1295 machine_mode len_store_mode = get_len_load_store_mode
1296 (loop_vinfo->vector_mode, false).require ();
1298 signed char partial_load_bias = internal_len_load_store_bias
1299 (IFN_LEN_LOAD, len_load_mode);
1301 signed char partial_store_bias = internal_len_load_store_bias
1302 (IFN_LEN_STORE, len_store_mode);
1304 gcc_assert (partial_load_bias == partial_store_bias);
1306 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1307 return false;
1309 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1310 len_loads with a length of zero. In order to avoid that we prohibit
1311 more than one loop length here. */
1312 if (partial_load_bias == -1
1313 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1314 return false;
1316 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1318 unsigned int max_nitems_per_iter = 1;
1319 unsigned int i;
1320 rgroup_controls *rgl;
1321 /* Find the maximum number of items per iteration for every rgroup. */
1322 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1324 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1325 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1328 /* Work out how many bits we need to represent the length limit. */
1329 unsigned int min_ni_prec
1330 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1332 /* Now use the maximum of below precisions for one suitable IV type:
1333 - the IV's natural precision
1334 - the precision needed to hold: the maximum number of scalar
1335 iterations multiplied by the scale factor (min_ni_prec above)
1336 - the Pmode precision
1338 If min_ni_prec is less than the precision of the current niters,
1339 we perfer to still use the niters type. Prefer to use Pmode and
1340 wider IV to avoid narrow conversions. */
1342 unsigned int ni_prec
1343 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1344 min_ni_prec = MAX (min_ni_prec, ni_prec);
1345 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1347 tree iv_type = NULL_TREE;
1348 opt_scalar_int_mode tmode_iter;
1349 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1351 scalar_mode tmode = tmode_iter.require ();
1352 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1354 /* ??? Do we really want to construct one IV whose precision exceeds
1355 BITS_PER_WORD? */
1356 if (tbits > BITS_PER_WORD)
1357 break;
1359 /* Find the first available standard integral type. */
1360 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1362 iv_type = build_nonstandard_integer_type (tbits, true);
1363 break;
1367 if (!iv_type)
1369 if (dump_enabled_p ())
1370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371 "can't vectorize with length-based partial vectors"
1372 " because there is no suitable iv type.\n");
1373 return false;
1376 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1377 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1379 return true;
1382 /* Calculate the cost of one scalar iteration of the loop. */
1383 static void
1384 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1386 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1387 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1388 int nbbs = loop->num_nodes, factor;
1389 int innerloop_iters, i;
1391 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1393 /* Gather costs for statements in the scalar loop. */
1395 /* FORNOW. */
1396 innerloop_iters = 1;
1397 if (loop->inner)
1398 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1400 for (i = 0; i < nbbs; i++)
1402 gimple_stmt_iterator si;
1403 basic_block bb = bbs[i];
1405 if (bb->loop_father == loop->inner)
1406 factor = innerloop_iters;
1407 else
1408 factor = 1;
1410 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1412 gimple *stmt = gsi_stmt (si);
1413 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1415 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1416 continue;
1418 /* Skip stmts that are not vectorized inside the loop. */
1419 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1420 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1421 && (!STMT_VINFO_LIVE_P (vstmt_info)
1422 || !VECTORIZABLE_CYCLE_DEF
1423 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1424 continue;
1426 vect_cost_for_stmt kind;
1427 if (STMT_VINFO_DATA_REF (stmt_info))
1429 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1430 kind = scalar_load;
1431 else
1432 kind = scalar_store;
1434 else if (vect_nop_conversion_p (stmt_info))
1435 continue;
1436 else
1437 kind = scalar_stmt;
1439 /* We are using vect_prologue here to avoid scaling twice
1440 by the inner loop factor. */
1441 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1442 factor, kind, stmt_info, 0, vect_prologue);
1446 /* Now accumulate cost. */
1447 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1448 add_stmt_costs (loop_vinfo->scalar_costs,
1449 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1450 loop_vinfo->scalar_costs->finish_cost (nullptr);
1454 /* Function vect_analyze_loop_form.
1456 Verify that certain CFG restrictions hold, including:
1457 - the loop has a pre-header
1458 - the loop has a single entry and exit
1459 - the loop exit condition is simple enough
1460 - the number of iterations can be analyzed, i.e, a countable loop. The
1461 niter could be analyzed under some assumptions. */
1463 opt_result
1464 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1466 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1468 /* Different restrictions apply when we are considering an inner-most loop,
1469 vs. an outer (nested) loop.
1470 (FORNOW. May want to relax some of these restrictions in the future). */
1472 info->inner_loop_cond = NULL;
1473 if (!loop->inner)
1475 /* Inner-most loop. We currently require that the number of BBs is
1476 exactly 2 (the header and latch). Vectorizable inner-most loops
1477 look like this:
1479 (pre-header)
1481 header <--------+
1482 | | |
1483 | +--> latch --+
1485 (exit-bb) */
1487 if (loop->num_nodes != 2)
1488 return opt_result::failure_at (vect_location,
1489 "not vectorized:"
1490 " control flow in loop.\n");
1492 if (empty_block_p (loop->header))
1493 return opt_result::failure_at (vect_location,
1494 "not vectorized: empty loop.\n");
1496 else
1498 class loop *innerloop = loop->inner;
1499 edge entryedge;
1501 /* Nested loop. We currently require that the loop is doubly-nested,
1502 contains a single inner loop, and the number of BBs is exactly 5.
1503 Vectorizable outer-loops look like this:
1505 (pre-header)
1507 header <---+
1509 inner-loop |
1511 tail ------+
1513 (exit-bb)
1515 The inner-loop has the properties expected of inner-most loops
1516 as described above. */
1518 if ((loop->inner)->inner || (loop->inner)->next)
1519 return opt_result::failure_at (vect_location,
1520 "not vectorized:"
1521 " multiple nested loops.\n");
1523 if (loop->num_nodes != 5)
1524 return opt_result::failure_at (vect_location,
1525 "not vectorized:"
1526 " control flow in loop.\n");
1528 entryedge = loop_preheader_edge (innerloop);
1529 if (entryedge->src != loop->header
1530 || !single_exit (innerloop)
1531 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1532 return opt_result::failure_at (vect_location,
1533 "not vectorized:"
1534 " unsupported outerloop form.\n");
1536 /* Analyze the inner-loop. */
1537 vect_loop_form_info inner;
1538 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1539 if (!res)
1541 if (dump_enabled_p ())
1542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1543 "not vectorized: Bad inner loop.\n");
1544 return res;
1547 /* Don't support analyzing niter under assumptions for inner
1548 loop. */
1549 if (!integer_onep (inner.assumptions))
1550 return opt_result::failure_at (vect_location,
1551 "not vectorized: Bad inner loop.\n");
1553 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1554 return opt_result::failure_at (vect_location,
1555 "not vectorized: inner-loop count not"
1556 " invariant.\n");
1558 if (dump_enabled_p ())
1559 dump_printf_loc (MSG_NOTE, vect_location,
1560 "Considering outer-loop vectorization.\n");
1561 info->inner_loop_cond = inner.loop_cond;
1564 if (!single_exit (loop))
1565 return opt_result::failure_at (vect_location,
1566 "not vectorized: multiple exits.\n");
1567 if (EDGE_COUNT (loop->header->preds) != 2)
1568 return opt_result::failure_at (vect_location,
1569 "not vectorized:"
1570 " too many incoming edges.\n");
1572 /* We assume that the loop exit condition is at the end of the loop. i.e,
1573 that the loop is represented as a do-while (with a proper if-guard
1574 before the loop if needed), where the loop header contains all the
1575 executable statements, and the latch is empty. */
1576 if (!empty_block_p (loop->latch)
1577 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1578 return opt_result::failure_at (vect_location,
1579 "not vectorized: latch block not empty.\n");
1581 /* Make sure the exit is not abnormal. */
1582 edge e = single_exit (loop);
1583 if (e->flags & EDGE_ABNORMAL)
1584 return opt_result::failure_at (vect_location,
1585 "not vectorized:"
1586 " abnormal loop exit edge.\n");
1588 info->loop_cond
1589 = vect_get_loop_niters (loop, &info->assumptions,
1590 &info->number_of_iterations,
1591 &info->number_of_iterationsm1);
1592 if (!info->loop_cond)
1593 return opt_result::failure_at
1594 (vect_location,
1595 "not vectorized: complicated exit condition.\n");
1597 if (integer_zerop (info->assumptions)
1598 || !info->number_of_iterations
1599 || chrec_contains_undetermined (info->number_of_iterations))
1600 return opt_result::failure_at
1601 (info->loop_cond,
1602 "not vectorized: number of iterations cannot be computed.\n");
1604 if (integer_zerop (info->number_of_iterations))
1605 return opt_result::failure_at
1606 (info->loop_cond,
1607 "not vectorized: number of iterations = 0.\n");
1609 if (!(tree_fits_shwi_p (info->number_of_iterations)
1610 && tree_to_shwi (info->number_of_iterations) > 0))
1612 if (dump_enabled_p ())
1614 dump_printf_loc (MSG_NOTE, vect_location,
1615 "Symbolic number of iterations is ");
1616 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1617 dump_printf (MSG_NOTE, "\n");
1621 return opt_result::success ();
1624 /* Create a loop_vec_info for LOOP with SHARED and the
1625 vect_analyze_loop_form result. */
1627 loop_vec_info
1628 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1629 const vect_loop_form_info *info,
1630 loop_vec_info main_loop_info)
1632 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1633 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1634 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1635 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1636 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1637 /* Also record the assumptions for versioning. */
1638 if (!integer_onep (info->assumptions) && !main_loop_info)
1639 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1641 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1642 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1643 if (info->inner_loop_cond)
1645 stmt_vec_info inner_loop_cond_info
1646 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1647 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1648 /* If we have an estimate on the number of iterations of the inner
1649 loop use that to limit the scale for costing, otherwise use
1650 --param vect-inner-loop-cost-factor literally. */
1651 widest_int nit;
1652 if (estimated_stmt_executions (loop->inner, &nit))
1653 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1654 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1657 return loop_vinfo;
1662 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1663 statements update the vectorization factor. */
1665 static void
1666 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1668 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1669 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1670 int nbbs = loop->num_nodes;
1671 poly_uint64 vectorization_factor;
1672 int i;
1674 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1676 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1677 gcc_assert (known_ne (vectorization_factor, 0U));
1679 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1680 vectorization factor of the loop is the unrolling factor required by
1681 the SLP instances. If that unrolling factor is 1, we say, that we
1682 perform pure SLP on loop - cross iteration parallelism is not
1683 exploited. */
1684 bool only_slp_in_loop = true;
1685 for (i = 0; i < nbbs; i++)
1687 basic_block bb = bbs[i];
1688 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1689 gsi_next (&si))
1691 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1692 if (!stmt_info)
1693 continue;
1694 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1695 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1696 && !PURE_SLP_STMT (stmt_info))
1697 /* STMT needs both SLP and loop-based vectorization. */
1698 only_slp_in_loop = false;
1700 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1701 gsi_next (&si))
1703 if (is_gimple_debug (gsi_stmt (si)))
1704 continue;
1705 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1706 stmt_info = vect_stmt_to_vectorize (stmt_info);
1707 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1708 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1709 && !PURE_SLP_STMT (stmt_info))
1710 /* STMT needs both SLP and loop-based vectorization. */
1711 only_slp_in_loop = false;
1715 if (only_slp_in_loop)
1717 if (dump_enabled_p ())
1718 dump_printf_loc (MSG_NOTE, vect_location,
1719 "Loop contains only SLP stmts\n");
1720 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1722 else
1724 if (dump_enabled_p ())
1725 dump_printf_loc (MSG_NOTE, vect_location,
1726 "Loop contains SLP and non-SLP stmts\n");
1727 /* Both the vectorization factor and unroll factor have the form
1728 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1729 so they must have a common multiple. */
1730 vectorization_factor
1731 = force_common_multiple (vectorization_factor,
1732 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1735 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1736 if (dump_enabled_p ())
1738 dump_printf_loc (MSG_NOTE, vect_location,
1739 "Updating vectorization factor to ");
1740 dump_dec (MSG_NOTE, vectorization_factor);
1741 dump_printf (MSG_NOTE, ".\n");
1745 /* Return true if STMT_INFO describes a double reduction phi and if
1746 the other phi in the reduction is also relevant for vectorization.
1747 This rejects cases such as:
1749 outer1:
1750 x_1 = PHI <x_3(outer2), ...>;
1753 inner:
1754 x_2 = ...;
1757 outer2:
1758 x_3 = PHI <x_2(inner)>;
1760 if nothing in x_2 or elsewhere makes x_1 relevant. */
1762 static bool
1763 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1765 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1766 return false;
1768 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1771 /* Function vect_analyze_loop_operations.
1773 Scan the loop stmts and make sure they are all vectorizable. */
1775 static opt_result
1776 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1778 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1779 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1780 int nbbs = loop->num_nodes;
1781 int i;
1782 stmt_vec_info stmt_info;
1783 bool need_to_vectorize = false;
1784 bool ok;
1786 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1788 auto_vec<stmt_info_for_cost> cost_vec;
1790 for (i = 0; i < nbbs; i++)
1792 basic_block bb = bbs[i];
1794 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1795 gsi_next (&si))
1797 gphi *phi = si.phi ();
1798 ok = true;
1800 stmt_info = loop_vinfo->lookup_stmt (phi);
1801 if (dump_enabled_p ())
1802 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1803 (gimple *) phi);
1804 if (virtual_operand_p (gimple_phi_result (phi)))
1805 continue;
1807 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1808 (i.e., a phi in the tail of the outer-loop). */
1809 if (! is_loop_header_bb_p (bb))
1811 /* FORNOW: we currently don't support the case that these phis
1812 are not used in the outerloop (unless it is double reduction,
1813 i.e., this phi is vect_reduction_def), cause this case
1814 requires to actually do something here. */
1815 if (STMT_VINFO_LIVE_P (stmt_info)
1816 && !vect_active_double_reduction_p (stmt_info))
1817 return opt_result::failure_at (phi,
1818 "Unsupported loop-closed phi"
1819 " in outer-loop.\n");
1821 /* If PHI is used in the outer loop, we check that its operand
1822 is defined in the inner loop. */
1823 if (STMT_VINFO_RELEVANT_P (stmt_info))
1825 tree phi_op;
1827 if (gimple_phi_num_args (phi) != 1)
1828 return opt_result::failure_at (phi, "unsupported phi");
1830 phi_op = PHI_ARG_DEF (phi, 0);
1831 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1832 if (!op_def_info)
1833 return opt_result::failure_at (phi, "unsupported phi\n");
1835 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1836 && (STMT_VINFO_RELEVANT (op_def_info)
1837 != vect_used_in_outer_by_reduction))
1838 return opt_result::failure_at (phi, "unsupported phi\n");
1840 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1841 || (STMT_VINFO_DEF_TYPE (stmt_info)
1842 == vect_double_reduction_def))
1843 && !vectorizable_lc_phi (loop_vinfo,
1844 stmt_info, NULL, NULL))
1845 return opt_result::failure_at (phi, "unsupported phi\n");
1848 continue;
1851 gcc_assert (stmt_info);
1853 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1854 || STMT_VINFO_LIVE_P (stmt_info))
1855 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
1856 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
1857 /* A scalar-dependence cycle that we don't support. */
1858 return opt_result::failure_at (phi,
1859 "not vectorized:"
1860 " scalar dependence cycle.\n");
1862 if (STMT_VINFO_RELEVANT_P (stmt_info))
1864 need_to_vectorize = true;
1865 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1866 && ! PURE_SLP_STMT (stmt_info))
1867 ok = vectorizable_induction (loop_vinfo,
1868 stmt_info, NULL, NULL,
1869 &cost_vec);
1870 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1871 || (STMT_VINFO_DEF_TYPE (stmt_info)
1872 == vect_double_reduction_def)
1873 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1874 && ! PURE_SLP_STMT (stmt_info))
1875 ok = vectorizable_reduction (loop_vinfo,
1876 stmt_info, NULL, NULL, &cost_vec);
1877 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
1878 == vect_first_order_recurrence)
1879 && ! PURE_SLP_STMT (stmt_info))
1880 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
1881 &cost_vec);
1884 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1885 if (ok
1886 && STMT_VINFO_LIVE_P (stmt_info)
1887 && !PURE_SLP_STMT (stmt_info))
1888 ok = vectorizable_live_operation (loop_vinfo,
1889 stmt_info, NULL, NULL, NULL,
1890 -1, false, &cost_vec);
1892 if (!ok)
1893 return opt_result::failure_at (phi,
1894 "not vectorized: relevant phi not "
1895 "supported: %G",
1896 static_cast <gimple *> (phi));
1899 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1900 gsi_next (&si))
1902 gimple *stmt = gsi_stmt (si);
1903 if (!gimple_clobber_p (stmt)
1904 && !is_gimple_debug (stmt))
1906 opt_result res
1907 = vect_analyze_stmt (loop_vinfo,
1908 loop_vinfo->lookup_stmt (stmt),
1909 &need_to_vectorize,
1910 NULL, NULL, &cost_vec);
1911 if (!res)
1912 return res;
1915 } /* bbs */
1917 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1919 /* All operations in the loop are either irrelevant (deal with loop
1920 control, or dead), or only used outside the loop and can be moved
1921 out of the loop (e.g. invariants, inductions). The loop can be
1922 optimized away by scalar optimizations. We're better off not
1923 touching this loop. */
1924 if (!need_to_vectorize)
1926 if (dump_enabled_p ())
1927 dump_printf_loc (MSG_NOTE, vect_location,
1928 "All the computation can be taken out of the loop.\n");
1929 return opt_result::failure_at
1930 (vect_location,
1931 "not vectorized: redundant loop. no profit to vectorize.\n");
1934 return opt_result::success ();
1937 /* Return true if we know that the iteration count is smaller than the
1938 vectorization factor. Return false if it isn't, or if we can't be sure
1939 either way. */
1941 static bool
1942 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1944 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1946 HOST_WIDE_INT max_niter;
1947 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1948 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1949 else
1950 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1952 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1953 return true;
1955 return false;
1958 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1959 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1960 definitely no, or -1 if it's worth retrying. */
1962 static int
1963 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
1964 unsigned *suggested_unroll_factor)
1966 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1967 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1969 /* Only loops that can handle partially-populated vectors can have iteration
1970 counts less than the vectorization factor. */
1971 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1973 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1975 if (dump_enabled_p ())
1976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1977 "not vectorized: iteration count smaller than "
1978 "vectorization factor.\n");
1979 return 0;
1983 /* If using the "very cheap" model. reject cases in which we'd keep
1984 a copy of the scalar code (even if we might be able to vectorize it). */
1985 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1986 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1987 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1988 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1990 if (dump_enabled_p ())
1991 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1992 "some scalar iterations would need to be peeled\n");
1993 return 0;
1996 int min_profitable_iters, min_profitable_estimate;
1997 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1998 &min_profitable_estimate,
1999 suggested_unroll_factor);
2001 if (min_profitable_iters < 0)
2003 if (dump_enabled_p ())
2004 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2005 "not vectorized: vectorization not profitable.\n");
2006 if (dump_enabled_p ())
2007 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2008 "not vectorized: vector version will never be "
2009 "profitable.\n");
2010 return -1;
2013 int min_scalar_loop_bound = (param_min_vect_loop_bound
2014 * assumed_vf);
2016 /* Use the cost model only if it is more conservative than user specified
2017 threshold. */
2018 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2019 min_profitable_iters);
2021 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2023 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2024 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2026 if (dump_enabled_p ())
2027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2028 "not vectorized: vectorization not profitable.\n");
2029 if (dump_enabled_p ())
2030 dump_printf_loc (MSG_NOTE, vect_location,
2031 "not vectorized: iteration count smaller than user "
2032 "specified loop bound parameter or minimum profitable "
2033 "iterations (whichever is more conservative).\n");
2034 return 0;
2037 /* The static profitablity threshold min_profitable_estimate includes
2038 the cost of having to check at runtime whether the scalar loop
2039 should be used instead. If it turns out that we don't need or want
2040 such a check, the threshold we should use for the static estimate
2041 is simply the point at which the vector loop becomes more profitable
2042 than the scalar loop. */
2043 if (min_profitable_estimate > min_profitable_iters
2044 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2045 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2046 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2047 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2049 if (dump_enabled_p ())
2050 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2051 " choice between the scalar and vector loops\n");
2052 min_profitable_estimate = min_profitable_iters;
2055 /* If the vector loop needs multiple iterations to be beneficial then
2056 things are probably too close to call, and the conservative thing
2057 would be to stick with the scalar code. */
2058 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2059 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2061 if (dump_enabled_p ())
2062 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2063 "one iteration of the vector loop would be"
2064 " more expensive than the equivalent number of"
2065 " iterations of the scalar loop\n");
2066 return 0;
2069 HOST_WIDE_INT estimated_niter;
2071 /* If we are vectorizing an epilogue then we know the maximum number of
2072 scalar iterations it will cover is at least one lower than the
2073 vectorization factor of the main loop. */
2074 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2075 estimated_niter
2076 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2077 else
2079 estimated_niter = estimated_stmt_executions_int (loop);
2080 if (estimated_niter == -1)
2081 estimated_niter = likely_max_stmt_executions_int (loop);
2083 if (estimated_niter != -1
2084 && ((unsigned HOST_WIDE_INT) estimated_niter
2085 < MAX (th, (unsigned) min_profitable_estimate)))
2087 if (dump_enabled_p ())
2088 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2089 "not vectorized: estimated iteration count too "
2090 "small.\n");
2091 if (dump_enabled_p ())
2092 dump_printf_loc (MSG_NOTE, vect_location,
2093 "not vectorized: estimated iteration count smaller "
2094 "than specified loop bound parameter or minimum "
2095 "profitable iterations (whichever is more "
2096 "conservative).\n");
2097 return -1;
2100 return 1;
2103 static opt_result
2104 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2105 vec<data_reference_p> *datarefs,
2106 unsigned int *n_stmts)
2108 *n_stmts = 0;
2109 for (unsigned i = 0; i < loop->num_nodes; i++)
2110 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2111 !gsi_end_p (gsi); gsi_next (&gsi))
2113 gimple *stmt = gsi_stmt (gsi);
2114 if (is_gimple_debug (stmt))
2115 continue;
2116 ++(*n_stmts);
2117 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2118 NULL, 0);
2119 if (!res)
2121 if (is_gimple_call (stmt) && loop->safelen)
2123 tree fndecl = gimple_call_fndecl (stmt), op;
2124 if (fndecl != NULL_TREE)
2126 cgraph_node *node = cgraph_node::get (fndecl);
2127 if (node != NULL && node->simd_clones != NULL)
2129 unsigned int j, n = gimple_call_num_args (stmt);
2130 for (j = 0; j < n; j++)
2132 op = gimple_call_arg (stmt, j);
2133 if (DECL_P (op)
2134 || (REFERENCE_CLASS_P (op)
2135 && get_base_address (op)))
2136 break;
2138 op = gimple_call_lhs (stmt);
2139 /* Ignore #pragma omp declare simd functions
2140 if they don't have data references in the
2141 call stmt itself. */
2142 if (j == n
2143 && !(op
2144 && (DECL_P (op)
2145 || (REFERENCE_CLASS_P (op)
2146 && get_base_address (op)))))
2147 continue;
2151 return res;
2153 /* If dependence analysis will give up due to the limit on the
2154 number of datarefs stop here and fail fatally. */
2155 if (datarefs->length ()
2156 > (unsigned)param_loop_max_datarefs_for_datadeps)
2157 return opt_result::failure_at (stmt, "exceeded param "
2158 "loop-max-datarefs-for-datadeps\n");
2160 return opt_result::success ();
2163 /* Look for SLP-only access groups and turn each individual access into its own
2164 group. */
2165 static void
2166 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2168 unsigned int i;
2169 struct data_reference *dr;
2171 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2173 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2174 FOR_EACH_VEC_ELT (datarefs, i, dr)
2176 gcc_assert (DR_REF (dr));
2177 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2179 /* Check if the load is a part of an interleaving chain. */
2180 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2182 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2183 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2184 unsigned int group_size = DR_GROUP_SIZE (first_element);
2186 /* Check if SLP-only groups. */
2187 if (!STMT_SLP_TYPE (stmt_info)
2188 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2190 /* Dissolve the group. */
2191 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2193 stmt_vec_info vinfo = first_element;
2194 while (vinfo)
2196 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2197 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2198 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2199 DR_GROUP_SIZE (vinfo) = 1;
2200 if (STMT_VINFO_STRIDED_P (first_element))
2201 DR_GROUP_GAP (vinfo) = 0;
2202 else
2203 DR_GROUP_GAP (vinfo) = group_size - 1;
2204 /* Duplicate and adjust alignment info, it needs to
2205 be present on each group leader, see dr_misalignment. */
2206 if (vinfo != first_element)
2208 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2209 dr_info2->target_alignment = dr_info->target_alignment;
2210 int misalignment = dr_info->misalignment;
2211 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2213 HOST_WIDE_INT diff
2214 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2215 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2216 unsigned HOST_WIDE_INT align_c
2217 = dr_info->target_alignment.to_constant ();
2218 misalignment = (misalignment + diff) % align_c;
2220 dr_info2->misalignment = misalignment;
2222 vinfo = next;
2229 /* Determine if operating on full vectors for LOOP_VINFO might leave
2230 some scalar iterations still to do. If so, decide how we should
2231 handle those scalar iterations. The possibilities are:
2233 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2234 In this case:
2236 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2237 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2238 LOOP_VINFO_PEELING_FOR_NITER == false
2240 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2241 to handle the remaining scalar iterations. In this case:
2243 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2244 LOOP_VINFO_PEELING_FOR_NITER == true
2246 There are two choices:
2248 (2a) Consider vectorizing the epilogue loop at the same VF as the
2249 main loop, but using partial vectors instead of full vectors.
2250 In this case:
2252 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2254 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2255 In this case:
2257 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2259 When FOR_EPILOGUE_P is true, make this determination based on the
2260 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2261 based on the assumption that LOOP_VINFO is the main loop. The caller
2262 has made sure that the number of iterations is set appropriately for
2263 this value of FOR_EPILOGUE_P. */
2265 opt_result
2266 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2267 bool for_epilogue_p)
2269 /* Determine whether there would be any scalar iterations left over. */
2270 bool need_peeling_or_partial_vectors_p
2271 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2273 /* Decide whether to vectorize the loop with partial vectors. */
2274 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2275 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2276 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2277 && need_peeling_or_partial_vectors_p)
2279 /* For partial-vector-usage=1, try to push the handling of partial
2280 vectors to the epilogue, with the main loop continuing to operate
2281 on full vectors.
2283 If we are unrolling we also do not want to use partial vectors. This
2284 is to avoid the overhead of generating multiple masks and also to
2285 avoid having to execute entire iterations of FALSE masked instructions
2286 when dealing with one or less full iterations.
2288 ??? We could then end up failing to use partial vectors if we
2289 decide to peel iterations into a prologue, and if the main loop
2290 then ends up processing fewer than VF iterations. */
2291 if ((param_vect_partial_vector_usage == 1
2292 || loop_vinfo->suggested_unroll_factor > 1)
2293 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2294 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2295 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2296 else
2297 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2300 if (dump_enabled_p ())
2302 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2303 dump_printf_loc (MSG_NOTE, vect_location,
2304 "operating on partial vectors%s.\n",
2305 for_epilogue_p ? " for epilogue loop" : "");
2306 else
2307 dump_printf_loc (MSG_NOTE, vect_location,
2308 "operating only on full vectors%s.\n",
2309 for_epilogue_p ? " for epilogue loop" : "");
2312 if (for_epilogue_p)
2314 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2315 gcc_assert (orig_loop_vinfo);
2316 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2317 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2318 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2321 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2322 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2324 /* Check that the loop processes at least one full vector. */
2325 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2326 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2327 if (known_lt (wi::to_widest (scalar_niters), vf))
2328 return opt_result::failure_at (vect_location,
2329 "loop does not have enough iterations"
2330 " to support vectorization.\n");
2332 /* If we need to peel an extra epilogue iteration to handle data
2333 accesses with gaps, check that there are enough scalar iterations
2334 available.
2336 The check above is redundant with this one when peeling for gaps,
2337 but the distinction is useful for diagnostics. */
2338 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2339 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2340 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2341 return opt_result::failure_at (vect_location,
2342 "loop does not have enough iterations"
2343 " to support peeling for gaps.\n");
2346 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2347 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2348 && need_peeling_or_partial_vectors_p);
2350 return opt_result::success ();
2353 /* Function vect_analyze_loop_2.
2355 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2356 analyses will record information in some members of LOOP_VINFO. FATAL
2357 indicates if some analysis meets fatal error. If one non-NULL pointer
2358 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2359 worked out suggested unroll factor, while one NULL pointer shows it's
2360 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2361 is to hold the slp decision when the suggested unroll factor is worked
2362 out. */
2363 static opt_result
2364 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2365 unsigned *suggested_unroll_factor,
2366 bool& slp_done_for_suggested_uf)
2368 opt_result ok = opt_result::success ();
2369 int res;
2370 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2371 poly_uint64 min_vf = 2;
2372 loop_vec_info orig_loop_vinfo = NULL;
2374 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2375 loop_vec_info of the first vectorized loop. */
2376 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2377 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2378 else
2379 orig_loop_vinfo = loop_vinfo;
2380 gcc_assert (orig_loop_vinfo);
2382 /* The first group of checks is independent of the vector size. */
2383 fatal = true;
2385 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2386 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2387 return opt_result::failure_at (vect_location,
2388 "not vectorized: simd if(0)\n");
2390 /* Find all data references in the loop (which correspond to vdefs/vuses)
2391 and analyze their evolution in the loop. */
2393 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2395 /* Gather the data references and count stmts in the loop. */
2396 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2398 opt_result res
2399 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2400 &LOOP_VINFO_DATAREFS (loop_vinfo),
2401 &LOOP_VINFO_N_STMTS (loop_vinfo));
2402 if (!res)
2404 if (dump_enabled_p ())
2405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2406 "not vectorized: loop contains function "
2407 "calls or data references that cannot "
2408 "be analyzed\n");
2409 return res;
2411 loop_vinfo->shared->save_datarefs ();
2413 else
2414 loop_vinfo->shared->check_datarefs ();
2416 /* Analyze the data references and also adjust the minimal
2417 vectorization factor according to the loads and stores. */
2419 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2420 if (!ok)
2422 if (dump_enabled_p ())
2423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2424 "bad data references.\n");
2425 return ok;
2428 /* Check if we are applying unroll factor now. */
2429 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2430 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2432 /* If the slp decision is false when suggested unroll factor is worked
2433 out, and we are applying suggested unroll factor, we can simply skip
2434 all slp related analyses this time. */
2435 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2437 /* Classify all cross-iteration scalar data-flow cycles.
2438 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2439 vect_analyze_scalar_cycles (loop_vinfo, slp);
2441 vect_pattern_recog (loop_vinfo);
2443 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2445 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2446 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2448 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2449 if (!ok)
2451 if (dump_enabled_p ())
2452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2453 "bad data access.\n");
2454 return ok;
2457 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2459 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2460 if (!ok)
2462 if (dump_enabled_p ())
2463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2464 "unexpected pattern.\n");
2465 return ok;
2468 /* While the rest of the analysis below depends on it in some way. */
2469 fatal = false;
2471 /* Analyze data dependences between the data-refs in the loop
2472 and adjust the maximum vectorization factor according to
2473 the dependences.
2474 FORNOW: fail at the first data dependence that we encounter. */
2476 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2477 if (!ok)
2479 if (dump_enabled_p ())
2480 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2481 "bad data dependence.\n");
2482 return ok;
2484 if (max_vf != MAX_VECTORIZATION_FACTOR
2485 && maybe_lt (max_vf, min_vf))
2486 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2487 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2489 ok = vect_determine_vectorization_factor (loop_vinfo);
2490 if (!ok)
2492 if (dump_enabled_p ())
2493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2494 "can't determine vectorization factor.\n");
2495 return ok;
2497 if (max_vf != MAX_VECTORIZATION_FACTOR
2498 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2499 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2501 /* Compute the scalar iteration cost. */
2502 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2504 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2506 if (slp)
2508 /* Check the SLP opportunities in the loop, analyze and build
2509 SLP trees. */
2510 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2511 if (!ok)
2512 return ok;
2514 /* If there are any SLP instances mark them as pure_slp. */
2515 slp = vect_make_slp_decision (loop_vinfo);
2516 if (slp)
2518 /* Find stmts that need to be both vectorized and SLPed. */
2519 vect_detect_hybrid_slp (loop_vinfo);
2521 /* Update the vectorization factor based on the SLP decision. */
2522 vect_update_vf_for_slp (loop_vinfo);
2524 /* Optimize the SLP graph with the vectorization factor fixed. */
2525 vect_optimize_slp (loop_vinfo);
2527 /* Gather the loads reachable from the SLP graph entries. */
2528 vect_gather_slp_loads (loop_vinfo);
2532 bool saved_can_use_partial_vectors_p
2533 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2535 /* We don't expect to have to roll back to anything other than an empty
2536 set of rgroups. */
2537 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2539 /* This is the point where we can re-start analysis with SLP forced off. */
2540 start_over:
2542 /* Apply the suggested unrolling factor, this was determined by the backend
2543 during finish_cost the first time we ran the analyzis for this
2544 vector mode. */
2545 if (applying_suggested_uf)
2546 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2548 /* Now the vectorization factor is final. */
2549 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2550 gcc_assert (known_ne (vectorization_factor, 0U));
2552 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2554 dump_printf_loc (MSG_NOTE, vect_location,
2555 "vectorization_factor = ");
2556 dump_dec (MSG_NOTE, vectorization_factor);
2557 dump_printf (MSG_NOTE, ", niters = %wd\n",
2558 LOOP_VINFO_INT_NITERS (loop_vinfo));
2561 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2563 /* Analyze the alignment of the data-refs in the loop.
2564 Fail if a data reference is found that cannot be vectorized. */
2566 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2567 if (!ok)
2569 if (dump_enabled_p ())
2570 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2571 "bad data alignment.\n");
2572 return ok;
2575 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2576 It is important to call pruning after vect_analyze_data_ref_accesses,
2577 since we use grouping information gathered by interleaving analysis. */
2578 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2579 if (!ok)
2580 return ok;
2582 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2583 vectorization, since we do not want to add extra peeling or
2584 add versioning for alignment. */
2585 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2586 /* This pass will decide on using loop versioning and/or loop peeling in
2587 order to enhance the alignment of data references in the loop. */
2588 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2589 if (!ok)
2590 return ok;
2592 if (slp)
2594 /* Analyze operations in the SLP instances. Note this may
2595 remove unsupported SLP instances which makes the above
2596 SLP kind detection invalid. */
2597 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2598 vect_slp_analyze_operations (loop_vinfo);
2599 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2601 ok = opt_result::failure_at (vect_location,
2602 "unsupported SLP instances\n");
2603 goto again;
2606 /* Check whether any load in ALL SLP instances is possibly permuted. */
2607 slp_tree load_node, slp_root;
2608 unsigned i, x;
2609 slp_instance instance;
2610 bool can_use_lanes = true;
2611 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2613 slp_root = SLP_INSTANCE_TREE (instance);
2614 int group_size = SLP_TREE_LANES (slp_root);
2615 tree vectype = SLP_TREE_VECTYPE (slp_root);
2616 bool loads_permuted = false;
2617 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2619 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2620 continue;
2621 unsigned j;
2622 stmt_vec_info load_info;
2623 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2624 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2626 loads_permuted = true;
2627 break;
2631 /* If the loads and stores can be handled with load/store-lane
2632 instructions record it and move on to the next instance. */
2633 if (loads_permuted
2634 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2635 && vect_store_lanes_supported (vectype, group_size, false))
2637 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2639 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2640 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2641 /* Use SLP for strided accesses (or if we can't
2642 load-lanes). */
2643 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2644 || ! vect_load_lanes_supported
2645 (STMT_VINFO_VECTYPE (stmt_vinfo),
2646 DR_GROUP_SIZE (stmt_vinfo), false))
2647 break;
2650 can_use_lanes
2651 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2653 if (can_use_lanes && dump_enabled_p ())
2654 dump_printf_loc (MSG_NOTE, vect_location,
2655 "SLP instance %p can use load/store-lanes\n",
2656 (void *) instance);
2658 else
2660 can_use_lanes = false;
2661 break;
2665 /* If all SLP instances can use load/store-lanes abort SLP and try again
2666 with SLP disabled. */
2667 if (can_use_lanes)
2669 ok = opt_result::failure_at (vect_location,
2670 "Built SLP cancelled: can use "
2671 "load/store-lanes\n");
2672 if (dump_enabled_p ())
2673 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2674 "Built SLP cancelled: all SLP instances support "
2675 "load/store-lanes\n");
2676 goto again;
2680 /* Dissolve SLP-only groups. */
2681 vect_dissolve_slp_only_groups (loop_vinfo);
2683 /* Scan all the remaining operations in the loop that are not subject
2684 to SLP and make sure they are vectorizable. */
2685 ok = vect_analyze_loop_operations (loop_vinfo);
2686 if (!ok)
2688 if (dump_enabled_p ())
2689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2690 "bad operation or unsupported loop bound.\n");
2691 return ok;
2694 /* For now, we don't expect to mix both masking and length approaches for one
2695 loop, disable it if both are recorded. */
2696 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2697 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2698 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2700 if (dump_enabled_p ())
2701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2702 "can't vectorize a loop with partial vectors"
2703 " because we don't expect to mix different"
2704 " approaches with partial vectors for the"
2705 " same loop.\n");
2706 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2709 /* If we still have the option of using partial vectors,
2710 check whether we can generate the necessary loop controls. */
2711 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2712 && !vect_verify_full_masking (loop_vinfo)
2713 && !vect_verify_loop_lens (loop_vinfo))
2714 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2716 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2717 to be able to handle fewer than VF scalars, or needs to have a lower VF
2718 than the main loop. */
2719 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2720 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2721 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2722 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2723 return opt_result::failure_at (vect_location,
2724 "Vectorization factor too high for"
2725 " epilogue loop.\n");
2727 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2728 assuming that the loop will be used as a main loop. We will redo
2729 this analysis later if we instead decide to use the loop as an
2730 epilogue loop. */
2731 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2732 if (!ok)
2733 return ok;
2735 /* Check the costings of the loop make vectorizing worthwhile. */
2736 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
2737 if (res < 0)
2739 ok = opt_result::failure_at (vect_location,
2740 "Loop costings may not be worthwhile.\n");
2741 goto again;
2743 if (!res)
2744 return opt_result::failure_at (vect_location,
2745 "Loop costings not worthwhile.\n");
2747 /* If an epilogue loop is required make sure we can create one. */
2748 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2749 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2751 if (dump_enabled_p ())
2752 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2753 if (!vect_can_advance_ivs_p (loop_vinfo)
2754 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2755 single_exit (LOOP_VINFO_LOOP
2756 (loop_vinfo))))
2758 ok = opt_result::failure_at (vect_location,
2759 "not vectorized: can't create required "
2760 "epilog loop\n");
2761 goto again;
2765 /* During peeling, we need to check if number of loop iterations is
2766 enough for both peeled prolog loop and vector loop. This check
2767 can be merged along with threshold check of loop versioning, so
2768 increase threshold for this case if necessary.
2770 If we are analyzing an epilogue we still want to check what its
2771 versioning threshold would be. If we decide to vectorize the epilogues we
2772 will want to use the lowest versioning threshold of all epilogues and main
2773 loop. This will enable us to enter a vectorized epilogue even when
2774 versioning the loop. We can't simply check whether the epilogue requires
2775 versioning though since we may have skipped some versioning checks when
2776 analyzing the epilogue. For instance, checks for alias versioning will be
2777 skipped when dealing with epilogues as we assume we already checked them
2778 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2779 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2781 poly_uint64 niters_th = 0;
2782 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2784 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2786 /* Niters for peeled prolog loop. */
2787 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2789 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2790 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2791 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2793 else
2794 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2797 /* Niters for at least one iteration of vectorized loop. */
2798 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2799 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2800 /* One additional iteration because of peeling for gap. */
2801 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2802 niters_th += 1;
2804 /* Use the same condition as vect_transform_loop to decide when to use
2805 the cost to determine a versioning threshold. */
2806 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2807 && ordered_p (th, niters_th))
2808 niters_th = ordered_max (poly_uint64 (th), niters_th);
2810 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2813 gcc_assert (known_eq (vectorization_factor,
2814 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2816 slp_done_for_suggested_uf = slp;
2818 /* Ok to vectorize! */
2819 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2820 return opt_result::success ();
2822 again:
2823 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2824 gcc_assert (!ok);
2826 /* Try again with SLP forced off but if we didn't do any SLP there is
2827 no point in re-trying. */
2828 if (!slp)
2829 return ok;
2831 /* If the slp decision is true when suggested unroll factor is worked
2832 out, and we are applying suggested unroll factor, we don't need to
2833 re-try any more. */
2834 if (applying_suggested_uf && slp_done_for_suggested_uf)
2835 return ok;
2837 /* If there are reduction chains re-trying will fail anyway. */
2838 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2839 return ok;
2841 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2842 via interleaving or lane instructions. */
2843 slp_instance instance;
2844 slp_tree node;
2845 unsigned i, j;
2846 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2848 stmt_vec_info vinfo;
2849 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2850 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2851 continue;
2852 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2853 unsigned int size = DR_GROUP_SIZE (vinfo);
2854 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2855 if (! vect_store_lanes_supported (vectype, size, false)
2856 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2857 && ! vect_grouped_store_supported (vectype, size))
2858 return opt_result::failure_at (vinfo->stmt,
2859 "unsupported grouped store\n");
2860 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2862 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2863 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2864 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2865 size = DR_GROUP_SIZE (vinfo);
2866 vectype = STMT_VINFO_VECTYPE (vinfo);
2867 if (! vect_load_lanes_supported (vectype, size, false)
2868 && ! vect_grouped_load_supported (vectype, single_element_p,
2869 size))
2870 return opt_result::failure_at (vinfo->stmt,
2871 "unsupported grouped load\n");
2875 if (dump_enabled_p ())
2876 dump_printf_loc (MSG_NOTE, vect_location,
2877 "re-trying with SLP disabled\n");
2879 /* Roll back state appropriately. No SLP this time. */
2880 slp = false;
2881 /* Restore vectorization factor as it were without SLP. */
2882 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2883 /* Free the SLP instances. */
2884 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2885 vect_free_slp_instance (instance);
2886 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2887 /* Reset SLP type to loop_vect on all stmts. */
2888 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2890 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2891 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2892 !gsi_end_p (si); gsi_next (&si))
2894 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2895 STMT_SLP_TYPE (stmt_info) = loop_vect;
2896 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2897 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2899 /* vectorizable_reduction adjusts reduction stmt def-types,
2900 restore them to that of the PHI. */
2901 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2902 = STMT_VINFO_DEF_TYPE (stmt_info);
2903 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2904 (STMT_VINFO_REDUC_DEF (stmt_info)))
2905 = STMT_VINFO_DEF_TYPE (stmt_info);
2908 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2909 !gsi_end_p (si); gsi_next (&si))
2911 if (is_gimple_debug (gsi_stmt (si)))
2912 continue;
2913 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2914 STMT_SLP_TYPE (stmt_info) = loop_vect;
2915 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2917 stmt_vec_info pattern_stmt_info
2918 = STMT_VINFO_RELATED_STMT (stmt_info);
2919 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2920 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2922 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2923 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2924 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2925 !gsi_end_p (pi); gsi_next (&pi))
2926 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2927 = loop_vect;
2931 /* Free optimized alias test DDRS. */
2932 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2933 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2934 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2935 /* Reset target cost data. */
2936 delete loop_vinfo->vector_costs;
2937 loop_vinfo->vector_costs = nullptr;
2938 /* Reset accumulated rgroup information. */
2939 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2940 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2941 /* Reset assorted flags. */
2942 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2943 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2944 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2945 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2946 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2947 = saved_can_use_partial_vectors_p;
2949 goto start_over;
2952 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2953 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2954 OLD_LOOP_VINFO is better unless something specifically indicates
2955 otherwise.
2957 Note that this deliberately isn't a partial order. */
2959 static bool
2960 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2961 loop_vec_info old_loop_vinfo)
2963 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2964 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2966 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2967 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2969 /* Always prefer a VF of loop->simdlen over any other VF. */
2970 if (loop->simdlen)
2972 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2973 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2974 if (new_simdlen_p != old_simdlen_p)
2975 return new_simdlen_p;
2978 const auto *old_costs = old_loop_vinfo->vector_costs;
2979 const auto *new_costs = new_loop_vinfo->vector_costs;
2980 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2981 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2983 return new_costs->better_main_loop_than_p (old_costs);
2986 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2987 true if we should. */
2989 static bool
2990 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2991 loop_vec_info old_loop_vinfo)
2993 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2994 return false;
2996 if (dump_enabled_p ())
2997 dump_printf_loc (MSG_NOTE, vect_location,
2998 "***** Preferring vector mode %s to vector mode %s\n",
2999 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3000 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3001 return true;
3004 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3005 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3006 MODE_I to the next mode useful to analyze.
3007 Return the loop_vinfo on success and wrapped null on failure. */
3009 static opt_loop_vec_info
3010 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3011 const vect_loop_form_info *loop_form_info,
3012 loop_vec_info main_loop_vinfo,
3013 const vector_modes &vector_modes, unsigned &mode_i,
3014 machine_mode &autodetected_vector_mode,
3015 bool &fatal)
3017 loop_vec_info loop_vinfo
3018 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3020 machine_mode vector_mode = vector_modes[mode_i];
3021 loop_vinfo->vector_mode = vector_mode;
3022 unsigned int suggested_unroll_factor = 1;
3023 bool slp_done_for_suggested_uf;
3025 /* Run the main analysis. */
3026 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3027 &suggested_unroll_factor,
3028 slp_done_for_suggested_uf);
3029 if (dump_enabled_p ())
3030 dump_printf_loc (MSG_NOTE, vect_location,
3031 "***** Analysis %s with vector mode %s\n",
3032 res ? "succeeded" : " failed",
3033 GET_MODE_NAME (loop_vinfo->vector_mode));
3035 if (!main_loop_vinfo && suggested_unroll_factor > 1)
3037 if (dump_enabled_p ())
3038 dump_printf_loc (MSG_NOTE, vect_location,
3039 "***** Re-trying analysis for unrolling"
3040 " with unroll factor %d and slp %s.\n",
3041 suggested_unroll_factor,
3042 slp_done_for_suggested_uf ? "on" : "off");
3043 loop_vec_info unroll_vinfo
3044 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3045 unroll_vinfo->vector_mode = vector_mode;
3046 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3047 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3048 slp_done_for_suggested_uf);
3049 if (new_res)
3051 delete loop_vinfo;
3052 loop_vinfo = unroll_vinfo;
3054 else
3055 delete unroll_vinfo;
3058 /* Remember the autodetected vector mode. */
3059 if (vector_mode == VOIDmode)
3060 autodetected_vector_mode = loop_vinfo->vector_mode;
3062 /* Advance mode_i, first skipping modes that would result in the
3063 same analysis result. */
3064 while (mode_i + 1 < vector_modes.length ()
3065 && vect_chooses_same_modes_p (loop_vinfo,
3066 vector_modes[mode_i + 1]))
3068 if (dump_enabled_p ())
3069 dump_printf_loc (MSG_NOTE, vect_location,
3070 "***** The result for vector mode %s would"
3071 " be the same\n",
3072 GET_MODE_NAME (vector_modes[mode_i + 1]));
3073 mode_i += 1;
3075 if (mode_i + 1 < vector_modes.length ()
3076 && VECTOR_MODE_P (autodetected_vector_mode)
3077 && (related_vector_mode (vector_modes[mode_i + 1],
3078 GET_MODE_INNER (autodetected_vector_mode))
3079 == autodetected_vector_mode)
3080 && (related_vector_mode (autodetected_vector_mode,
3081 GET_MODE_INNER (vector_modes[mode_i + 1]))
3082 == vector_modes[mode_i + 1]))
3084 if (dump_enabled_p ())
3085 dump_printf_loc (MSG_NOTE, vect_location,
3086 "***** Skipping vector mode %s, which would"
3087 " repeat the analysis for %s\n",
3088 GET_MODE_NAME (vector_modes[mode_i + 1]),
3089 GET_MODE_NAME (autodetected_vector_mode));
3090 mode_i += 1;
3092 mode_i++;
3094 if (!res)
3096 delete loop_vinfo;
3097 if (fatal)
3098 gcc_checking_assert (main_loop_vinfo == NULL);
3099 return opt_loop_vec_info::propagate_failure (res);
3102 return opt_loop_vec_info::success (loop_vinfo);
3105 /* Function vect_analyze_loop.
3107 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3108 for it. The different analyses will record information in the
3109 loop_vec_info struct. */
3110 opt_loop_vec_info
3111 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3113 DUMP_VECT_SCOPE ("analyze_loop_nest");
3115 if (loop_outer (loop)
3116 && loop_vec_info_for_loop (loop_outer (loop))
3117 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3118 return opt_loop_vec_info::failure_at (vect_location,
3119 "outer-loop already vectorized.\n");
3121 if (!find_loop_nest (loop, &shared->loop_nest))
3122 return opt_loop_vec_info::failure_at
3123 (vect_location,
3124 "not vectorized: loop nest containing two or more consecutive inner"
3125 " loops cannot be vectorized\n");
3127 /* Analyze the loop form. */
3128 vect_loop_form_info loop_form_info;
3129 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3130 if (!res)
3132 if (dump_enabled_p ())
3133 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3134 "bad loop form.\n");
3135 return opt_loop_vec_info::propagate_failure (res);
3137 if (!integer_onep (loop_form_info.assumptions))
3139 /* We consider to vectorize this loop by versioning it under
3140 some assumptions. In order to do this, we need to clear
3141 existing information computed by scev and niter analyzer. */
3142 scev_reset_htab ();
3143 free_numbers_of_iterations_estimates (loop);
3144 /* Also set flag for this loop so that following scev and niter
3145 analysis are done under the assumptions. */
3146 loop_constraint_set (loop, LOOP_C_FINITE);
3149 auto_vector_modes vector_modes;
3150 /* Autodetect first vector size we try. */
3151 vector_modes.safe_push (VOIDmode);
3152 unsigned int autovec_flags
3153 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3154 loop->simdlen != 0);
3155 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3156 && !unlimited_cost_model (loop));
3157 machine_mode autodetected_vector_mode = VOIDmode;
3158 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3159 unsigned int mode_i = 0;
3160 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3162 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3163 a mode has not been analyzed. */
3164 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3165 for (unsigned i = 0; i < vector_modes.length (); ++i)
3166 cached_vf_per_mode.safe_push (0);
3168 /* First determine the main loop vectorization mode, either the first
3169 one that works, starting with auto-detecting the vector mode and then
3170 following the targets order of preference, or the one with the
3171 lowest cost if pick_lowest_cost_p. */
3172 while (1)
3174 bool fatal;
3175 unsigned int last_mode_i = mode_i;
3176 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3177 failed. */
3178 cached_vf_per_mode[last_mode_i] = -1;
3179 opt_loop_vec_info loop_vinfo
3180 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3181 NULL, vector_modes, mode_i,
3182 autodetected_vector_mode, fatal);
3183 if (fatal)
3184 break;
3186 if (loop_vinfo)
3188 /* Analyzis has been successful so update the VF value. The
3189 VF should always be a multiple of unroll_factor and we want to
3190 capture the original VF here. */
3191 cached_vf_per_mode[last_mode_i]
3192 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3193 loop_vinfo->suggested_unroll_factor);
3194 /* Once we hit the desired simdlen for the first time,
3195 discard any previous attempts. */
3196 if (simdlen
3197 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3199 delete first_loop_vinfo;
3200 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3201 simdlen = 0;
3203 else if (pick_lowest_cost_p
3204 && first_loop_vinfo
3205 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3207 /* Pick loop_vinfo over first_loop_vinfo. */
3208 delete first_loop_vinfo;
3209 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3211 if (first_loop_vinfo == NULL)
3212 first_loop_vinfo = loop_vinfo;
3213 else
3215 delete loop_vinfo;
3216 loop_vinfo = opt_loop_vec_info::success (NULL);
3219 /* Commit to first_loop_vinfo if we have no reason to try
3220 alternatives. */
3221 if (!simdlen && !pick_lowest_cost_p)
3222 break;
3224 if (mode_i == vector_modes.length ()
3225 || autodetected_vector_mode == VOIDmode)
3226 break;
3228 /* Try the next biggest vector size. */
3229 if (dump_enabled_p ())
3230 dump_printf_loc (MSG_NOTE, vect_location,
3231 "***** Re-trying analysis with vector mode %s\n",
3232 GET_MODE_NAME (vector_modes[mode_i]));
3234 if (!first_loop_vinfo)
3235 return opt_loop_vec_info::propagate_failure (res);
3237 if (dump_enabled_p ())
3238 dump_printf_loc (MSG_NOTE, vect_location,
3239 "***** Choosing vector mode %s\n",
3240 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3242 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3243 enabled, SIMDUID is not set, it is the innermost loop and we have
3244 either already found the loop's SIMDLEN or there was no SIMDLEN to
3245 begin with.
3246 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3247 bool vect_epilogues = (!simdlen
3248 && loop->inner == NULL
3249 && param_vect_epilogues_nomask
3250 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3251 && !loop->simduid);
3252 if (!vect_epilogues)
3253 return first_loop_vinfo;
3255 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3256 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3258 /* For epilogues start the analysis from the first mode. The motivation
3259 behind starting from the beginning comes from cases where the VECTOR_MODES
3260 array may contain length-agnostic and length-specific modes. Their
3261 ordering is not guaranteed, so we could end up picking a mode for the main
3262 loop that is after the epilogue's optimal mode. */
3263 vector_modes[0] = autodetected_vector_mode;
3264 mode_i = 0;
3266 bool supports_partial_vectors =
3267 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3268 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3270 while (1)
3272 /* If the target does not support partial vectors we can shorten the
3273 number of modes to analyze for the epilogue as we know we can't pick a
3274 mode that would lead to a VF at least as big as the
3275 FIRST_VINFO_VF. */
3276 if (!supports_partial_vectors
3277 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3279 mode_i++;
3280 if (mode_i == vector_modes.length ())
3281 break;
3282 continue;
3285 if (dump_enabled_p ())
3286 dump_printf_loc (MSG_NOTE, vect_location,
3287 "***** Re-trying epilogue analysis with vector "
3288 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3290 bool fatal;
3291 opt_loop_vec_info loop_vinfo
3292 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3293 first_loop_vinfo,
3294 vector_modes, mode_i,
3295 autodetected_vector_mode, fatal);
3296 if (fatal)
3297 break;
3299 if (loop_vinfo)
3301 if (pick_lowest_cost_p)
3303 /* Keep trying to roll back vectorization attempts while the
3304 loop_vec_infos they produced were worse than this one. */
3305 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3306 while (!vinfos.is_empty ()
3307 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3309 gcc_assert (vect_epilogues);
3310 delete vinfos.pop ();
3313 /* For now only allow one epilogue loop. */
3314 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3316 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3317 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3318 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3319 || maybe_ne (lowest_th, 0U));
3320 /* Keep track of the known smallest versioning
3321 threshold. */
3322 if (ordered_p (lowest_th, th))
3323 lowest_th = ordered_min (lowest_th, th);
3325 else
3327 delete loop_vinfo;
3328 loop_vinfo = opt_loop_vec_info::success (NULL);
3331 /* For now only allow one epilogue loop, but allow
3332 pick_lowest_cost_p to replace it, so commit to the
3333 first epilogue if we have no reason to try alternatives. */
3334 if (!pick_lowest_cost_p)
3335 break;
3338 if (mode_i == vector_modes.length ())
3339 break;
3343 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3345 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3346 if (dump_enabled_p ())
3347 dump_printf_loc (MSG_NOTE, vect_location,
3348 "***** Choosing epilogue vector mode %s\n",
3349 GET_MODE_NAME
3350 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3353 return first_loop_vinfo;
3356 /* Return true if there is an in-order reduction function for CODE, storing
3357 it in *REDUC_FN if so. */
3359 static bool
3360 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3362 if (code == PLUS_EXPR)
3364 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3365 return true;
3367 return false;
3370 /* Function reduction_fn_for_scalar_code
3372 Input:
3373 CODE - tree_code of a reduction operations.
3375 Output:
3376 REDUC_FN - the corresponding internal function to be used to reduce the
3377 vector of partial results into a single scalar result, or IFN_LAST
3378 if the operation is a supported reduction operation, but does not have
3379 such an internal function.
3381 Return FALSE if CODE currently cannot be vectorized as reduction. */
3383 bool
3384 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3386 if (code.is_tree_code ())
3387 switch (tree_code (code))
3389 case MAX_EXPR:
3390 *reduc_fn = IFN_REDUC_MAX;
3391 return true;
3393 case MIN_EXPR:
3394 *reduc_fn = IFN_REDUC_MIN;
3395 return true;
3397 case PLUS_EXPR:
3398 *reduc_fn = IFN_REDUC_PLUS;
3399 return true;
3401 case BIT_AND_EXPR:
3402 *reduc_fn = IFN_REDUC_AND;
3403 return true;
3405 case BIT_IOR_EXPR:
3406 *reduc_fn = IFN_REDUC_IOR;
3407 return true;
3409 case BIT_XOR_EXPR:
3410 *reduc_fn = IFN_REDUC_XOR;
3411 return true;
3413 case MULT_EXPR:
3414 case MINUS_EXPR:
3415 *reduc_fn = IFN_LAST;
3416 return true;
3418 default:
3419 return false;
3421 else
3422 switch (combined_fn (code))
3424 CASE_CFN_FMAX:
3425 *reduc_fn = IFN_REDUC_FMAX;
3426 return true;
3428 CASE_CFN_FMIN:
3429 *reduc_fn = IFN_REDUC_FMIN;
3430 return true;
3432 default:
3433 return false;
3437 /* If there is a neutral value X such that a reduction would not be affected
3438 by the introduction of additional X elements, return that X, otherwise
3439 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3440 of the scalar elements. If the reduction has just a single initial value
3441 then INITIAL_VALUE is that value, otherwise it is null. */
3443 tree
3444 neutral_op_for_reduction (tree scalar_type, code_helper code,
3445 tree initial_value)
3447 if (code.is_tree_code ())
3448 switch (tree_code (code))
3450 case WIDEN_SUM_EXPR:
3451 case DOT_PROD_EXPR:
3452 case SAD_EXPR:
3453 case PLUS_EXPR:
3454 case MINUS_EXPR:
3455 case BIT_IOR_EXPR:
3456 case BIT_XOR_EXPR:
3457 return build_zero_cst (scalar_type);
3459 case MULT_EXPR:
3460 return build_one_cst (scalar_type);
3462 case BIT_AND_EXPR:
3463 return build_all_ones_cst (scalar_type);
3465 case MAX_EXPR:
3466 case MIN_EXPR:
3467 return initial_value;
3469 default:
3470 return NULL_TREE;
3472 else
3473 switch (combined_fn (code))
3475 CASE_CFN_FMIN:
3476 CASE_CFN_FMAX:
3477 return initial_value;
3479 default:
3480 return NULL_TREE;
3484 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3485 STMT is printed with a message MSG. */
3487 static void
3488 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3490 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3493 /* Return true if we need an in-order reduction for operation CODE
3494 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3495 overflow must wrap. */
3497 bool
3498 needs_fold_left_reduction_p (tree type, code_helper code)
3500 /* CHECKME: check for !flag_finite_math_only too? */
3501 if (SCALAR_FLOAT_TYPE_P (type))
3503 if (code.is_tree_code ())
3504 switch (tree_code (code))
3506 case MIN_EXPR:
3507 case MAX_EXPR:
3508 return false;
3510 default:
3511 return !flag_associative_math;
3513 else
3514 switch (combined_fn (code))
3516 CASE_CFN_FMIN:
3517 CASE_CFN_FMAX:
3518 return false;
3520 default:
3521 return !flag_associative_math;
3525 if (INTEGRAL_TYPE_P (type))
3526 return (!code.is_tree_code ()
3527 || !operation_no_trapping_overflow (type, tree_code (code)));
3529 if (SAT_FIXED_POINT_TYPE_P (type))
3530 return true;
3532 return false;
3535 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3536 has a handled computation expression. Store the main reduction
3537 operation in *CODE. */
3539 static bool
3540 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3541 tree loop_arg, code_helper *code,
3542 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3544 auto_bitmap visited;
3545 tree lookfor = PHI_RESULT (phi);
3546 ssa_op_iter curri;
3547 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3548 while (USE_FROM_PTR (curr) != loop_arg)
3549 curr = op_iter_next_use (&curri);
3550 curri.i = curri.numops;
3553 path.safe_push (std::make_pair (curri, curr));
3554 tree use = USE_FROM_PTR (curr);
3555 if (use == lookfor)
3556 break;
3557 gimple *def = SSA_NAME_DEF_STMT (use);
3558 if (gimple_nop_p (def)
3559 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3561 pop:
3564 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3565 curri = x.first;
3566 curr = x.second;
3568 curr = op_iter_next_use (&curri);
3569 /* Skip already visited or non-SSA operands (from iterating
3570 over PHI args). */
3571 while (curr != NULL_USE_OPERAND_P
3572 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3573 || ! bitmap_set_bit (visited,
3574 SSA_NAME_VERSION
3575 (USE_FROM_PTR (curr)))));
3577 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3578 if (curr == NULL_USE_OPERAND_P)
3579 break;
3581 else
3583 if (gimple_code (def) == GIMPLE_PHI)
3584 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3585 else
3586 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3587 while (curr != NULL_USE_OPERAND_P
3588 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3589 || ! bitmap_set_bit (visited,
3590 SSA_NAME_VERSION
3591 (USE_FROM_PTR (curr)))))
3592 curr = op_iter_next_use (&curri);
3593 if (curr == NULL_USE_OPERAND_P)
3594 goto pop;
3597 while (1);
3598 if (dump_file && (dump_flags & TDF_DETAILS))
3600 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3601 unsigned i;
3602 std::pair<ssa_op_iter, use_operand_p> *x;
3603 FOR_EACH_VEC_ELT (path, i, x)
3604 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3605 dump_printf (MSG_NOTE, "\n");
3608 /* Check whether the reduction path detected is valid. */
3609 bool fail = path.length () == 0;
3610 bool neg = false;
3611 int sign = -1;
3612 *code = ERROR_MARK;
3613 for (unsigned i = 1; i < path.length (); ++i)
3615 gimple *use_stmt = USE_STMT (path[i].second);
3616 gimple_match_op op;
3617 if (!gimple_extract_op (use_stmt, &op))
3619 fail = true;
3620 break;
3622 unsigned int opi = op.num_ops;
3623 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3625 /* The following make sure we can compute the operand index
3626 easily plus it mostly disallows chaining via COND_EXPR condition
3627 operands. */
3628 for (opi = 0; opi < op.num_ops; ++opi)
3629 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3630 break;
3632 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3634 for (opi = 0; opi < op.num_ops; ++opi)
3635 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3636 break;
3638 if (opi == op.num_ops)
3640 fail = true;
3641 break;
3643 op.code = canonicalize_code (op.code, op.type);
3644 if (op.code == MINUS_EXPR)
3646 op.code = PLUS_EXPR;
3647 /* Track whether we negate the reduction value each iteration. */
3648 if (op.ops[1] == op.ops[opi])
3649 neg = ! neg;
3651 if (CONVERT_EXPR_CODE_P (op.code)
3652 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3654 else if (*code == ERROR_MARK)
3656 *code = op.code;
3657 sign = TYPE_SIGN (op.type);
3659 else if (op.code != *code)
3661 fail = true;
3662 break;
3664 else if ((op.code == MIN_EXPR
3665 || op.code == MAX_EXPR)
3666 && sign != TYPE_SIGN (op.type))
3668 fail = true;
3669 break;
3671 /* Check there's only a single stmt the op is used on. For the
3672 not value-changing tail and the last stmt allow out-of-loop uses.
3673 ??? We could relax this and handle arbitrary live stmts by
3674 forcing a scalar epilogue for example. */
3675 imm_use_iterator imm_iter;
3676 gimple *op_use_stmt;
3677 unsigned cnt = 0;
3678 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3679 if (!is_gimple_debug (op_use_stmt)
3680 && (*code != ERROR_MARK
3681 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3683 /* We want to allow x + x but not x < 1 ? x : 2. */
3684 if (is_gimple_assign (op_use_stmt)
3685 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3687 use_operand_p use_p;
3688 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3689 cnt++;
3691 else
3692 cnt++;
3694 if (cnt != 1)
3696 fail = true;
3697 break;
3700 return ! fail && ! neg && *code != ERROR_MARK;
3703 bool
3704 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3705 tree loop_arg, enum tree_code code)
3707 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3708 code_helper code_;
3709 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3710 && code_ == code);
3715 /* Function vect_is_simple_reduction
3717 (1) Detect a cross-iteration def-use cycle that represents a simple
3718 reduction computation. We look for the following pattern:
3720 loop_header:
3721 a1 = phi < a0, a2 >
3722 a3 = ...
3723 a2 = operation (a3, a1)
3727 a3 = ...
3728 loop_header:
3729 a1 = phi < a0, a2 >
3730 a2 = operation (a3, a1)
3732 such that:
3733 1. operation is commutative and associative and it is safe to
3734 change the order of the computation
3735 2. no uses for a2 in the loop (a2 is used out of the loop)
3736 3. no uses of a1 in the loop besides the reduction operation
3737 4. no uses of a1 outside the loop.
3739 Conditions 1,4 are tested here.
3740 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3742 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3743 nested cycles.
3745 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3746 reductions:
3748 a1 = phi < a0, a2 >
3749 inner loop (def of a3)
3750 a2 = phi < a3 >
3752 (4) Detect condition expressions, ie:
3753 for (int i = 0; i < N; i++)
3754 if (a[i] < val)
3755 ret_val = a[i];
3759 static stmt_vec_info
3760 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3761 bool *double_reduc, bool *reduc_chain_p, bool slp)
3763 gphi *phi = as_a <gphi *> (phi_info->stmt);
3764 gimple *phi_use_stmt = NULL;
3765 imm_use_iterator imm_iter;
3766 use_operand_p use_p;
3768 *double_reduc = false;
3769 *reduc_chain_p = false;
3770 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3772 tree phi_name = PHI_RESULT (phi);
3773 /* ??? If there are no uses of the PHI result the inner loop reduction
3774 won't be detected as possibly double-reduction by vectorizable_reduction
3775 because that tries to walk the PHI arg from the preheader edge which
3776 can be constant. See PR60382. */
3777 if (has_zero_uses (phi_name))
3778 return NULL;
3779 class loop *loop = (gimple_bb (phi))->loop_father;
3780 unsigned nphi_def_loop_uses = 0;
3781 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3783 gimple *use_stmt = USE_STMT (use_p);
3784 if (is_gimple_debug (use_stmt))
3785 continue;
3787 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3789 if (dump_enabled_p ())
3790 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3791 "intermediate value used outside loop.\n");
3793 return NULL;
3796 nphi_def_loop_uses++;
3797 phi_use_stmt = use_stmt;
3800 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3801 if (TREE_CODE (latch_def) != SSA_NAME)
3803 if (dump_enabled_p ())
3804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3805 "reduction: not ssa_name: %T\n", latch_def);
3806 return NULL;
3809 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3810 if (!def_stmt_info
3811 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3812 return NULL;
3814 bool nested_in_vect_loop
3815 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3816 unsigned nlatch_def_loop_uses = 0;
3817 auto_vec<gphi *, 3> lcphis;
3818 bool inner_loop_of_double_reduc = false;
3819 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3821 gimple *use_stmt = USE_STMT (use_p);
3822 if (is_gimple_debug (use_stmt))
3823 continue;
3824 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3825 nlatch_def_loop_uses++;
3826 else
3828 /* We can have more than one loop-closed PHI. */
3829 lcphis.safe_push (as_a <gphi *> (use_stmt));
3830 if (nested_in_vect_loop
3831 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3832 == vect_double_reduction_def))
3833 inner_loop_of_double_reduc = true;
3837 /* If we are vectorizing an inner reduction we are executing that
3838 in the original order only in case we are not dealing with a
3839 double reduction. */
3840 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3842 if (dump_enabled_p ())
3843 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3844 "detected nested cycle: ");
3845 return def_stmt_info;
3848 /* When the inner loop of a double reduction ends up with more than
3849 one loop-closed PHI we have failed to classify alternate such
3850 PHIs as double reduction, leading to wrong code. See PR103237. */
3851 if (inner_loop_of_double_reduc && lcphis.length () != 1)
3853 if (dump_enabled_p ())
3854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3855 "unhandle double reduction\n");
3856 return NULL;
3859 /* If this isn't a nested cycle or if the nested cycle reduction value
3860 is used ouside of the inner loop we cannot handle uses of the reduction
3861 value. */
3862 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3864 if (dump_enabled_p ())
3865 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3866 "reduction used in loop.\n");
3867 return NULL;
3870 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3871 defined in the inner loop. */
3872 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3874 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3875 if (gimple_phi_num_args (def_stmt) != 1
3876 || TREE_CODE (op1) != SSA_NAME)
3878 if (dump_enabled_p ())
3879 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3880 "unsupported phi node definition.\n");
3882 return NULL;
3885 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3886 if (gimple_bb (def1)
3887 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3888 && loop->inner
3889 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3890 && (is_gimple_assign (def1) || is_gimple_call (def1))
3891 && is_a <gphi *> (phi_use_stmt)
3892 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3894 if (dump_enabled_p ())
3895 report_vect_op (MSG_NOTE, def_stmt,
3896 "detected double reduction: ");
3898 *double_reduc = true;
3899 return def_stmt_info;
3902 return NULL;
3905 /* Look for the expression computing latch_def from then loop PHI result. */
3906 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3907 code_helper code;
3908 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3909 path))
3911 STMT_VINFO_REDUC_CODE (phi_info) = code;
3912 if (code == COND_EXPR && !nested_in_vect_loop)
3913 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3915 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3916 reduction chain for which the additional restriction is that
3917 all operations in the chain are the same. */
3918 auto_vec<stmt_vec_info, 8> reduc_chain;
3919 unsigned i;
3920 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3921 for (i = path.length () - 1; i >= 1; --i)
3923 gimple *stmt = USE_STMT (path[i].second);
3924 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3925 gimple_match_op op;
3926 if (!gimple_extract_op (stmt, &op))
3927 gcc_unreachable ();
3928 if (gassign *assign = dyn_cast<gassign *> (stmt))
3929 STMT_VINFO_REDUC_IDX (stmt_info)
3930 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
3931 else
3933 gcall *call = as_a<gcall *> (stmt);
3934 STMT_VINFO_REDUC_IDX (stmt_info)
3935 = path[i].second->use - gimple_call_arg_ptr (call, 0);
3937 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
3938 && (i == 1 || i == path.length () - 1));
3939 if ((op.code != code && !leading_conversion)
3940 /* We can only handle the final value in epilogue
3941 generation for reduction chains. */
3942 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
3943 is_slp_reduc = false;
3944 /* For reduction chains we support a trailing/leading
3945 conversions. We do not store those in the actual chain. */
3946 if (leading_conversion)
3947 continue;
3948 reduc_chain.safe_push (stmt_info);
3950 if (slp && is_slp_reduc && reduc_chain.length () > 1)
3952 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3954 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3955 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3957 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3958 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3960 /* Save the chain for further analysis in SLP detection. */
3961 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3962 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3964 *reduc_chain_p = true;
3965 if (dump_enabled_p ())
3966 dump_printf_loc (MSG_NOTE, vect_location,
3967 "reduction: detected reduction chain\n");
3969 else if (dump_enabled_p ())
3970 dump_printf_loc (MSG_NOTE, vect_location,
3971 "reduction: detected reduction\n");
3973 return def_stmt_info;
3976 if (dump_enabled_p ())
3977 dump_printf_loc (MSG_NOTE, vect_location,
3978 "reduction: unknown pattern\n");
3980 return NULL;
3983 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3984 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3985 or -1 if not known. */
3987 static int
3988 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3990 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3991 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3993 if (dump_enabled_p ())
3994 dump_printf_loc (MSG_NOTE, vect_location,
3995 "cost model: epilogue peel iters set to vf/2 "
3996 "because loop iterations are unknown .\n");
3997 return assumed_vf / 2;
3999 else
4001 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4002 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4003 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4004 /* If we need to peel for gaps, but no peeling is required, we have to
4005 peel VF iterations. */
4006 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4007 peel_iters_epilogue = assumed_vf;
4008 return peel_iters_epilogue;
4012 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4014 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4015 int *peel_iters_epilogue,
4016 stmt_vector_for_cost *scalar_cost_vec,
4017 stmt_vector_for_cost *prologue_cost_vec,
4018 stmt_vector_for_cost *epilogue_cost_vec)
4020 int retval = 0;
4022 *peel_iters_epilogue
4023 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4025 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4027 /* If peeled iterations are known but number of scalar loop
4028 iterations are unknown, count a taken branch per peeled loop. */
4029 if (peel_iters_prologue > 0)
4030 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4031 vect_prologue);
4032 if (*peel_iters_epilogue > 0)
4033 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4034 vect_epilogue);
4037 stmt_info_for_cost *si;
4038 int j;
4039 if (peel_iters_prologue)
4040 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4041 retval += record_stmt_cost (prologue_cost_vec,
4042 si->count * peel_iters_prologue,
4043 si->kind, si->stmt_info, si->misalign,
4044 vect_prologue);
4045 if (*peel_iters_epilogue)
4046 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4047 retval += record_stmt_cost (epilogue_cost_vec,
4048 si->count * *peel_iters_epilogue,
4049 si->kind, si->stmt_info, si->misalign,
4050 vect_epilogue);
4052 return retval;
4055 /* Function vect_estimate_min_profitable_iters
4057 Return the number of iterations required for the vector version of the
4058 loop to be profitable relative to the cost of the scalar version of the
4059 loop.
4061 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4062 of iterations for vectorization. -1 value means loop vectorization
4063 is not profitable. This returned value may be used for dynamic
4064 profitability check.
4066 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4067 for static check against estimated number of iterations. */
4069 static void
4070 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4071 int *ret_min_profitable_niters,
4072 int *ret_min_profitable_estimate,
4073 unsigned *suggested_unroll_factor)
4075 int min_profitable_iters;
4076 int min_profitable_estimate;
4077 int peel_iters_prologue;
4078 int peel_iters_epilogue;
4079 unsigned vec_inside_cost = 0;
4080 int vec_outside_cost = 0;
4081 unsigned vec_prologue_cost = 0;
4082 unsigned vec_epilogue_cost = 0;
4083 int scalar_single_iter_cost = 0;
4084 int scalar_outside_cost = 0;
4085 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4086 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4087 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4089 /* Cost model disabled. */
4090 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4092 if (dump_enabled_p ())
4093 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4094 *ret_min_profitable_niters = 0;
4095 *ret_min_profitable_estimate = 0;
4096 return;
4099 /* Requires loop versioning tests to handle misalignment. */
4100 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4102 /* FIXME: Make cost depend on complexity of individual check. */
4103 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4104 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4105 if (dump_enabled_p ())
4106 dump_printf (MSG_NOTE,
4107 "cost model: Adding cost of checks for loop "
4108 "versioning to treat misalignment.\n");
4111 /* Requires loop versioning with alias checks. */
4112 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4114 /* FIXME: Make cost depend on complexity of individual check. */
4115 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4116 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4117 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4118 if (len)
4119 /* Count LEN - 1 ANDs and LEN comparisons. */
4120 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4121 scalar_stmt, vect_prologue);
4122 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4123 if (len)
4125 /* Count LEN - 1 ANDs and LEN comparisons. */
4126 unsigned int nstmts = len * 2 - 1;
4127 /* +1 for each bias that needs adding. */
4128 for (unsigned int i = 0; i < len; ++i)
4129 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4130 nstmts += 1;
4131 (void) add_stmt_cost (target_cost_data, nstmts,
4132 scalar_stmt, vect_prologue);
4134 if (dump_enabled_p ())
4135 dump_printf (MSG_NOTE,
4136 "cost model: Adding cost of checks for loop "
4137 "versioning aliasing.\n");
4140 /* Requires loop versioning with niter checks. */
4141 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4143 /* FIXME: Make cost depend on complexity of individual check. */
4144 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4145 NULL, NULL, NULL_TREE, 0, vect_prologue);
4146 if (dump_enabled_p ())
4147 dump_printf (MSG_NOTE,
4148 "cost model: Adding cost of checks for loop "
4149 "versioning niters.\n");
4152 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4153 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4154 vect_prologue);
4156 /* Count statements in scalar loop. Using this as scalar cost for a single
4157 iteration for now.
4159 TODO: Add outer loop support.
4161 TODO: Consider assigning different costs to different scalar
4162 statements. */
4164 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4166 /* Add additional cost for the peeled instructions in prologue and epilogue
4167 loop. (For fully-masked loops there will be no peeling.)
4169 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4170 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4172 TODO: Build an expression that represents peel_iters for prologue and
4173 epilogue to be used in a run-time test. */
4175 bool prologue_need_br_taken_cost = false;
4176 bool prologue_need_br_not_taken_cost = false;
4178 /* Calculate peel_iters_prologue. */
4179 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4180 peel_iters_prologue = 0;
4181 else if (npeel < 0)
4183 peel_iters_prologue = assumed_vf / 2;
4184 if (dump_enabled_p ())
4185 dump_printf (MSG_NOTE, "cost model: "
4186 "prologue peel iters set to vf/2.\n");
4188 /* If peeled iterations are unknown, count a taken branch and a not taken
4189 branch per peeled loop. Even if scalar loop iterations are known,
4190 vector iterations are not known since peeled prologue iterations are
4191 not known. Hence guards remain the same. */
4192 prologue_need_br_taken_cost = true;
4193 prologue_need_br_not_taken_cost = true;
4195 else
4197 peel_iters_prologue = npeel;
4198 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4199 /* If peeled iterations are known but number of scalar loop
4200 iterations are unknown, count a taken branch per peeled loop. */
4201 prologue_need_br_taken_cost = true;
4204 bool epilogue_need_br_taken_cost = false;
4205 bool epilogue_need_br_not_taken_cost = false;
4207 /* Calculate peel_iters_epilogue. */
4208 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4209 /* We need to peel exactly one iteration for gaps. */
4210 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4211 else if (npeel < 0)
4213 /* If peeling for alignment is unknown, loop bound of main loop
4214 becomes unknown. */
4215 peel_iters_epilogue = assumed_vf / 2;
4216 if (dump_enabled_p ())
4217 dump_printf (MSG_NOTE, "cost model: "
4218 "epilogue peel iters set to vf/2 because "
4219 "peeling for alignment is unknown.\n");
4221 /* See the same reason above in peel_iters_prologue calculation. */
4222 epilogue_need_br_taken_cost = true;
4223 epilogue_need_br_not_taken_cost = true;
4225 else
4227 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4228 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4229 /* If peeled iterations are known but number of scalar loop
4230 iterations are unknown, count a taken branch per peeled loop. */
4231 epilogue_need_br_taken_cost = true;
4234 stmt_info_for_cost *si;
4235 int j;
4236 /* Add costs associated with peel_iters_prologue. */
4237 if (peel_iters_prologue)
4238 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4240 (void) add_stmt_cost (target_cost_data,
4241 si->count * peel_iters_prologue, si->kind,
4242 si->stmt_info, si->node, si->vectype,
4243 si->misalign, vect_prologue);
4246 /* Add costs associated with peel_iters_epilogue. */
4247 if (peel_iters_epilogue)
4248 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4250 (void) add_stmt_cost (target_cost_data,
4251 si->count * peel_iters_epilogue, si->kind,
4252 si->stmt_info, si->node, si->vectype,
4253 si->misalign, vect_epilogue);
4256 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4258 if (prologue_need_br_taken_cost)
4259 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4260 vect_prologue);
4262 if (prologue_need_br_not_taken_cost)
4263 (void) add_stmt_cost (target_cost_data, 1,
4264 cond_branch_not_taken, vect_prologue);
4266 if (epilogue_need_br_taken_cost)
4267 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4268 vect_epilogue);
4270 if (epilogue_need_br_not_taken_cost)
4271 (void) add_stmt_cost (target_cost_data, 1,
4272 cond_branch_not_taken, vect_epilogue);
4274 /* Take care of special costs for rgroup controls of partial vectors. */
4275 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4277 /* Calculate how many masks we need to generate. */
4278 unsigned int num_masks = 0;
4279 rgroup_controls *rgm;
4280 unsigned int num_vectors_m1;
4281 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4282 if (rgm->type)
4283 num_masks += num_vectors_m1 + 1;
4284 gcc_assert (num_masks > 0);
4286 /* In the worst case, we need to generate each mask in the prologue
4287 and in the loop body. One of the loop body mask instructions
4288 replaces the comparison in the scalar loop, and since we don't
4289 count the scalar comparison against the scalar body, we shouldn't
4290 count that vector instruction against the vector body either.
4292 Sometimes we can use unpacks instead of generating prologue
4293 masks and sometimes the prologue mask will fold to a constant,
4294 so the actual prologue cost might be smaller. However, it's
4295 simpler and safer to use the worst-case cost; if this ends up
4296 being the tie-breaker between vectorizing or not, then it's
4297 probably better not to vectorize. */
4298 (void) add_stmt_cost (target_cost_data, num_masks,
4299 vector_stmt, NULL, NULL, NULL_TREE, 0,
4300 vect_prologue);
4301 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4302 vector_stmt, NULL, NULL, NULL_TREE, 0,
4303 vect_body);
4305 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4307 /* Referring to the functions vect_set_loop_condition_partial_vectors
4308 and vect_set_loop_controls_directly, we need to generate each
4309 length in the prologue and in the loop body if required. Although
4310 there are some possible optimizations, we consider the worst case
4311 here. */
4313 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4314 signed char partial_load_store_bias
4315 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4316 bool need_iterate_p
4317 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4318 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4320 /* Calculate how many statements to be added. */
4321 unsigned int prologue_stmts = 0;
4322 unsigned int body_stmts = 0;
4324 rgroup_controls *rgc;
4325 unsigned int num_vectors_m1;
4326 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4327 if (rgc->type)
4329 /* May need one SHIFT for nitems_total computation. */
4330 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4331 if (nitems != 1 && !niters_known_p)
4332 prologue_stmts += 1;
4334 /* May need one MAX and one MINUS for wrap around. */
4335 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4336 prologue_stmts += 2;
4338 /* Need one MAX and one MINUS for each batch limit excepting for
4339 the 1st one. */
4340 prologue_stmts += num_vectors_m1 * 2;
4342 unsigned int num_vectors = num_vectors_m1 + 1;
4344 /* Need to set up lengths in prologue, only one MIN required
4345 for each since start index is zero. */
4346 prologue_stmts += num_vectors;
4348 /* If we have a non-zero partial load bias, we need one PLUS
4349 to adjust the load length. */
4350 if (partial_load_store_bias != 0)
4351 body_stmts += 1;
4353 /* Each may need two MINs and one MINUS to update lengths in body
4354 for next iteration. */
4355 if (need_iterate_p)
4356 body_stmts += 3 * num_vectors;
4359 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4360 scalar_stmt, vect_prologue);
4361 (void) add_stmt_cost (target_cost_data, body_stmts,
4362 scalar_stmt, vect_body);
4365 /* FORNOW: The scalar outside cost is incremented in one of the
4366 following ways:
4368 1. The vectorizer checks for alignment and aliasing and generates
4369 a condition that allows dynamic vectorization. A cost model
4370 check is ANDED with the versioning condition. Hence scalar code
4371 path now has the added cost of the versioning check.
4373 if (cost > th & versioning_check)
4374 jmp to vector code
4376 Hence run-time scalar is incremented by not-taken branch cost.
4378 2. The vectorizer then checks if a prologue is required. If the
4379 cost model check was not done before during versioning, it has to
4380 be done before the prologue check.
4382 if (cost <= th)
4383 prologue = scalar_iters
4384 if (prologue == 0)
4385 jmp to vector code
4386 else
4387 execute prologue
4388 if (prologue == num_iters)
4389 go to exit
4391 Hence the run-time scalar cost is incremented by a taken branch,
4392 plus a not-taken branch, plus a taken branch cost.
4394 3. The vectorizer then checks if an epilogue is required. If the
4395 cost model check was not done before during prologue check, it
4396 has to be done with the epilogue check.
4398 if (prologue == 0)
4399 jmp to vector code
4400 else
4401 execute prologue
4402 if (prologue == num_iters)
4403 go to exit
4404 vector code:
4405 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4406 jmp to epilogue
4408 Hence the run-time scalar cost should be incremented by 2 taken
4409 branches.
4411 TODO: The back end may reorder the BBS's differently and reverse
4412 conditions/branch directions. Change the estimates below to
4413 something more reasonable. */
4415 /* If the number of iterations is known and we do not do versioning, we can
4416 decide whether to vectorize at compile time. Hence the scalar version
4417 do not carry cost model guard costs. */
4418 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4419 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4421 /* Cost model check occurs at versioning. */
4422 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4423 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4424 else
4426 /* Cost model check occurs at prologue generation. */
4427 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4428 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4429 + vect_get_stmt_cost (cond_branch_not_taken);
4430 /* Cost model check occurs at epilogue generation. */
4431 else
4432 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4436 /* Complete the target-specific cost calculations. */
4437 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4438 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4439 suggested_unroll_factor);
4441 if (suggested_unroll_factor && *suggested_unroll_factor > 1
4442 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4443 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4444 *suggested_unroll_factor,
4445 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4447 if (dump_enabled_p ())
4448 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4449 "can't unroll as unrolled vectorization factor larger"
4450 " than maximum vectorization factor: "
4451 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4452 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4453 *suggested_unroll_factor = 1;
4456 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4458 if (dump_enabled_p ())
4460 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4461 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4462 vec_inside_cost);
4463 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4464 vec_prologue_cost);
4465 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4466 vec_epilogue_cost);
4467 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4468 scalar_single_iter_cost);
4469 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4470 scalar_outside_cost);
4471 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4472 vec_outside_cost);
4473 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4474 peel_iters_prologue);
4475 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4476 peel_iters_epilogue);
4479 /* Calculate number of iterations required to make the vector version
4480 profitable, relative to the loop bodies only. The following condition
4481 must hold true:
4482 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4483 where
4484 SIC = scalar iteration cost, VIC = vector iteration cost,
4485 VOC = vector outside cost, VF = vectorization factor,
4486 NPEEL = prologue iterations + epilogue iterations,
4487 SOC = scalar outside cost for run time cost model check. */
4489 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4490 - vec_inside_cost);
4491 if (saving_per_viter <= 0)
4493 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4494 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4495 "vectorization did not happen for a simd loop");
4497 if (dump_enabled_p ())
4498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4499 "cost model: the vector iteration cost = %d "
4500 "divided by the scalar iteration cost = %d "
4501 "is greater or equal to the vectorization factor = %d"
4502 ".\n",
4503 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4504 *ret_min_profitable_niters = -1;
4505 *ret_min_profitable_estimate = -1;
4506 return;
4509 /* ??? The "if" arm is written to handle all cases; see below for what
4510 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4511 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4513 /* Rewriting the condition above in terms of the number of
4514 vector iterations (vniters) rather than the number of
4515 scalar iterations (niters) gives:
4517 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4519 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4521 For integer N, X and Y when X > 0:
4523 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4524 int outside_overhead = (vec_outside_cost
4525 - scalar_single_iter_cost * peel_iters_prologue
4526 - scalar_single_iter_cost * peel_iters_epilogue
4527 - scalar_outside_cost);
4528 /* We're only interested in cases that require at least one
4529 vector iteration. */
4530 int min_vec_niters = 1;
4531 if (outside_overhead > 0)
4532 min_vec_niters = outside_overhead / saving_per_viter + 1;
4534 if (dump_enabled_p ())
4535 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4536 min_vec_niters);
4538 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4540 /* Now that we know the minimum number of vector iterations,
4541 find the minimum niters for which the scalar cost is larger:
4543 SIC * niters > VIC * vniters + VOC - SOC
4545 We know that the minimum niters is no more than
4546 vniters * VF + NPEEL, but it might be (and often is) less
4547 than that if a partial vector iteration is cheaper than the
4548 equivalent scalar code. */
4549 int threshold = (vec_inside_cost * min_vec_niters
4550 + vec_outside_cost
4551 - scalar_outside_cost);
4552 if (threshold <= 0)
4553 min_profitable_iters = 1;
4554 else
4555 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4557 else
4558 /* Convert the number of vector iterations into a number of
4559 scalar iterations. */
4560 min_profitable_iters = (min_vec_niters * assumed_vf
4561 + peel_iters_prologue
4562 + peel_iters_epilogue);
4564 else
4566 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4567 * assumed_vf
4568 - vec_inside_cost * peel_iters_prologue
4569 - vec_inside_cost * peel_iters_epilogue);
4570 if (min_profitable_iters <= 0)
4571 min_profitable_iters = 0;
4572 else
4574 min_profitable_iters /= saving_per_viter;
4576 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4577 <= (((int) vec_inside_cost * min_profitable_iters)
4578 + (((int) vec_outside_cost - scalar_outside_cost)
4579 * assumed_vf)))
4580 min_profitable_iters++;
4584 if (dump_enabled_p ())
4585 dump_printf (MSG_NOTE,
4586 " Calculated minimum iters for profitability: %d\n",
4587 min_profitable_iters);
4589 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4590 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4591 /* We want the vectorized loop to execute at least once. */
4592 min_profitable_iters = assumed_vf + peel_iters_prologue;
4593 else if (min_profitable_iters < peel_iters_prologue)
4594 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4595 vectorized loop executes at least once. */
4596 min_profitable_iters = peel_iters_prologue;
4598 if (dump_enabled_p ())
4599 dump_printf_loc (MSG_NOTE, vect_location,
4600 " Runtime profitability threshold = %d\n",
4601 min_profitable_iters);
4603 *ret_min_profitable_niters = min_profitable_iters;
4605 /* Calculate number of iterations required to make the vector version
4606 profitable, relative to the loop bodies only.
4608 Non-vectorized variant is SIC * niters and it must win over vector
4609 variant on the expected loop trip count. The following condition must hold true:
4610 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4612 if (vec_outside_cost <= 0)
4613 min_profitable_estimate = 0;
4614 /* ??? This "else if" arm is written to handle all cases; see below for
4615 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4616 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4618 /* This is a repeat of the code above, but with + SOC rather
4619 than - SOC. */
4620 int outside_overhead = (vec_outside_cost
4621 - scalar_single_iter_cost * peel_iters_prologue
4622 - scalar_single_iter_cost * peel_iters_epilogue
4623 + scalar_outside_cost);
4624 int min_vec_niters = 1;
4625 if (outside_overhead > 0)
4626 min_vec_niters = outside_overhead / saving_per_viter + 1;
4628 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4630 int threshold = (vec_inside_cost * min_vec_niters
4631 + vec_outside_cost
4632 + scalar_outside_cost);
4633 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4635 else
4636 min_profitable_estimate = (min_vec_niters * assumed_vf
4637 + peel_iters_prologue
4638 + peel_iters_epilogue);
4640 else
4642 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4643 * assumed_vf
4644 - vec_inside_cost * peel_iters_prologue
4645 - vec_inside_cost * peel_iters_epilogue)
4646 / ((scalar_single_iter_cost * assumed_vf)
4647 - vec_inside_cost);
4649 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4650 if (dump_enabled_p ())
4651 dump_printf_loc (MSG_NOTE, vect_location,
4652 " Static estimate profitability threshold = %d\n",
4653 min_profitable_estimate);
4655 *ret_min_profitable_estimate = min_profitable_estimate;
4658 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4659 vector elements (not bits) for a vector with NELT elements. */
4660 static void
4661 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4662 vec_perm_builder *sel)
4664 /* The encoding is a single stepped pattern. Any wrap-around is handled
4665 by vec_perm_indices. */
4666 sel->new_vector (nelt, 1, 3);
4667 for (unsigned int i = 0; i < 3; i++)
4668 sel->quick_push (i + offset);
4671 /* Checks whether the target supports whole-vector shifts for vectors of mode
4672 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4673 it supports vec_perm_const with masks for all necessary shift amounts. */
4674 static bool
4675 have_whole_vector_shift (machine_mode mode)
4677 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4678 return true;
4680 /* Variable-length vectors should be handled via the optab. */
4681 unsigned int nelt;
4682 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4683 return false;
4685 vec_perm_builder sel;
4686 vec_perm_indices indices;
4687 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4689 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4690 indices.new_vector (sel, 2, nelt);
4691 if (!can_vec_perm_const_p (mode, mode, indices, false))
4692 return false;
4694 return true;
4697 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
4698 multiplication operands have differing signs and (b) we intend
4699 to emulate the operation using a series of signed DOT_PROD_EXPRs.
4700 See vect_emulate_mixed_dot_prod for the actual sequence used. */
4702 static bool
4703 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
4704 stmt_vec_info stmt_info)
4706 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
4707 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
4708 return false;
4710 tree rhs1 = gimple_assign_rhs1 (assign);
4711 tree rhs2 = gimple_assign_rhs2 (assign);
4712 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
4713 return false;
4715 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4716 gcc_assert (reduc_info->is_reduc_info);
4717 return !directly_supported_p (DOT_PROD_EXPR,
4718 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
4719 optab_vector_mixed_sign);
4722 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4723 functions. Design better to avoid maintenance issues. */
4725 /* Function vect_model_reduction_cost.
4727 Models cost for a reduction operation, including the vector ops
4728 generated within the strip-mine loop in some cases, the initial
4729 definition before the loop, and the epilogue code that must be generated. */
4731 static void
4732 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4733 stmt_vec_info stmt_info, internal_fn reduc_fn,
4734 vect_reduction_type reduction_type,
4735 int ncopies, stmt_vector_for_cost *cost_vec)
4737 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4738 tree vectype;
4739 machine_mode mode;
4740 class loop *loop = NULL;
4742 if (loop_vinfo)
4743 loop = LOOP_VINFO_LOOP (loop_vinfo);
4745 /* Condition reductions generate two reductions in the loop. */
4746 if (reduction_type == COND_REDUCTION)
4747 ncopies *= 2;
4749 vectype = STMT_VINFO_VECTYPE (stmt_info);
4750 mode = TYPE_MODE (vectype);
4751 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4753 gimple_match_op op;
4754 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
4755 gcc_unreachable ();
4757 bool emulated_mixed_dot_prod
4758 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
4759 if (reduction_type == EXTRACT_LAST_REDUCTION)
4760 /* No extra instructions are needed in the prologue. The loop body
4761 operations are costed in vectorizable_condition. */
4762 inside_cost = 0;
4763 else if (reduction_type == FOLD_LEFT_REDUCTION)
4765 /* No extra instructions needed in the prologue. */
4766 prologue_cost = 0;
4768 if (reduc_fn != IFN_LAST)
4769 /* Count one reduction-like operation per vector. */
4770 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4771 stmt_info, 0, vect_body);
4772 else
4774 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4775 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4776 inside_cost = record_stmt_cost (cost_vec, nelements,
4777 vec_to_scalar, stmt_info, 0,
4778 vect_body);
4779 inside_cost += record_stmt_cost (cost_vec, nelements,
4780 scalar_stmt, stmt_info, 0,
4781 vect_body);
4784 else
4786 /* Add in the cost of the initial definitions. */
4787 int prologue_stmts;
4788 if (reduction_type == COND_REDUCTION)
4789 /* For cond reductions we have four vectors: initial index, step,
4790 initial result of the data reduction, initial value of the index
4791 reduction. */
4792 prologue_stmts = 4;
4793 else if (emulated_mixed_dot_prod)
4794 /* We need the initial reduction value and two invariants:
4795 one that contains the minimum signed value and one that
4796 contains half of its negative. */
4797 prologue_stmts = 3;
4798 else
4799 prologue_stmts = 1;
4800 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4801 scalar_to_vec, stmt_info, 0,
4802 vect_prologue);
4805 /* Determine cost of epilogue code.
4807 We have a reduction operator that will reduce the vector in one statement.
4808 Also requires scalar extract. */
4810 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4812 if (reduc_fn != IFN_LAST)
4814 if (reduction_type == COND_REDUCTION)
4816 /* An EQ stmt and an COND_EXPR stmt. */
4817 epilogue_cost += record_stmt_cost (cost_vec, 2,
4818 vector_stmt, stmt_info, 0,
4819 vect_epilogue);
4820 /* Reduction of the max index and a reduction of the found
4821 values. */
4822 epilogue_cost += record_stmt_cost (cost_vec, 2,
4823 vec_to_scalar, stmt_info, 0,
4824 vect_epilogue);
4825 /* A broadcast of the max value. */
4826 epilogue_cost += record_stmt_cost (cost_vec, 1,
4827 scalar_to_vec, stmt_info, 0,
4828 vect_epilogue);
4830 else
4832 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4833 stmt_info, 0, vect_epilogue);
4834 epilogue_cost += record_stmt_cost (cost_vec, 1,
4835 vec_to_scalar, stmt_info, 0,
4836 vect_epilogue);
4839 else if (reduction_type == COND_REDUCTION)
4841 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4842 /* Extraction of scalar elements. */
4843 epilogue_cost += record_stmt_cost (cost_vec,
4844 2 * estimated_nunits,
4845 vec_to_scalar, stmt_info, 0,
4846 vect_epilogue);
4847 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4848 epilogue_cost += record_stmt_cost (cost_vec,
4849 2 * estimated_nunits - 3,
4850 scalar_stmt, stmt_info, 0,
4851 vect_epilogue);
4853 else if (reduction_type == EXTRACT_LAST_REDUCTION
4854 || reduction_type == FOLD_LEFT_REDUCTION)
4855 /* No extra instructions need in the epilogue. */
4857 else
4859 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4860 tree bitsize = TYPE_SIZE (op.type);
4861 int element_bitsize = tree_to_uhwi (bitsize);
4862 int nelements = vec_size_in_bits / element_bitsize;
4864 if (op.code == COND_EXPR)
4865 op.code = MAX_EXPR;
4867 /* We have a whole vector shift available. */
4868 if (VECTOR_MODE_P (mode)
4869 && directly_supported_p (op.code, vectype)
4870 && have_whole_vector_shift (mode))
4872 /* Final reduction via vector shifts and the reduction operator.
4873 Also requires scalar extract. */
4874 epilogue_cost += record_stmt_cost (cost_vec,
4875 exact_log2 (nelements) * 2,
4876 vector_stmt, stmt_info, 0,
4877 vect_epilogue);
4878 epilogue_cost += record_stmt_cost (cost_vec, 1,
4879 vec_to_scalar, stmt_info, 0,
4880 vect_epilogue);
4882 else
4883 /* Use extracts and reduction op for final reduction. For N
4884 elements, we have N extracts and N-1 reduction ops. */
4885 epilogue_cost += record_stmt_cost (cost_vec,
4886 nelements + nelements - 1,
4887 vector_stmt, stmt_info, 0,
4888 vect_epilogue);
4892 if (dump_enabled_p ())
4893 dump_printf (MSG_NOTE,
4894 "vect_model_reduction_cost: inside_cost = %d, "
4895 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4896 prologue_cost, epilogue_cost);
4899 /* SEQ is a sequence of instructions that initialize the reduction
4900 described by REDUC_INFO. Emit them in the appropriate place. */
4902 static void
4903 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4904 stmt_vec_info reduc_info, gimple *seq)
4906 if (reduc_info->reused_accumulator)
4908 /* When reusing an accumulator from the main loop, we only need
4909 initialization instructions if the main loop can be skipped.
4910 In that case, emit the initialization instructions at the end
4911 of the guard block that does the skip. */
4912 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4913 gcc_assert (skip_edge);
4914 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4915 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4917 else
4919 /* The normal case: emit the initialization instructions on the
4920 preheader edge. */
4921 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4922 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4926 /* Function get_initial_def_for_reduction
4928 Input:
4929 REDUC_INFO - the info_for_reduction
4930 INIT_VAL - the initial value of the reduction variable
4931 NEUTRAL_OP - a value that has no effect on the reduction, as per
4932 neutral_op_for_reduction
4934 Output:
4935 Return a vector variable, initialized according to the operation that
4936 STMT_VINFO performs. This vector will be used as the initial value
4937 of the vector of partial results.
4939 The value we need is a vector in which element 0 has value INIT_VAL
4940 and every other element has value NEUTRAL_OP. */
4942 static tree
4943 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4944 stmt_vec_info reduc_info,
4945 tree init_val, tree neutral_op)
4947 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4948 tree scalar_type = TREE_TYPE (init_val);
4949 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4950 tree init_def;
4951 gimple_seq stmts = NULL;
4953 gcc_assert (vectype);
4955 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4956 || SCALAR_FLOAT_TYPE_P (scalar_type));
4958 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4959 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4961 if (operand_equal_p (init_val, neutral_op))
4963 /* If both elements are equal then the vector described above is
4964 just a splat. */
4965 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4966 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4968 else
4970 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4971 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4972 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4974 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4975 element 0. */
4976 init_def = gimple_build_vector_from_val (&stmts, vectype,
4977 neutral_op);
4978 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4979 vectype, init_def, init_val);
4981 else
4983 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
4984 tree_vector_builder elts (vectype, 1, 2);
4985 elts.quick_push (init_val);
4986 elts.quick_push (neutral_op);
4987 init_def = gimple_build_vector (&stmts, &elts);
4991 if (stmts)
4992 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4993 return init_def;
4996 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4997 which performs a reduction involving GROUP_SIZE scalar statements.
4998 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4999 is nonnull, introducing extra elements of that value will not change the
5000 result. */
5002 static void
5003 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5004 stmt_vec_info reduc_info,
5005 vec<tree> *vec_oprnds,
5006 unsigned int number_of_vectors,
5007 unsigned int group_size, tree neutral_op)
5009 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5010 unsigned HOST_WIDE_INT nunits;
5011 unsigned j, number_of_places_left_in_vector;
5012 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5013 unsigned int i;
5015 gcc_assert (group_size == initial_values.length () || neutral_op);
5017 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5018 created vectors. It is greater than 1 if unrolling is performed.
5020 For example, we have two scalar operands, s1 and s2 (e.g., group of
5021 strided accesses of size two), while NUNITS is four (i.e., four scalars
5022 of this type can be packed in a vector). The output vector will contain
5023 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5024 will be 2).
5026 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5027 vectors containing the operands.
5029 For example, NUNITS is four as before, and the group size is 8
5030 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5031 {s5, s6, s7, s8}. */
5033 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5034 nunits = group_size;
5036 number_of_places_left_in_vector = nunits;
5037 bool constant_p = true;
5038 tree_vector_builder elts (vector_type, nunits, 1);
5039 elts.quick_grow (nunits);
5040 gimple_seq ctor_seq = NULL;
5041 for (j = 0; j < nunits * number_of_vectors; ++j)
5043 tree op;
5044 i = j % group_size;
5046 /* Get the def before the loop. In reduction chain we have only
5047 one initial value. Else we have as many as PHIs in the group. */
5048 if (i >= initial_values.length () || (j > i && neutral_op))
5049 op = neutral_op;
5050 else
5051 op = initial_values[i];
5053 /* Create 'vect_ = {op0,op1,...,opn}'. */
5054 number_of_places_left_in_vector--;
5055 elts[nunits - number_of_places_left_in_vector - 1] = op;
5056 if (!CONSTANT_CLASS_P (op))
5057 constant_p = false;
5059 if (number_of_places_left_in_vector == 0)
5061 tree init;
5062 if (constant_p && !neutral_op
5063 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5064 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5065 /* Build the vector directly from ELTS. */
5066 init = gimple_build_vector (&ctor_seq, &elts);
5067 else if (neutral_op)
5069 /* Build a vector of the neutral value and shift the
5070 other elements into place. */
5071 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5072 neutral_op);
5073 int k = nunits;
5074 while (k > 0 && elts[k - 1] == neutral_op)
5075 k -= 1;
5076 while (k > 0)
5078 k -= 1;
5079 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5080 vector_type, init, elts[k]);
5083 else
5085 /* First time round, duplicate ELTS to fill the
5086 required number of vectors. */
5087 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5088 elts, number_of_vectors, *vec_oprnds);
5089 break;
5091 vec_oprnds->quick_push (init);
5093 number_of_places_left_in_vector = nunits;
5094 elts.new_vector (vector_type, nunits, 1);
5095 elts.quick_grow (nunits);
5096 constant_p = true;
5099 if (ctor_seq != NULL)
5100 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5103 /* For a statement STMT_INFO taking part in a reduction operation return
5104 the stmt_vec_info the meta information is stored on. */
5106 stmt_vec_info
5107 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5109 stmt_info = vect_orig_stmt (stmt_info);
5110 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5111 if (!is_a <gphi *> (stmt_info->stmt)
5112 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5113 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5114 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5115 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5117 if (gimple_phi_num_args (phi) == 1)
5118 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5120 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5122 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5123 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5124 stmt_info = info;
5126 return stmt_info;
5129 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5130 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5131 return false. */
5133 static bool
5134 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5135 stmt_vec_info reduc_info)
5137 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5138 if (!main_loop_vinfo)
5139 return false;
5141 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5142 return false;
5144 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5145 auto_vec<tree, 16> main_loop_results (num_phis);
5146 auto_vec<tree, 16> initial_values (num_phis);
5147 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5149 /* The epilogue loop can be entered either from the main loop or
5150 from an earlier guard block. */
5151 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5152 for (tree incoming_value : reduc_info->reduc_initial_values)
5154 /* Look for:
5156 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5157 INITIAL_VALUE(guard block)>. */
5158 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5160 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5161 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5163 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5164 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5166 main_loop_results.quick_push (from_main_loop);
5167 initial_values.quick_push (from_skip);
5170 else
5171 /* The main loop dominates the epilogue loop. */
5172 main_loop_results.splice (reduc_info->reduc_initial_values);
5174 /* See if the main loop has the kind of accumulator we need. */
5175 vect_reusable_accumulator *accumulator
5176 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5177 if (!accumulator
5178 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5179 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5180 accumulator->reduc_info->reduc_scalar_results.begin ()))
5181 return false;
5183 /* Handle the case where we can reduce wider vectors to narrower ones. */
5184 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5185 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5186 unsigned HOST_WIDE_INT m;
5187 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5188 TYPE_VECTOR_SUBPARTS (vectype), &m))
5189 return false;
5190 /* Check the intermediate vector types and operations are available. */
5191 tree prev_vectype = old_vectype;
5192 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5193 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5195 intermediate_nunits = exact_div (intermediate_nunits, 2);
5196 tree intermediate_vectype = get_related_vectype_for_scalar_type
5197 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5198 if (!intermediate_vectype
5199 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5200 intermediate_vectype)
5201 || !can_vec_extract (TYPE_MODE (prev_vectype),
5202 TYPE_MODE (intermediate_vectype)))
5203 return false;
5204 prev_vectype = intermediate_vectype;
5207 /* Non-SLP reductions might apply an adjustment after the reduction
5208 operation, in order to simplify the initialization of the accumulator.
5209 If the epilogue loop carries on from where the main loop left off,
5210 it should apply the same adjustment to the final reduction result.
5212 If the epilogue loop can also be entered directly (rather than via
5213 the main loop), we need to be able to handle that case in the same way,
5214 with the same adjustment. (In principle we could add a PHI node
5215 to select the correct adjustment, but in practice that shouldn't be
5216 necessary.) */
5217 tree main_adjustment
5218 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5219 if (loop_vinfo->main_loop_edge && main_adjustment)
5221 gcc_assert (num_phis == 1);
5222 tree initial_value = initial_values[0];
5223 /* Check that we can use INITIAL_VALUE as the adjustment and
5224 initialize the accumulator with a neutral value instead. */
5225 if (!operand_equal_p (initial_value, main_adjustment))
5226 return false;
5227 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5228 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5229 code, initial_value);
5231 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5232 reduc_info->reduc_initial_values.truncate (0);
5233 reduc_info->reduc_initial_values.splice (initial_values);
5234 reduc_info->reused_accumulator = accumulator;
5235 return true;
5238 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5239 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5241 static tree
5242 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5243 gimple_seq *seq)
5245 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5246 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5247 tree stype = TREE_TYPE (vectype);
5248 tree new_temp = vec_def;
5249 while (nunits > nunits1)
5251 nunits /= 2;
5252 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5253 stype, nunits);
5254 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5256 /* The target has to make sure we support lowpart/highpart
5257 extraction, either via direct vector extract or through
5258 an integer mode punning. */
5259 tree dst1, dst2;
5260 gimple *epilog_stmt;
5261 if (convert_optab_handler (vec_extract_optab,
5262 TYPE_MODE (TREE_TYPE (new_temp)),
5263 TYPE_MODE (vectype1))
5264 != CODE_FOR_nothing)
5266 /* Extract sub-vectors directly once vec_extract becomes
5267 a conversion optab. */
5268 dst1 = make_ssa_name (vectype1);
5269 epilog_stmt
5270 = gimple_build_assign (dst1, BIT_FIELD_REF,
5271 build3 (BIT_FIELD_REF, vectype1,
5272 new_temp, TYPE_SIZE (vectype1),
5273 bitsize_int (0)));
5274 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5275 dst2 = make_ssa_name (vectype1);
5276 epilog_stmt
5277 = gimple_build_assign (dst2, BIT_FIELD_REF,
5278 build3 (BIT_FIELD_REF, vectype1,
5279 new_temp, TYPE_SIZE (vectype1),
5280 bitsize_int (bitsize)));
5281 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5283 else
5285 /* Extract via punning to appropriately sized integer mode
5286 vector. */
5287 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5288 tree etype = build_vector_type (eltype, 2);
5289 gcc_assert (convert_optab_handler (vec_extract_optab,
5290 TYPE_MODE (etype),
5291 TYPE_MODE (eltype))
5292 != CODE_FOR_nothing);
5293 tree tem = make_ssa_name (etype);
5294 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5295 build1 (VIEW_CONVERT_EXPR,
5296 etype, new_temp));
5297 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5298 new_temp = tem;
5299 tem = make_ssa_name (eltype);
5300 epilog_stmt
5301 = gimple_build_assign (tem, BIT_FIELD_REF,
5302 build3 (BIT_FIELD_REF, eltype,
5303 new_temp, TYPE_SIZE (eltype),
5304 bitsize_int (0)));
5305 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5306 dst1 = make_ssa_name (vectype1);
5307 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5308 build1 (VIEW_CONVERT_EXPR,
5309 vectype1, tem));
5310 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5311 tem = make_ssa_name (eltype);
5312 epilog_stmt
5313 = gimple_build_assign (tem, BIT_FIELD_REF,
5314 build3 (BIT_FIELD_REF, eltype,
5315 new_temp, TYPE_SIZE (eltype),
5316 bitsize_int (bitsize)));
5317 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5318 dst2 = make_ssa_name (vectype1);
5319 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5320 build1 (VIEW_CONVERT_EXPR,
5321 vectype1, tem));
5322 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5325 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5328 return new_temp;
5331 /* Function vect_create_epilog_for_reduction
5333 Create code at the loop-epilog to finalize the result of a reduction
5334 computation.
5336 STMT_INFO is the scalar reduction stmt that is being vectorized.
5337 SLP_NODE is an SLP node containing a group of reduction statements. The
5338 first one in this group is STMT_INFO.
5339 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5340 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5341 (counting from 0)
5343 This function:
5344 1. Completes the reduction def-use cycles.
5345 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5346 by calling the function specified by REDUC_FN if available, or by
5347 other means (whole-vector shifts or a scalar loop).
5348 The function also creates a new phi node at the loop exit to preserve
5349 loop-closed form, as illustrated below.
5351 The flow at the entry to this function:
5353 loop:
5354 vec_def = phi <vec_init, null> # REDUCTION_PHI
5355 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5356 s_loop = scalar_stmt # (scalar) STMT_INFO
5357 loop_exit:
5358 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5359 use <s_out0>
5360 use <s_out0>
5362 The above is transformed by this function into:
5364 loop:
5365 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5366 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5367 s_loop = scalar_stmt # (scalar) STMT_INFO
5368 loop_exit:
5369 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5370 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5371 v_out2 = reduce <v_out1>
5372 s_out3 = extract_field <v_out2, 0>
5373 s_out4 = adjust_result <s_out3>
5374 use <s_out4>
5375 use <s_out4>
5378 static void
5379 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5380 stmt_vec_info stmt_info,
5381 slp_tree slp_node,
5382 slp_instance slp_node_instance)
5384 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5385 gcc_assert (reduc_info->is_reduc_info);
5386 /* For double reductions we need to get at the inner loop reduction
5387 stmt which has the meta info attached. Our stmt_info is that of the
5388 loop-closed PHI of the inner loop which we remember as
5389 def for the reduction PHI generation. */
5390 bool double_reduc = false;
5391 stmt_vec_info rdef_info = stmt_info;
5392 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5394 gcc_assert (!slp_node);
5395 double_reduc = true;
5396 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5397 (stmt_info->stmt, 0));
5398 stmt_info = vect_stmt_to_vectorize (stmt_info);
5400 gphi *reduc_def_stmt
5401 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5402 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5403 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5404 tree vectype;
5405 machine_mode mode;
5406 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5407 basic_block exit_bb;
5408 tree scalar_dest;
5409 tree scalar_type;
5410 gimple *new_phi = NULL, *phi;
5411 gimple_stmt_iterator exit_gsi;
5412 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5413 gimple *epilog_stmt = NULL;
5414 gimple *exit_phi;
5415 tree bitsize;
5416 tree def;
5417 tree orig_name, scalar_result;
5418 imm_use_iterator imm_iter, phi_imm_iter;
5419 use_operand_p use_p, phi_use_p;
5420 gimple *use_stmt;
5421 auto_vec<tree> reduc_inputs;
5422 int j, i;
5423 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5424 unsigned int group_size = 1, k;
5425 auto_vec<gimple *> phis;
5426 /* SLP reduction without reduction chain, e.g.,
5427 # a1 = phi <a2, a0>
5428 # b1 = phi <b2, b0>
5429 a2 = operation (a1)
5430 b2 = operation (b1) */
5431 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5432 bool direct_slp_reduc;
5433 tree induction_index = NULL_TREE;
5435 if (slp_node)
5436 group_size = SLP_TREE_LANES (slp_node);
5438 if (nested_in_vect_loop_p (loop, stmt_info))
5440 outer_loop = loop;
5441 loop = loop->inner;
5442 gcc_assert (!slp_node && double_reduc);
5445 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5446 gcc_assert (vectype);
5447 mode = TYPE_MODE (vectype);
5449 tree induc_val = NULL_TREE;
5450 tree adjustment_def = NULL;
5451 if (slp_node)
5453 else
5455 /* Optimize: for induction condition reduction, if we can't use zero
5456 for induc_val, use initial_def. */
5457 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5458 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5459 else if (double_reduc)
5461 else
5462 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5465 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5466 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5467 if (slp_reduc)
5468 /* All statements produce live-out values. */
5469 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5470 else if (slp_node)
5472 /* The last statement in the reduction chain produces the live-out
5473 value. Note SLP optimization can shuffle scalar stmts to
5474 optimize permutations so we have to search for the last stmt. */
5475 for (k = 0; k < group_size; ++k)
5476 if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5478 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5479 break;
5483 unsigned vec_num;
5484 int ncopies;
5485 if (slp_node)
5487 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5488 ncopies = 1;
5490 else
5492 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5493 vec_num = 1;
5494 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5497 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5498 which is updated with the current index of the loop for every match of
5499 the original loop's cond_expr (VEC_STMT). This results in a vector
5500 containing the last time the condition passed for that vector lane.
5501 The first match will be a 1 to allow 0 to be used for non-matching
5502 indexes. If there are no matches at all then the vector will be all
5503 zeroes.
5505 PR92772: This algorithm is broken for architectures that support
5506 masked vectors, but do not provide fold_extract_last. */
5507 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5509 auto_vec<std::pair<tree, bool>, 2> ccompares;
5510 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5511 cond_info = vect_stmt_to_vectorize (cond_info);
5512 while (cond_info != reduc_info)
5514 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5516 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5517 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5518 ccompares.safe_push
5519 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5520 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5522 cond_info
5523 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5524 1 + STMT_VINFO_REDUC_IDX
5525 (cond_info)));
5526 cond_info = vect_stmt_to_vectorize (cond_info);
5528 gcc_assert (ccompares.length () != 0);
5530 tree indx_before_incr, indx_after_incr;
5531 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5532 int scalar_precision
5533 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5534 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5535 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5536 (TYPE_MODE (vectype), cr_index_scalar_type,
5537 TYPE_VECTOR_SUBPARTS (vectype));
5539 /* First we create a simple vector induction variable which starts
5540 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5541 vector size (STEP). */
5543 /* Create a {1,2,3,...} vector. */
5544 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5546 /* Create a vector of the step value. */
5547 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5548 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5550 /* Create an induction variable. */
5551 gimple_stmt_iterator incr_gsi;
5552 bool insert_after;
5553 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5554 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5555 insert_after, &indx_before_incr, &indx_after_incr);
5557 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5558 filled with zeros (VEC_ZERO). */
5560 /* Create a vector of 0s. */
5561 tree zero = build_zero_cst (cr_index_scalar_type);
5562 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5564 /* Create a vector phi node. */
5565 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5566 new_phi = create_phi_node (new_phi_tree, loop->header);
5567 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5568 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5570 /* Now take the condition from the loops original cond_exprs
5571 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5572 every match uses values from the induction variable
5573 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5574 (NEW_PHI_TREE).
5575 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5576 the new cond_expr (INDEX_COND_EXPR). */
5577 gimple_seq stmts = NULL;
5578 for (int i = ccompares.length () - 1; i != -1; --i)
5580 tree ccompare = ccompares[i].first;
5581 if (ccompares[i].second)
5582 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5583 cr_index_vector_type,
5584 ccompare,
5585 indx_before_incr, new_phi_tree);
5586 else
5587 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5588 cr_index_vector_type,
5589 ccompare,
5590 new_phi_tree, indx_before_incr);
5592 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5594 /* Update the phi with the vec cond. */
5595 induction_index = new_phi_tree;
5596 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5597 loop_latch_edge (loop), UNKNOWN_LOCATION);
5600 /* 2. Create epilog code.
5601 The reduction epilog code operates across the elements of the vector
5602 of partial results computed by the vectorized loop.
5603 The reduction epilog code consists of:
5605 step 1: compute the scalar result in a vector (v_out2)
5606 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5607 step 3: adjust the scalar result (s_out3) if needed.
5609 Step 1 can be accomplished using one the following three schemes:
5610 (scheme 1) using reduc_fn, if available.
5611 (scheme 2) using whole-vector shifts, if available.
5612 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5613 combined.
5615 The overall epilog code looks like this:
5617 s_out0 = phi <s_loop> # original EXIT_PHI
5618 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5619 v_out2 = reduce <v_out1> # step 1
5620 s_out3 = extract_field <v_out2, 0> # step 2
5621 s_out4 = adjust_result <s_out3> # step 3
5623 (step 3 is optional, and steps 1 and 2 may be combined).
5624 Lastly, the uses of s_out0 are replaced by s_out4. */
5627 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5628 v_out1 = phi <VECT_DEF>
5629 Store them in NEW_PHIS. */
5630 if (double_reduc)
5631 loop = outer_loop;
5632 exit_bb = single_exit (loop)->dest;
5633 exit_gsi = gsi_after_labels (exit_bb);
5634 reduc_inputs.create (slp_node ? vec_num : ncopies);
5635 for (unsigned i = 0; i < vec_num; i++)
5637 gimple_seq stmts = NULL;
5638 if (slp_node)
5639 def = vect_get_slp_vect_def (slp_node, i);
5640 else
5641 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5642 for (j = 0; j < ncopies; j++)
5644 tree new_def = copy_ssa_name (def);
5645 phi = create_phi_node (new_def, exit_bb);
5646 if (j)
5647 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5648 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5649 new_def = gimple_convert (&stmts, vectype, new_def);
5650 reduc_inputs.quick_push (new_def);
5652 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5655 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5656 (i.e. when reduc_fn is not available) and in the final adjustment
5657 code (if needed). Also get the original scalar reduction variable as
5658 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5659 represents a reduction pattern), the tree-code and scalar-def are
5660 taken from the original stmt that the pattern-stmt (STMT) replaces.
5661 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5662 are taken from STMT. */
5664 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5665 if (orig_stmt_info != stmt_info)
5667 /* Reduction pattern */
5668 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5669 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5672 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5673 scalar_type = TREE_TYPE (scalar_dest);
5674 scalar_results.truncate (0);
5675 scalar_results.reserve_exact (group_size);
5676 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5677 bitsize = TYPE_SIZE (scalar_type);
5679 /* True if we should implement SLP_REDUC using native reduction operations
5680 instead of scalar operations. */
5681 direct_slp_reduc = (reduc_fn != IFN_LAST
5682 && slp_reduc
5683 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5685 /* In case of reduction chain, e.g.,
5686 # a1 = phi <a3, a0>
5687 a2 = operation (a1)
5688 a3 = operation (a2),
5690 we may end up with more than one vector result. Here we reduce them
5691 to one vector.
5693 The same is true for a SLP reduction, e.g.,
5694 # a1 = phi <a2, a0>
5695 # b1 = phi <b2, b0>
5696 a2 = operation (a1)
5697 b2 = operation (a2),
5699 where we can end up with more than one vector as well. We can
5700 easily accumulate vectors when the number of vector elements is
5701 a multiple of the SLP group size.
5703 The same is true if we couldn't use a single defuse cycle. */
5704 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5705 || direct_slp_reduc
5706 || (slp_reduc
5707 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
5708 || ncopies > 1)
5710 gimple_seq stmts = NULL;
5711 tree single_input = reduc_inputs[0];
5712 for (k = 1; k < reduc_inputs.length (); k++)
5713 single_input = gimple_build (&stmts, code, vectype,
5714 single_input, reduc_inputs[k]);
5715 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5717 reduc_inputs.truncate (0);
5718 reduc_inputs.safe_push (single_input);
5721 tree orig_reduc_input = reduc_inputs[0];
5723 /* If this loop is an epilogue loop that can be skipped after the
5724 main loop, we can only share a reduction operation between the
5725 main loop and the epilogue if we put it at the target of the
5726 skip edge.
5728 We can still reuse accumulators if this check fails. Doing so has
5729 the minor(?) benefit of making the epilogue loop's scalar result
5730 independent of the main loop's scalar result. */
5731 bool unify_with_main_loop_p = false;
5732 if (reduc_info->reused_accumulator
5733 && loop_vinfo->skip_this_loop_edge
5734 && single_succ_p (exit_bb)
5735 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5737 unify_with_main_loop_p = true;
5739 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5740 reduc_inputs[0] = make_ssa_name (vectype);
5741 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5742 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5743 UNKNOWN_LOCATION);
5744 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5745 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5746 exit_gsi = gsi_after_labels (reduc_block);
5749 /* Shouldn't be used beyond this point. */
5750 exit_bb = nullptr;
5752 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5753 && reduc_fn != IFN_LAST)
5755 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5756 various data values where the condition matched and another vector
5757 (INDUCTION_INDEX) containing all the indexes of those matches. We
5758 need to extract the last matching index (which will be the index with
5759 highest value) and use this to index into the data vector.
5760 For the case where there were no matches, the data vector will contain
5761 all default values and the index vector will be all zeros. */
5763 /* Get various versions of the type of the vector of indexes. */
5764 tree index_vec_type = TREE_TYPE (induction_index);
5765 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5766 tree index_scalar_type = TREE_TYPE (index_vec_type);
5767 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5769 /* Get an unsigned integer version of the type of the data vector. */
5770 int scalar_precision
5771 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5772 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5773 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5774 vectype);
5776 /* First we need to create a vector (ZERO_VEC) of zeros and another
5777 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5778 can create using a MAX reduction and then expanding.
5779 In the case where the loop never made any matches, the max index will
5780 be zero. */
5782 /* Vector of {0, 0, 0,...}. */
5783 tree zero_vec = build_zero_cst (vectype);
5785 /* Find maximum value from the vector of found indexes. */
5786 tree max_index = make_ssa_name (index_scalar_type);
5787 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5788 1, induction_index);
5789 gimple_call_set_lhs (max_index_stmt, max_index);
5790 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5792 /* Vector of {max_index, max_index, max_index,...}. */
5793 tree max_index_vec = make_ssa_name (index_vec_type);
5794 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5795 max_index);
5796 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5797 max_index_vec_rhs);
5798 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5800 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5801 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5802 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5803 otherwise. Only one value should match, resulting in a vector
5804 (VEC_COND) with one data value and the rest zeros.
5805 In the case where the loop never made any matches, every index will
5806 match, resulting in a vector with all data values (which will all be
5807 the default value). */
5809 /* Compare the max index vector to the vector of found indexes to find
5810 the position of the max value. */
5811 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5812 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5813 induction_index,
5814 max_index_vec);
5815 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5817 /* Use the compare to choose either values from the data vector or
5818 zero. */
5819 tree vec_cond = make_ssa_name (vectype);
5820 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5821 vec_compare,
5822 reduc_inputs[0],
5823 zero_vec);
5824 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5826 /* Finally we need to extract the data value from the vector (VEC_COND)
5827 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5828 reduction, but because this doesn't exist, we can use a MAX reduction
5829 instead. The data value might be signed or a float so we need to cast
5830 it first.
5831 In the case where the loop never made any matches, the data values are
5832 all identical, and so will reduce down correctly. */
5834 /* Make the matched data values unsigned. */
5835 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5836 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5837 vec_cond);
5838 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5839 VIEW_CONVERT_EXPR,
5840 vec_cond_cast_rhs);
5841 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5843 /* Reduce down to a scalar value. */
5844 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5845 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5846 1, vec_cond_cast);
5847 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5848 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5850 /* Convert the reduced value back to the result type and set as the
5851 result. */
5852 gimple_seq stmts = NULL;
5853 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5854 data_reduc);
5855 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5856 scalar_results.safe_push (new_temp);
5858 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5859 && reduc_fn == IFN_LAST)
5861 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5862 idx = 0;
5863 idx_val = induction_index[0];
5864 val = data_reduc[0];
5865 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5866 if (induction_index[i] > idx_val)
5867 val = data_reduc[i], idx_val = induction_index[i];
5868 return val; */
5870 tree data_eltype = TREE_TYPE (vectype);
5871 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5872 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5873 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5874 /* Enforced by vectorizable_reduction, which ensures we have target
5875 support before allowing a conditional reduction on variable-length
5876 vectors. */
5877 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5878 tree idx_val = NULL_TREE, val = NULL_TREE;
5879 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5881 tree old_idx_val = idx_val;
5882 tree old_val = val;
5883 idx_val = make_ssa_name (idx_eltype);
5884 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5885 build3 (BIT_FIELD_REF, idx_eltype,
5886 induction_index,
5887 bitsize_int (el_size),
5888 bitsize_int (off)));
5889 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5890 val = make_ssa_name (data_eltype);
5891 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5892 build3 (BIT_FIELD_REF,
5893 data_eltype,
5894 reduc_inputs[0],
5895 bitsize_int (el_size),
5896 bitsize_int (off)));
5897 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5898 if (off != 0)
5900 tree new_idx_val = idx_val;
5901 if (off != v_size - el_size)
5903 new_idx_val = make_ssa_name (idx_eltype);
5904 epilog_stmt = gimple_build_assign (new_idx_val,
5905 MAX_EXPR, idx_val,
5906 old_idx_val);
5907 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5909 tree cond = make_ssa_name (boolean_type_node);
5910 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
5911 idx_val, old_idx_val);
5912 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5913 tree new_val = make_ssa_name (data_eltype);
5914 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
5915 cond, val, old_val);
5916 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5917 idx_val = new_idx_val;
5918 val = new_val;
5921 /* Convert the reduced value back to the result type and set as the
5922 result. */
5923 gimple_seq stmts = NULL;
5924 val = gimple_convert (&stmts, scalar_type, val);
5925 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5926 scalar_results.safe_push (val);
5929 /* 2.3 Create the reduction code, using one of the three schemes described
5930 above. In SLP we simply need to extract all the elements from the
5931 vector (without reducing them), so we use scalar shifts. */
5932 else if (reduc_fn != IFN_LAST && !slp_reduc)
5934 tree tmp;
5935 tree vec_elem_type;
5937 /* Case 1: Create:
5938 v_out2 = reduc_expr <v_out1> */
5940 if (dump_enabled_p ())
5941 dump_printf_loc (MSG_NOTE, vect_location,
5942 "Reduce using direct vector reduction.\n");
5944 gimple_seq stmts = NULL;
5945 vec_elem_type = TREE_TYPE (vectype);
5946 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5947 vec_elem_type, reduc_inputs[0]);
5948 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5949 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5951 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5952 && induc_val)
5954 /* Earlier we set the initial value to be a vector if induc_val
5955 values. Check the result and if it is induc_val then replace
5956 with the original initial value, unless induc_val is
5957 the same as initial_def already. */
5958 tree zcompare = make_ssa_name (boolean_type_node);
5959 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
5960 new_temp, induc_val);
5961 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5962 tree initial_def = reduc_info->reduc_initial_values[0];
5963 tmp = make_ssa_name (new_scalar_dest);
5964 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5965 initial_def, new_temp);
5966 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5967 new_temp = tmp;
5970 scalar_results.safe_push (new_temp);
5972 else if (direct_slp_reduc)
5974 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5975 with the elements for other SLP statements replaced with the
5976 neutral value. We can then do a normal reduction on each vector. */
5978 /* Enforced by vectorizable_reduction. */
5979 gcc_assert (reduc_inputs.length () == 1);
5980 gcc_assert (pow2p_hwi (group_size));
5982 gimple_seq seq = NULL;
5984 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5985 and the same element size as VECTYPE. */
5986 tree index = build_index_vector (vectype, 0, 1);
5987 tree index_type = TREE_TYPE (index);
5988 tree index_elt_type = TREE_TYPE (index_type);
5989 tree mask_type = truth_type_for (index_type);
5991 /* Create a vector that, for each element, identifies which of
5992 the REDUC_GROUP_SIZE results should use it. */
5993 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5994 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5995 build_vector_from_val (index_type, index_mask));
5997 /* Get a neutral vector value. This is simply a splat of the neutral
5998 scalar value if we have one, otherwise the initial scalar value
5999 is itself a neutral value. */
6000 tree vector_identity = NULL_TREE;
6001 tree neutral_op = NULL_TREE;
6002 if (slp_node)
6004 tree initial_value = NULL_TREE;
6005 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6006 initial_value = reduc_info->reduc_initial_values[0];
6007 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6008 initial_value);
6010 if (neutral_op)
6011 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6012 neutral_op);
6013 for (unsigned int i = 0; i < group_size; ++i)
6015 /* If there's no univeral neutral value, we can use the
6016 initial scalar value from the original PHI. This is used
6017 for MIN and MAX reduction, for example. */
6018 if (!neutral_op)
6020 tree scalar_value = reduc_info->reduc_initial_values[i];
6021 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6022 scalar_value);
6023 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6024 scalar_value);
6027 /* Calculate the equivalent of:
6029 sel[j] = (index[j] == i);
6031 which selects the elements of REDUC_INPUTS[0] that should
6032 be included in the result. */
6033 tree compare_val = build_int_cst (index_elt_type, i);
6034 compare_val = build_vector_from_val (index_type, compare_val);
6035 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6036 index, compare_val);
6038 /* Calculate the equivalent of:
6040 vec = seq ? reduc_inputs[0] : vector_identity;
6042 VEC is now suitable for a full vector reduction. */
6043 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6044 sel, reduc_inputs[0], vector_identity);
6046 /* Do the reduction and convert it to the appropriate type. */
6047 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6048 TREE_TYPE (vectype), vec);
6049 scalar = gimple_convert (&seq, scalar_type, scalar);
6050 scalar_results.safe_push (scalar);
6052 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6054 else
6056 bool reduce_with_shift;
6057 tree vec_temp;
6059 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6061 /* See if the target wants to do the final (shift) reduction
6062 in a vector mode of smaller size and first reduce upper/lower
6063 halves against each other. */
6064 enum machine_mode mode1 = mode;
6065 tree stype = TREE_TYPE (vectype);
6066 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6067 unsigned nunits1 = nunits;
6068 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6069 && reduc_inputs.length () == 1)
6071 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6072 /* For SLP reductions we have to make sure lanes match up, but
6073 since we're doing individual element final reduction reducing
6074 vector width here is even more important.
6075 ??? We can also separate lanes with permutes, for the common
6076 case of power-of-two group-size odd/even extracts would work. */
6077 if (slp_reduc && nunits != nunits1)
6079 nunits1 = least_common_multiple (nunits1, group_size);
6080 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6083 if (!slp_reduc
6084 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6085 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6087 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6088 stype, nunits1);
6089 reduce_with_shift = have_whole_vector_shift (mode1);
6090 if (!VECTOR_MODE_P (mode1)
6091 || !directly_supported_p (code, vectype1))
6092 reduce_with_shift = false;
6094 /* First reduce the vector to the desired vector size we should
6095 do shift reduction on by combining upper and lower halves. */
6096 gimple_seq stmts = NULL;
6097 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6098 code, &stmts);
6099 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6100 reduc_inputs[0] = new_temp;
6102 if (reduce_with_shift && !slp_reduc)
6104 int element_bitsize = tree_to_uhwi (bitsize);
6105 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6106 for variable-length vectors and also requires direct target support
6107 for loop reductions. */
6108 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6109 int nelements = vec_size_in_bits / element_bitsize;
6110 vec_perm_builder sel;
6111 vec_perm_indices indices;
6113 int elt_offset;
6115 tree zero_vec = build_zero_cst (vectype1);
6116 /* Case 2: Create:
6117 for (offset = nelements/2; offset >= 1; offset/=2)
6119 Create: va' = vec_shift <va, offset>
6120 Create: va = vop <va, va'>
6121 } */
6123 tree rhs;
6125 if (dump_enabled_p ())
6126 dump_printf_loc (MSG_NOTE, vect_location,
6127 "Reduce using vector shifts\n");
6129 gimple_seq stmts = NULL;
6130 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6131 for (elt_offset = nelements / 2;
6132 elt_offset >= 1;
6133 elt_offset /= 2)
6135 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6136 indices.new_vector (sel, 2, nelements);
6137 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6138 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6139 new_temp, zero_vec, mask);
6140 new_temp = gimple_build (&stmts, code,
6141 vectype1, new_name, new_temp);
6143 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6145 /* 2.4 Extract the final scalar result. Create:
6146 s_out3 = extract_field <v_out2, bitpos> */
6148 if (dump_enabled_p ())
6149 dump_printf_loc (MSG_NOTE, vect_location,
6150 "extract scalar result\n");
6152 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6153 bitsize, bitsize_zero_node);
6154 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6155 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6156 gimple_assign_set_lhs (epilog_stmt, new_temp);
6157 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6158 scalar_results.safe_push (new_temp);
6160 else
6162 /* Case 3: Create:
6163 s = extract_field <v_out2, 0>
6164 for (offset = element_size;
6165 offset < vector_size;
6166 offset += element_size;)
6168 Create: s' = extract_field <v_out2, offset>
6169 Create: s = op <s, s'> // For non SLP cases
6170 } */
6172 if (dump_enabled_p ())
6173 dump_printf_loc (MSG_NOTE, vect_location,
6174 "Reduce using scalar code.\n");
6176 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6177 int element_bitsize = tree_to_uhwi (bitsize);
6178 tree compute_type = TREE_TYPE (vectype);
6179 gimple_seq stmts = NULL;
6180 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6182 int bit_offset;
6183 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6184 vec_temp, bitsize, bitsize_zero_node);
6186 /* In SLP we don't need to apply reduction operation, so we just
6187 collect s' values in SCALAR_RESULTS. */
6188 if (slp_reduc)
6189 scalar_results.safe_push (new_temp);
6191 for (bit_offset = element_bitsize;
6192 bit_offset < vec_size_in_bits;
6193 bit_offset += element_bitsize)
6195 tree bitpos = bitsize_int (bit_offset);
6196 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6197 compute_type, vec_temp,
6198 bitsize, bitpos);
6199 if (slp_reduc)
6201 /* In SLP we don't need to apply reduction operation, so
6202 we just collect s' values in SCALAR_RESULTS. */
6203 new_temp = new_name;
6204 scalar_results.safe_push (new_name);
6206 else
6207 new_temp = gimple_build (&stmts, code, compute_type,
6208 new_name, new_temp);
6212 /* The only case where we need to reduce scalar results in SLP, is
6213 unrolling. If the size of SCALAR_RESULTS is greater than
6214 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6215 REDUC_GROUP_SIZE. */
6216 if (slp_reduc)
6218 tree res, first_res, new_res;
6220 /* Reduce multiple scalar results in case of SLP unrolling. */
6221 for (j = group_size; scalar_results.iterate (j, &res);
6222 j++)
6224 first_res = scalar_results[j % group_size];
6225 new_res = gimple_build (&stmts, code, compute_type,
6226 first_res, res);
6227 scalar_results[j % group_size] = new_res;
6229 scalar_results.truncate (group_size);
6230 for (k = 0; k < group_size; k++)
6231 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6232 scalar_results[k]);
6234 else
6236 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6237 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6238 scalar_results.safe_push (new_temp);
6241 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6244 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6245 && induc_val)
6247 /* Earlier we set the initial value to be a vector if induc_val
6248 values. Check the result and if it is induc_val then replace
6249 with the original initial value, unless induc_val is
6250 the same as initial_def already. */
6251 tree zcompare = make_ssa_name (boolean_type_node);
6252 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6253 induc_val);
6254 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6255 tree initial_def = reduc_info->reduc_initial_values[0];
6256 tree tmp = make_ssa_name (new_scalar_dest);
6257 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6258 initial_def, new_temp);
6259 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6260 scalar_results[0] = tmp;
6264 /* 2.5 Adjust the final result by the initial value of the reduction
6265 variable. (When such adjustment is not needed, then
6266 'adjustment_def' is zero). For example, if code is PLUS we create:
6267 new_temp = loop_exit_def + adjustment_def */
6269 if (adjustment_def)
6271 gcc_assert (!slp_reduc);
6272 gimple_seq stmts = NULL;
6273 if (double_reduc)
6275 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6276 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6277 new_temp = gimple_build (&stmts, code, vectype,
6278 reduc_inputs[0], adjustment_def);
6280 else
6282 new_temp = scalar_results[0];
6283 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6284 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
6285 new_temp = gimple_build (&stmts, code, scalar_type,
6286 new_temp, adjustment_def);
6289 epilog_stmt = gimple_seq_last_stmt (stmts);
6290 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6291 scalar_results[0] = new_temp;
6294 /* Record this operation if it could be reused by the epilogue loop. */
6295 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6296 && reduc_inputs.length () == 1)
6297 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6298 { orig_reduc_input, reduc_info });
6300 if (double_reduc)
6301 loop = outer_loop;
6303 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6304 phis with new adjusted scalar results, i.e., replace use <s_out0>
6305 with use <s_out4>.
6307 Transform:
6308 loop_exit:
6309 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6310 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6311 v_out2 = reduce <v_out1>
6312 s_out3 = extract_field <v_out2, 0>
6313 s_out4 = adjust_result <s_out3>
6314 use <s_out0>
6315 use <s_out0>
6317 into:
6319 loop_exit:
6320 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6321 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6322 v_out2 = reduce <v_out1>
6323 s_out3 = extract_field <v_out2, 0>
6324 s_out4 = adjust_result <s_out3>
6325 use <s_out4>
6326 use <s_out4> */
6328 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6329 for (k = 0; k < live_out_stmts.size (); k++)
6331 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6332 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6334 phis.create (3);
6335 /* Find the loop-closed-use at the loop exit of the original scalar
6336 result. (The reduction result is expected to have two immediate uses,
6337 one at the latch block, and one at the loop exit). For double
6338 reductions we are looking for exit phis of the outer loop. */
6339 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6341 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6343 if (!is_gimple_debug (USE_STMT (use_p)))
6344 phis.safe_push (USE_STMT (use_p));
6346 else
6348 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6350 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6352 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6354 if (!flow_bb_inside_loop_p (loop,
6355 gimple_bb (USE_STMT (phi_use_p)))
6356 && !is_gimple_debug (USE_STMT (phi_use_p)))
6357 phis.safe_push (USE_STMT (phi_use_p));
6363 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6365 /* Replace the uses: */
6366 orig_name = PHI_RESULT (exit_phi);
6368 /* Look for a single use at the target of the skip edge. */
6369 if (unify_with_main_loop_p)
6371 use_operand_p use_p;
6372 gimple *user;
6373 if (!single_imm_use (orig_name, &use_p, &user))
6374 gcc_unreachable ();
6375 orig_name = gimple_get_lhs (user);
6378 scalar_result = scalar_results[k];
6379 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6381 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6382 SET_USE (use_p, scalar_result);
6383 update_stmt (use_stmt);
6387 phis.release ();
6391 /* Return a vector of type VECTYPE that is equal to the vector select
6392 operation "MASK ? VEC : IDENTITY". Insert the select statements
6393 before GSI. */
6395 static tree
6396 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6397 tree vec, tree identity)
6399 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6400 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6401 mask, vec, identity);
6402 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6403 return cond;
6406 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6407 order, starting with LHS. Insert the extraction statements before GSI and
6408 associate the new scalar SSA names with variable SCALAR_DEST.
6409 Return the SSA name for the result. */
6411 static tree
6412 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6413 tree_code code, tree lhs, tree vector_rhs)
6415 tree vectype = TREE_TYPE (vector_rhs);
6416 tree scalar_type = TREE_TYPE (vectype);
6417 tree bitsize = TYPE_SIZE (scalar_type);
6418 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6419 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6421 for (unsigned HOST_WIDE_INT bit_offset = 0;
6422 bit_offset < vec_size_in_bits;
6423 bit_offset += element_bitsize)
6425 tree bitpos = bitsize_int (bit_offset);
6426 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6427 bitsize, bitpos);
6429 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6430 rhs = make_ssa_name (scalar_dest, stmt);
6431 gimple_assign_set_lhs (stmt, rhs);
6432 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6434 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6435 tree new_name = make_ssa_name (scalar_dest, stmt);
6436 gimple_assign_set_lhs (stmt, new_name);
6437 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6438 lhs = new_name;
6440 return lhs;
6443 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6444 type of the vector input. */
6446 static internal_fn
6447 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6449 internal_fn mask_reduc_fn;
6451 switch (reduc_fn)
6453 case IFN_FOLD_LEFT_PLUS:
6454 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6455 break;
6457 default:
6458 return IFN_LAST;
6461 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6462 OPTIMIZE_FOR_SPEED))
6463 return mask_reduc_fn;
6464 return IFN_LAST;
6467 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6468 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6469 statement. CODE is the operation performed by STMT_INFO and OPS are
6470 its scalar operands. REDUC_INDEX is the index of the operand in
6471 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6472 implements in-order reduction, or IFN_LAST if we should open-code it.
6473 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6474 that should be used to control the operation in a fully-masked loop. */
6476 static bool
6477 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6478 stmt_vec_info stmt_info,
6479 gimple_stmt_iterator *gsi,
6480 gimple **vec_stmt, slp_tree slp_node,
6481 gimple *reduc_def_stmt,
6482 tree_code code, internal_fn reduc_fn,
6483 tree ops[3], tree vectype_in,
6484 int reduc_index, vec_loop_masks *masks)
6486 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6487 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6488 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6490 int ncopies;
6491 if (slp_node)
6492 ncopies = 1;
6493 else
6494 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6496 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6497 gcc_assert (ncopies == 1);
6498 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6500 if (slp_node)
6501 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6502 TYPE_VECTOR_SUBPARTS (vectype_in)));
6504 tree op0 = ops[1 - reduc_index];
6506 int group_size = 1;
6507 stmt_vec_info scalar_dest_def_info;
6508 auto_vec<tree> vec_oprnds0;
6509 if (slp_node)
6511 auto_vec<vec<tree> > vec_defs (2);
6512 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6513 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6514 vec_defs[0].release ();
6515 vec_defs[1].release ();
6516 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6517 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6519 else
6521 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6522 op0, &vec_oprnds0);
6523 scalar_dest_def_info = stmt_info;
6526 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6527 tree scalar_type = TREE_TYPE (scalar_dest);
6528 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6530 int vec_num = vec_oprnds0.length ();
6531 gcc_assert (vec_num == 1 || slp_node);
6532 tree vec_elem_type = TREE_TYPE (vectype_out);
6533 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6535 tree vector_identity = NULL_TREE;
6536 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6537 vector_identity = build_zero_cst (vectype_out);
6539 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6540 int i;
6541 tree def0;
6542 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6544 gimple *new_stmt;
6545 tree mask = NULL_TREE;
6546 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6547 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6549 /* Handle MINUS by adding the negative. */
6550 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6552 tree negated = make_ssa_name (vectype_out);
6553 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6554 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6555 def0 = negated;
6558 if (mask && mask_reduc_fn == IFN_LAST)
6559 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6560 vector_identity);
6562 /* On the first iteration the input is simply the scalar phi
6563 result, and for subsequent iterations it is the output of
6564 the preceding operation. */
6565 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6567 if (mask && mask_reduc_fn != IFN_LAST)
6568 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6569 def0, mask);
6570 else
6571 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6572 def0);
6573 /* For chained SLP reductions the output of the previous reduction
6574 operation serves as the input of the next. For the final statement
6575 the output cannot be a temporary - we reuse the original
6576 scalar destination of the last statement. */
6577 if (i != vec_num - 1)
6579 gimple_set_lhs (new_stmt, scalar_dest_var);
6580 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6581 gimple_set_lhs (new_stmt, reduc_var);
6584 else
6586 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6587 reduc_var, def0);
6588 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6589 /* Remove the statement, so that we can use the same code paths
6590 as for statements that we've just created. */
6591 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6592 gsi_remove (&tmp_gsi, true);
6595 if (i == vec_num - 1)
6597 gimple_set_lhs (new_stmt, scalar_dest);
6598 vect_finish_replace_stmt (loop_vinfo,
6599 scalar_dest_def_info,
6600 new_stmt);
6602 else
6603 vect_finish_stmt_generation (loop_vinfo,
6604 scalar_dest_def_info,
6605 new_stmt, gsi);
6607 if (slp_node)
6608 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6609 else
6611 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6612 *vec_stmt = new_stmt;
6616 return true;
6619 /* Function is_nonwrapping_integer_induction.
6621 Check if STMT_VINO (which is part of loop LOOP) both increments and
6622 does not cause overflow. */
6624 static bool
6625 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6627 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6628 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6629 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6630 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6631 widest_int ni, max_loop_value, lhs_max;
6632 wi::overflow_type overflow = wi::OVF_NONE;
6634 /* Make sure the loop is integer based. */
6635 if (TREE_CODE (base) != INTEGER_CST
6636 || TREE_CODE (step) != INTEGER_CST)
6637 return false;
6639 /* Check that the max size of the loop will not wrap. */
6641 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6642 return true;
6644 if (! max_stmt_executions (loop, &ni))
6645 return false;
6647 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6648 &overflow);
6649 if (overflow)
6650 return false;
6652 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6653 TYPE_SIGN (lhs_type), &overflow);
6654 if (overflow)
6655 return false;
6657 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6658 <= TYPE_PRECISION (lhs_type));
6661 /* Check if masking can be supported by inserting a conditional expression.
6662 CODE is the code for the operation. COND_FN is the conditional internal
6663 function, if it exists. VECTYPE_IN is the type of the vector input. */
6664 static bool
6665 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
6666 tree vectype_in)
6668 if (cond_fn != IFN_LAST
6669 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6670 OPTIMIZE_FOR_SPEED))
6671 return false;
6673 if (code.is_tree_code ())
6674 switch (tree_code (code))
6676 case DOT_PROD_EXPR:
6677 case SAD_EXPR:
6678 return true;
6680 default:
6681 break;
6683 return false;
6686 /* Insert a conditional expression to enable masked vectorization. CODE is the
6687 code for the operation. VOP is the array of operands. MASK is the loop
6688 mask. GSI is a statement iterator used to place the new conditional
6689 expression. */
6690 static void
6691 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
6692 gimple_stmt_iterator *gsi)
6694 switch (tree_code (code))
6696 case DOT_PROD_EXPR:
6698 tree vectype = TREE_TYPE (vop[1]);
6699 tree zero = build_zero_cst (vectype);
6700 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6701 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6702 mask, vop[1], zero);
6703 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6704 vop[1] = masked_op1;
6705 break;
6708 case SAD_EXPR:
6710 tree vectype = TREE_TYPE (vop[1]);
6711 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6712 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6713 mask, vop[1], vop[0]);
6714 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6715 vop[1] = masked_op1;
6716 break;
6719 default:
6720 gcc_unreachable ();
6724 /* Function vectorizable_reduction.
6726 Check if STMT_INFO performs a reduction operation that can be vectorized.
6727 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6728 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6729 Return true if STMT_INFO is vectorizable in this way.
6731 This function also handles reduction idioms (patterns) that have been
6732 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6733 may be of this form:
6734 X = pattern_expr (arg0, arg1, ..., X)
6735 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6736 sequence that had been detected and replaced by the pattern-stmt
6737 (STMT_INFO).
6739 This function also handles reduction of condition expressions, for example:
6740 for (int i = 0; i < N; i++)
6741 if (a[i] < value)
6742 last = a[i];
6743 This is handled by vectorising the loop and creating an additional vector
6744 containing the loop indexes for which "a[i] < value" was true. In the
6745 function epilogue this is reduced to a single max value and then used to
6746 index into the vector of results.
6748 In some cases of reduction patterns, the type of the reduction variable X is
6749 different than the type of the other arguments of STMT_INFO.
6750 In such cases, the vectype that is used when transforming STMT_INFO into
6751 a vector stmt is different than the vectype that is used to determine the
6752 vectorization factor, because it consists of a different number of elements
6753 than the actual number of elements that are being operated upon in parallel.
6755 For example, consider an accumulation of shorts into an int accumulator.
6756 On some targets it's possible to vectorize this pattern operating on 8
6757 shorts at a time (hence, the vectype for purposes of determining the
6758 vectorization factor should be V8HI); on the other hand, the vectype that
6759 is used to create the vector form is actually V4SI (the type of the result).
6761 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6762 indicates what is the actual level of parallelism (V8HI in the example), so
6763 that the right vectorization factor would be derived. This vectype
6764 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6765 be used to create the vectorized stmt. The right vectype for the vectorized
6766 stmt is obtained from the type of the result X:
6767 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6769 This means that, contrary to "regular" reductions (or "regular" stmts in
6770 general), the following equation:
6771 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6772 does *NOT* necessarily hold for reduction patterns. */
6774 bool
6775 vectorizable_reduction (loop_vec_info loop_vinfo,
6776 stmt_vec_info stmt_info, slp_tree slp_node,
6777 slp_instance slp_node_instance,
6778 stmt_vector_for_cost *cost_vec)
6780 tree vectype_in = NULL_TREE;
6781 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6782 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6783 stmt_vec_info cond_stmt_vinfo = NULL;
6784 int i;
6785 int ncopies;
6786 bool single_defuse_cycle = false;
6787 bool nested_cycle = false;
6788 bool double_reduc = false;
6789 int vec_num;
6790 tree tem;
6791 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6792 tree cond_reduc_val = NULL_TREE;
6794 /* Make sure it was already recognized as a reduction computation. */
6795 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6796 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6797 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6798 return false;
6800 /* The stmt we store reduction analysis meta on. */
6801 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6802 reduc_info->is_reduc_info = true;
6804 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6806 if (is_a <gphi *> (stmt_info->stmt))
6808 if (slp_node)
6810 /* We eventually need to set a vector type on invariant
6811 arguments. */
6812 unsigned j;
6813 slp_tree child;
6814 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6815 if (!vect_maybe_update_slp_op_vectype
6816 (child, SLP_TREE_VECTYPE (slp_node)))
6818 if (dump_enabled_p ())
6819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6820 "incompatible vector types for "
6821 "invariants\n");
6822 return false;
6825 /* Analysis for double-reduction is done on the outer
6826 loop PHI, nested cycles have no further restrictions. */
6827 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6829 else
6830 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6831 return true;
6834 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6835 stmt_vec_info phi_info = stmt_info;
6836 if (!is_a <gphi *> (stmt_info->stmt))
6838 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6839 return true;
6841 if (slp_node)
6843 slp_node_instance->reduc_phis = slp_node;
6844 /* ??? We're leaving slp_node to point to the PHIs, we only
6845 need it to get at the number of vector stmts which wasn't
6846 yet initialized for the instance root. */
6848 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6850 use_operand_p use_p;
6851 gimple *use_stmt;
6852 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6853 &use_p, &use_stmt);
6854 gcc_assert (res);
6855 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6858 /* PHIs should not participate in patterns. */
6859 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6860 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6862 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6863 and compute the reduction chain length. Discover the real
6864 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6865 tree reduc_def
6866 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6867 loop_latch_edge
6868 (gimple_bb (reduc_def_phi)->loop_father));
6869 unsigned reduc_chain_length = 0;
6870 bool only_slp_reduc_chain = true;
6871 stmt_info = NULL;
6872 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6873 while (reduc_def != PHI_RESULT (reduc_def_phi))
6875 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6876 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6877 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6879 if (dump_enabled_p ())
6880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6881 "reduction chain broken by patterns.\n");
6882 return false;
6884 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6885 only_slp_reduc_chain = false;
6886 /* For epilogue generation live members of the chain need
6887 to point back to the PHI via their original stmt for
6888 info_for_reduction to work. For SLP we need to look at
6889 all lanes here - even though we only will vectorize from
6890 the SLP node with live lane zero the other live lanes also
6891 need to be identified as part of a reduction to be able
6892 to skip code generation for them. */
6893 if (slp_for_stmt_info)
6895 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
6896 if (STMT_VINFO_LIVE_P (s))
6897 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
6899 else if (STMT_VINFO_LIVE_P (vdef))
6900 STMT_VINFO_REDUC_DEF (def) = phi_info;
6901 gimple_match_op op;
6902 if (!gimple_extract_op (vdef->stmt, &op))
6904 if (dump_enabled_p ())
6905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6906 "reduction chain includes unsupported"
6907 " statement type.\n");
6908 return false;
6910 if (CONVERT_EXPR_CODE_P (op.code))
6912 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
6914 if (dump_enabled_p ())
6915 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6916 "conversion in the reduction chain.\n");
6917 return false;
6920 else if (!stmt_info)
6921 /* First non-conversion stmt. */
6922 stmt_info = vdef;
6923 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
6924 reduc_chain_length++;
6925 if (!stmt_info && slp_node)
6926 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6928 /* PHIs should not participate in patterns. */
6929 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6931 if (nested_in_vect_loop_p (loop, stmt_info))
6933 loop = loop->inner;
6934 nested_cycle = true;
6937 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6938 element. */
6939 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6941 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6942 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6944 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6945 gcc_assert (slp_node
6946 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6948 /* 1. Is vectorizable reduction? */
6949 /* Not supportable if the reduction variable is used in the loop, unless
6950 it's a reduction chain. */
6951 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6952 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6953 return false;
6955 /* Reductions that are not used even in an enclosing outer-loop,
6956 are expected to be "live" (used out of the loop). */
6957 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6958 && !STMT_VINFO_LIVE_P (stmt_info))
6959 return false;
6961 /* 2. Has this been recognized as a reduction pattern?
6963 Check if STMT represents a pattern that has been recognized
6964 in earlier analysis stages. For stmts that represent a pattern,
6965 the STMT_VINFO_RELATED_STMT field records the last stmt in
6966 the original sequence that constitutes the pattern. */
6968 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6969 if (orig_stmt_info)
6971 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6972 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6975 /* 3. Check the operands of the operation. The first operands are defined
6976 inside the loop body. The last operand is the reduction variable,
6977 which is defined by the loop-header-phi. */
6979 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6980 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6981 gimple_match_op op;
6982 if (!gimple_extract_op (stmt_info->stmt, &op))
6983 gcc_unreachable ();
6984 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
6985 || op.code == WIDEN_SUM_EXPR
6986 || op.code == SAD_EXPR);
6988 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
6989 && !SCALAR_FLOAT_TYPE_P (op.type))
6990 return false;
6992 /* Do not try to vectorize bit-precision reductions. */
6993 if (!type_has_mode_precision_p (op.type))
6994 return false;
6996 /* For lane-reducing ops we're reducing the number of reduction PHIs
6997 which means the only use of that may be in the lane-reducing operation. */
6998 if (lane_reduc_code_p
6999 && reduc_chain_length != 1
7000 && !only_slp_reduc_chain)
7002 if (dump_enabled_p ())
7003 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7004 "lane-reducing reduction with extra stmts.\n");
7005 return false;
7008 /* All uses but the last are expected to be defined in the loop.
7009 The last use is the reduction variable. In case of nested cycle this
7010 assumption is not true: we use reduc_index to record the index of the
7011 reduction variable. */
7012 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7013 /* We need to skip an extra operand for COND_EXPRs with embedded
7014 comparison. */
7015 unsigned opno_adjust = 0;
7016 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7017 opno_adjust = 1;
7018 for (i = 0; i < (int) op.num_ops; i++)
7020 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7021 if (i == 0 && op.code == COND_EXPR)
7022 continue;
7024 stmt_vec_info def_stmt_info;
7025 enum vect_def_type dt;
7026 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7027 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7028 &tem, &def_stmt_info))
7030 if (dump_enabled_p ())
7031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7032 "use not simple.\n");
7033 return false;
7035 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7036 continue;
7038 /* There should be only one cycle def in the stmt, the one
7039 leading to reduc_def. */
7040 if (VECTORIZABLE_CYCLE_DEF (dt))
7041 return false;
7043 /* To properly compute ncopies we are interested in the widest
7044 non-reduction input type in case we're looking at a widening
7045 accumulation that we later handle in vect_transform_reduction. */
7046 if (lane_reduc_code_p
7047 && tem
7048 && (!vectype_in
7049 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7050 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
7051 vectype_in = tem;
7053 if (op.code == COND_EXPR)
7055 /* Record how the non-reduction-def value of COND_EXPR is defined. */
7056 if (dt == vect_constant_def)
7058 cond_reduc_dt = dt;
7059 cond_reduc_val = op.ops[i];
7061 if (dt == vect_induction_def
7062 && def_stmt_info
7063 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7065 cond_reduc_dt = dt;
7066 cond_stmt_vinfo = def_stmt_info;
7070 if (!vectype_in)
7071 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7072 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7074 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7075 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7076 /* If we have a condition reduction, see if we can simplify it further. */
7077 if (v_reduc_type == COND_REDUCTION)
7079 if (slp_node)
7080 return false;
7082 /* When the condition uses the reduction value in the condition, fail. */
7083 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7085 if (dump_enabled_p ())
7086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7087 "condition depends on previous iteration\n");
7088 return false;
7091 if (reduc_chain_length == 1
7092 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
7093 vectype_in, OPTIMIZE_FOR_SPEED))
7095 if (dump_enabled_p ())
7096 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7097 "optimizing condition reduction with"
7098 " FOLD_EXTRACT_LAST.\n");
7099 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7101 else if (cond_reduc_dt == vect_induction_def)
7103 tree base
7104 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7105 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7107 gcc_assert (TREE_CODE (base) == INTEGER_CST
7108 && TREE_CODE (step) == INTEGER_CST);
7109 cond_reduc_val = NULL_TREE;
7110 enum tree_code cond_reduc_op_code = ERROR_MARK;
7111 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7112 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7114 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7115 above base; punt if base is the minimum value of the type for
7116 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7117 else if (tree_int_cst_sgn (step) == -1)
7119 cond_reduc_op_code = MIN_EXPR;
7120 if (tree_int_cst_sgn (base) == -1)
7121 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7122 else if (tree_int_cst_lt (base,
7123 TYPE_MAX_VALUE (TREE_TYPE (base))))
7124 cond_reduc_val
7125 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7127 else
7129 cond_reduc_op_code = MAX_EXPR;
7130 if (tree_int_cst_sgn (base) == 1)
7131 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7132 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7133 base))
7134 cond_reduc_val
7135 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7137 if (cond_reduc_val)
7139 if (dump_enabled_p ())
7140 dump_printf_loc (MSG_NOTE, vect_location,
7141 "condition expression based on "
7142 "integer induction.\n");
7143 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7144 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7145 = cond_reduc_val;
7146 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7149 else if (cond_reduc_dt == vect_constant_def)
7151 enum vect_def_type cond_initial_dt;
7152 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7153 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7154 if (cond_initial_dt == vect_constant_def
7155 && types_compatible_p (TREE_TYPE (cond_initial_val),
7156 TREE_TYPE (cond_reduc_val)))
7158 tree e = fold_binary (LE_EXPR, boolean_type_node,
7159 cond_initial_val, cond_reduc_val);
7160 if (e && (integer_onep (e) || integer_zerop (e)))
7162 if (dump_enabled_p ())
7163 dump_printf_loc (MSG_NOTE, vect_location,
7164 "condition expression based on "
7165 "compile time constant.\n");
7166 /* Record reduction code at analysis stage. */
7167 STMT_VINFO_REDUC_CODE (reduc_info)
7168 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7169 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7175 if (STMT_VINFO_LIVE_P (phi_info))
7176 return false;
7178 if (slp_node)
7179 ncopies = 1;
7180 else
7181 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7183 gcc_assert (ncopies >= 1);
7185 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7187 if (nested_cycle)
7189 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7190 == vect_double_reduction_def);
7191 double_reduc = true;
7194 /* 4.2. Check support for the epilog operation.
7196 If STMT represents a reduction pattern, then the type of the
7197 reduction variable may be different than the type of the rest
7198 of the arguments. For example, consider the case of accumulation
7199 of shorts into an int accumulator; The original code:
7200 S1: int_a = (int) short_a;
7201 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7203 was replaced with:
7204 STMT: int_acc = widen_sum <short_a, int_acc>
7206 This means that:
7207 1. The tree-code that is used to create the vector operation in the
7208 epilog code (that reduces the partial results) is not the
7209 tree-code of STMT, but is rather the tree-code of the original
7210 stmt from the pattern that STMT is replacing. I.e, in the example
7211 above we want to use 'widen_sum' in the loop, but 'plus' in the
7212 epilog.
7213 2. The type (mode) we use to check available target support
7214 for the vector operation to be created in the *epilog*, is
7215 determined by the type of the reduction variable (in the example
7216 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7217 However the type (mode) we use to check available target support
7218 for the vector operation to be created *inside the loop*, is
7219 determined by the type of the other arguments to STMT (in the
7220 example we'd check this: optab_handler (widen_sum_optab,
7221 vect_short_mode)).
7223 This is contrary to "regular" reductions, in which the types of all
7224 the arguments are the same as the type of the reduction variable.
7225 For "regular" reductions we can therefore use the same vector type
7226 (and also the same tree-code) when generating the epilog code and
7227 when generating the code inside the loop. */
7229 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7230 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7232 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7233 if (reduction_type == TREE_CODE_REDUCTION)
7235 /* Check whether it's ok to change the order of the computation.
7236 Generally, when vectorizing a reduction we change the order of the
7237 computation. This may change the behavior of the program in some
7238 cases, so we need to check that this is ok. One exception is when
7239 vectorizing an outer-loop: the inner-loop is executed sequentially,
7240 and therefore vectorizing reductions in the inner-loop during
7241 outer-loop vectorization is safe. Likewise when we are vectorizing
7242 a series of reductions using SLP and the VF is one the reductions
7243 are performed in scalar order. */
7244 if (slp_node
7245 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7246 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7248 else if (needs_fold_left_reduction_p (op.type, orig_code))
7250 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7251 is not directy used in stmt. */
7252 if (!only_slp_reduc_chain
7253 && reduc_chain_length != 1)
7255 if (dump_enabled_p ())
7256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7257 "in-order reduction chain without SLP.\n");
7258 return false;
7260 STMT_VINFO_REDUC_TYPE (reduc_info)
7261 = reduction_type = FOLD_LEFT_REDUCTION;
7263 else if (!commutative_binary_op_p (orig_code, op.type)
7264 || !associative_binary_op_p (orig_code, op.type))
7266 if (dump_enabled_p ())
7267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7268 "reduction: not commutative/associative");
7269 return false;
7273 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7274 && ncopies > 1)
7276 if (dump_enabled_p ())
7277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7278 "multiple types in double reduction or condition "
7279 "reduction or fold-left reduction.\n");
7280 return false;
7283 internal_fn reduc_fn = IFN_LAST;
7284 if (reduction_type == TREE_CODE_REDUCTION
7285 || reduction_type == FOLD_LEFT_REDUCTION
7286 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7287 || reduction_type == CONST_COND_REDUCTION)
7289 if (reduction_type == FOLD_LEFT_REDUCTION
7290 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7291 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7293 if (reduc_fn != IFN_LAST
7294 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7295 OPTIMIZE_FOR_SPEED))
7297 if (dump_enabled_p ())
7298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7299 "reduc op not supported by target.\n");
7301 reduc_fn = IFN_LAST;
7304 else
7306 if (!nested_cycle || double_reduc)
7308 if (dump_enabled_p ())
7309 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7310 "no reduc code for scalar code.\n");
7312 return false;
7316 else if (reduction_type == COND_REDUCTION)
7318 int scalar_precision
7319 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7320 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7321 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7322 vectype_out);
7324 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7325 OPTIMIZE_FOR_SPEED))
7326 reduc_fn = IFN_REDUC_MAX;
7328 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7330 if (reduction_type != EXTRACT_LAST_REDUCTION
7331 && (!nested_cycle || double_reduc)
7332 && reduc_fn == IFN_LAST
7333 && !nunits_out.is_constant ())
7335 if (dump_enabled_p ())
7336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7337 "missing target support for reduction on"
7338 " variable-length vectors.\n");
7339 return false;
7342 /* For SLP reductions, see if there is a neutral value we can use. */
7343 tree neutral_op = NULL_TREE;
7344 if (slp_node)
7346 tree initial_value = NULL_TREE;
7347 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7348 initial_value = vect_phi_initial_value (reduc_def_phi);
7349 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7350 orig_code, initial_value);
7353 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7355 /* We can't support in-order reductions of code such as this:
7357 for (int i = 0; i < n1; ++i)
7358 for (int j = 0; j < n2; ++j)
7359 l += a[j];
7361 since GCC effectively transforms the loop when vectorizing:
7363 for (int i = 0; i < n1 / VF; ++i)
7364 for (int j = 0; j < n2; ++j)
7365 for (int k = 0; k < VF; ++k)
7366 l += a[j];
7368 which is a reassociation of the original operation. */
7369 if (dump_enabled_p ())
7370 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7371 "in-order double reduction not supported.\n");
7373 return false;
7376 if (reduction_type == FOLD_LEFT_REDUCTION
7377 && slp_node
7378 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7380 /* We cannot use in-order reductions in this case because there is
7381 an implicit reassociation of the operations involved. */
7382 if (dump_enabled_p ())
7383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7384 "in-order unchained SLP reductions not supported.\n");
7385 return false;
7388 /* For double reductions, and for SLP reductions with a neutral value,
7389 we construct a variable-length initial vector by loading a vector
7390 full of the neutral value and then shift-and-inserting the start
7391 values into the low-numbered elements. */
7392 if ((double_reduc || neutral_op)
7393 && !nunits_out.is_constant ()
7394 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7395 vectype_out, OPTIMIZE_FOR_SPEED))
7397 if (dump_enabled_p ())
7398 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7399 "reduction on variable-length vectors requires"
7400 " target support for a vector-shift-and-insert"
7401 " operation.\n");
7402 return false;
7405 /* Check extra constraints for variable-length unchained SLP reductions. */
7406 if (STMT_SLP_TYPE (stmt_info)
7407 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7408 && !nunits_out.is_constant ())
7410 /* We checked above that we could build the initial vector when
7411 there's a neutral element value. Check here for the case in
7412 which each SLP statement has its own initial value and in which
7413 that value needs to be repeated for every instance of the
7414 statement within the initial vector. */
7415 unsigned int group_size = SLP_TREE_LANES (slp_node);
7416 if (!neutral_op
7417 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7418 TREE_TYPE (vectype_out)))
7420 if (dump_enabled_p ())
7421 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7422 "unsupported form of SLP reduction for"
7423 " variable-length vectors: cannot build"
7424 " initial vector.\n");
7425 return false;
7427 /* The epilogue code relies on the number of elements being a multiple
7428 of the group size. The duplicate-and-interleave approach to setting
7429 up the initial vector does too. */
7430 if (!multiple_p (nunits_out, group_size))
7432 if (dump_enabled_p ())
7433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7434 "unsupported form of SLP reduction for"
7435 " variable-length vectors: the vector size"
7436 " is not a multiple of the number of results.\n");
7437 return false;
7441 if (reduction_type == COND_REDUCTION)
7443 widest_int ni;
7445 if (! max_loop_iterations (loop, &ni))
7447 if (dump_enabled_p ())
7448 dump_printf_loc (MSG_NOTE, vect_location,
7449 "loop count not known, cannot create cond "
7450 "reduction.\n");
7451 return false;
7453 /* Convert backedges to iterations. */
7454 ni += 1;
7456 /* The additional index will be the same type as the condition. Check
7457 that the loop can fit into this less one (because we'll use up the
7458 zero slot for when there are no matches). */
7459 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7460 if (wi::geu_p (ni, wi::to_widest (max_index)))
7462 if (dump_enabled_p ())
7463 dump_printf_loc (MSG_NOTE, vect_location,
7464 "loop size is greater than data size.\n");
7465 return false;
7469 /* In case the vectorization factor (VF) is bigger than the number
7470 of elements that we can fit in a vectype (nunits), we have to generate
7471 more than one vector stmt - i.e - we need to "unroll" the
7472 vector stmt by a factor VF/nunits. For more details see documentation
7473 in vectorizable_operation. */
7475 /* If the reduction is used in an outer loop we need to generate
7476 VF intermediate results, like so (e.g. for ncopies=2):
7477 r0 = phi (init, r0)
7478 r1 = phi (init, r1)
7479 r0 = x0 + r0;
7480 r1 = x1 + r1;
7481 (i.e. we generate VF results in 2 registers).
7482 In this case we have a separate def-use cycle for each copy, and therefore
7483 for each copy we get the vector def for the reduction variable from the
7484 respective phi node created for this copy.
7486 Otherwise (the reduction is unused in the loop nest), we can combine
7487 together intermediate results, like so (e.g. for ncopies=2):
7488 r = phi (init, r)
7489 r = x0 + r;
7490 r = x1 + r;
7491 (i.e. we generate VF/2 results in a single register).
7492 In this case for each copy we get the vector def for the reduction variable
7493 from the vectorized reduction operation generated in the previous iteration.
7495 This only works when we see both the reduction PHI and its only consumer
7496 in vectorizable_reduction and there are no intermediate stmts
7497 participating. When unrolling we want each unrolled iteration to have its
7498 own reduction accumulator since one of the main goals of unrolling a
7499 reduction is to reduce the aggregate loop-carried latency. */
7500 if (ncopies > 1
7501 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7502 && reduc_chain_length == 1
7503 && loop_vinfo->suggested_unroll_factor == 1)
7504 single_defuse_cycle = true;
7506 if (single_defuse_cycle || lane_reduc_code_p)
7508 gcc_assert (op.code != COND_EXPR);
7510 /* 4. Supportable by target? */
7511 bool ok = true;
7513 /* 4.1. check support for the operation in the loop
7515 This isn't necessary for the lane reduction codes, since they
7516 can only be produced by pattern matching, and it's up to the
7517 pattern matcher to test for support. The main reason for
7518 specifically skipping this step is to avoid rechecking whether
7519 mixed-sign dot-products can be implemented using signed
7520 dot-products. */
7521 machine_mode vec_mode = TYPE_MODE (vectype_in);
7522 if (!lane_reduc_code_p
7523 && !directly_supported_p (op.code, vectype_in, optab_vector))
7525 if (dump_enabled_p ())
7526 dump_printf (MSG_NOTE, "op not supported by target.\n");
7527 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7528 || !vect_can_vectorize_without_simd_p (op.code))
7529 ok = false;
7530 else
7531 if (dump_enabled_p ())
7532 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7535 if (vect_emulated_vector_p (vectype_in)
7536 && !vect_can_vectorize_without_simd_p (op.code))
7538 if (dump_enabled_p ())
7539 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7540 return false;
7543 /* lane-reducing operations have to go through vect_transform_reduction.
7544 For the other cases try without the single cycle optimization. */
7545 if (!ok)
7547 if (lane_reduc_code_p)
7548 return false;
7549 else
7550 single_defuse_cycle = false;
7553 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7555 /* If the reduction stmt is one of the patterns that have lane
7556 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7557 if ((ncopies > 1 && ! single_defuse_cycle)
7558 && lane_reduc_code_p)
7560 if (dump_enabled_p ())
7561 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7562 "multi def-use cycle not possible for lane-reducing "
7563 "reduction operation\n");
7564 return false;
7567 if (slp_node
7568 && !(!single_defuse_cycle
7569 && !lane_reduc_code_p
7570 && reduction_type != FOLD_LEFT_REDUCTION))
7571 for (i = 0; i < (int) op.num_ops; i++)
7572 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7574 if (dump_enabled_p ())
7575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7576 "incompatible vector types for invariants\n");
7577 return false;
7580 if (slp_node)
7581 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7582 else
7583 vec_num = 1;
7585 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7586 reduction_type, ncopies, cost_vec);
7587 /* Cost the reduction op inside the loop if transformed via
7588 vect_transform_reduction. Otherwise this is costed by the
7589 separate vectorizable_* routines. */
7590 if (single_defuse_cycle || lane_reduc_code_p)
7592 int factor = 1;
7593 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7594 /* Three dot-products and a subtraction. */
7595 factor = 4;
7596 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
7597 stmt_info, 0, vect_body);
7600 if (dump_enabled_p ()
7601 && reduction_type == FOLD_LEFT_REDUCTION)
7602 dump_printf_loc (MSG_NOTE, vect_location,
7603 "using an in-order (fold-left) reduction.\n");
7604 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7605 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7606 reductions go through their own vectorizable_* routines. */
7607 if (!single_defuse_cycle
7608 && !lane_reduc_code_p
7609 && reduction_type != FOLD_LEFT_REDUCTION)
7611 stmt_vec_info tem
7612 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7613 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7615 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7616 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7618 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7619 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7621 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7623 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7624 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7626 if (reduction_type != FOLD_LEFT_REDUCTION
7627 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7628 && (cond_fn == IFN_LAST
7629 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7630 OPTIMIZE_FOR_SPEED)))
7632 if (dump_enabled_p ())
7633 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7634 "can't operate on partial vectors because"
7635 " no conditional operation is available.\n");
7636 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7638 else if (reduction_type == FOLD_LEFT_REDUCTION
7639 && reduc_fn == IFN_LAST
7640 && !expand_vec_cond_expr_p (vectype_in,
7641 truth_type_for (vectype_in),
7642 SSA_NAME))
7644 if (dump_enabled_p ())
7645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7646 "can't operate on partial vectors because"
7647 " no conditional operation is available.\n");
7648 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7650 else
7651 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7652 vectype_in, NULL);
7654 return true;
7657 /* STMT_INFO is a dot-product reduction whose multiplication operands
7658 have different signs. Emit a sequence to emulate the operation
7659 using a series of signed DOT_PROD_EXPRs and return the last
7660 statement generated. VEC_DEST is the result of the vector operation
7661 and VOP lists its inputs. */
7663 static gassign *
7664 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7665 gimple_stmt_iterator *gsi, tree vec_dest,
7666 tree vop[3])
7668 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
7669 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
7670 tree narrow_elttype = TREE_TYPE (narrow_vectype);
7671 gimple *new_stmt;
7673 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
7674 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
7675 std::swap (vop[0], vop[1]);
7677 /* Convert all inputs to signed types. */
7678 for (int i = 0; i < 3; ++i)
7679 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
7681 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
7682 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
7683 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7684 vop[i] = tmp;
7687 /* In the comments below we assume 8-bit inputs for simplicity,
7688 but the approach works for any full integer type. */
7690 /* Create a vector of -128. */
7691 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
7692 tree min_narrow = build_vector_from_val (narrow_vectype,
7693 min_narrow_elttype);
7695 /* Create a vector of 64. */
7696 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
7697 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
7698 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
7700 /* Emit: SUB_RES = VOP[0] - 128. */
7701 tree sub_res = make_ssa_name (narrow_vectype);
7702 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
7703 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7705 /* Emit:
7707 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
7708 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
7709 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
7711 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
7712 Doing the two 64 * y steps first allows more time to compute x. */
7713 tree stage1 = make_ssa_name (wide_vectype);
7714 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
7715 vop[1], half_narrow, vop[2]);
7716 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7718 tree stage2 = make_ssa_name (wide_vectype);
7719 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
7720 vop[1], half_narrow, stage1);
7721 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7723 tree stage3 = make_ssa_name (wide_vectype);
7724 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
7725 sub_res, vop[1], stage2);
7726 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7728 /* Convert STAGE3 to the reduction type. */
7729 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
7732 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7733 value. */
7735 bool
7736 vect_transform_reduction (loop_vec_info loop_vinfo,
7737 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7738 gimple **vec_stmt, slp_tree slp_node)
7740 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7741 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7742 int i;
7743 int ncopies;
7744 int vec_num;
7746 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7747 gcc_assert (reduc_info->is_reduc_info);
7749 if (nested_in_vect_loop_p (loop, stmt_info))
7751 loop = loop->inner;
7752 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7755 gimple_match_op op;
7756 if (!gimple_extract_op (stmt_info->stmt, &op))
7757 gcc_unreachable ();
7758 gcc_assert (op.code.is_tree_code ());
7759 auto code = tree_code (op.code);
7761 /* All uses but the last are expected to be defined in the loop.
7762 The last use is the reduction variable. In case of nested cycle this
7763 assumption is not true: we use reduc_index to record the index of the
7764 reduction variable. */
7765 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7766 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7767 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7768 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7770 if (slp_node)
7772 ncopies = 1;
7773 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7775 else
7777 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7778 vec_num = 1;
7781 internal_fn cond_fn = get_conditional_internal_fn (code);
7782 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7783 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7785 /* Transform. */
7786 tree new_temp = NULL_TREE;
7787 auto_vec<tree> vec_oprnds0;
7788 auto_vec<tree> vec_oprnds1;
7789 auto_vec<tree> vec_oprnds2;
7790 tree def0;
7792 if (dump_enabled_p ())
7793 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7795 /* FORNOW: Multiple types are not supported for condition. */
7796 if (code == COND_EXPR)
7797 gcc_assert (ncopies == 1);
7799 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7801 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7802 if (reduction_type == FOLD_LEFT_REDUCTION)
7804 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7805 return vectorize_fold_left_reduction
7806 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7807 reduc_fn, op.ops, vectype_in, reduc_index, masks);
7810 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7811 gcc_assert (single_defuse_cycle
7812 || code == DOT_PROD_EXPR
7813 || code == WIDEN_SUM_EXPR
7814 || code == SAD_EXPR);
7816 /* Create the destination vector */
7817 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
7818 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7820 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7821 single_defuse_cycle && reduc_index == 0
7822 ? NULL_TREE : op.ops[0], &vec_oprnds0,
7823 single_defuse_cycle && reduc_index == 1
7824 ? NULL_TREE : op.ops[1], &vec_oprnds1,
7825 op.num_ops == 3
7826 && !(single_defuse_cycle && reduc_index == 2)
7827 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
7828 if (single_defuse_cycle)
7830 gcc_assert (!slp_node);
7831 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7832 op.ops[reduc_index],
7833 reduc_index == 0 ? &vec_oprnds0
7834 : (reduc_index == 1 ? &vec_oprnds1
7835 : &vec_oprnds2));
7838 bool emulated_mixed_dot_prod
7839 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
7840 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7842 gimple *new_stmt;
7843 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7844 if (masked_loop_p && !mask_by_cond_expr)
7846 /* No conditional ifns have been defined for dot-product yet. */
7847 gcc_assert (code != DOT_PROD_EXPR);
7849 /* Make sure that the reduction accumulator is vop[0]. */
7850 if (reduc_index == 1)
7852 gcc_assert (commutative_tree_code (code));
7853 std::swap (vop[0], vop[1]);
7855 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7856 vectype_in, i);
7857 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7858 vop[0], vop[1], vop[0]);
7859 new_temp = make_ssa_name (vec_dest, call);
7860 gimple_call_set_lhs (call, new_temp);
7861 gimple_call_set_nothrow (call, true);
7862 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7863 new_stmt = call;
7865 else
7867 if (op.num_ops == 3)
7868 vop[2] = vec_oprnds2[i];
7870 if (masked_loop_p && mask_by_cond_expr)
7872 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7873 vectype_in, i);
7874 build_vect_cond_expr (code, vop, mask, gsi);
7877 if (emulated_mixed_dot_prod)
7878 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
7879 vec_dest, vop);
7880 else
7881 new_stmt = gimple_build_assign (vec_dest, code,
7882 vop[0], vop[1], vop[2]);
7883 new_temp = make_ssa_name (vec_dest, new_stmt);
7884 gimple_assign_set_lhs (new_stmt, new_temp);
7885 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7888 if (slp_node)
7889 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7890 else if (single_defuse_cycle
7891 && i < ncopies - 1)
7893 if (reduc_index == 0)
7894 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7895 else if (reduc_index == 1)
7896 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7897 else if (reduc_index == 2)
7898 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7900 else
7901 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7904 if (!slp_node)
7905 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7907 return true;
7910 /* Transform phase of a cycle PHI. */
7912 bool
7913 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7914 stmt_vec_info stmt_info, gimple **vec_stmt,
7915 slp_tree slp_node, slp_instance slp_node_instance)
7917 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7918 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7919 int i;
7920 int ncopies;
7921 int j;
7922 bool nested_cycle = false;
7923 int vec_num;
7925 if (nested_in_vect_loop_p (loop, stmt_info))
7927 loop = loop->inner;
7928 nested_cycle = true;
7931 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7932 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7933 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7934 gcc_assert (reduc_info->is_reduc_info);
7936 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7937 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7938 /* Leave the scalar phi in place. */
7939 return true;
7941 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7942 /* For a nested cycle we do not fill the above. */
7943 if (!vectype_in)
7944 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7945 gcc_assert (vectype_in);
7947 if (slp_node)
7949 /* The size vect_schedule_slp_instance computes is off for us. */
7950 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7951 * SLP_TREE_LANES (slp_node), vectype_in);
7952 ncopies = 1;
7954 else
7956 vec_num = 1;
7957 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7960 /* Check whether we should use a single PHI node and accumulate
7961 vectors to one before the backedge. */
7962 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7963 ncopies = 1;
7965 /* Create the destination vector */
7966 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7967 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7968 vectype_out);
7970 /* Get the loop-entry arguments. */
7971 tree vec_initial_def = NULL_TREE;
7972 auto_vec<tree> vec_initial_defs;
7973 if (slp_node)
7975 vec_initial_defs.reserve (vec_num);
7976 if (nested_cycle)
7978 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7979 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7980 &vec_initial_defs);
7982 else
7984 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7985 vec<tree> &initial_values = reduc_info->reduc_initial_values;
7986 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7988 unsigned int num_phis = stmts.length ();
7989 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7990 num_phis = 1;
7991 initial_values.reserve (num_phis);
7992 for (unsigned int i = 0; i < num_phis; ++i)
7994 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7995 initial_values.quick_push (vect_phi_initial_value (this_phi));
7997 if (vec_num == 1)
7998 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7999 if (!initial_values.is_empty ())
8001 tree initial_value
8002 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8003 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8004 tree neutral_op
8005 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8006 code, initial_value);
8007 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8008 &vec_initial_defs, vec_num,
8009 stmts.length (), neutral_op);
8013 else
8015 /* Get at the scalar def before the loop, that defines the initial
8016 value of the reduction variable. */
8017 tree initial_def = vect_phi_initial_value (phi);
8018 reduc_info->reduc_initial_values.safe_push (initial_def);
8019 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8020 and we can't use zero for induc_val, use initial_def. Similarly
8021 for REDUC_MIN and initial_def larger than the base. */
8022 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8024 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8025 if (TREE_CODE (initial_def) == INTEGER_CST
8026 && !integer_zerop (induc_val)
8027 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8028 && tree_int_cst_lt (initial_def, induc_val))
8029 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8030 && tree_int_cst_lt (induc_val, initial_def))))
8032 induc_val = initial_def;
8033 /* Communicate we used the initial_def to epilouge
8034 generation. */
8035 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8037 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8039 else if (nested_cycle)
8041 /* Do not use an adjustment def as that case is not supported
8042 correctly if ncopies is not one. */
8043 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8044 ncopies, initial_def,
8045 &vec_initial_defs);
8047 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8048 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8049 /* Fill the initial vector with the initial scalar value. */
8050 vec_initial_def
8051 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8052 initial_def, initial_def);
8053 else
8055 if (ncopies == 1)
8056 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8057 if (!reduc_info->reduc_initial_values.is_empty ())
8059 initial_def = reduc_info->reduc_initial_values[0];
8060 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8061 tree neutral_op
8062 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8063 code, initial_def);
8064 gcc_assert (neutral_op);
8065 /* Try to simplify the vector initialization by applying an
8066 adjustment after the reduction has been performed. */
8067 if (!reduc_info->reused_accumulator
8068 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8069 && !operand_equal_p (neutral_op, initial_def))
8071 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8072 = initial_def;
8073 initial_def = neutral_op;
8075 vec_initial_def
8076 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8077 initial_def, neutral_op);
8082 if (vec_initial_def)
8084 vec_initial_defs.create (ncopies);
8085 for (i = 0; i < ncopies; ++i)
8086 vec_initial_defs.quick_push (vec_initial_def);
8089 if (auto *accumulator = reduc_info->reused_accumulator)
8091 tree def = accumulator->reduc_input;
8092 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8094 unsigned int nreduc;
8095 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8096 (TREE_TYPE (def)),
8097 TYPE_VECTOR_SUBPARTS (vectype_out),
8098 &nreduc);
8099 gcc_assert (res);
8100 gimple_seq stmts = NULL;
8101 /* Reduce the single vector to a smaller one. */
8102 if (nreduc != 1)
8104 /* Perform the reduction in the appropriate type. */
8105 tree rvectype = vectype_out;
8106 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8107 TREE_TYPE (TREE_TYPE (def))))
8108 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8109 TYPE_VECTOR_SUBPARTS
8110 (vectype_out));
8111 def = vect_create_partial_epilog (def, rvectype,
8112 STMT_VINFO_REDUC_CODE
8113 (reduc_info),
8114 &stmts);
8116 /* The epilogue loop might use a different vector mode, like
8117 VNx2DI vs. V2DI. */
8118 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8120 tree reduc_type = build_vector_type_for_mode
8121 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8122 def = gimple_convert (&stmts, reduc_type, def);
8124 /* Adjust the input so we pick up the partially reduced value
8125 for the skip edge in vect_create_epilog_for_reduction. */
8126 accumulator->reduc_input = def;
8127 /* And the reduction could be carried out using a different sign. */
8128 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8129 def = gimple_convert (&stmts, vectype_out, def);
8130 if (loop_vinfo->main_loop_edge)
8132 /* While we'd like to insert on the edge this will split
8133 blocks and disturb bookkeeping, we also will eventually
8134 need this on the skip edge. Rely on sinking to
8135 fixup optimal placement and insert in the pred. */
8136 gimple_stmt_iterator gsi
8137 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8138 /* Insert before a cond that eventually skips the
8139 epilogue. */
8140 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8141 gsi_prev (&gsi);
8142 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8144 else
8145 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8146 stmts);
8148 if (loop_vinfo->main_loop_edge)
8149 vec_initial_defs[0]
8150 = vect_get_main_loop_result (loop_vinfo, def,
8151 vec_initial_defs[0]);
8152 else
8153 vec_initial_defs.safe_push (def);
8156 /* Generate the reduction PHIs upfront. */
8157 for (i = 0; i < vec_num; i++)
8159 tree vec_init_def = vec_initial_defs[i];
8160 for (j = 0; j < ncopies; j++)
8162 /* Create the reduction-phi that defines the reduction
8163 operand. */
8164 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8166 /* Set the loop-entry arg of the reduction-phi. */
8167 if (j != 0 && nested_cycle)
8168 vec_init_def = vec_initial_defs[j];
8169 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8170 UNKNOWN_LOCATION);
8172 /* The loop-latch arg is set in epilogue processing. */
8174 if (slp_node)
8175 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8176 else
8178 if (j == 0)
8179 *vec_stmt = new_phi;
8180 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8185 return true;
8188 /* Vectorizes LC PHIs. */
8190 bool
8191 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8192 stmt_vec_info stmt_info, gimple **vec_stmt,
8193 slp_tree slp_node)
8195 if (!loop_vinfo
8196 || !is_a <gphi *> (stmt_info->stmt)
8197 || gimple_phi_num_args (stmt_info->stmt) != 1)
8198 return false;
8200 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8201 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8202 return false;
8204 if (!vec_stmt) /* transformation not required. */
8206 /* Deal with copies from externs or constants that disguise as
8207 loop-closed PHI nodes (PR97886). */
8208 if (slp_node
8209 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8210 SLP_TREE_VECTYPE (slp_node)))
8212 if (dump_enabled_p ())
8213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8214 "incompatible vector types for invariants\n");
8215 return false;
8217 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8218 return true;
8221 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8222 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8223 basic_block bb = gimple_bb (stmt_info->stmt);
8224 edge e = single_pred_edge (bb);
8225 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8226 auto_vec<tree> vec_oprnds;
8227 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8228 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8229 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8230 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8232 /* Create the vectorized LC PHI node. */
8233 gphi *new_phi = create_phi_node (vec_dest, bb);
8234 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8235 if (slp_node)
8236 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8237 else
8238 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8240 if (!slp_node)
8241 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8243 return true;
8246 /* Vectorizes PHIs. */
8248 bool
8249 vectorizable_phi (vec_info *,
8250 stmt_vec_info stmt_info, gimple **vec_stmt,
8251 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8253 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8254 return false;
8256 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8257 return false;
8259 tree vectype = SLP_TREE_VECTYPE (slp_node);
8261 if (!vec_stmt) /* transformation not required. */
8263 slp_tree child;
8264 unsigned i;
8265 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8266 if (!child)
8268 if (dump_enabled_p ())
8269 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8270 "PHI node with unvectorized backedge def\n");
8271 return false;
8273 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8275 if (dump_enabled_p ())
8276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8277 "incompatible vector types for invariants\n");
8278 return false;
8280 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8281 && !useless_type_conversion_p (vectype,
8282 SLP_TREE_VECTYPE (child)))
8284 /* With bools we can have mask and non-mask precision vectors
8285 or different non-mask precisions. while pattern recog is
8286 supposed to guarantee consistency here bugs in it can cause
8287 mismatches (PR103489 and PR103800 for example).
8288 Deal with them here instead of ICEing later. */
8289 if (dump_enabled_p ())
8290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8291 "incompatible vector type setup from "
8292 "bool pattern detection\n");
8293 return false;
8296 /* For single-argument PHIs assume coalescing which means zero cost
8297 for the scalar and the vector PHIs. This avoids artificially
8298 favoring the vector path (but may pessimize it in some cases). */
8299 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8300 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8301 vector_stmt, stmt_info, vectype, 0, vect_body);
8302 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8303 return true;
8306 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8307 basic_block bb = gimple_bb (stmt_info->stmt);
8308 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8309 auto_vec<gphi *> new_phis;
8310 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8312 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8314 /* Skip not yet vectorized defs. */
8315 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8316 && SLP_TREE_VEC_STMTS (child).is_empty ())
8317 continue;
8319 auto_vec<tree> vec_oprnds;
8320 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8321 if (!new_phis.exists ())
8323 new_phis.create (vec_oprnds.length ());
8324 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8326 /* Create the vectorized LC PHI node. */
8327 new_phis.quick_push (create_phi_node (vec_dest, bb));
8328 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8331 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8332 for (unsigned j = 0; j < vec_oprnds.length (); j++)
8333 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8335 /* We should have at least one already vectorized child. */
8336 gcc_assert (new_phis.exists ());
8338 return true;
8341 /* Vectorizes first order recurrences. An overview of the transformation
8342 is described below. Suppose we have the following loop.
8344 int t = 0;
8345 for (int i = 0; i < n; ++i)
8347 b[i] = a[i] - t;
8348 t = a[i];
8351 There is a first-order recurrence on 'a'. For this loop, the scalar IR
8352 looks (simplified) like:
8354 scalar.preheader:
8355 init = 0;
8357 scalar.body:
8358 i = PHI <0(scalar.preheader), i+1(scalar.body)>
8359 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8360 _1 = a[i]
8361 b[i] = _1 - _2
8362 if (i < n) goto scalar.body
8364 In this example, _2 is a recurrence because it's value depends on the
8365 previous iteration. We vectorize this as (VF = 4)
8367 vector.preheader:
8368 vect_init = vect_cst(..., ..., ..., 0)
8370 vector.body
8371 i = PHI <0(vector.preheader), i+4(vector.body)>
8372 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8373 vect_2 = a[i, i+1, i+2, i+3];
8374 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8375 b[i, i+1, i+2, i+3] = vect_2 - vect_3
8376 if (..) goto vector.body
8378 In this function, vectorizable_recurr, we code generate both the
8379 vector PHI node and the permute since those together compute the
8380 vectorized value of the scalar PHI. We do not yet have the
8381 backedge value to fill in there nor into the vec_perm. Those
8382 are filled in maybe_set_vectorized_backedge_value and
8383 vect_schedule_scc.
8385 TODO: Since the scalar loop does not have a use of the recurrence
8386 outside of the loop the natural way to implement peeling via
8387 vectorizing the live value doesn't work. For now peeling of loops
8388 with a recurrence is not implemented. For SLP the supported cases
8389 are restricted to those requiring a single vector recurrence PHI. */
8391 bool
8392 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8393 gimple **vec_stmt, slp_tree slp_node,
8394 stmt_vector_for_cost *cost_vec)
8396 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8397 return false;
8399 gphi *phi = as_a<gphi *> (stmt_info->stmt);
8401 /* So far we only support first-order recurrence auto-vectorization. */
8402 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8403 return false;
8405 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8406 unsigned ncopies;
8407 if (slp_node)
8408 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8409 else
8410 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8411 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8412 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8413 /* We need to be able to make progress with a single vector. */
8414 if (maybe_gt (dist * 2, nunits))
8416 if (dump_enabled_p ())
8417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8418 "first order recurrence exceeds half of "
8419 "a vector\n");
8420 return false;
8423 /* First-order recurrence autovectorization needs to handle permutation
8424 with indices = [nunits-1, nunits, nunits+1, ...]. */
8425 vec_perm_builder sel (nunits, 1, 3);
8426 for (int i = 0; i < 3; ++i)
8427 sel.quick_push (nunits - dist + i);
8428 vec_perm_indices indices (sel, 2, nunits);
8430 if (!vec_stmt) /* transformation not required. */
8432 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8433 indices))
8434 return false;
8436 if (slp_node)
8438 /* We eventually need to set a vector type on invariant
8439 arguments. */
8440 unsigned j;
8441 slp_tree child;
8442 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8443 if (!vect_maybe_update_slp_op_vectype
8444 (child, SLP_TREE_VECTYPE (slp_node)))
8446 if (dump_enabled_p ())
8447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8448 "incompatible vector types for "
8449 "invariants\n");
8450 return false;
8453 /* The recurrence costs the initialization vector and one permute
8454 for each copy. */
8455 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8456 stmt_info, 0, vect_prologue);
8457 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8458 stmt_info, 0, vect_body);
8459 if (dump_enabled_p ())
8460 dump_printf_loc (MSG_NOTE, vect_location,
8461 "vectorizable_recurr: inside_cost = %d, "
8462 "prologue_cost = %d .\n", inside_cost,
8463 prologue_cost);
8465 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8466 return true;
8469 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8470 basic_block bb = gimple_bb (phi);
8471 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8472 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8474 gimple_seq stmts = NULL;
8475 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8476 gsi_insert_seq_on_edge_immediate (pe, stmts);
8478 tree vec_init = build_vector_from_val (vectype, preheader);
8479 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8481 /* Create the vectorized first-order PHI node. */
8482 tree vec_dest = vect_get_new_vect_var (vectype,
8483 vect_simple_var, "vec_recur_");
8484 gphi *new_phi = create_phi_node (vec_dest, bb);
8485 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8487 /* Insert shuffles the first-order recurrence autovectorization.
8488 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
8489 tree perm = vect_gen_perm_mask_checked (vectype, indices);
8491 /* Insert the required permute after the latch definition. The
8492 second and later operands are tentative and will be updated when we have
8493 vectorized the latch definition. */
8494 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8495 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8496 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8497 gsi_next (&gsi2);
8499 for (unsigned i = 0; i < ncopies; ++i)
8501 vec_dest = make_ssa_name (vectype);
8502 gassign *vperm
8503 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8504 i == 0 ? gimple_phi_result (new_phi) : NULL,
8505 NULL, perm);
8506 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8508 if (slp_node)
8509 SLP_TREE_VEC_STMTS (slp_node).quick_push (vperm);
8510 else
8511 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8514 if (!slp_node)
8515 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8516 return true;
8519 /* Return true if VECTYPE represents a vector that requires lowering
8520 by the vector lowering pass. */
8522 bool
8523 vect_emulated_vector_p (tree vectype)
8525 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8526 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8527 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8530 /* Return true if we can emulate CODE on an integer mode representation
8531 of a vector. */
8533 bool
8534 vect_can_vectorize_without_simd_p (tree_code code)
8536 switch (code)
8538 case PLUS_EXPR:
8539 case MINUS_EXPR:
8540 case NEGATE_EXPR:
8541 case BIT_AND_EXPR:
8542 case BIT_IOR_EXPR:
8543 case BIT_XOR_EXPR:
8544 case BIT_NOT_EXPR:
8545 return true;
8547 default:
8548 return false;
8552 /* Likewise, but taking a code_helper. */
8554 bool
8555 vect_can_vectorize_without_simd_p (code_helper code)
8557 return (code.is_tree_code ()
8558 && vect_can_vectorize_without_simd_p (tree_code (code)));
8561 /* Create vector init for vectorized iv. */
8562 static tree
8563 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8564 tree step_expr, poly_uint64 nunits,
8565 tree vectype,
8566 enum vect_induction_op_type induction_type)
8568 unsigned HOST_WIDE_INT const_nunits;
8569 tree vec_shift, vec_init, new_name;
8570 unsigned i;
8571 tree itype = TREE_TYPE (vectype);
8573 /* iv_loop is the loop to be vectorized. Create:
8574 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
8575 new_name = gimple_convert (stmts, itype, init_expr);
8576 switch (induction_type)
8578 case vect_step_op_shr:
8579 case vect_step_op_shl:
8580 /* Build the Initial value from shift_expr. */
8581 vec_init = gimple_build_vector_from_val (stmts,
8582 vectype,
8583 new_name);
8584 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8585 build_zero_cst (itype), step_expr);
8586 vec_init = gimple_build (stmts,
8587 (induction_type == vect_step_op_shr
8588 ? RSHIFT_EXPR : LSHIFT_EXPR),
8589 vectype, vec_init, vec_shift);
8590 break;
8592 case vect_step_op_neg:
8594 vec_init = gimple_build_vector_from_val (stmts,
8595 vectype,
8596 new_name);
8597 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8598 vectype, vec_init);
8599 /* The encoding has 2 interleaved stepped patterns. */
8600 vec_perm_builder sel (nunits, 2, 3);
8601 sel.quick_grow (6);
8602 for (i = 0; i < 3; i++)
8604 sel[2 * i] = i;
8605 sel[2 * i + 1] = i + nunits;
8607 vec_perm_indices indices (sel, 2, nunits);
8608 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8609 fail when vec_init is const vector. In that situation vec_perm is not
8610 really needed. */
8611 tree perm_mask_even
8612 = vect_gen_perm_mask_any (vectype, indices);
8613 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8614 vectype,
8615 vec_init, vec_neg,
8616 perm_mask_even);
8618 break;
8620 case vect_step_op_mul:
8622 /* Use unsigned mult to avoid UD integer overflow. */
8623 gcc_assert (nunits.is_constant (&const_nunits));
8624 tree utype = unsigned_type_for (itype);
8625 tree uvectype = build_vector_type (utype,
8626 TYPE_VECTOR_SUBPARTS (vectype));
8627 new_name = gimple_convert (stmts, utype, new_name);
8628 vec_init = gimple_build_vector_from_val (stmts,
8629 uvectype,
8630 new_name);
8631 tree_vector_builder elts (uvectype, const_nunits, 1);
8632 tree elt_step = build_one_cst (utype);
8634 elts.quick_push (elt_step);
8635 for (i = 1; i < const_nunits; i++)
8637 /* Create: new_name_i = new_name + step_expr. */
8638 elt_step = gimple_build (stmts, MULT_EXPR,
8639 utype, elt_step, step_expr);
8640 elts.quick_push (elt_step);
8642 /* Create a vector from [new_name_0, new_name_1, ...,
8643 new_name_nunits-1]. */
8644 tree vec_mul = gimple_build_vector (stmts, &elts);
8645 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
8646 vec_init, vec_mul);
8647 vec_init = gimple_convert (stmts, vectype, vec_init);
8649 break;
8651 default:
8652 gcc_unreachable ();
8655 return vec_init;
8658 /* Peel init_expr by skip_niter for induction_type. */
8659 tree
8660 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8661 tree skip_niters, tree step_expr,
8662 enum vect_induction_op_type induction_type)
8664 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
8665 tree type = TREE_TYPE (init_expr);
8666 unsigned prec = TYPE_PRECISION (type);
8667 switch (induction_type)
8669 case vect_step_op_neg:
8670 if (TREE_INT_CST_LOW (skip_niters) % 2)
8671 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
8672 /* else no change. */
8673 break;
8675 case vect_step_op_shr:
8676 case vect_step_op_shl:
8677 skip_niters = gimple_convert (stmts, type, skip_niters);
8678 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
8679 /* When shift mount >= precision, need to avoid UD.
8680 In the original loop, there's no UD, and according to semantic,
8681 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
8682 if (!tree_fits_uhwi_p (step_expr)
8683 || tree_to_uhwi (step_expr) >= prec)
8685 if (induction_type == vect_step_op_shl
8686 || TYPE_UNSIGNED (type))
8687 init_expr = build_zero_cst (type);
8688 else
8689 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
8690 init_expr,
8691 wide_int_to_tree (type, prec - 1));
8693 else
8694 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
8695 ? RSHIFT_EXPR : LSHIFT_EXPR),
8696 type, init_expr, step_expr);
8697 break;
8699 case vect_step_op_mul:
8701 tree utype = unsigned_type_for (type);
8702 init_expr = gimple_convert (stmts, utype, init_expr);
8703 unsigned skipn = TREE_INT_CST_LOW (skip_niters);
8704 wide_int begin = wi::to_wide (step_expr);
8705 for (unsigned i = 0; i != skipn - 1; i++)
8706 begin = wi::mul (begin, wi::to_wide (step_expr));
8707 tree mult_expr = wide_int_to_tree (utype, begin);
8708 init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
8709 init_expr = gimple_convert (stmts, type, init_expr);
8711 break;
8713 default:
8714 gcc_unreachable ();
8717 return init_expr;
8720 /* Create vector step for vectorized iv. */
8721 static tree
8722 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
8723 poly_uint64 vf,
8724 enum vect_induction_op_type induction_type)
8726 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
8727 tree new_name = NULL;
8728 /* Step should be pow (step, vf) for mult induction. */
8729 if (induction_type == vect_step_op_mul)
8731 gcc_assert (vf.is_constant ());
8732 wide_int begin = wi::to_wide (step_expr);
8734 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
8735 begin = wi::mul (begin, wi::to_wide (step_expr));
8737 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
8739 else if (induction_type == vect_step_op_neg)
8740 /* Do nothing. */
8742 else
8743 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
8744 expr, step_expr);
8745 return new_name;
8748 static tree
8749 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
8750 stmt_vec_info stmt_info,
8751 tree new_name, tree vectype,
8752 enum vect_induction_op_type induction_type)
8754 /* No step is needed for neg induction. */
8755 if (induction_type == vect_step_op_neg)
8756 return NULL;
8758 tree t = unshare_expr (new_name);
8759 gcc_assert (CONSTANT_CLASS_P (new_name)
8760 || TREE_CODE (new_name) == SSA_NAME);
8761 tree new_vec = build_vector_from_val (vectype, t);
8762 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
8763 new_vec, vectype, NULL);
8764 return vec_step;
8767 /* Update vectorized iv with vect_step, induc_def is init. */
8768 static tree
8769 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
8770 tree induc_def, tree vec_step,
8771 enum vect_induction_op_type induction_type)
8773 tree vec_def = induc_def;
8774 switch (induction_type)
8776 case vect_step_op_mul:
8778 /* Use unsigned mult to avoid UD integer overflow. */
8779 tree uvectype
8780 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
8781 TYPE_VECTOR_SUBPARTS (vectype));
8782 vec_def = gimple_convert (stmts, uvectype, vec_def);
8783 vec_step = gimple_convert (stmts, uvectype, vec_step);
8784 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
8785 vec_def, vec_step);
8786 vec_def = gimple_convert (stmts, vectype, vec_def);
8788 break;
8790 case vect_step_op_shr:
8791 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
8792 vec_def, vec_step);
8793 break;
8795 case vect_step_op_shl:
8796 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
8797 vec_def, vec_step);
8798 break;
8799 case vect_step_op_neg:
8800 vec_def = induc_def;
8801 /* Do nothing. */
8802 break;
8803 default:
8804 gcc_unreachable ();
8807 return vec_def;
8811 /* Return true if vectorizer can peel for nonlinear iv. */
8812 bool
8813 vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
8814 enum vect_induction_op_type induction_type)
8816 tree niters_skip;
8817 /* Init_expr will be update by vect_update_ivs_after_vectorizer,
8818 if niters is unkown:
8819 For shift, when shift mount >= precision, there would be UD.
8820 For mult, don't known how to generate
8821 init_expr * pow (step, niters) for variable niters.
8822 For neg, it should be ok, since niters of vectorized main loop
8823 will always be multiple of 2. */
8824 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8825 && induction_type != vect_step_op_neg)
8827 if (dump_enabled_p ())
8828 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8829 "Peeling for epilogue is not supported"
8830 " for nonlinear induction except neg"
8831 " when iteration count is unknown.\n");
8832 return false;
8835 /* Also doens't support peel for neg when niter is variable.
8836 ??? generate something like niter_expr & 1 ? init_expr : -init_expr? */
8837 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8838 if ((niters_skip != NULL_TREE
8839 && TREE_CODE (niters_skip) != INTEGER_CST)
8840 || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
8841 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
8843 if (dump_enabled_p ())
8844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8845 "Peeling for alignement is not supported"
8846 " for nonlinear induction when niters_skip"
8847 " is not constant.\n");
8848 return false;
8851 return true;
8854 /* Function vectorizable_induction
8856 Check if STMT_INFO performs an nonlinear induction computation that can be
8857 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
8858 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
8859 basic block.
8860 Return true if STMT_INFO is vectorizable in this way. */
8862 static bool
8863 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
8864 stmt_vec_info stmt_info,
8865 gimple **vec_stmt, slp_tree slp_node,
8866 stmt_vector_for_cost *cost_vec)
8868 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8869 unsigned ncopies;
8870 bool nested_in_vect_loop = false;
8871 class loop *iv_loop;
8872 tree vec_def;
8873 edge pe = loop_preheader_edge (loop);
8874 basic_block new_bb;
8875 tree vec_init, vec_step;
8876 tree new_name;
8877 gimple *new_stmt;
8878 gphi *induction_phi;
8879 tree induc_def, vec_dest;
8880 tree init_expr, step_expr;
8881 tree niters_skip;
8882 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8883 unsigned i;
8884 gimple_stmt_iterator si;
8886 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
8888 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8889 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8890 enum vect_induction_op_type induction_type
8891 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
8893 gcc_assert (induction_type > vect_step_op_add);
8895 if (slp_node)
8896 ncopies = 1;
8897 else
8898 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8899 gcc_assert (ncopies >= 1);
8901 /* FORNOW. Only handle nonlinear induction in the same loop. */
8902 if (nested_in_vect_loop_p (loop, stmt_info))
8904 if (dump_enabled_p ())
8905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8906 "nonlinear induction in nested loop.\n");
8907 return false;
8910 iv_loop = loop;
8911 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8913 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
8914 update for each iv and a permutation to generate wanted vector iv. */
8915 if (slp_node)
8917 if (dump_enabled_p ())
8918 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8919 "SLP induction not supported for nonlinear"
8920 " induction.\n");
8921 return false;
8924 if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, induction_type))
8925 return false;
8927 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
8929 if (dump_enabled_p ())
8930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8931 "floating point nonlinear induction vectorization"
8932 " not supported.\n");
8933 return false;
8936 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8937 init_expr = vect_phi_initial_value (phi);
8938 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
8939 && TREE_CODE (step_expr) == INTEGER_CST);
8940 /* step_expr should be aligned with init_expr,
8941 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
8942 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
8944 if (TREE_CODE (init_expr) == INTEGER_CST)
8945 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
8946 else
8947 gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
8948 TREE_TYPE (init_expr)));
8950 switch (induction_type)
8952 case vect_step_op_neg:
8953 if (TREE_CODE (init_expr) != INTEGER_CST
8954 && TREE_CODE (init_expr) != REAL_CST)
8956 /* Check for backend support of NEGATE_EXPR and vec_perm. */
8957 if (!directly_supported_p (NEGATE_EXPR, vectype))
8958 return false;
8960 /* The encoding has 2 interleaved stepped patterns. */
8961 vec_perm_builder sel (nunits, 2, 3);
8962 machine_mode mode = TYPE_MODE (vectype);
8963 sel.quick_grow (6);
8964 for (i = 0; i < 3; i++)
8966 sel[i * 2] = i;
8967 sel[i * 2 + 1] = i + nunits;
8969 vec_perm_indices indices (sel, 2, nunits);
8970 if (!can_vec_perm_const_p (mode, mode, indices))
8971 return false;
8973 break;
8975 case vect_step_op_mul:
8977 /* Check for backend support of MULT_EXPR. */
8978 if (!directly_supported_p (MULT_EXPR, vectype))
8979 return false;
8981 /* ?? How to construct vector step for variable number vector.
8982 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
8983 if (!vf.is_constant ())
8984 return false;
8986 break;
8988 case vect_step_op_shr:
8989 /* Check for backend support of RSHIFT_EXPR. */
8990 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
8991 return false;
8993 /* Don't shift more than type precision to avoid UD. */
8994 if (!tree_fits_uhwi_p (step_expr)
8995 || maybe_ge (nunits * tree_to_uhwi (step_expr),
8996 TYPE_PRECISION (TREE_TYPE (init_expr))))
8997 return false;
8998 break;
9000 case vect_step_op_shl:
9001 /* Check for backend support of RSHIFT_EXPR. */
9002 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9003 return false;
9005 /* Don't shift more than type precision to avoid UD. */
9006 if (!tree_fits_uhwi_p (step_expr)
9007 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9008 TYPE_PRECISION (TREE_TYPE (init_expr))))
9009 return false;
9011 break;
9013 default:
9014 gcc_unreachable ();
9017 if (!vec_stmt) /* transformation not required. */
9019 unsigned inside_cost = 0, prologue_cost = 0;
9020 /* loop cost for vec_loop. Neg induction doesn't have any
9021 inside_cost. */
9022 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9023 stmt_info, 0, vect_body);
9025 /* loop cost for vec_loop. Neg induction doesn't have any
9026 inside_cost. */
9027 if (induction_type == vect_step_op_neg)
9028 inside_cost = 0;
9030 /* prologue cost for vec_init and vec_step. */
9031 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9032 stmt_info, 0, vect_prologue);
9034 if (dump_enabled_p ())
9035 dump_printf_loc (MSG_NOTE, vect_location,
9036 "vect_model_induction_cost: inside_cost = %d, "
9037 "prologue_cost = %d. \n", inside_cost,
9038 prologue_cost);
9040 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9041 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9042 return true;
9045 /* Transform. */
9047 /* Compute a vector variable, initialized with the first VF values of
9048 the induction variable. E.g., for an iv with IV_PHI='X' and
9049 evolution S, for a vector of 4 units, we want to compute:
9050 [X, X + S, X + 2*S, X + 3*S]. */
9052 if (dump_enabled_p ())
9053 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9055 pe = loop_preheader_edge (iv_loop);
9056 /* Find the first insertion point in the BB. */
9057 basic_block bb = gimple_bb (phi);
9058 si = gsi_after_labels (bb);
9060 gimple_seq stmts = NULL;
9062 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9063 /* If we are using the loop mask to "peel" for alignment then we need
9064 to adjust the start value here. */
9065 if (niters_skip != NULL_TREE)
9066 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9067 step_expr, induction_type);
9069 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9070 step_expr, nunits, vectype,
9071 induction_type);
9072 if (stmts)
9074 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9075 gcc_assert (!new_bb);
9078 stmts = NULL;
9079 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9080 vf, induction_type);
9081 if (stmts)
9083 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9084 gcc_assert (!new_bb);
9087 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9088 new_name, vectype,
9089 induction_type);
9090 /* Create the following def-use cycle:
9091 loop prolog:
9092 vec_init = ...
9093 vec_step = ...
9094 loop:
9095 vec_iv = PHI <vec_init, vec_loop>
9097 STMT
9099 vec_loop = vec_iv + vec_step; */
9101 /* Create the induction-phi that defines the induction-operand. */
9102 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9103 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9104 induc_def = PHI_RESULT (induction_phi);
9106 /* Create the iv update inside the loop. */
9107 stmts = NULL;
9108 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9109 induc_def, vec_step,
9110 induction_type);
9112 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9113 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9115 /* Set the arguments of the phi node: */
9116 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9117 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9118 UNKNOWN_LOCATION);
9120 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9121 *vec_stmt = induction_phi;
9123 /* In case that vectorization factor (VF) is bigger than the number
9124 of elements that we can fit in a vectype (nunits), we have to generate
9125 more than one vector stmt - i.e - we need to "unroll" the
9126 vector stmt by a factor VF/nunits. For more details see documentation
9127 in vectorizable_operation. */
9129 if (ncopies > 1)
9131 stmts = NULL;
9132 /* FORNOW. This restriction should be relaxed. */
9133 gcc_assert (!nested_in_vect_loop);
9135 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9136 nunits, induction_type);
9138 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9139 new_name, vectype,
9140 induction_type);
9141 vec_def = induc_def;
9142 for (i = 1; i < ncopies; i++)
9144 /* vec_i = vec_prev + vec_step. */
9145 stmts = NULL;
9146 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9147 vec_def, vec_step,
9148 induction_type);
9149 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9150 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9151 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9155 if (dump_enabled_p ())
9156 dump_printf_loc (MSG_NOTE, vect_location,
9157 "transform induction: created def-use cycle: %G%G",
9158 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9160 return true;
9163 /* Function vectorizable_induction
9165 Check if STMT_INFO performs an induction computation that can be vectorized.
9166 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9167 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9168 Return true if STMT_INFO is vectorizable in this way. */
9170 bool
9171 vectorizable_induction (loop_vec_info loop_vinfo,
9172 stmt_vec_info stmt_info,
9173 gimple **vec_stmt, slp_tree slp_node,
9174 stmt_vector_for_cost *cost_vec)
9176 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9177 unsigned ncopies;
9178 bool nested_in_vect_loop = false;
9179 class loop *iv_loop;
9180 tree vec_def;
9181 edge pe = loop_preheader_edge (loop);
9182 basic_block new_bb;
9183 tree new_vec, vec_init, vec_step, t;
9184 tree new_name;
9185 gimple *new_stmt;
9186 gphi *induction_phi;
9187 tree induc_def, vec_dest;
9188 tree init_expr, step_expr;
9189 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9190 unsigned i;
9191 tree expr;
9192 gimple_stmt_iterator si;
9193 enum vect_induction_op_type induction_type
9194 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9196 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9197 if (!phi)
9198 return false;
9200 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9201 return false;
9203 /* Make sure it was recognized as induction computation. */
9204 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9205 return false;
9207 /* Handle nonlinear induction in a separate place. */
9208 if (induction_type != vect_step_op_add)
9209 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9210 vec_stmt, slp_node, cost_vec);
9212 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9213 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9215 if (slp_node)
9216 ncopies = 1;
9217 else
9218 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9219 gcc_assert (ncopies >= 1);
9221 /* FORNOW. These restrictions should be relaxed. */
9222 if (nested_in_vect_loop_p (loop, stmt_info))
9224 imm_use_iterator imm_iter;
9225 use_operand_p use_p;
9226 gimple *exit_phi;
9227 edge latch_e;
9228 tree loop_arg;
9230 if (ncopies > 1)
9232 if (dump_enabled_p ())
9233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9234 "multiple types in nested loop.\n");
9235 return false;
9238 exit_phi = NULL;
9239 latch_e = loop_latch_edge (loop->inner);
9240 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9241 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9243 gimple *use_stmt = USE_STMT (use_p);
9244 if (is_gimple_debug (use_stmt))
9245 continue;
9247 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9249 exit_phi = use_stmt;
9250 break;
9253 if (exit_phi)
9255 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9256 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9257 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9259 if (dump_enabled_p ())
9260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9261 "inner-loop induction only used outside "
9262 "of the outer vectorized loop.\n");
9263 return false;
9267 nested_in_vect_loop = true;
9268 iv_loop = loop->inner;
9270 else
9271 iv_loop = loop;
9272 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9274 if (slp_node && !nunits.is_constant ())
9276 /* The current SLP code creates the step value element-by-element. */
9277 if (dump_enabled_p ())
9278 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9279 "SLP induction not supported for variable-length"
9280 " vectors.\n");
9281 return false;
9284 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9286 if (dump_enabled_p ())
9287 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9288 "floating point induction vectorization disabled\n");
9289 return false;
9292 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9293 gcc_assert (step_expr != NULL_TREE);
9294 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9296 /* Check for backend support of PLUS/MINUS_EXPR. */
9297 if (!directly_supported_p (PLUS_EXPR, step_vectype)
9298 || !directly_supported_p (MINUS_EXPR, step_vectype))
9299 return false;
9301 if (!vec_stmt) /* transformation not required. */
9303 unsigned inside_cost = 0, prologue_cost = 0;
9304 if (slp_node)
9306 /* We eventually need to set a vector type on invariant
9307 arguments. */
9308 unsigned j;
9309 slp_tree child;
9310 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9311 if (!vect_maybe_update_slp_op_vectype
9312 (child, SLP_TREE_VECTYPE (slp_node)))
9314 if (dump_enabled_p ())
9315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9316 "incompatible vector types for "
9317 "invariants\n");
9318 return false;
9320 /* loop cost for vec_loop. */
9321 inside_cost
9322 = record_stmt_cost (cost_vec,
9323 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9324 vector_stmt, stmt_info, 0, vect_body);
9325 /* prologue cost for vec_init (if not nested) and step. */
9326 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9327 scalar_to_vec,
9328 stmt_info, 0, vect_prologue);
9330 else /* if (!slp_node) */
9332 /* loop cost for vec_loop. */
9333 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9334 stmt_info, 0, vect_body);
9335 /* prologue cost for vec_init and vec_step. */
9336 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9337 stmt_info, 0, vect_prologue);
9339 if (dump_enabled_p ())
9340 dump_printf_loc (MSG_NOTE, vect_location,
9341 "vect_model_induction_cost: inside_cost = %d, "
9342 "prologue_cost = %d .\n", inside_cost,
9343 prologue_cost);
9345 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9346 DUMP_VECT_SCOPE ("vectorizable_induction");
9347 return true;
9350 /* Transform. */
9352 /* Compute a vector variable, initialized with the first VF values of
9353 the induction variable. E.g., for an iv with IV_PHI='X' and
9354 evolution S, for a vector of 4 units, we want to compute:
9355 [X, X + S, X + 2*S, X + 3*S]. */
9357 if (dump_enabled_p ())
9358 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9360 pe = loop_preheader_edge (iv_loop);
9361 /* Find the first insertion point in the BB. */
9362 basic_block bb = gimple_bb (phi);
9363 si = gsi_after_labels (bb);
9365 /* For SLP induction we have to generate several IVs as for example
9366 with group size 3 we need
9367 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9368 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
9369 if (slp_node)
9371 /* Enforced above. */
9372 unsigned int const_nunits = nunits.to_constant ();
9374 /* The initial values are vectorized, but any lanes > group_size
9375 need adjustment. */
9376 slp_tree init_node
9377 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9379 /* Gather steps. Since we do not vectorize inductions as
9380 cycles we have to reconstruct the step from SCEV data. */
9381 unsigned group_size = SLP_TREE_LANES (slp_node);
9382 tree *steps = XALLOCAVEC (tree, group_size);
9383 tree *inits = XALLOCAVEC (tree, group_size);
9384 stmt_vec_info phi_info;
9385 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9387 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9388 if (!init_node)
9389 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9390 pe->dest_idx);
9393 /* Now generate the IVs. */
9394 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9395 gcc_assert ((const_nunits * nvects) % group_size == 0);
9396 unsigned nivs;
9397 if (nested_in_vect_loop)
9398 nivs = nvects;
9399 else
9401 /* Compute the number of distinct IVs we need. First reduce
9402 group_size if it is a multiple of const_nunits so we get
9403 one IV for a group_size of 4 but const_nunits 2. */
9404 unsigned group_sizep = group_size;
9405 if (group_sizep % const_nunits == 0)
9406 group_sizep = group_sizep / const_nunits;
9407 nivs = least_common_multiple (group_sizep,
9408 const_nunits) / const_nunits;
9410 tree stept = TREE_TYPE (step_vectype);
9411 tree lupdate_mul = NULL_TREE;
9412 if (!nested_in_vect_loop)
9414 /* The number of iterations covered in one vector iteration. */
9415 unsigned lup_mul = (nvects * const_nunits) / group_size;
9416 lupdate_mul
9417 = build_vector_from_val (step_vectype,
9418 SCALAR_FLOAT_TYPE_P (stept)
9419 ? build_real_from_wide (stept, lup_mul,
9420 UNSIGNED)
9421 : build_int_cstu (stept, lup_mul));
9423 tree peel_mul = NULL_TREE;
9424 gimple_seq init_stmts = NULL;
9425 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9427 if (SCALAR_FLOAT_TYPE_P (stept))
9428 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9429 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9430 else
9431 peel_mul = gimple_convert (&init_stmts, stept,
9432 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9433 peel_mul = gimple_build_vector_from_val (&init_stmts,
9434 step_vectype, peel_mul);
9436 unsigned ivn;
9437 auto_vec<tree> vec_steps;
9438 for (ivn = 0; ivn < nivs; ++ivn)
9440 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9441 tree_vector_builder init_elts (vectype, const_nunits, 1);
9442 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9443 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9445 /* The scalar steps of the IVs. */
9446 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9447 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9448 step_elts.quick_push (elt);
9449 if (!init_node)
9451 /* The scalar inits of the IVs if not vectorized. */
9452 elt = inits[(ivn*const_nunits + eltn) % group_size];
9453 if (!useless_type_conversion_p (TREE_TYPE (vectype),
9454 TREE_TYPE (elt)))
9455 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9456 TREE_TYPE (vectype), elt);
9457 init_elts.quick_push (elt);
9459 /* The number of steps to add to the initial values. */
9460 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9461 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9462 ? build_real_from_wide (stept,
9463 mul_elt, UNSIGNED)
9464 : build_int_cstu (stept, mul_elt));
9466 vec_step = gimple_build_vector (&init_stmts, &step_elts);
9467 vec_steps.safe_push (vec_step);
9468 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9469 if (peel_mul)
9470 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9471 step_mul, peel_mul);
9472 if (!init_node)
9473 vec_init = gimple_build_vector (&init_stmts, &init_elts);
9475 /* Create the induction-phi that defines the induction-operand. */
9476 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9477 "vec_iv_");
9478 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9479 induc_def = PHI_RESULT (induction_phi);
9481 /* Create the iv update inside the loop */
9482 tree up = vec_step;
9483 if (lupdate_mul)
9484 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9485 vec_step, lupdate_mul);
9486 gimple_seq stmts = NULL;
9487 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9488 vec_def = gimple_build (&stmts,
9489 PLUS_EXPR, step_vectype, vec_def, up);
9490 vec_def = gimple_convert (&stmts, vectype, vec_def);
9491 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9492 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9493 UNKNOWN_LOCATION);
9495 if (init_node)
9496 vec_init = vect_get_slp_vect_def (init_node, ivn);
9497 if (!nested_in_vect_loop
9498 && !integer_zerop (step_mul))
9500 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9501 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9502 vec_step, step_mul);
9503 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9504 vec_def, up);
9505 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9508 /* Set the arguments of the phi node: */
9509 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9511 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
9513 if (!nested_in_vect_loop)
9515 /* Fill up to the number of vectors we need for the whole group. */
9516 nivs = least_common_multiple (group_size,
9517 const_nunits) / const_nunits;
9518 vec_steps.reserve (nivs-ivn);
9519 for (; ivn < nivs; ++ivn)
9521 SLP_TREE_VEC_STMTS (slp_node)
9522 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
9523 vec_steps.quick_push (vec_steps[0]);
9527 /* Re-use IVs when we can. We are generating further vector
9528 stmts by adding VF' * stride to the IVs generated above. */
9529 if (ivn < nvects)
9531 unsigned vfp
9532 = least_common_multiple (group_size, const_nunits) / group_size;
9533 tree lupdate_mul
9534 = build_vector_from_val (step_vectype,
9535 SCALAR_FLOAT_TYPE_P (stept)
9536 ? build_real_from_wide (stept,
9537 vfp, UNSIGNED)
9538 : build_int_cstu (stept, vfp));
9539 for (; ivn < nvects; ++ivn)
9541 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
9542 tree def = gimple_get_lhs (iv);
9543 if (ivn < 2*nivs)
9544 vec_steps[ivn - nivs]
9545 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9546 vec_steps[ivn - nivs], lupdate_mul);
9547 gimple_seq stmts = NULL;
9548 def = gimple_convert (&stmts, step_vectype, def);
9549 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9550 def, vec_steps[ivn % nivs]);
9551 def = gimple_convert (&stmts, vectype, def);
9552 if (gimple_code (iv) == GIMPLE_PHI)
9553 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9554 else
9556 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9557 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9559 SLP_TREE_VEC_STMTS (slp_node)
9560 .quick_push (SSA_NAME_DEF_STMT (def));
9564 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9565 gcc_assert (!new_bb);
9567 return true;
9570 init_expr = vect_phi_initial_value (phi);
9572 gimple_seq stmts = NULL;
9573 if (!nested_in_vect_loop)
9575 /* Convert the initial value to the IV update type. */
9576 tree new_type = TREE_TYPE (step_expr);
9577 init_expr = gimple_convert (&stmts, new_type, init_expr);
9579 /* If we are using the loop mask to "peel" for alignment then we need
9580 to adjust the start value here. */
9581 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9582 if (skip_niters != NULL_TREE)
9584 if (FLOAT_TYPE_P (vectype))
9585 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9586 skip_niters);
9587 else
9588 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9589 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9590 skip_niters, step_expr);
9591 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9592 init_expr, skip_step);
9596 if (stmts)
9598 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9599 gcc_assert (!new_bb);
9602 /* Create the vector that holds the initial_value of the induction. */
9603 if (nested_in_vect_loop)
9605 /* iv_loop is nested in the loop to be vectorized. init_expr had already
9606 been created during vectorization of previous stmts. We obtain it
9607 from the STMT_VINFO_VEC_STMT of the defining stmt. */
9608 auto_vec<tree> vec_inits;
9609 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9610 init_expr, &vec_inits);
9611 vec_init = vec_inits[0];
9612 /* If the initial value is not of proper type, convert it. */
9613 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9615 new_stmt
9616 = gimple_build_assign (vect_get_new_ssa_name (vectype,
9617 vect_simple_var,
9618 "vec_iv_"),
9619 VIEW_CONVERT_EXPR,
9620 build1 (VIEW_CONVERT_EXPR, vectype,
9621 vec_init));
9622 vec_init = gimple_assign_lhs (new_stmt);
9623 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
9624 new_stmt);
9625 gcc_assert (!new_bb);
9628 else
9630 /* iv_loop is the loop to be vectorized. Create:
9631 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
9632 stmts = NULL;
9633 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
9635 unsigned HOST_WIDE_INT const_nunits;
9636 if (nunits.is_constant (&const_nunits))
9638 tree_vector_builder elts (step_vectype, const_nunits, 1);
9639 elts.quick_push (new_name);
9640 for (i = 1; i < const_nunits; i++)
9642 /* Create: new_name_i = new_name + step_expr */
9643 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
9644 new_name, step_expr);
9645 elts.quick_push (new_name);
9647 /* Create a vector from [new_name_0, new_name_1, ...,
9648 new_name_nunits-1] */
9649 vec_init = gimple_build_vector (&stmts, &elts);
9651 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
9652 /* Build the initial value directly from a VEC_SERIES_EXPR. */
9653 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
9654 new_name, step_expr);
9655 else
9657 /* Build:
9658 [base, base, base, ...]
9659 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
9660 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
9661 gcc_assert (flag_associative_math);
9662 tree index = build_index_vector (step_vectype, 0, 1);
9663 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9664 new_name);
9665 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9666 step_expr);
9667 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
9668 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
9669 vec_init, step_vec);
9670 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9671 vec_init, base_vec);
9673 vec_init = gimple_convert (&stmts, vectype, vec_init);
9675 if (stmts)
9677 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9678 gcc_assert (!new_bb);
9683 /* Create the vector that holds the step of the induction. */
9684 if (nested_in_vect_loop)
9685 /* iv_loop is nested in the loop to be vectorized. Generate:
9686 vec_step = [S, S, S, S] */
9687 new_name = step_expr;
9688 else
9690 /* iv_loop is the loop to be vectorized. Generate:
9691 vec_step = [VF*S, VF*S, VF*S, VF*S] */
9692 gimple_seq seq = NULL;
9693 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9695 expr = build_int_cst (integer_type_node, vf);
9696 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9698 else
9699 expr = build_int_cst (TREE_TYPE (step_expr), vf);
9700 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
9701 expr, step_expr);
9702 if (seq)
9704 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
9705 gcc_assert (!new_bb);
9709 t = unshare_expr (new_name);
9710 gcc_assert (CONSTANT_CLASS_P (new_name)
9711 || TREE_CODE (new_name) == SSA_NAME);
9712 new_vec = build_vector_from_val (step_vectype, t);
9713 vec_step = vect_init_vector (loop_vinfo, stmt_info,
9714 new_vec, step_vectype, NULL);
9717 /* Create the following def-use cycle:
9718 loop prolog:
9719 vec_init = ...
9720 vec_step = ...
9721 loop:
9722 vec_iv = PHI <vec_init, vec_loop>
9724 STMT
9726 vec_loop = vec_iv + vec_step; */
9728 /* Create the induction-phi that defines the induction-operand. */
9729 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9730 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9731 induc_def = PHI_RESULT (induction_phi);
9733 /* Create the iv update inside the loop */
9734 stmts = NULL;
9735 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9736 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
9737 vec_def = gimple_convert (&stmts, vectype, vec_def);
9738 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9739 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9741 /* Set the arguments of the phi node: */
9742 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9743 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9744 UNKNOWN_LOCATION);
9746 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9747 *vec_stmt = induction_phi;
9749 /* In case that vectorization factor (VF) is bigger than the number
9750 of elements that we can fit in a vectype (nunits), we have to generate
9751 more than one vector stmt - i.e - we need to "unroll" the
9752 vector stmt by a factor VF/nunits. For more details see documentation
9753 in vectorizable_operation. */
9755 if (ncopies > 1)
9757 gimple_seq seq = NULL;
9758 /* FORNOW. This restriction should be relaxed. */
9759 gcc_assert (!nested_in_vect_loop);
9761 /* Create the vector that holds the step of the induction. */
9762 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
9764 expr = build_int_cst (integer_type_node, nunits);
9765 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
9767 else
9768 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
9769 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
9770 expr, step_expr);
9771 if (seq)
9773 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
9774 gcc_assert (!new_bb);
9777 t = unshare_expr (new_name);
9778 gcc_assert (CONSTANT_CLASS_P (new_name)
9779 || TREE_CODE (new_name) == SSA_NAME);
9780 new_vec = build_vector_from_val (step_vectype, t);
9781 vec_step = vect_init_vector (loop_vinfo, stmt_info,
9782 new_vec, step_vectype, NULL);
9784 vec_def = induc_def;
9785 for (i = 1; i < ncopies; i++)
9787 /* vec_i = vec_prev + vec_step */
9788 gimple_seq stmts = NULL;
9789 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
9790 vec_def = gimple_build (&stmts,
9791 PLUS_EXPR, step_vectype, vec_def, vec_step);
9792 vec_def = gimple_convert (&stmts, vectype, vec_def);
9794 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9795 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9796 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9800 if (dump_enabled_p ())
9801 dump_printf_loc (MSG_NOTE, vect_location,
9802 "transform induction: created def-use cycle: %G%G",
9803 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9805 return true;
9808 /* Function vectorizable_live_operation.
9810 STMT_INFO computes a value that is used outside the loop. Check if
9811 it can be supported. */
9813 bool
9814 vectorizable_live_operation (vec_info *vinfo,
9815 stmt_vec_info stmt_info,
9816 gimple_stmt_iterator *gsi,
9817 slp_tree slp_node, slp_instance slp_node_instance,
9818 int slp_index, bool vec_stmt_p,
9819 stmt_vector_for_cost *cost_vec)
9821 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
9822 imm_use_iterator imm_iter;
9823 tree lhs, lhs_type, bitsize;
9824 tree vectype = (slp_node
9825 ? SLP_TREE_VECTYPE (slp_node)
9826 : STMT_VINFO_VECTYPE (stmt_info));
9827 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9828 int ncopies;
9829 gimple *use_stmt;
9830 auto_vec<tree> vec_oprnds;
9831 int vec_entry = 0;
9832 poly_uint64 vec_index = 0;
9834 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
9836 /* If a stmt of a reduction is live, vectorize it via
9837 vect_create_epilog_for_reduction. vectorizable_reduction assessed
9838 validity so just trigger the transform here. */
9839 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
9841 if (!vec_stmt_p)
9842 return true;
9843 if (slp_node)
9845 /* For reduction chains the meta-info is attached to
9846 the group leader. */
9847 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
9848 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
9849 /* For SLP reductions we vectorize the epilogue for
9850 all involved stmts together. */
9851 else if (slp_index != 0)
9852 return true;
9854 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
9855 gcc_assert (reduc_info->is_reduc_info);
9856 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
9857 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
9858 return true;
9859 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
9860 slp_node_instance);
9861 return true;
9864 /* If STMT is not relevant and it is a simple assignment and its inputs are
9865 invariant then it can remain in place, unvectorized. The original last
9866 scalar value that it computes will be used. */
9867 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9869 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
9870 if (dump_enabled_p ())
9871 dump_printf_loc (MSG_NOTE, vect_location,
9872 "statement is simple and uses invariant. Leaving in "
9873 "place.\n");
9874 return true;
9877 if (slp_node)
9878 ncopies = 1;
9879 else
9880 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9882 if (slp_node)
9884 gcc_assert (slp_index >= 0);
9886 /* Get the last occurrence of the scalar index from the concatenation of
9887 all the slp vectors. Calculate which slp vector it is and the index
9888 within. */
9889 int num_scalar = SLP_TREE_LANES (slp_node);
9890 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9891 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
9893 /* Calculate which vector contains the result, and which lane of
9894 that vector we need. */
9895 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
9897 if (dump_enabled_p ())
9898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9899 "Cannot determine which vector holds the"
9900 " final result.\n");
9901 return false;
9905 if (!vec_stmt_p)
9907 /* No transformation required. */
9908 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
9910 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
9911 OPTIMIZE_FOR_SPEED))
9913 if (dump_enabled_p ())
9914 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9915 "can't operate on partial vectors "
9916 "because the target doesn't support extract "
9917 "last reduction.\n");
9918 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9920 else if (slp_node)
9922 if (dump_enabled_p ())
9923 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9924 "can't operate on partial vectors "
9925 "because an SLP statement is live after "
9926 "the loop.\n");
9927 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9929 else if (ncopies > 1)
9931 if (dump_enabled_p ())
9932 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9933 "can't operate on partial vectors "
9934 "because ncopies is greater than 1.\n");
9935 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
9937 else
9939 gcc_assert (ncopies == 1 && !slp_node);
9940 vect_record_loop_mask (loop_vinfo,
9941 &LOOP_VINFO_MASKS (loop_vinfo),
9942 1, vectype, NULL);
9945 /* ??? Enable for loop costing as well. */
9946 if (!loop_vinfo)
9947 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
9948 0, vect_epilogue);
9949 return true;
9952 /* Use the lhs of the original scalar statement. */
9953 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
9954 if (dump_enabled_p ())
9955 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
9956 "stmt %G", stmt);
9958 lhs = gimple_get_lhs (stmt);
9959 lhs_type = TREE_TYPE (lhs);
9961 bitsize = vector_element_bits_tree (vectype);
9963 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
9964 tree vec_lhs, bitstart;
9965 gimple *vec_stmt;
9966 if (slp_node)
9968 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
9970 /* Get the correct slp vectorized stmt. */
9971 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
9972 vec_lhs = gimple_get_lhs (vec_stmt);
9974 /* Get entry to use. */
9975 bitstart = bitsize_int (vec_index);
9976 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
9978 else
9980 /* For multiple copies, get the last copy. */
9981 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
9982 vec_lhs = gimple_get_lhs (vec_stmt);
9984 /* Get the last lane in the vector. */
9985 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
9988 if (loop_vinfo)
9990 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
9991 requirement, insert one phi node for it. It looks like:
9992 loop;
9994 # lhs' = PHI <lhs>
9996 loop;
9998 # vec_lhs' = PHI <vec_lhs>
9999 new_tree = lane_extract <vec_lhs', ...>;
10000 lhs' = new_tree; */
10002 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10003 basic_block exit_bb = single_exit (loop)->dest;
10004 gcc_assert (single_pred_p (exit_bb));
10006 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10007 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10008 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
10010 gimple_seq stmts = NULL;
10011 tree new_tree;
10012 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10014 /* Emit:
10016 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10018 where VEC_LHS is the vectorized live-out result and MASK is
10019 the loop mask for the final iteration. */
10020 gcc_assert (ncopies == 1 && !slp_node);
10021 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10022 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
10023 1, vectype, 0);
10024 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10025 mask, vec_lhs_phi);
10027 /* Convert the extracted vector element to the scalar type. */
10028 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10030 else
10032 tree bftype = TREE_TYPE (vectype);
10033 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10034 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10035 new_tree = build3 (BIT_FIELD_REF, bftype,
10036 vec_lhs_phi, bitsize, bitstart);
10037 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10038 &stmts, true, NULL_TREE);
10041 if (stmts)
10043 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10044 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10046 /* Remove existing phi from lhs and create one copy from new_tree. */
10047 tree lhs_phi = NULL_TREE;
10048 gimple_stmt_iterator gsi;
10049 for (gsi = gsi_start_phis (exit_bb);
10050 !gsi_end_p (gsi); gsi_next (&gsi))
10052 gimple *phi = gsi_stmt (gsi);
10053 if ((gimple_phi_arg_def (phi, 0) == lhs))
10055 remove_phi_node (&gsi, false);
10056 lhs_phi = gimple_phi_result (phi);
10057 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10058 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10059 break;
10064 /* Replace use of lhs with newly computed result. If the use stmt is a
10065 single arg PHI, just replace all uses of PHI result. It's necessary
10066 because lcssa PHI defining lhs may be before newly inserted stmt. */
10067 use_operand_p use_p;
10068 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10069 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10070 && !is_gimple_debug (use_stmt))
10072 if (gimple_code (use_stmt) == GIMPLE_PHI
10073 && gimple_phi_num_args (use_stmt) == 1)
10075 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10077 else
10079 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10080 SET_USE (use_p, new_tree);
10082 update_stmt (use_stmt);
10085 else
10087 /* For basic-block vectorization simply insert the lane-extraction. */
10088 tree bftype = TREE_TYPE (vectype);
10089 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10090 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10091 tree new_tree = build3 (BIT_FIELD_REF, bftype,
10092 vec_lhs, bitsize, bitstart);
10093 gimple_seq stmts = NULL;
10094 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10095 &stmts, true, NULL_TREE);
10096 if (TREE_CODE (new_tree) == SSA_NAME
10097 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10098 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10099 if (is_a <gphi *> (vec_stmt))
10101 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10102 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10104 else
10106 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10107 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10110 /* Replace use of lhs with newly computed result. If the use stmt is a
10111 single arg PHI, just replace all uses of PHI result. It's necessary
10112 because lcssa PHI defining lhs may be before newly inserted stmt. */
10113 use_operand_p use_p;
10114 stmt_vec_info use_stmt_info;
10115 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10116 if (!is_gimple_debug (use_stmt)
10117 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10118 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10120 /* ??? This can happen when the live lane ends up being
10121 used in a vector construction code-generated by an
10122 external SLP node (and code-generation for that already
10123 happened). See gcc.dg/vect/bb-slp-47.c.
10124 Doing this is what would happen if that vector CTOR
10125 were not code-generated yet so it is not too bad.
10126 ??? In fact we'd likely want to avoid this situation
10127 in the first place. */
10128 if (TREE_CODE (new_tree) == SSA_NAME
10129 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10130 && gimple_code (use_stmt) != GIMPLE_PHI
10131 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10132 use_stmt))
10134 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10135 gcc_assert (code == CONSTRUCTOR
10136 || code == VIEW_CONVERT_EXPR
10137 || CONVERT_EXPR_CODE_P (code));
10138 if (dump_enabled_p ())
10139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10140 "Using original scalar computation for "
10141 "live lane because use preceeds vector "
10142 "def\n");
10143 continue;
10145 /* ??? It can also happen that we end up pulling a def into
10146 a loop where replacing out-of-loop uses would require
10147 a new LC SSA PHI node. Retain the original scalar in
10148 those cases as well. PR98064. */
10149 if (TREE_CODE (new_tree) == SSA_NAME
10150 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10151 && (gimple_bb (use_stmt)->loop_father
10152 != gimple_bb (vec_stmt)->loop_father)
10153 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10154 gimple_bb (use_stmt)->loop_father))
10156 if (dump_enabled_p ())
10157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10158 "Using original scalar computation for "
10159 "live lane because there is an out-of-loop "
10160 "definition for it\n");
10161 continue;
10163 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10164 SET_USE (use_p, new_tree);
10165 update_stmt (use_stmt);
10169 return true;
10172 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
10174 static void
10175 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10177 ssa_op_iter op_iter;
10178 imm_use_iterator imm_iter;
10179 def_operand_p def_p;
10180 gimple *ustmt;
10182 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10184 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10186 basic_block bb;
10188 if (!is_gimple_debug (ustmt))
10189 continue;
10191 bb = gimple_bb (ustmt);
10193 if (!flow_bb_inside_loop_p (loop, bb))
10195 if (gimple_debug_bind_p (ustmt))
10197 if (dump_enabled_p ())
10198 dump_printf_loc (MSG_NOTE, vect_location,
10199 "killing debug use\n");
10201 gimple_debug_bind_reset_value (ustmt);
10202 update_stmt (ustmt);
10204 else
10205 gcc_unreachable ();
10211 /* Given loop represented by LOOP_VINFO, return true if computation of
10212 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10213 otherwise. */
10215 static bool
10216 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10218 /* Constant case. */
10219 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10221 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10222 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10224 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10225 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10226 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10227 return true;
10230 widest_int max;
10231 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10232 /* Check the upper bound of loop niters. */
10233 if (get_max_loop_iterations (loop, &max))
10235 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10236 signop sgn = TYPE_SIGN (type);
10237 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10238 if (max < type_max)
10239 return true;
10241 return false;
10244 /* Return a mask type with half the number of elements as OLD_TYPE,
10245 given that it should have mode NEW_MODE. */
10247 tree
10248 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10250 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10251 return build_truth_vector_type_for_mode (nunits, new_mode);
10254 /* Return a mask type with twice as many elements as OLD_TYPE,
10255 given that it should have mode NEW_MODE. */
10257 tree
10258 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10260 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10261 return build_truth_vector_type_for_mode (nunits, new_mode);
10264 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10265 contain a sequence of NVECTORS masks that each control a vector of type
10266 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
10267 these vector masks with the vector version of SCALAR_MASK. */
10269 void
10270 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10271 unsigned int nvectors, tree vectype, tree scalar_mask)
10273 gcc_assert (nvectors != 0);
10274 if (masks->length () < nvectors)
10275 masks->safe_grow_cleared (nvectors, true);
10276 rgroup_controls *rgm = &(*masks)[nvectors - 1];
10277 /* The number of scalars per iteration and the number of vectors are
10278 both compile-time constants. */
10279 unsigned int nscalars_per_iter
10280 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10281 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10283 if (scalar_mask)
10285 scalar_cond_masked_key cond (scalar_mask, nvectors);
10286 loop_vinfo->scalar_cond_masked_set.add (cond);
10289 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
10291 rgm->max_nscalars_per_iter = nscalars_per_iter;
10292 rgm->type = truth_type_for (vectype);
10293 rgm->factor = 1;
10297 /* Given a complete set of masks MASKS, extract mask number INDEX
10298 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10299 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
10301 See the comment above vec_loop_masks for more details about the mask
10302 arrangement. */
10304 tree
10305 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10306 unsigned int nvectors, tree vectype, unsigned int index)
10308 rgroup_controls *rgm = &(*masks)[nvectors - 1];
10309 tree mask_type = rgm->type;
10311 /* Populate the rgroup's mask array, if this is the first time we've
10312 used it. */
10313 if (rgm->controls.is_empty ())
10315 rgm->controls.safe_grow_cleared (nvectors, true);
10316 for (unsigned int i = 0; i < nvectors; ++i)
10318 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10319 /* Provide a dummy definition until the real one is available. */
10320 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10321 rgm->controls[i] = mask;
10325 tree mask = rgm->controls[index];
10326 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10327 TYPE_VECTOR_SUBPARTS (vectype)))
10329 /* A loop mask for data type X can be reused for data type Y
10330 if X has N times more elements than Y and if Y's elements
10331 are N times bigger than X's. In this case each sequence
10332 of N elements in the loop mask will be all-zero or all-one.
10333 We can then view-convert the mask so that each sequence of
10334 N elements is replaced by a single element. */
10335 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10336 TYPE_VECTOR_SUBPARTS (vectype)));
10337 gimple_seq seq = NULL;
10338 mask_type = truth_type_for (vectype);
10339 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10340 if (seq)
10341 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10343 return mask;
10346 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10347 lengths for controlling an operation on VECTYPE. The operation splits
10348 each element of VECTYPE into FACTOR separate subelements, measuring the
10349 length as a number of these subelements. */
10351 void
10352 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10353 unsigned int nvectors, tree vectype, unsigned int factor)
10355 gcc_assert (nvectors != 0);
10356 if (lens->length () < nvectors)
10357 lens->safe_grow_cleared (nvectors, true);
10358 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10360 /* The number of scalars per iteration, scalar occupied bytes and
10361 the number of vectors are both compile-time constants. */
10362 unsigned int nscalars_per_iter
10363 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10364 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10366 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10368 /* For now, we only support cases in which all loads and stores fall back
10369 to VnQI or none do. */
10370 gcc_assert (!rgl->max_nscalars_per_iter
10371 || (rgl->factor == 1 && factor == 1)
10372 || (rgl->max_nscalars_per_iter * rgl->factor
10373 == nscalars_per_iter * factor));
10374 rgl->max_nscalars_per_iter = nscalars_per_iter;
10375 rgl->type = vectype;
10376 rgl->factor = factor;
10380 /* Given a complete set of length LENS, extract length number INDEX for an
10381 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
10383 tree
10384 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10385 unsigned int nvectors, unsigned int index)
10387 rgroup_controls *rgl = &(*lens)[nvectors - 1];
10388 bool use_bias_adjusted_len =
10389 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10391 /* Populate the rgroup's len array, if this is the first time we've
10392 used it. */
10393 if (rgl->controls.is_empty ())
10395 rgl->controls.safe_grow_cleared (nvectors, true);
10396 for (unsigned int i = 0; i < nvectors; ++i)
10398 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10399 gcc_assert (len_type != NULL_TREE);
10401 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10403 /* Provide a dummy definition until the real one is available. */
10404 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10405 rgl->controls[i] = len;
10407 if (use_bias_adjusted_len)
10409 gcc_assert (i == 0);
10410 tree adjusted_len =
10411 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10412 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10413 rgl->bias_adjusted_ctrl = adjusted_len;
10418 if (use_bias_adjusted_len)
10419 return rgl->bias_adjusted_ctrl;
10420 else
10421 return rgl->controls[index];
10424 /* Scale profiling counters by estimation for LOOP which is vectorized
10425 by factor VF. */
10427 static void
10428 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
10430 edge preheader = loop_preheader_edge (loop);
10431 /* Reduce loop iterations by the vectorization factor. */
10432 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
10433 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
10435 if (freq_h.nonzero_p ())
10437 profile_probability p;
10439 /* Avoid dropping loop body profile counter to 0 because of zero count
10440 in loop's preheader. */
10441 if (!(freq_e == profile_count::zero ()))
10442 freq_e = freq_e.force_nonzero ();
10443 p = (freq_e * (new_est_niter + 1)).probability_in (freq_h);
10444 scale_loop_frequencies (loop, p);
10447 edge exit_e = single_exit (loop);
10448 exit_e->probability = profile_probability::always () / (new_est_niter + 1);
10450 edge exit_l = single_pred_edge (loop->latch);
10451 profile_probability prob = exit_l->probability;
10452 exit_l->probability = exit_e->probability.invert ();
10453 if (prob.initialized_p () && exit_l->probability.initialized_p ())
10454 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
10457 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10458 latch edge values originally defined by it. */
10460 static void
10461 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
10462 stmt_vec_info def_stmt_info)
10464 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
10465 if (!def || TREE_CODE (def) != SSA_NAME)
10466 return;
10467 stmt_vec_info phi_info;
10468 imm_use_iterator iter;
10469 use_operand_p use_p;
10470 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
10472 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
10473 if (!phi)
10474 continue;
10475 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
10476 && (phi_info = loop_vinfo->lookup_stmt (phi))
10477 && STMT_VINFO_RELEVANT_P (phi_info)))
10478 continue;
10479 loop_p loop = gimple_bb (phi)->loop_father;
10480 edge e = loop_latch_edge (loop);
10481 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
10482 continue;
10484 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
10485 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
10486 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
10488 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10489 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10490 gcc_assert (phi_defs.length () == latch_defs.length ());
10491 for (unsigned i = 0; i < phi_defs.length (); ++i)
10492 add_phi_arg (as_a <gphi *> (phi_defs[i]),
10493 gimple_get_lhs (latch_defs[i]), e,
10494 gimple_phi_arg_location (phi, e->dest_idx));
10496 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
10498 /* For first order recurrences we have to update both uses of
10499 the latch definition, the one in the PHI node and the one
10500 in the generated VEC_PERM_EXPR. */
10501 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10502 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10503 gcc_assert (phi_defs.length () == latch_defs.length ());
10504 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
10505 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
10506 for (unsigned i = 0; i < phi_defs.length (); ++i)
10508 gassign *perm = as_a <gassign *> (phi_defs[i]);
10509 if (i > 0)
10510 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
10511 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
10512 update_stmt (perm);
10514 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
10515 gimple_phi_arg_location (phi, e->dest_idx));
10520 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
10521 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
10522 stmt_vec_info. */
10524 static bool
10525 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
10526 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
10528 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10529 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10531 if (dump_enabled_p ())
10532 dump_printf_loc (MSG_NOTE, vect_location,
10533 "------>vectorizing statement: %G", stmt_info->stmt);
10535 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10536 vect_loop_kill_debug_uses (loop, stmt_info);
10538 if (!STMT_VINFO_RELEVANT_P (stmt_info)
10539 && !STMT_VINFO_LIVE_P (stmt_info))
10540 return false;
10542 if (STMT_VINFO_VECTYPE (stmt_info))
10544 poly_uint64 nunits
10545 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
10546 if (!STMT_SLP_TYPE (stmt_info)
10547 && maybe_ne (nunits, vf)
10548 && dump_enabled_p ())
10549 /* For SLP VF is set according to unrolling factor, and not
10550 to vector size, hence for SLP this print is not valid. */
10551 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10554 /* Pure SLP statements have already been vectorized. We still need
10555 to apply loop vectorization to hybrid SLP statements. */
10556 if (PURE_SLP_STMT (stmt_info))
10557 return false;
10559 if (dump_enabled_p ())
10560 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
10562 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
10563 *seen_store = stmt_info;
10565 return true;
10568 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
10569 in the hash_map with its corresponding values. */
10571 static tree
10572 find_in_mapping (tree t, void *context)
10574 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
10576 tree *value = mapping->get (t);
10577 return value ? *value : t;
10580 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
10581 original loop that has now been vectorized.
10583 The inits of the data_references need to be advanced with the number of
10584 iterations of the main loop. This has been computed in vect_do_peeling and
10585 is stored in parameter ADVANCE. We first restore the data_references
10586 initial offset with the values recored in ORIG_DRS_INIT.
10588 Since the loop_vec_info of this EPILOGUE was constructed for the original
10589 loop, its stmt_vec_infos all point to the original statements. These need
10590 to be updated to point to their corresponding copies as well as the SSA_NAMES
10591 in their PATTERN_DEF_SEQs and RELATED_STMTs.
10593 The data_reference's connections also need to be updated. Their
10594 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
10595 stmt_vec_infos, their statements need to point to their corresponding copy,
10596 if they are gather loads or scatter stores then their reference needs to be
10597 updated to point to its corresponding copy and finally we set
10598 'base_misaligned' to false as we have already peeled for alignment in the
10599 prologue of the main loop. */
10601 static void
10602 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
10604 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
10605 auto_vec<gimple *> stmt_worklist;
10606 hash_map<tree,tree> mapping;
10607 gimple *orig_stmt, *new_stmt;
10608 gimple_stmt_iterator epilogue_gsi;
10609 gphi_iterator epilogue_phi_gsi;
10610 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
10611 basic_block *epilogue_bbs = get_loop_body (epilogue);
10612 unsigned i;
10614 free (LOOP_VINFO_BBS (epilogue_vinfo));
10615 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
10617 /* Advance data_reference's with the number of iterations of the previous
10618 loop and its prologue. */
10619 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
10622 /* The EPILOGUE loop is a copy of the original loop so they share the same
10623 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
10624 point to the copied statements. We also create a mapping of all LHS' in
10625 the original loop and all the LHS' in the EPILOGUE and create worklists to
10626 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
10627 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
10629 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
10630 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
10632 new_stmt = epilogue_phi_gsi.phi ();
10634 gcc_assert (gimple_uid (new_stmt) > 0);
10635 stmt_vinfo
10636 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10638 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
10639 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10641 mapping.put (gimple_phi_result (orig_stmt),
10642 gimple_phi_result (new_stmt));
10643 /* PHI nodes can not have patterns or related statements. */
10644 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
10645 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
10648 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
10649 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
10651 new_stmt = gsi_stmt (epilogue_gsi);
10652 if (is_gimple_debug (new_stmt))
10653 continue;
10655 gcc_assert (gimple_uid (new_stmt) > 0);
10656 stmt_vinfo
10657 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
10659 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
10660 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
10662 if (tree old_lhs = gimple_get_lhs (orig_stmt))
10663 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
10665 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
10667 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
10668 for (gimple_stmt_iterator gsi = gsi_start (seq);
10669 !gsi_end_p (gsi); gsi_next (&gsi))
10670 stmt_worklist.safe_push (gsi_stmt (gsi));
10673 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
10674 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
10676 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
10677 stmt_worklist.safe_push (stmt);
10678 /* Set BB such that the assert in
10679 'get_initial_def_for_reduction' is able to determine that
10680 the BB of the related stmt is inside this loop. */
10681 gimple_set_bb (stmt,
10682 gimple_bb (new_stmt));
10683 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
10684 gcc_assert (related_vinfo == NULL
10685 || related_vinfo == stmt_vinfo);
10690 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
10691 using the original main loop and thus need to be updated to refer to the
10692 cloned variables used in the epilogue. */
10693 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
10695 gimple *stmt = stmt_worklist[i];
10696 tree *new_op;
10698 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
10700 tree op = gimple_op (stmt, j);
10701 if ((new_op = mapping.get(op)))
10702 gimple_set_op (stmt, j, *new_op);
10703 else
10705 /* PR92429: The last argument of simplify_replace_tree disables
10706 folding when replacing arguments. This is required as
10707 otherwise you might end up with different statements than the
10708 ones analyzed in vect_loop_analyze, leading to different
10709 vectorization. */
10710 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
10711 &find_in_mapping, &mapping, false);
10712 gimple_set_op (stmt, j, op);
10717 struct data_reference *dr;
10718 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
10719 FOR_EACH_VEC_ELT (datarefs, i, dr)
10721 orig_stmt = DR_STMT (dr);
10722 gcc_assert (gimple_uid (orig_stmt) > 0);
10723 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
10724 /* Data references for gather loads and scatter stores do not use the
10725 updated offset we set using ADVANCE. Instead we have to make sure the
10726 reference in the data references point to the corresponding copy of
10727 the original in the epilogue. */
10728 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
10729 == VMAT_GATHER_SCATTER)
10731 DR_REF (dr)
10732 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
10733 &find_in_mapping, &mapping);
10734 DR_BASE_ADDRESS (dr)
10735 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
10736 &find_in_mapping, &mapping);
10738 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
10739 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
10740 /* The vector size of the epilogue is smaller than that of the main loop
10741 so the alignment is either the same or lower. This means the dr will
10742 thus by definition be aligned. */
10743 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
10746 epilogue_vinfo->shared->datarefs_copy.release ();
10747 epilogue_vinfo->shared->save_datarefs ();
10750 /* Function vect_transform_loop.
10752 The analysis phase has determined that the loop is vectorizable.
10753 Vectorize the loop - created vectorized stmts to replace the scalar
10754 stmts in the loop, and update the loop exit condition.
10755 Returns scalar epilogue loop if any. */
10757 class loop *
10758 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
10760 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10761 class loop *epilogue = NULL;
10762 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
10763 int nbbs = loop->num_nodes;
10764 int i;
10765 tree niters_vector = NULL_TREE;
10766 tree step_vector = NULL_TREE;
10767 tree niters_vector_mult_vf = NULL_TREE;
10768 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10769 unsigned int lowest_vf = constant_lower_bound (vf);
10770 gimple *stmt;
10771 bool check_profitability = false;
10772 unsigned int th;
10774 DUMP_VECT_SCOPE ("vec_transform_loop");
10776 loop_vinfo->shared->check_datarefs ();
10778 /* Use the more conservative vectorization threshold. If the number
10779 of iterations is constant assume the cost check has been performed
10780 by our caller. If the threshold makes all loops profitable that
10781 run at least the (estimated) vectorization factor number of times
10782 checking is pointless, too. */
10783 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
10784 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
10786 if (dump_enabled_p ())
10787 dump_printf_loc (MSG_NOTE, vect_location,
10788 "Profitability threshold is %d loop iterations.\n",
10789 th);
10790 check_profitability = true;
10793 /* Make sure there exists a single-predecessor exit bb. Do this before
10794 versioning. */
10795 edge e = single_exit (loop);
10796 if (! single_pred_p (e->dest))
10798 split_loop_exit_edge (e, true);
10799 if (dump_enabled_p ())
10800 dump_printf (MSG_NOTE, "split exit edge\n");
10803 /* Version the loop first, if required, so the profitability check
10804 comes first. */
10806 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
10808 class loop *sloop
10809 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
10810 sloop->force_vectorize = false;
10811 check_profitability = false;
10814 /* Make sure there exists a single-predecessor exit bb also on the
10815 scalar loop copy. Do this after versioning but before peeling
10816 so CFG structure is fine for both scalar and if-converted loop
10817 to make slpeel_duplicate_current_defs_from_edges face matched
10818 loop closed PHI nodes on the exit. */
10819 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
10821 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
10822 if (! single_pred_p (e->dest))
10824 split_loop_exit_edge (e, true);
10825 if (dump_enabled_p ())
10826 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
10830 tree niters = vect_build_loop_niters (loop_vinfo);
10831 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
10832 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
10833 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
10834 tree advance;
10835 drs_init_vec orig_drs_init;
10837 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
10838 &step_vector, &niters_vector_mult_vf, th,
10839 check_profitability, niters_no_overflow,
10840 &advance);
10842 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
10843 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
10844 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
10845 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
10847 if (niters_vector == NULL_TREE)
10849 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
10850 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
10851 && known_eq (lowest_vf, vf))
10853 niters_vector
10854 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
10855 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
10856 step_vector = build_one_cst (TREE_TYPE (niters));
10858 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10859 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
10860 &step_vector, niters_no_overflow);
10861 else
10862 /* vect_do_peeling subtracted the number of peeled prologue
10863 iterations from LOOP_VINFO_NITERS. */
10864 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
10865 &niters_vector, &step_vector,
10866 niters_no_overflow);
10869 /* 1) Make sure the loop header has exactly two entries
10870 2) Make sure we have a preheader basic block. */
10872 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
10874 split_edge (loop_preheader_edge (loop));
10876 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
10877 /* This will deal with any possible peeling. */
10878 vect_prepare_for_masked_peels (loop_vinfo);
10880 /* Schedule the SLP instances first, then handle loop vectorization
10881 below. */
10882 if (!loop_vinfo->slp_instances.is_empty ())
10884 DUMP_VECT_SCOPE ("scheduling SLP instances");
10885 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
10888 /* FORNOW: the vectorizer supports only loops which body consist
10889 of one basic block (header + empty latch). When the vectorizer will
10890 support more involved loop forms, the order by which the BBs are
10891 traversed need to be reconsidered. */
10893 for (i = 0; i < nbbs; i++)
10895 basic_block bb = bbs[i];
10896 stmt_vec_info stmt_info;
10898 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
10899 gsi_next (&si))
10901 gphi *phi = si.phi ();
10902 if (dump_enabled_p ())
10903 dump_printf_loc (MSG_NOTE, vect_location,
10904 "------>vectorizing phi: %G", (gimple *) phi);
10905 stmt_info = loop_vinfo->lookup_stmt (phi);
10906 if (!stmt_info)
10907 continue;
10909 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10910 vect_loop_kill_debug_uses (loop, stmt_info);
10912 if (!STMT_VINFO_RELEVANT_P (stmt_info)
10913 && !STMT_VINFO_LIVE_P (stmt_info))
10914 continue;
10916 if (STMT_VINFO_VECTYPE (stmt_info)
10917 && (maybe_ne
10918 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
10919 && dump_enabled_p ())
10920 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10922 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
10923 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
10924 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
10925 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
10926 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
10927 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
10928 && ! PURE_SLP_STMT (stmt_info))
10930 if (dump_enabled_p ())
10931 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
10932 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
10936 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
10937 gsi_next (&si))
10939 gphi *phi = si.phi ();
10940 stmt_info = loop_vinfo->lookup_stmt (phi);
10941 if (!stmt_info)
10942 continue;
10944 if (!STMT_VINFO_RELEVANT_P (stmt_info)
10945 && !STMT_VINFO_LIVE_P (stmt_info))
10946 continue;
10948 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
10949 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
10950 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
10951 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
10952 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
10953 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
10954 && ! PURE_SLP_STMT (stmt_info))
10955 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
10958 for (gimple_stmt_iterator si = gsi_start_bb (bb);
10959 !gsi_end_p (si);)
10961 stmt = gsi_stmt (si);
10962 /* During vectorization remove existing clobber stmts. */
10963 if (gimple_clobber_p (stmt))
10965 unlink_stmt_vdef (stmt);
10966 gsi_remove (&si, true);
10967 release_defs (stmt);
10969 else
10971 /* Ignore vector stmts created in the outer loop. */
10972 stmt_info = loop_vinfo->lookup_stmt (stmt);
10974 /* vector stmts created in the outer-loop during vectorization of
10975 stmts in an inner-loop may not have a stmt_info, and do not
10976 need to be vectorized. */
10977 stmt_vec_info seen_store = NULL;
10978 if (stmt_info)
10980 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
10982 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
10983 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
10984 !gsi_end_p (subsi); gsi_next (&subsi))
10986 stmt_vec_info pat_stmt_info
10987 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
10988 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
10989 &si, &seen_store);
10991 stmt_vec_info pat_stmt_info
10992 = STMT_VINFO_RELATED_STMT (stmt_info);
10993 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
10994 &si, &seen_store))
10995 maybe_set_vectorized_backedge_value (loop_vinfo,
10996 pat_stmt_info);
10998 else
11000 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11001 &seen_store))
11002 maybe_set_vectorized_backedge_value (loop_vinfo,
11003 stmt_info);
11006 gsi_next (&si);
11007 if (seen_store)
11009 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11010 /* Interleaving. If IS_STORE is TRUE, the
11011 vectorization of the interleaving chain was
11012 completed - free all the stores in the chain. */
11013 vect_remove_stores (loop_vinfo,
11014 DR_GROUP_FIRST_ELEMENT (seen_store));
11015 else
11016 /* Free the attached stmt_vec_info and remove the stmt. */
11017 loop_vinfo->remove_stmt (stmt_info);
11022 /* Stub out scalar statements that must not survive vectorization.
11023 Doing this here helps with grouped statements, or statements that
11024 are involved in patterns. */
11025 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11026 !gsi_end_p (gsi); gsi_next (&gsi))
11028 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11029 if (!call || !gimple_call_internal_p (call))
11030 continue;
11031 internal_fn ifn = gimple_call_internal_fn (call);
11032 if (ifn == IFN_MASK_LOAD)
11034 tree lhs = gimple_get_lhs (call);
11035 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11037 tree zero = build_zero_cst (TREE_TYPE (lhs));
11038 gimple *new_stmt = gimple_build_assign (lhs, zero);
11039 gsi_replace (&gsi, new_stmt, true);
11042 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11044 tree lhs = gimple_get_lhs (call);
11045 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11047 tree else_arg
11048 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11049 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11050 gsi_replace (&gsi, new_stmt, true);
11054 } /* BBs in loop */
11056 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11057 a zero NITERS becomes a nonzero NITERS_VECTOR. */
11058 if (integer_onep (step_vector))
11059 niters_no_overflow = true;
11060 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11061 niters_vector_mult_vf, !niters_no_overflow);
11063 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11064 scale_profile_for_vect_loop (loop, assumed_vf);
11066 /* True if the final iteration might not handle a full vector's
11067 worth of scalar iterations. */
11068 bool final_iter_may_be_partial
11069 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11070 /* The minimum number of iterations performed by the epilogue. This
11071 is 1 when peeling for gaps because we always need a final scalar
11072 iteration. */
11073 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11074 /* +1 to convert latch counts to loop iteration counts,
11075 -min_epilogue_iters to remove iterations that cannot be performed
11076 by the vector code. */
11077 int bias_for_lowest = 1 - min_epilogue_iters;
11078 int bias_for_assumed = bias_for_lowest;
11079 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11080 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11082 /* When the amount of peeling is known at compile time, the first
11083 iteration will have exactly alignment_npeels active elements.
11084 In the worst case it will have at least one. */
11085 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11086 bias_for_lowest += lowest_vf - min_first_active;
11087 bias_for_assumed += assumed_vf - min_first_active;
11089 /* In these calculations the "- 1" converts loop iteration counts
11090 back to latch counts. */
11091 if (loop->any_upper_bound)
11093 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11094 loop->nb_iterations_upper_bound
11095 = (final_iter_may_be_partial
11096 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11097 lowest_vf) - 1
11098 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11099 lowest_vf) - 1);
11100 if (main_vinfo
11101 /* Both peeling for alignment and peeling for gaps can end up
11102 with the scalar epilogue running for more than VF-1 iterations. */
11103 && !main_vinfo->peeling_for_alignment
11104 && !main_vinfo->peeling_for_gaps)
11106 unsigned int bound;
11107 poly_uint64 main_iters
11108 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11109 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11110 main_iters
11111 = upper_bound (main_iters,
11112 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11113 if (can_div_away_from_zero_p (main_iters,
11114 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11115 &bound))
11116 loop->nb_iterations_upper_bound
11117 = wi::umin ((widest_int) (bound - 1),
11118 loop->nb_iterations_upper_bound);
11121 if (loop->any_likely_upper_bound)
11122 loop->nb_iterations_likely_upper_bound
11123 = (final_iter_may_be_partial
11124 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11125 + bias_for_lowest, lowest_vf) - 1
11126 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11127 + bias_for_lowest, lowest_vf) - 1);
11128 if (loop->any_estimate)
11129 loop->nb_iterations_estimate
11130 = (final_iter_may_be_partial
11131 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11132 assumed_vf) - 1
11133 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11134 assumed_vf) - 1);
11136 if (dump_enabled_p ())
11138 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11140 dump_printf_loc (MSG_NOTE, vect_location,
11141 "LOOP VECTORIZED\n");
11142 if (loop->inner)
11143 dump_printf_loc (MSG_NOTE, vect_location,
11144 "OUTER LOOP VECTORIZED\n");
11145 dump_printf (MSG_NOTE, "\n");
11147 else
11148 dump_printf_loc (MSG_NOTE, vect_location,
11149 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11150 GET_MODE_NAME (loop_vinfo->vector_mode));
11153 /* Loops vectorized with a variable factor won't benefit from
11154 unrolling/peeling. */
11155 if (!vf.is_constant ())
11157 loop->unroll = 1;
11158 if (dump_enabled_p ())
11159 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11160 " variable-length vectorization factor\n");
11162 /* Free SLP instances here because otherwise stmt reference counting
11163 won't work. */
11164 slp_instance instance;
11165 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11166 vect_free_slp_instance (instance);
11167 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11168 /* Clear-up safelen field since its value is invalid after vectorization
11169 since vectorized loop can have loop-carried dependencies. */
11170 loop->safelen = 0;
11172 if (epilogue)
11174 update_epilogue_loop_vinfo (epilogue, advance);
11176 epilogue->simduid = loop->simduid;
11177 epilogue->force_vectorize = loop->force_vectorize;
11178 epilogue->dont_vectorize = false;
11181 return epilogue;
11184 /* The code below is trying to perform simple optimization - revert
11185 if-conversion for masked stores, i.e. if the mask of a store is zero
11186 do not perform it and all stored value producers also if possible.
11187 For example,
11188 for (i=0; i<n; i++)
11189 if (c[i])
11191 p1[i] += 1;
11192 p2[i] = p3[i] +2;
11194 this transformation will produce the following semi-hammock:
11196 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11198 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11199 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11200 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11201 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11202 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11203 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11207 void
11208 optimize_mask_stores (class loop *loop)
11210 basic_block *bbs = get_loop_body (loop);
11211 unsigned nbbs = loop->num_nodes;
11212 unsigned i;
11213 basic_block bb;
11214 class loop *bb_loop;
11215 gimple_stmt_iterator gsi;
11216 gimple *stmt;
11217 auto_vec<gimple *> worklist;
11218 auto_purge_vect_location sentinel;
11220 vect_location = find_loop_location (loop);
11221 /* Pick up all masked stores in loop if any. */
11222 for (i = 0; i < nbbs; i++)
11224 bb = bbs[i];
11225 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11226 gsi_next (&gsi))
11228 stmt = gsi_stmt (gsi);
11229 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11230 worklist.safe_push (stmt);
11234 free (bbs);
11235 if (worklist.is_empty ())
11236 return;
11238 /* Loop has masked stores. */
11239 while (!worklist.is_empty ())
11241 gimple *last, *last_store;
11242 edge e, efalse;
11243 tree mask;
11244 basic_block store_bb, join_bb;
11245 gimple_stmt_iterator gsi_to;
11246 tree vdef, new_vdef;
11247 gphi *phi;
11248 tree vectype;
11249 tree zero;
11251 last = worklist.pop ();
11252 mask = gimple_call_arg (last, 2);
11253 bb = gimple_bb (last);
11254 /* Create then_bb and if-then structure in CFG, then_bb belongs to
11255 the same loop as if_bb. It could be different to LOOP when two
11256 level loop-nest is vectorized and mask_store belongs to the inner
11257 one. */
11258 e = split_block (bb, last);
11259 bb_loop = bb->loop_father;
11260 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11261 join_bb = e->dest;
11262 store_bb = create_empty_bb (bb);
11263 add_bb_to_loop (store_bb, bb_loop);
11264 e->flags = EDGE_TRUE_VALUE;
11265 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11266 /* Put STORE_BB to likely part. */
11267 efalse->probability = profile_probability::unlikely ();
11268 store_bb->count = efalse->count ();
11269 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11270 if (dom_info_available_p (CDI_DOMINATORS))
11271 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11272 if (dump_enabled_p ())
11273 dump_printf_loc (MSG_NOTE, vect_location,
11274 "Create new block %d to sink mask stores.",
11275 store_bb->index);
11276 /* Create vector comparison with boolean result. */
11277 vectype = TREE_TYPE (mask);
11278 zero = build_zero_cst (vectype);
11279 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11280 gsi = gsi_last_bb (bb);
11281 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11282 /* Create new PHI node for vdef of the last masked store:
11283 .MEM_2 = VDEF <.MEM_1>
11284 will be converted to
11285 .MEM.3 = VDEF <.MEM_1>
11286 and new PHI node will be created in join bb
11287 .MEM_2 = PHI <.MEM_1, .MEM_3>
11289 vdef = gimple_vdef (last);
11290 new_vdef = make_ssa_name (gimple_vop (cfun), last);
11291 gimple_set_vdef (last, new_vdef);
11292 phi = create_phi_node (vdef, join_bb);
11293 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11295 /* Put all masked stores with the same mask to STORE_BB if possible. */
11296 while (true)
11298 gimple_stmt_iterator gsi_from;
11299 gimple *stmt1 = NULL;
11301 /* Move masked store to STORE_BB. */
11302 last_store = last;
11303 gsi = gsi_for_stmt (last);
11304 gsi_from = gsi;
11305 /* Shift GSI to the previous stmt for further traversal. */
11306 gsi_prev (&gsi);
11307 gsi_to = gsi_start_bb (store_bb);
11308 gsi_move_before (&gsi_from, &gsi_to);
11309 /* Setup GSI_TO to the non-empty block start. */
11310 gsi_to = gsi_start_bb (store_bb);
11311 if (dump_enabled_p ())
11312 dump_printf_loc (MSG_NOTE, vect_location,
11313 "Move stmt to created bb\n%G", last);
11314 /* Move all stored value producers if possible. */
11315 while (!gsi_end_p (gsi))
11317 tree lhs;
11318 imm_use_iterator imm_iter;
11319 use_operand_p use_p;
11320 bool res;
11322 /* Skip debug statements. */
11323 if (is_gimple_debug (gsi_stmt (gsi)))
11325 gsi_prev (&gsi);
11326 continue;
11328 stmt1 = gsi_stmt (gsi);
11329 /* Do not consider statements writing to memory or having
11330 volatile operand. */
11331 if (gimple_vdef (stmt1)
11332 || gimple_has_volatile_ops (stmt1))
11333 break;
11334 gsi_from = gsi;
11335 gsi_prev (&gsi);
11336 lhs = gimple_get_lhs (stmt1);
11337 if (!lhs)
11338 break;
11340 /* LHS of vectorized stmt must be SSA_NAME. */
11341 if (TREE_CODE (lhs) != SSA_NAME)
11342 break;
11344 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11346 /* Remove dead scalar statement. */
11347 if (has_zero_uses (lhs))
11349 gsi_remove (&gsi_from, true);
11350 continue;
11354 /* Check that LHS does not have uses outside of STORE_BB. */
11355 res = true;
11356 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11358 gimple *use_stmt;
11359 use_stmt = USE_STMT (use_p);
11360 if (is_gimple_debug (use_stmt))
11361 continue;
11362 if (gimple_bb (use_stmt) != store_bb)
11364 res = false;
11365 break;
11368 if (!res)
11369 break;
11371 if (gimple_vuse (stmt1)
11372 && gimple_vuse (stmt1) != gimple_vuse (last_store))
11373 break;
11375 /* Can move STMT1 to STORE_BB. */
11376 if (dump_enabled_p ())
11377 dump_printf_loc (MSG_NOTE, vect_location,
11378 "Move stmt to created bb\n%G", stmt1);
11379 gsi_move_before (&gsi_from, &gsi_to);
11380 /* Shift GSI_TO for further insertion. */
11381 gsi_prev (&gsi_to);
11383 /* Put other masked stores with the same mask to STORE_BB. */
11384 if (worklist.is_empty ()
11385 || gimple_call_arg (worklist.last (), 2) != mask
11386 || worklist.last () != stmt1)
11387 break;
11388 last = worklist.pop ();
11390 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11394 /* Decide whether it is possible to use a zero-based induction variable
11395 when vectorizing LOOP_VINFO with partial vectors. If it is, return
11396 the value that the induction variable must be able to hold in order
11397 to ensure that the rgroups eventually have no active vector elements.
11398 Return -1 otherwise. */
11400 widest_int
11401 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11403 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11404 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11405 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11407 /* Calculate the value that the induction variable must be able
11408 to hit in order to ensure that we end the loop with an all-false mask.
11409 This involves adding the maximum number of inactive trailing scalar
11410 iterations. */
11411 widest_int iv_limit = -1;
11412 if (max_loop_iterations (loop, &iv_limit))
11414 if (niters_skip)
11416 /* Add the maximum number of skipped iterations to the
11417 maximum iteration count. */
11418 if (TREE_CODE (niters_skip) == INTEGER_CST)
11419 iv_limit += wi::to_widest (niters_skip);
11420 else
11421 iv_limit += max_vf - 1;
11423 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11424 /* Make a conservatively-correct assumption. */
11425 iv_limit += max_vf - 1;
11427 /* IV_LIMIT is the maximum number of latch iterations, which is also
11428 the maximum in-range IV value. Round this value down to the previous
11429 vector alignment boundary and then add an extra full iteration. */
11430 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11431 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11433 return iv_limit;
11436 /* For the given rgroup_controls RGC, check whether an induction variable
11437 would ever hit a value that produces a set of all-false masks or zero
11438 lengths before wrapping around. Return true if it's possible to wrap
11439 around before hitting the desirable value, otherwise return false. */
11441 bool
11442 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11444 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11446 if (iv_limit == -1)
11447 return true;
11449 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11450 unsigned int compare_precision = TYPE_PRECISION (compare_type);
11451 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11453 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11454 return true;
11456 return false;