Re: Refactor copying decl section names
[official-gcc.git] / gcc / tree-vect-loop.c
blob39b7319e8253c351a4f6fbdd8c154330f08f2b1b
1 /* Loop Vectorization
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
57 /* Loop Vectorization Pass.
59 This pass tries to vectorize loops.
61 For example, the vectorizer transforms the following simple loop:
63 short a[N]; short b[N]; short c[N]; int i;
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
69 as if it was manually vectorized by rewriting the source code into:
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
117 For example, say stmt S1 was vectorized into stmt VS1:
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
163 static opt_result
164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
181 &stmt_vectype,
182 &nunits_vectype);
183 if (!res)
184 return res;
186 if (stmt_vectype)
188 if (STMT_VINFO_VECTYPE (stmt_info))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
193 || vectype_maybe_set_p)
194 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
202 return opt_result::success ();
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
210 static opt_result
211 vect_determine_vf_for_stmt (vec_info *vinfo,
212 stmt_vec_info stmt_info, poly_uint64 *vf)
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
216 stmt_info->stmt);
217 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
218 if (!res)
219 return res;
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
222 && STMT_VINFO_RELATED_STMT (stmt_info))
224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
225 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
229 !gsi_end_p (si); gsi_next (&si))
231 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE, vect_location,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info->stmt);
236 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
237 if (!res)
238 return res;
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE, vect_location,
243 "==> examining pattern statement: %G",
244 stmt_info->stmt);
245 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
246 if (!res)
247 return res;
250 return opt_result::success ();
253 /* Function vect_determine_vectorization_factor
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
264 in the loop.
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
267 original loop:
268 for (i=0; i<N; i++){
269 a[i] = b[i] + c[i];
272 vectorized loop:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
278 static opt_result
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
281 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
283 unsigned nbbs = loop->num_nodes;
284 poly_uint64 vectorization_factor = 1;
285 tree scalar_type = NULL_TREE;
286 gphi *phi;
287 tree vectype;
288 stmt_vec_info stmt_info;
289 unsigned i;
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
293 for (i = 0; i < nbbs; i++)
295 basic_block bb = bbs[i];
297 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
298 gsi_next (&si))
300 phi = si.phi ();
301 stmt_info = loop_vinfo->lookup_stmt (phi);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
304 phi);
306 gcc_assert (stmt_info);
308 if (STMT_VINFO_RELEVANT_P (stmt_info)
309 || STMT_VINFO_LIVE_P (stmt_info))
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
312 scalar_type = TREE_TYPE (PHI_RESULT (phi));
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "get vectype for scalar type: %T\n",
317 scalar_type);
319 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
320 if (!vectype)
321 return opt_result::failure_at (phi,
322 "not vectorized: unsupported "
323 "data-type %T\n",
324 scalar_type);
325 STMT_VINFO_VECTYPE (stmt_info) = vectype;
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
329 vectype);
331 if (dump_enabled_p ())
333 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
334 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
335 dump_printf (MSG_NOTE, "\n");
338 vect_update_max_nunits (&vectorization_factor, vectype);
342 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
343 gsi_next (&si))
345 if (is_gimple_debug (gsi_stmt (si)))
346 continue;
347 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
348 opt_result res
349 = vect_determine_vf_for_stmt (loop_vinfo,
350 stmt_info, &vectorization_factor);
351 if (!res)
352 return res;
356 /* TODO: Analyze cost. Decide if worth while to vectorize. */
357 if (dump_enabled_p ())
359 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
360 dump_dec (MSG_NOTE, vectorization_factor);
361 dump_printf (MSG_NOTE, "\n");
364 if (known_le (vectorization_factor, 1U))
365 return opt_result::failure_at (vect_location,
366 "not vectorized: unsupported data-type\n");
367 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
368 return opt_result::success ();
372 /* Function vect_is_simple_iv_evolution.
374 FORNOW: A simple evolution of an induction variables in the loop is
375 considered a polynomial evolution. */
377 static bool
378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
379 tree * step)
381 tree init_expr;
382 tree step_expr;
383 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
384 basic_block bb;
386 /* When there is no evolution in this loop, the evolution function
387 is not "simple". */
388 if (evolution_part == NULL_TREE)
389 return false;
391 /* When the evolution is a polynomial of degree >= 2
392 the evolution function is not "simple". */
393 if (tree_is_chrec (evolution_part))
394 return false;
396 step_expr = evolution_part;
397 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
401 step_expr, init_expr);
403 *init = init_expr;
404 *step = step_expr;
406 if (TREE_CODE (step_expr) != INTEGER_CST
407 && (TREE_CODE (step_expr) != SSA_NAME
408 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
409 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
410 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
411 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
412 || !flag_associative_math)))
413 && (TREE_CODE (step_expr) != REAL_CST
414 || !flag_associative_math))
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "step unknown.\n");
419 return false;
422 return true;
425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
426 what we are assuming is a double reduction. For example, given
427 a structure like this:
429 outer1:
430 x_1 = PHI <x_4(outer2), ...>;
433 inner:
434 x_2 = PHI <x_1(outer1), ...>;
436 x_3 = ...;
439 outer2:
440 x_4 = PHI <x_3(inner)>;
443 outer loop analysis would treat x_1 as a double reduction phi and
444 this function would then return true for x_2. */
446 static bool
447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
449 use_operand_p use_p;
450 ssa_op_iter op_iter;
451 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
452 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
453 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
454 return true;
455 return false;
458 /* Function vect_analyze_scalar_cycles_1.
460 Examine the cross iteration def-use cycles of scalar variables
461 in LOOP. LOOP_VINFO represents the loop that is now being
462 considered for vectorization (can be LOOP, or an outer-loop
463 enclosing LOOP). */
465 static void
466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
468 basic_block bb = loop->header;
469 tree init, step;
470 auto_vec<stmt_vec_info, 64> worklist;
471 gphi_iterator gsi;
472 bool double_reduc, reduc_chain;
474 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
476 /* First - identify all inductions. Reduction detection assumes that all the
477 inductions have been identified, therefore, this order must not be
478 changed. */
479 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
481 gphi *phi = gsi.phi ();
482 tree access_fn = NULL;
483 tree def = PHI_RESULT (phi);
484 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
489 /* Skip virtual phi's. The data dependences that are associated with
490 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
491 if (virtual_operand_p (def))
492 continue;
494 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
496 /* Analyze the evolution function. */
497 access_fn = analyze_scalar_evolution (loop, def);
498 if (access_fn)
500 STRIP_NOPS (access_fn);
501 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE, vect_location,
503 "Access function of PHI: %T\n", access_fn);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
505 = initial_condition_in_loop_num (access_fn, loop->num);
506 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
507 = evolution_part_in_loop_num (access_fn, loop->num);
510 if (!access_fn
511 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
512 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
513 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
514 && TREE_CODE (step) != INTEGER_CST))
516 worklist.safe_push (stmt_vinfo);
517 continue;
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
521 != NULL_TREE);
522 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
526 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
530 /* Second - identify all reductions and nested cycles. */
531 while (worklist.length () > 0)
533 stmt_vec_info stmt_vinfo = worklist.pop ();
534 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
535 tree def = PHI_RESULT (phi);
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
540 gcc_assert (!virtual_operand_p (def)
541 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
543 stmt_vec_info reduc_stmt_info
544 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
545 &reduc_chain);
546 if (reduc_stmt_info)
548 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
549 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
550 if (double_reduc)
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE, vect_location,
554 "Detected double reduction.\n");
556 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
557 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
559 else
561 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_NOTE, vect_location,
565 "Detected vectorizable nested cycle.\n");
567 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
569 else
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location,
573 "Detected reduction.\n");
575 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
577 /* Store the reduction cycles for possible vectorization in
578 loop-aware SLP if it was not detected as reduction
579 chain. */
580 if (! reduc_chain)
581 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
582 (reduc_stmt_info);
586 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
589 "Unknown def-use cycle pattern.\n");
594 /* Function vect_analyze_scalar_cycles.
596 Examine the cross iteration def-use cycles of scalar variables, by
597 analyzing the loop-header PHIs of scalar variables. Classify each
598 cycle as one of the following: invariant, induction, reduction, unknown.
599 We do that for the loop represented by LOOP_VINFO, and also to its
600 inner-loop, if exists.
601 Examples for scalar cycles:
603 Example1: reduction:
605 loop1:
606 for (i=0; i<N; i++)
607 sum += a[i];
609 Example2: induction:
611 loop2:
612 for (i=0; i<N; i++)
613 a[i] = i; */
615 static void
616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
618 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
620 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
622 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
623 Reductions in such inner-loop therefore have different properties than
624 the reductions in the nest that gets vectorized:
625 1. When vectorized, they are executed in the same order as in the original
626 scalar loop, so we can't change the order of computation when
627 vectorizing them.
628 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
629 current checks are too strict. */
631 if (loop->inner)
632 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
635 /* Transfer group and reduction information from STMT_INFO to its
636 pattern stmt. */
638 static void
639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
641 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
642 stmt_vec_info stmtp;
643 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
644 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
645 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
648 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
649 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
650 == STMT_VINFO_DEF_TYPE (stmt_info));
651 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
652 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
653 if (stmt_info)
654 REDUC_GROUP_NEXT_ELEMENT (stmtp)
655 = STMT_VINFO_RELATED_STMT (stmt_info);
657 while (stmt_info);
660 /* Fixup scalar cycles that now have their stmts detected as patterns. */
662 static void
663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
665 stmt_vec_info first;
666 unsigned i;
668 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
670 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
671 while (next)
673 if ((STMT_VINFO_IN_PATTERN_P (next)
674 != STMT_VINFO_IN_PATTERN_P (first))
675 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
676 break;
677 next = REDUC_GROUP_NEXT_ELEMENT (next);
679 /* If all reduction chain members are well-formed patterns adjust
680 the group to group the pattern stmts instead. */
681 if (! next
682 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
684 if (STMT_VINFO_IN_PATTERN_P (first))
686 vect_fixup_reduc_chain (first);
687 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
688 = STMT_VINFO_RELATED_STMT (first);
691 /* If not all stmt in the chain are patterns or if we failed
692 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
693 it as regular reduction instead. */
694 else
696 stmt_vec_info vinfo = first;
697 stmt_vec_info last = NULL;
698 while (vinfo)
700 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
701 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
702 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
703 last = vinfo;
704 vinfo = next;
706 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
707 = vect_internal_def;
708 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
709 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
710 --i;
715 /* Function vect_get_loop_niters.
717 Determine how many iterations the loop is executed and place it
718 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
719 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
720 niter information holds in ASSUMPTIONS.
722 Return the loop exit condition. */
725 static gcond *
726 vect_get_loop_niters (class loop *loop, tree *assumptions,
727 tree *number_of_iterations, tree *number_of_iterationsm1)
729 edge exit = single_exit (loop);
730 class tree_niter_desc niter_desc;
731 tree niter_assumptions, niter, may_be_zero;
732 gcond *cond = get_loop_exit_condition (loop);
734 *assumptions = boolean_true_node;
735 *number_of_iterationsm1 = chrec_dont_know;
736 *number_of_iterations = chrec_dont_know;
737 DUMP_VECT_SCOPE ("get_loop_niters");
739 if (!exit)
740 return cond;
742 may_be_zero = NULL_TREE;
743 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
744 || chrec_contains_undetermined (niter_desc.niter))
745 return cond;
747 niter_assumptions = niter_desc.assumptions;
748 may_be_zero = niter_desc.may_be_zero;
749 niter = niter_desc.niter;
751 if (may_be_zero && integer_zerop (may_be_zero))
752 may_be_zero = NULL_TREE;
754 if (may_be_zero)
756 if (COMPARISON_CLASS_P (may_be_zero))
758 /* Try to combine may_be_zero with assumptions, this can simplify
759 computation of niter expression. */
760 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
761 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
762 niter_assumptions,
763 fold_build1 (TRUTH_NOT_EXPR,
764 boolean_type_node,
765 may_be_zero));
766 else
767 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
768 build_int_cst (TREE_TYPE (niter), 0),
769 rewrite_to_non_trapping_overflow (niter));
771 may_be_zero = NULL_TREE;
773 else if (integer_nonzerop (may_be_zero))
775 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
776 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
777 return cond;
779 else
780 return cond;
783 *assumptions = niter_assumptions;
784 *number_of_iterationsm1 = niter;
786 /* We want the number of loop header executions which is the number
787 of latch executions plus one.
788 ??? For UINT_MAX latch executions this number overflows to zero
789 for loops like do { n++; } while (n != 0); */
790 if (niter && !chrec_contains_undetermined (niter))
791 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
792 build_int_cst (TREE_TYPE (niter), 1));
793 *number_of_iterations = niter;
795 return cond;
798 /* Function bb_in_loop_p
800 Used as predicate for dfs order traversal of the loop bbs. */
802 static bool
803 bb_in_loop_p (const_basic_block bb, const void *data)
805 const class loop *const loop = (const class loop *)data;
806 if (flow_bb_inside_loop_p (loop, bb))
807 return true;
808 return false;
812 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
813 stmt_vec_info structs for all the stmts in LOOP_IN. */
815 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
816 : vec_info (vec_info::loop, init_cost (loop_in), shared),
817 loop (loop_in),
818 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
819 num_itersm1 (NULL_TREE),
820 num_iters (NULL_TREE),
821 num_iters_unchanged (NULL_TREE),
822 num_iters_assumptions (NULL_TREE),
823 th (0),
824 versioning_threshold (0),
825 vectorization_factor (0),
826 max_vectorization_factor (0),
827 mask_skip_niters (NULL_TREE),
828 rgroup_compare_type (NULL_TREE),
829 simd_if_cond (NULL_TREE),
830 unaligned_dr (NULL),
831 peeling_for_alignment (0),
832 ptr_mask (0),
833 ivexpr_map (NULL),
834 scan_map (NULL),
835 slp_unrolling_factor (1),
836 single_scalar_iteration_cost (0),
837 vec_outside_cost (0),
838 vec_inside_cost (0),
839 vectorizable (false),
840 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
841 using_partial_vectors_p (false),
842 epil_using_partial_vectors_p (false),
843 peeling_for_gaps (false),
844 peeling_for_niter (false),
845 no_data_dependencies (false),
846 has_mask_store (false),
847 scalar_loop_scaling (profile_probability::uninitialized ()),
848 scalar_loop (NULL),
849 orig_loop_info (NULL)
851 /* CHECKME: We want to visit all BBs before their successors (except for
852 latch blocks, for which this assertion wouldn't hold). In the simple
853 case of the loop forms we allow, a dfs order of the BBs would the same
854 as reversed postorder traversal, so we are safe. */
856 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
857 bbs, loop->num_nodes, loop);
858 gcc_assert (nbbs == loop->num_nodes);
860 for (unsigned int i = 0; i < nbbs; i++)
862 basic_block bb = bbs[i];
863 gimple_stmt_iterator si;
865 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
867 gimple *phi = gsi_stmt (si);
868 gimple_set_uid (phi, 0);
869 add_stmt (phi);
872 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
874 gimple *stmt = gsi_stmt (si);
875 gimple_set_uid (stmt, 0);
876 if (is_gimple_debug (stmt))
877 continue;
878 add_stmt (stmt);
879 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
880 third argument is the #pragma omp simd if (x) condition, when 0,
881 loop shouldn't be vectorized, when non-zero constant, it should
882 be vectorized normally, otherwise versioned with vectorized loop
883 done if the condition is non-zero at runtime. */
884 if (loop_in->simduid
885 && is_gimple_call (stmt)
886 && gimple_call_internal_p (stmt)
887 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
888 && gimple_call_num_args (stmt) >= 3
889 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
890 && (loop_in->simduid
891 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
893 tree arg = gimple_call_arg (stmt, 2);
894 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
895 simd_if_cond = arg;
896 else
897 gcc_assert (integer_nonzerop (arg));
902 epilogue_vinfos.create (6);
905 /* Free all levels of rgroup CONTROLS. */
907 void
908 release_vec_loop_controls (vec<rgroup_controls> *controls)
910 rgroup_controls *rgc;
911 unsigned int i;
912 FOR_EACH_VEC_ELT (*controls, i, rgc)
913 rgc->controls.release ();
914 controls->release ();
917 /* Free all memory used by the _loop_vec_info, as well as all the
918 stmt_vec_info structs of all the stmts in the loop. */
920 _loop_vec_info::~_loop_vec_info ()
922 free (bbs);
924 release_vec_loop_controls (&masks);
925 release_vec_loop_controls (&lens);
926 delete ivexpr_map;
927 delete scan_map;
928 epilogue_vinfos.release ();
930 loop->aux = NULL;
933 /* Return an invariant or register for EXPR and emit necessary
934 computations in the LOOP_VINFO loop preheader. */
936 tree
937 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
939 if (is_gimple_reg (expr)
940 || is_gimple_min_invariant (expr))
941 return expr;
943 if (! loop_vinfo->ivexpr_map)
944 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
945 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
946 if (! cached)
948 gimple_seq stmts = NULL;
949 cached = force_gimple_operand (unshare_expr (expr),
950 &stmts, true, NULL_TREE);
951 if (stmts)
953 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
954 gsi_insert_seq_on_edge_immediate (e, stmts);
957 return cached;
960 /* Return true if we can use CMP_TYPE as the comparison type to produce
961 all masks required to mask LOOP_VINFO. */
963 static bool
964 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
966 rgroup_controls *rgm;
967 unsigned int i;
968 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
969 if (rgm->type != NULL_TREE
970 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
971 cmp_type, rgm->type,
972 OPTIMIZE_FOR_SPEED))
973 return false;
974 return true;
977 /* Calculate the maximum number of scalars per iteration for every
978 rgroup in LOOP_VINFO. */
980 static unsigned int
981 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
983 unsigned int res = 1;
984 unsigned int i;
985 rgroup_controls *rgm;
986 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
987 res = MAX (res, rgm->max_nscalars_per_iter);
988 return res;
991 /* Calculate the minimum precision necessary to represent:
993 MAX_NITERS * FACTOR
995 as an unsigned integer, where MAX_NITERS is the maximum number of
996 loop header iterations for the original scalar form of LOOP_VINFO. */
998 static unsigned
999 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1001 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1003 /* Get the maximum number of iterations that is representable
1004 in the counter type. */
1005 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1006 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1008 /* Get a more refined estimate for the number of iterations. */
1009 widest_int max_back_edges;
1010 if (max_loop_iterations (loop, &max_back_edges))
1011 max_ni = wi::smin (max_ni, max_back_edges + 1);
1013 /* Work out how many bits we need to represent the limit. */
1014 return wi::min_precision (max_ni * factor, UNSIGNED);
1017 /* True if the loop needs peeling or partial vectors when vectorized. */
1019 static bool
1020 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1022 unsigned HOST_WIDE_INT const_vf;
1023 HOST_WIDE_INT max_niter
1024 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1026 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1027 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1028 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1029 (loop_vinfo));
1031 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1032 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1034 /* Work out the (constant) number of iterations that need to be
1035 peeled for reasons other than niters. */
1036 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1037 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1038 peel_niter += 1;
1039 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1040 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1041 return true;
1043 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1044 /* ??? When peeling for gaps but not alignment, we could
1045 try to check whether the (variable) niters is known to be
1046 VF * N + 1. That's something of a niche case though. */
1047 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1048 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1049 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1050 < (unsigned) exact_log2 (const_vf))
1051 /* In case of versioning, check if the maximum number of
1052 iterations is greater than th. If they are identical,
1053 the epilogue is unnecessary. */
1054 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1055 || ((unsigned HOST_WIDE_INT) max_niter
1056 > (th / const_vf) * const_vf))))
1057 return true;
1059 return false;
1062 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1063 whether we can actually generate the masks required. Return true if so,
1064 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1066 static bool
1067 vect_verify_full_masking (loop_vec_info loop_vinfo)
1069 unsigned int min_ni_width;
1070 unsigned int max_nscalars_per_iter
1071 = vect_get_max_nscalars_per_iter (loop_vinfo);
1073 /* Use a normal loop if there are no statements that need masking.
1074 This only happens in rare degenerate cases: it means that the loop
1075 has no loads, no stores, and no live-out values. */
1076 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1077 return false;
1079 /* Work out how many bits we need to represent the limit. */
1080 min_ni_width
1081 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1083 /* Find a scalar mode for which WHILE_ULT is supported. */
1084 opt_scalar_int_mode cmp_mode_iter;
1085 tree cmp_type = NULL_TREE;
1086 tree iv_type = NULL_TREE;
1087 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1088 unsigned int iv_precision = UINT_MAX;
1090 if (iv_limit != -1)
1091 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1092 UNSIGNED);
1094 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1096 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1097 if (cmp_bits >= min_ni_width
1098 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1100 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1101 if (this_type
1102 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1104 /* Although we could stop as soon as we find a valid mode,
1105 there are at least two reasons why that's not always the
1106 best choice:
1108 - An IV that's Pmode or wider is more likely to be reusable
1109 in address calculations than an IV that's narrower than
1110 Pmode.
1112 - Doing the comparison in IV_PRECISION or wider allows
1113 a natural 0-based IV, whereas using a narrower comparison
1114 type requires mitigations against wrap-around.
1116 Conversely, if the IV limit is variable, doing the comparison
1117 in a wider type than the original type can introduce
1118 unnecessary extensions, so picking the widest valid mode
1119 is not always a good choice either.
1121 Here we prefer the first IV type that's Pmode or wider,
1122 and the first comparison type that's IV_PRECISION or wider.
1123 (The comparison type must be no wider than the IV type,
1124 to avoid extensions in the vector loop.)
1126 ??? We might want to try continuing beyond Pmode for ILP32
1127 targets if CMP_BITS < IV_PRECISION. */
1128 iv_type = this_type;
1129 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1130 cmp_type = this_type;
1131 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1132 break;
1137 if (!cmp_type)
1138 return false;
1140 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1141 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1142 return true;
1145 /* Check whether we can use vector access with length based on precison
1146 comparison. So far, to keep it simple, we only allow the case that the
1147 precision of the target supported length is larger than the precision
1148 required by loop niters. */
1150 static bool
1151 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1153 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1154 return false;
1156 unsigned int max_nitems_per_iter = 1;
1157 unsigned int i;
1158 rgroup_controls *rgl;
1159 /* Find the maximum number of items per iteration for every rgroup. */
1160 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1162 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1163 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1166 /* Work out how many bits we need to represent the length limit. */
1167 unsigned int min_ni_prec
1168 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1170 /* Now use the maximum of below precisions for one suitable IV type:
1171 - the IV's natural precision
1172 - the precision needed to hold: the maximum number of scalar
1173 iterations multiplied by the scale factor (min_ni_prec above)
1174 - the Pmode precision
1176 If min_ni_prec is less than the precision of the current niters,
1177 we perfer to still use the niters type. Prefer to use Pmode and
1178 wider IV to avoid narrow conversions. */
1180 unsigned int ni_prec
1181 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1182 min_ni_prec = MAX (min_ni_prec, ni_prec);
1183 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1185 tree iv_type = NULL_TREE;
1186 opt_scalar_int_mode tmode_iter;
1187 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1189 scalar_mode tmode = tmode_iter.require ();
1190 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1192 /* ??? Do we really want to construct one IV whose precision exceeds
1193 BITS_PER_WORD? */
1194 if (tbits > BITS_PER_WORD)
1195 break;
1197 /* Find the first available standard integral type. */
1198 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1200 iv_type = build_nonstandard_integer_type (tbits, true);
1201 break;
1205 if (!iv_type)
1207 if (dump_enabled_p ())
1208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1209 "can't vectorize with length-based partial vectors"
1210 " because there is no suitable iv type.\n");
1211 return false;
1214 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1215 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1217 return true;
1220 /* Calculate the cost of one scalar iteration of the loop. */
1221 static void
1222 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1224 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1225 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1226 int nbbs = loop->num_nodes, factor;
1227 int innerloop_iters, i;
1229 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1231 /* Gather costs for statements in the scalar loop. */
1233 /* FORNOW. */
1234 innerloop_iters = 1;
1235 if (loop->inner)
1236 innerloop_iters = 50; /* FIXME */
1238 for (i = 0; i < nbbs; i++)
1240 gimple_stmt_iterator si;
1241 basic_block bb = bbs[i];
1243 if (bb->loop_father == loop->inner)
1244 factor = innerloop_iters;
1245 else
1246 factor = 1;
1248 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1250 gimple *stmt = gsi_stmt (si);
1251 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1253 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1254 continue;
1256 /* Skip stmts that are not vectorized inside the loop. */
1257 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1258 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1259 && (!STMT_VINFO_LIVE_P (vstmt_info)
1260 || !VECTORIZABLE_CYCLE_DEF
1261 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1262 continue;
1264 vect_cost_for_stmt kind;
1265 if (STMT_VINFO_DATA_REF (stmt_info))
1267 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1268 kind = scalar_load;
1269 else
1270 kind = scalar_store;
1272 else if (vect_nop_conversion_p (stmt_info))
1273 continue;
1274 else
1275 kind = scalar_stmt;
1277 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1278 factor, kind, stmt_info, 0, vect_prologue);
1282 /* Now accumulate cost. */
1283 void *target_cost_data = init_cost (loop);
1284 stmt_info_for_cost *si;
1285 int j;
1286 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1287 j, si)
1288 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1289 si->kind, si->stmt_info, si->vectype,
1290 si->misalign, vect_body);
1291 unsigned dummy, body_cost = 0;
1292 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1293 destroy_cost_data (target_cost_data);
1294 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1298 /* Function vect_analyze_loop_form_1.
1300 Verify that certain CFG restrictions hold, including:
1301 - the loop has a pre-header
1302 - the loop has a single entry and exit
1303 - the loop exit condition is simple enough
1304 - the number of iterations can be analyzed, i.e, a countable loop. The
1305 niter could be analyzed under some assumptions. */
1307 opt_result
1308 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1309 tree *assumptions, tree *number_of_iterationsm1,
1310 tree *number_of_iterations, gcond **inner_loop_cond)
1312 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1314 /* Different restrictions apply when we are considering an inner-most loop,
1315 vs. an outer (nested) loop.
1316 (FORNOW. May want to relax some of these restrictions in the future). */
1318 if (!loop->inner)
1320 /* Inner-most loop. We currently require that the number of BBs is
1321 exactly 2 (the header and latch). Vectorizable inner-most loops
1322 look like this:
1324 (pre-header)
1326 header <--------+
1327 | | |
1328 | +--> latch --+
1330 (exit-bb) */
1332 if (loop->num_nodes != 2)
1333 return opt_result::failure_at (vect_location,
1334 "not vectorized:"
1335 " control flow in loop.\n");
1337 if (empty_block_p (loop->header))
1338 return opt_result::failure_at (vect_location,
1339 "not vectorized: empty loop.\n");
1341 else
1343 class loop *innerloop = loop->inner;
1344 edge entryedge;
1346 /* Nested loop. We currently require that the loop is doubly-nested,
1347 contains a single inner loop, and the number of BBs is exactly 5.
1348 Vectorizable outer-loops look like this:
1350 (pre-header)
1352 header <---+
1354 inner-loop |
1356 tail ------+
1358 (exit-bb)
1360 The inner-loop has the properties expected of inner-most loops
1361 as described above. */
1363 if ((loop->inner)->inner || (loop->inner)->next)
1364 return opt_result::failure_at (vect_location,
1365 "not vectorized:"
1366 " multiple nested loops.\n");
1368 if (loop->num_nodes != 5)
1369 return opt_result::failure_at (vect_location,
1370 "not vectorized:"
1371 " control flow in loop.\n");
1373 entryedge = loop_preheader_edge (innerloop);
1374 if (entryedge->src != loop->header
1375 || !single_exit (innerloop)
1376 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1377 return opt_result::failure_at (vect_location,
1378 "not vectorized:"
1379 " unsupported outerloop form.\n");
1381 /* Analyze the inner-loop. */
1382 tree inner_niterm1, inner_niter, inner_assumptions;
1383 opt_result res
1384 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1385 &inner_assumptions, &inner_niterm1,
1386 &inner_niter, NULL);
1387 if (!res)
1389 if (dump_enabled_p ())
1390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1391 "not vectorized: Bad inner loop.\n");
1392 return res;
1395 /* Don't support analyzing niter under assumptions for inner
1396 loop. */
1397 if (!integer_onep (inner_assumptions))
1398 return opt_result::failure_at (vect_location,
1399 "not vectorized: Bad inner loop.\n");
1401 if (!expr_invariant_in_loop_p (loop, inner_niter))
1402 return opt_result::failure_at (vect_location,
1403 "not vectorized: inner-loop count not"
1404 " invariant.\n");
1406 if (dump_enabled_p ())
1407 dump_printf_loc (MSG_NOTE, vect_location,
1408 "Considering outer-loop vectorization.\n");
1411 if (!single_exit (loop))
1412 return opt_result::failure_at (vect_location,
1413 "not vectorized: multiple exits.\n");
1414 if (EDGE_COUNT (loop->header->preds) != 2)
1415 return opt_result::failure_at (vect_location,
1416 "not vectorized:"
1417 " too many incoming edges.\n");
1419 /* We assume that the loop exit condition is at the end of the loop. i.e,
1420 that the loop is represented as a do-while (with a proper if-guard
1421 before the loop if needed), where the loop header contains all the
1422 executable statements, and the latch is empty. */
1423 if (!empty_block_p (loop->latch)
1424 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1425 return opt_result::failure_at (vect_location,
1426 "not vectorized: latch block not empty.\n");
1428 /* Make sure the exit is not abnormal. */
1429 edge e = single_exit (loop);
1430 if (e->flags & EDGE_ABNORMAL)
1431 return opt_result::failure_at (vect_location,
1432 "not vectorized:"
1433 " abnormal loop exit edge.\n");
1435 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1436 number_of_iterationsm1);
1437 if (!*loop_cond)
1438 return opt_result::failure_at
1439 (vect_location,
1440 "not vectorized: complicated exit condition.\n");
1442 if (integer_zerop (*assumptions)
1443 || !*number_of_iterations
1444 || chrec_contains_undetermined (*number_of_iterations))
1445 return opt_result::failure_at
1446 (*loop_cond,
1447 "not vectorized: number of iterations cannot be computed.\n");
1449 if (integer_zerop (*number_of_iterations))
1450 return opt_result::failure_at
1451 (*loop_cond,
1452 "not vectorized: number of iterations = 0.\n");
1454 return opt_result::success ();
1457 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1459 opt_loop_vec_info
1460 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1462 tree assumptions, number_of_iterations, number_of_iterationsm1;
1463 gcond *loop_cond, *inner_loop_cond = NULL;
1465 opt_result res
1466 = vect_analyze_loop_form_1 (loop, &loop_cond,
1467 &assumptions, &number_of_iterationsm1,
1468 &number_of_iterations, &inner_loop_cond);
1469 if (!res)
1470 return opt_loop_vec_info::propagate_failure (res);
1472 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1473 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1474 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1475 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1476 if (!integer_onep (assumptions))
1478 /* We consider to vectorize this loop by versioning it under
1479 some assumptions. In order to do this, we need to clear
1480 existing information computed by scev and niter analyzer. */
1481 scev_reset_htab ();
1482 free_numbers_of_iterations_estimates (loop);
1483 /* Also set flag for this loop so that following scev and niter
1484 analysis are done under the assumptions. */
1485 loop_constraint_set (loop, LOOP_C_FINITE);
1486 /* Also record the assumptions for versioning. */
1487 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1490 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1492 if (dump_enabled_p ())
1494 dump_printf_loc (MSG_NOTE, vect_location,
1495 "Symbolic number of iterations is ");
1496 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1497 dump_printf (MSG_NOTE, "\n");
1501 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1502 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1503 if (inner_loop_cond)
1505 stmt_vec_info inner_loop_cond_info
1506 = loop_vinfo->lookup_stmt (inner_loop_cond);
1507 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1510 gcc_assert (!loop->aux);
1511 loop->aux = loop_vinfo;
1512 return opt_loop_vec_info::success (loop_vinfo);
1517 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1518 statements update the vectorization factor. */
1520 static void
1521 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1523 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1524 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1525 int nbbs = loop->num_nodes;
1526 poly_uint64 vectorization_factor;
1527 int i;
1529 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1531 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1532 gcc_assert (known_ne (vectorization_factor, 0U));
1534 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1535 vectorization factor of the loop is the unrolling factor required by
1536 the SLP instances. If that unrolling factor is 1, we say, that we
1537 perform pure SLP on loop - cross iteration parallelism is not
1538 exploited. */
1539 bool only_slp_in_loop = true;
1540 for (i = 0; i < nbbs; i++)
1542 basic_block bb = bbs[i];
1543 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1544 gsi_next (&si))
1546 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1547 if (!stmt_info)
1548 continue;
1549 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1550 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1551 && !PURE_SLP_STMT (stmt_info))
1552 /* STMT needs both SLP and loop-based vectorization. */
1553 only_slp_in_loop = false;
1555 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1556 gsi_next (&si))
1558 if (is_gimple_debug (gsi_stmt (si)))
1559 continue;
1560 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1561 stmt_info = vect_stmt_to_vectorize (stmt_info);
1562 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1563 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1564 && !PURE_SLP_STMT (stmt_info))
1565 /* STMT needs both SLP and loop-based vectorization. */
1566 only_slp_in_loop = false;
1570 if (only_slp_in_loop)
1572 if (dump_enabled_p ())
1573 dump_printf_loc (MSG_NOTE, vect_location,
1574 "Loop contains only SLP stmts\n");
1575 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1577 else
1579 if (dump_enabled_p ())
1580 dump_printf_loc (MSG_NOTE, vect_location,
1581 "Loop contains SLP and non-SLP stmts\n");
1582 /* Both the vectorization factor and unroll factor have the form
1583 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1584 so they must have a common multiple. */
1585 vectorization_factor
1586 = force_common_multiple (vectorization_factor,
1587 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1590 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1591 if (dump_enabled_p ())
1593 dump_printf_loc (MSG_NOTE, vect_location,
1594 "Updating vectorization factor to ");
1595 dump_dec (MSG_NOTE, vectorization_factor);
1596 dump_printf (MSG_NOTE, ".\n");
1600 /* Return true if STMT_INFO describes a double reduction phi and if
1601 the other phi in the reduction is also relevant for vectorization.
1602 This rejects cases such as:
1604 outer1:
1605 x_1 = PHI <x_3(outer2), ...>;
1608 inner:
1609 x_2 = ...;
1612 outer2:
1613 x_3 = PHI <x_2(inner)>;
1615 if nothing in x_2 or elsewhere makes x_1 relevant. */
1617 static bool
1618 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1620 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1621 return false;
1623 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1626 /* Function vect_analyze_loop_operations.
1628 Scan the loop stmts and make sure they are all vectorizable. */
1630 static opt_result
1631 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1633 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1634 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1635 int nbbs = loop->num_nodes;
1636 int i;
1637 stmt_vec_info stmt_info;
1638 bool need_to_vectorize = false;
1639 bool ok;
1641 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1643 auto_vec<stmt_info_for_cost> cost_vec;
1645 for (i = 0; i < nbbs; i++)
1647 basic_block bb = bbs[i];
1649 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1650 gsi_next (&si))
1652 gphi *phi = si.phi ();
1653 ok = true;
1655 stmt_info = loop_vinfo->lookup_stmt (phi);
1656 if (dump_enabled_p ())
1657 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1658 if (virtual_operand_p (gimple_phi_result (phi)))
1659 continue;
1661 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1662 (i.e., a phi in the tail of the outer-loop). */
1663 if (! is_loop_header_bb_p (bb))
1665 /* FORNOW: we currently don't support the case that these phis
1666 are not used in the outerloop (unless it is double reduction,
1667 i.e., this phi is vect_reduction_def), cause this case
1668 requires to actually do something here. */
1669 if (STMT_VINFO_LIVE_P (stmt_info)
1670 && !vect_active_double_reduction_p (stmt_info))
1671 return opt_result::failure_at (phi,
1672 "Unsupported loop-closed phi"
1673 " in outer-loop.\n");
1675 /* If PHI is used in the outer loop, we check that its operand
1676 is defined in the inner loop. */
1677 if (STMT_VINFO_RELEVANT_P (stmt_info))
1679 tree phi_op;
1681 if (gimple_phi_num_args (phi) != 1)
1682 return opt_result::failure_at (phi, "unsupported phi");
1684 phi_op = PHI_ARG_DEF (phi, 0);
1685 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1686 if (!op_def_info)
1687 return opt_result::failure_at (phi, "unsupported phi\n");
1689 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1690 && (STMT_VINFO_RELEVANT (op_def_info)
1691 != vect_used_in_outer_by_reduction))
1692 return opt_result::failure_at (phi, "unsupported phi\n");
1694 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1695 || (STMT_VINFO_DEF_TYPE (stmt_info)
1696 == vect_double_reduction_def))
1697 && !vectorizable_lc_phi (loop_vinfo,
1698 stmt_info, NULL, NULL))
1699 return opt_result::failure_at (phi, "unsupported phi\n");
1702 continue;
1705 gcc_assert (stmt_info);
1707 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1708 || STMT_VINFO_LIVE_P (stmt_info))
1709 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1710 /* A scalar-dependence cycle that we don't support. */
1711 return opt_result::failure_at (phi,
1712 "not vectorized:"
1713 " scalar dependence cycle.\n");
1715 if (STMT_VINFO_RELEVANT_P (stmt_info))
1717 need_to_vectorize = true;
1718 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1719 && ! PURE_SLP_STMT (stmt_info))
1720 ok = vectorizable_induction (loop_vinfo,
1721 stmt_info, NULL, NULL,
1722 &cost_vec);
1723 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1724 || (STMT_VINFO_DEF_TYPE (stmt_info)
1725 == vect_double_reduction_def)
1726 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1727 && ! PURE_SLP_STMT (stmt_info))
1728 ok = vectorizable_reduction (loop_vinfo,
1729 stmt_info, NULL, NULL, &cost_vec);
1732 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1733 if (ok
1734 && STMT_VINFO_LIVE_P (stmt_info)
1735 && !PURE_SLP_STMT (stmt_info))
1736 ok = vectorizable_live_operation (loop_vinfo,
1737 stmt_info, NULL, NULL, NULL,
1738 -1, false, &cost_vec);
1740 if (!ok)
1741 return opt_result::failure_at (phi,
1742 "not vectorized: relevant phi not "
1743 "supported: %G",
1744 static_cast <gimple *> (phi));
1747 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1748 gsi_next (&si))
1750 gimple *stmt = gsi_stmt (si);
1751 if (!gimple_clobber_p (stmt)
1752 && !is_gimple_debug (stmt))
1754 opt_result res
1755 = vect_analyze_stmt (loop_vinfo,
1756 loop_vinfo->lookup_stmt (stmt),
1757 &need_to_vectorize,
1758 NULL, NULL, &cost_vec);
1759 if (!res)
1760 return res;
1763 } /* bbs */
1765 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1767 /* All operations in the loop are either irrelevant (deal with loop
1768 control, or dead), or only used outside the loop and can be moved
1769 out of the loop (e.g. invariants, inductions). The loop can be
1770 optimized away by scalar optimizations. We're better off not
1771 touching this loop. */
1772 if (!need_to_vectorize)
1774 if (dump_enabled_p ())
1775 dump_printf_loc (MSG_NOTE, vect_location,
1776 "All the computation can be taken out of the loop.\n");
1777 return opt_result::failure_at
1778 (vect_location,
1779 "not vectorized: redundant loop. no profit to vectorize.\n");
1782 return opt_result::success ();
1785 /* Return true if we know that the iteration count is smaller than the
1786 vectorization factor. Return false if it isn't, or if we can't be sure
1787 either way. */
1789 static bool
1790 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1792 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1794 HOST_WIDE_INT max_niter;
1795 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1796 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1797 else
1798 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1800 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1801 return true;
1803 return false;
1806 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1807 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1808 definitely no, or -1 if it's worth retrying. */
1810 static int
1811 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1813 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1814 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1816 /* Only loops that can handle partially-populated vectors can have iteration
1817 counts less than the vectorization factor. */
1818 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1820 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1822 if (dump_enabled_p ())
1823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824 "not vectorized: iteration count smaller than "
1825 "vectorization factor.\n");
1826 return 0;
1830 int min_profitable_iters, min_profitable_estimate;
1831 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1832 &min_profitable_estimate);
1834 if (min_profitable_iters < 0)
1836 if (dump_enabled_p ())
1837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838 "not vectorized: vectorization not profitable.\n");
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1841 "not vectorized: vector version will never be "
1842 "profitable.\n");
1843 return -1;
1846 int min_scalar_loop_bound = (param_min_vect_loop_bound
1847 * assumed_vf);
1849 /* Use the cost model only if it is more conservative than user specified
1850 threshold. */
1851 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1852 min_profitable_iters);
1854 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1856 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1857 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1859 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861 "not vectorized: vectorization not profitable.\n");
1862 if (dump_enabled_p ())
1863 dump_printf_loc (MSG_NOTE, vect_location,
1864 "not vectorized: iteration count smaller than user "
1865 "specified loop bound parameter or minimum profitable "
1866 "iterations (whichever is more conservative).\n");
1867 return 0;
1870 /* The static profitablity threshold min_profitable_estimate includes
1871 the cost of having to check at runtime whether the scalar loop
1872 should be used instead. If it turns out that we don't need or want
1873 such a check, the threshold we should use for the static estimate
1874 is simply the point at which the vector loop becomes more profitable
1875 than the scalar loop. */
1876 if (min_profitable_estimate > min_profitable_iters
1877 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1878 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1879 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1880 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1882 if (dump_enabled_p ())
1883 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1884 " choice between the scalar and vector loops\n");
1885 min_profitable_estimate = min_profitable_iters;
1888 HOST_WIDE_INT estimated_niter;
1890 /* If we are vectorizing an epilogue then we know the maximum number of
1891 scalar iterations it will cover is at least one lower than the
1892 vectorization factor of the main loop. */
1893 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1894 estimated_niter
1895 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1896 else
1898 estimated_niter = estimated_stmt_executions_int (loop);
1899 if (estimated_niter == -1)
1900 estimated_niter = likely_max_stmt_executions_int (loop);
1902 if (estimated_niter != -1
1903 && ((unsigned HOST_WIDE_INT) estimated_niter
1904 < MAX (th, (unsigned) min_profitable_estimate)))
1906 if (dump_enabled_p ())
1907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1908 "not vectorized: estimated iteration count too "
1909 "small.\n");
1910 if (dump_enabled_p ())
1911 dump_printf_loc (MSG_NOTE, vect_location,
1912 "not vectorized: estimated iteration count smaller "
1913 "than specified loop bound parameter or minimum "
1914 "profitable iterations (whichever is more "
1915 "conservative).\n");
1916 return -1;
1919 return 1;
1922 static opt_result
1923 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1924 vec<data_reference_p> *datarefs,
1925 unsigned int *n_stmts)
1927 *n_stmts = 0;
1928 for (unsigned i = 0; i < loop->num_nodes; i++)
1929 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1930 !gsi_end_p (gsi); gsi_next (&gsi))
1932 gimple *stmt = gsi_stmt (gsi);
1933 if (is_gimple_debug (stmt))
1934 continue;
1935 ++(*n_stmts);
1936 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1937 NULL, 0);
1938 if (!res)
1940 if (is_gimple_call (stmt) && loop->safelen)
1942 tree fndecl = gimple_call_fndecl (stmt), op;
1943 if (fndecl != NULL_TREE)
1945 cgraph_node *node = cgraph_node::get (fndecl);
1946 if (node != NULL && node->simd_clones != NULL)
1948 unsigned int j, n = gimple_call_num_args (stmt);
1949 for (j = 0; j < n; j++)
1951 op = gimple_call_arg (stmt, j);
1952 if (DECL_P (op)
1953 || (REFERENCE_CLASS_P (op)
1954 && get_base_address (op)))
1955 break;
1957 op = gimple_call_lhs (stmt);
1958 /* Ignore #pragma omp declare simd functions
1959 if they don't have data references in the
1960 call stmt itself. */
1961 if (j == n
1962 && !(op
1963 && (DECL_P (op)
1964 || (REFERENCE_CLASS_P (op)
1965 && get_base_address (op)))))
1966 continue;
1970 return res;
1972 /* If dependence analysis will give up due to the limit on the
1973 number of datarefs stop here and fail fatally. */
1974 if (datarefs->length ()
1975 > (unsigned)param_loop_max_datarefs_for_datadeps)
1976 return opt_result::failure_at (stmt, "exceeded param "
1977 "loop-max-datarefs-for-datadeps\n");
1979 return opt_result::success ();
1982 /* Look for SLP-only access groups and turn each individual access into its own
1983 group. */
1984 static void
1985 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1987 unsigned int i;
1988 struct data_reference *dr;
1990 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1992 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1993 FOR_EACH_VEC_ELT (datarefs, i, dr)
1995 gcc_assert (DR_REF (dr));
1996 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1998 /* Check if the load is a part of an interleaving chain. */
1999 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2001 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2002 unsigned int group_size = DR_GROUP_SIZE (first_element);
2004 /* Check if SLP-only groups. */
2005 if (!STMT_SLP_TYPE (stmt_info)
2006 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2008 /* Dissolve the group. */
2009 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2011 stmt_vec_info vinfo = first_element;
2012 while (vinfo)
2014 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2015 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2016 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2017 DR_GROUP_SIZE (vinfo) = 1;
2018 if (STMT_VINFO_STRIDED_P (first_element))
2019 DR_GROUP_GAP (vinfo) = 0;
2020 else
2021 DR_GROUP_GAP (vinfo) = group_size - 1;
2022 vinfo = next;
2029 /* Determine if operating on full vectors for LOOP_VINFO might leave
2030 some scalar iterations still to do. If so, decide how we should
2031 handle those scalar iterations. The possibilities are:
2033 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2034 In this case:
2036 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2037 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2038 LOOP_VINFO_PEELING_FOR_NITER == false
2040 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2041 to handle the remaining scalar iterations. In this case:
2043 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2044 LOOP_VINFO_PEELING_FOR_NITER == true
2046 There are two choices:
2048 (2a) Consider vectorizing the epilogue loop at the same VF as the
2049 main loop, but using partial vectors instead of full vectors.
2050 In this case:
2052 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2054 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2055 In this case:
2057 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2059 When FOR_EPILOGUE_P is true, make this determination based on the
2060 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2061 based on the assumption that LOOP_VINFO is the main loop. The caller
2062 has made sure that the number of iterations is set appropriately for
2063 this value of FOR_EPILOGUE_P. */
2065 opt_result
2066 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2067 bool for_epilogue_p)
2069 /* Determine whether there would be any scalar iterations left over. */
2070 bool need_peeling_or_partial_vectors_p
2071 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2073 /* Decide whether to vectorize the loop with partial vectors. */
2074 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2075 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2076 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2077 && need_peeling_or_partial_vectors_p)
2079 /* For partial-vector-usage=1, try to push the handling of partial
2080 vectors to the epilogue, with the main loop continuing to operate
2081 on full vectors.
2083 ??? We could then end up failing to use partial vectors if we
2084 decide to peel iterations into a prologue, and if the main loop
2085 then ends up processing fewer than VF iterations. */
2086 if (param_vect_partial_vector_usage == 1
2087 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2088 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2089 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2090 else
2091 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2094 if (dump_enabled_p ())
2096 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2097 dump_printf_loc (MSG_NOTE, vect_location,
2098 "operating on partial vectors%s.\n",
2099 for_epilogue_p ? " for epilogue loop" : "");
2100 else
2101 dump_printf_loc (MSG_NOTE, vect_location,
2102 "operating only on full vectors%s.\n",
2103 for_epilogue_p ? " for epilogue loop" : "");
2106 if (for_epilogue_p)
2108 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2109 gcc_assert (orig_loop_vinfo);
2110 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2111 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2112 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2115 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2116 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2118 /* Check that the loop processes at least one full vector. */
2119 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2120 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2121 if (known_lt (wi::to_widest (scalar_niters), vf))
2122 return opt_result::failure_at (vect_location,
2123 "loop does not have enough iterations"
2124 " to support vectorization.\n");
2126 /* If we need to peel an extra epilogue iteration to handle data
2127 accesses with gaps, check that there are enough scalar iterations
2128 available.
2130 The check above is redundant with this one when peeling for gaps,
2131 but the distinction is useful for diagnostics. */
2132 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2133 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2134 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2135 return opt_result::failure_at (vect_location,
2136 "loop does not have enough iterations"
2137 " to support peeling for gaps.\n");
2140 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2141 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2142 && need_peeling_or_partial_vectors_p);
2144 return opt_result::success ();
2147 /* Function vect_analyze_loop_2.
2149 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2150 for it. The different analyses will record information in the
2151 loop_vec_info struct. */
2152 static opt_result
2153 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2155 opt_result ok = opt_result::success ();
2156 int res;
2157 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2158 poly_uint64 min_vf = 2;
2159 loop_vec_info orig_loop_vinfo = NULL;
2161 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2162 loop_vec_info of the first vectorized loop. */
2163 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2164 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2165 else
2166 orig_loop_vinfo = loop_vinfo;
2167 gcc_assert (orig_loop_vinfo);
2169 /* The first group of checks is independent of the vector size. */
2170 fatal = true;
2172 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2173 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2174 return opt_result::failure_at (vect_location,
2175 "not vectorized: simd if(0)\n");
2177 /* Find all data references in the loop (which correspond to vdefs/vuses)
2178 and analyze their evolution in the loop. */
2180 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2182 /* Gather the data references and count stmts in the loop. */
2183 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2185 opt_result res
2186 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2187 &LOOP_VINFO_DATAREFS (loop_vinfo),
2188 n_stmts);
2189 if (!res)
2191 if (dump_enabled_p ())
2192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193 "not vectorized: loop contains function "
2194 "calls or data references that cannot "
2195 "be analyzed\n");
2196 return res;
2198 loop_vinfo->shared->save_datarefs ();
2200 else
2201 loop_vinfo->shared->check_datarefs ();
2203 /* Analyze the data references and also adjust the minimal
2204 vectorization factor according to the loads and stores. */
2206 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2207 if (!ok)
2209 if (dump_enabled_p ())
2210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2211 "bad data references.\n");
2212 return ok;
2215 /* Classify all cross-iteration scalar data-flow cycles.
2216 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2217 vect_analyze_scalar_cycles (loop_vinfo);
2219 vect_pattern_recog (loop_vinfo);
2221 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2223 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2224 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2226 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2227 if (!ok)
2229 if (dump_enabled_p ())
2230 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2231 "bad data access.\n");
2232 return ok;
2235 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2237 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2238 if (!ok)
2240 if (dump_enabled_p ())
2241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2242 "unexpected pattern.\n");
2243 return ok;
2246 /* While the rest of the analysis below depends on it in some way. */
2247 fatal = false;
2249 /* Analyze data dependences between the data-refs in the loop
2250 and adjust the maximum vectorization factor according to
2251 the dependences.
2252 FORNOW: fail at the first data dependence that we encounter. */
2254 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2255 if (!ok)
2257 if (dump_enabled_p ())
2258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259 "bad data dependence.\n");
2260 return ok;
2262 if (max_vf != MAX_VECTORIZATION_FACTOR
2263 && maybe_lt (max_vf, min_vf))
2264 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2265 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2267 ok = vect_determine_vectorization_factor (loop_vinfo);
2268 if (!ok)
2270 if (dump_enabled_p ())
2271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2272 "can't determine vectorization factor.\n");
2273 return ok;
2275 if (max_vf != MAX_VECTORIZATION_FACTOR
2276 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2277 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2279 /* Compute the scalar iteration cost. */
2280 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2282 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2284 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2285 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2286 if (!ok)
2287 return ok;
2289 /* If there are any SLP instances mark them as pure_slp. */
2290 bool slp = vect_make_slp_decision (loop_vinfo);
2291 if (slp)
2293 /* Find stmts that need to be both vectorized and SLPed. */
2294 vect_detect_hybrid_slp (loop_vinfo);
2296 /* Update the vectorization factor based on the SLP decision. */
2297 vect_update_vf_for_slp (loop_vinfo);
2299 /* Optimize the SLP graph with the vectorization factor fixed. */
2300 vect_optimize_slp (loop_vinfo);
2303 bool saved_can_use_partial_vectors_p
2304 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2306 /* We don't expect to have to roll back to anything other than an empty
2307 set of rgroups. */
2308 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2310 /* This is the point where we can re-start analysis with SLP forced off. */
2311 start_over:
2313 /* Now the vectorization factor is final. */
2314 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2315 gcc_assert (known_ne (vectorization_factor, 0U));
2317 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2319 dump_printf_loc (MSG_NOTE, vect_location,
2320 "vectorization_factor = ");
2321 dump_dec (MSG_NOTE, vectorization_factor);
2322 dump_printf (MSG_NOTE, ", niters = %wd\n",
2323 LOOP_VINFO_INT_NITERS (loop_vinfo));
2326 /* Analyze the alignment of the data-refs in the loop.
2327 Fail if a data reference is found that cannot be vectorized. */
2329 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2330 if (!ok)
2332 if (dump_enabled_p ())
2333 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334 "bad data alignment.\n");
2335 return ok;
2338 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2339 It is important to call pruning after vect_analyze_data_ref_accesses,
2340 since we use grouping information gathered by interleaving analysis. */
2341 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2342 if (!ok)
2343 return ok;
2345 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2346 vectorization, since we do not want to add extra peeling or
2347 add versioning for alignment. */
2348 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2349 /* This pass will decide on using loop versioning and/or loop peeling in
2350 order to enhance the alignment of data references in the loop. */
2351 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2352 if (!ok)
2353 return ok;
2355 if (slp)
2357 /* Analyze operations in the SLP instances. Note this may
2358 remove unsupported SLP instances which makes the above
2359 SLP kind detection invalid. */
2360 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2361 vect_slp_analyze_operations (loop_vinfo);
2362 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2364 ok = opt_result::failure_at (vect_location,
2365 "unsupported SLP instances\n");
2366 goto again;
2369 /* Check whether any load in ALL SLP instances is possibly permuted. */
2370 slp_tree load_node, slp_root;
2371 unsigned i, x;
2372 slp_instance instance;
2373 bool can_use_lanes = true;
2374 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2376 slp_root = SLP_INSTANCE_TREE (instance);
2377 int group_size = SLP_TREE_LANES (slp_root);
2378 tree vectype = SLP_TREE_VECTYPE (slp_root);
2379 bool loads_permuted = false;
2380 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2382 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2383 continue;
2384 unsigned j;
2385 stmt_vec_info load_info;
2386 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2387 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2389 loads_permuted = true;
2390 break;
2394 /* If the loads and stores can be handled with load/store-lane
2395 instructions record it and move on to the next instance. */
2396 if (loads_permuted
2397 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2398 && vect_store_lanes_supported (vectype, group_size, false))
2400 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2402 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2403 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2404 /* Use SLP for strided accesses (or if we can't
2405 load-lanes). */
2406 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2407 || ! vect_load_lanes_supported
2408 (STMT_VINFO_VECTYPE (stmt_vinfo),
2409 DR_GROUP_SIZE (stmt_vinfo), false))
2410 break;
2413 can_use_lanes
2414 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2416 if (can_use_lanes && dump_enabled_p ())
2417 dump_printf_loc (MSG_NOTE, vect_location,
2418 "SLP instance %p can use load/store-lanes\n",
2419 instance);
2421 else
2423 can_use_lanes = false;
2424 break;
2428 /* If all SLP instances can use load/store-lanes abort SLP and try again
2429 with SLP disabled. */
2430 if (can_use_lanes)
2432 ok = opt_result::failure_at (vect_location,
2433 "Built SLP cancelled: can use "
2434 "load/store-lanes\n");
2435 if (dump_enabled_p ())
2436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2437 "Built SLP cancelled: all SLP instances support "
2438 "load/store-lanes\n");
2439 goto again;
2443 /* Dissolve SLP-only groups. */
2444 vect_dissolve_slp_only_groups (loop_vinfo);
2446 /* Scan all the remaining operations in the loop that are not subject
2447 to SLP and make sure they are vectorizable. */
2448 ok = vect_analyze_loop_operations (loop_vinfo);
2449 if (!ok)
2451 if (dump_enabled_p ())
2452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2453 "bad operation or unsupported loop bound.\n");
2454 return ok;
2457 /* For now, we don't expect to mix both masking and length approaches for one
2458 loop, disable it if both are recorded. */
2459 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2460 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2461 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2463 if (dump_enabled_p ())
2464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2465 "can't vectorize a loop with partial vectors"
2466 " because we don't expect to mix different"
2467 " approaches with partial vectors for the"
2468 " same loop.\n");
2469 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2472 /* If we still have the option of using partial vectors,
2473 check whether we can generate the necessary loop controls. */
2474 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2475 && !vect_verify_full_masking (loop_vinfo)
2476 && !vect_verify_loop_lens (loop_vinfo))
2477 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2479 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2480 to be able to handle fewer than VF scalars, or needs to have a lower VF
2481 than the main loop. */
2482 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2483 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2484 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2485 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2486 return opt_result::failure_at (vect_location,
2487 "Vectorization factor too high for"
2488 " epilogue loop.\n");
2490 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2491 assuming that the loop will be used as a main loop. We will redo
2492 this analysis later if we instead decide to use the loop as an
2493 epilogue loop. */
2494 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2495 if (!ok)
2496 return ok;
2498 /* Check the costings of the loop make vectorizing worthwhile. */
2499 res = vect_analyze_loop_costing (loop_vinfo);
2500 if (res < 0)
2502 ok = opt_result::failure_at (vect_location,
2503 "Loop costings may not be worthwhile.\n");
2504 goto again;
2506 if (!res)
2507 return opt_result::failure_at (vect_location,
2508 "Loop costings not worthwhile.\n");
2510 /* If an epilogue loop is required make sure we can create one. */
2511 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2512 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2514 if (dump_enabled_p ())
2515 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2516 if (!vect_can_advance_ivs_p (loop_vinfo)
2517 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2518 single_exit (LOOP_VINFO_LOOP
2519 (loop_vinfo))))
2521 ok = opt_result::failure_at (vect_location,
2522 "not vectorized: can't create required "
2523 "epilog loop\n");
2524 goto again;
2528 /* During peeling, we need to check if number of loop iterations is
2529 enough for both peeled prolog loop and vector loop. This check
2530 can be merged along with threshold check of loop versioning, so
2531 increase threshold for this case if necessary.
2533 If we are analyzing an epilogue we still want to check what its
2534 versioning threshold would be. If we decide to vectorize the epilogues we
2535 will want to use the lowest versioning threshold of all epilogues and main
2536 loop. This will enable us to enter a vectorized epilogue even when
2537 versioning the loop. We can't simply check whether the epilogue requires
2538 versioning though since we may have skipped some versioning checks when
2539 analyzing the epilogue. For instance, checks for alias versioning will be
2540 skipped when dealing with epilogues as we assume we already checked them
2541 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2542 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2544 poly_uint64 niters_th = 0;
2545 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2547 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2549 /* Niters for peeled prolog loop. */
2550 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2552 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2553 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2554 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2556 else
2557 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2560 /* Niters for at least one iteration of vectorized loop. */
2561 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2562 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2563 /* One additional iteration because of peeling for gap. */
2564 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2565 niters_th += 1;
2567 /* Use the same condition as vect_transform_loop to decide when to use
2568 the cost to determine a versioning threshold. */
2569 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2570 && ordered_p (th, niters_th))
2571 niters_th = ordered_max (poly_uint64 (th), niters_th);
2573 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2576 gcc_assert (known_eq (vectorization_factor,
2577 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2579 /* Ok to vectorize! */
2580 return opt_result::success ();
2582 again:
2583 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2584 gcc_assert (!ok);
2586 /* Try again with SLP forced off but if we didn't do any SLP there is
2587 no point in re-trying. */
2588 if (!slp)
2589 return ok;
2591 /* If there are reduction chains re-trying will fail anyway. */
2592 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2593 return ok;
2595 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2596 via interleaving or lane instructions. */
2597 slp_instance instance;
2598 slp_tree node;
2599 unsigned i, j;
2600 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2602 stmt_vec_info vinfo;
2603 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2604 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2605 continue;
2606 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2607 unsigned int size = DR_GROUP_SIZE (vinfo);
2608 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2609 if (! vect_store_lanes_supported (vectype, size, false)
2610 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2611 && ! vect_grouped_store_supported (vectype, size))
2612 return opt_result::failure_at (vinfo->stmt,
2613 "unsupported grouped store\n");
2614 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2616 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2617 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2618 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2619 size = DR_GROUP_SIZE (vinfo);
2620 vectype = STMT_VINFO_VECTYPE (vinfo);
2621 if (! vect_load_lanes_supported (vectype, size, false)
2622 && ! vect_grouped_load_supported (vectype, single_element_p,
2623 size))
2624 return opt_result::failure_at (vinfo->stmt,
2625 "unsupported grouped load\n");
2629 if (dump_enabled_p ())
2630 dump_printf_loc (MSG_NOTE, vect_location,
2631 "re-trying with SLP disabled\n");
2633 /* Roll back state appropriately. No SLP this time. */
2634 slp = false;
2635 /* Restore vectorization factor as it were without SLP. */
2636 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2637 /* Free the SLP instances. */
2638 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2639 vect_free_slp_instance (instance);
2640 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2641 /* Reset SLP type to loop_vect on all stmts. */
2642 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2644 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2645 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2646 !gsi_end_p (si); gsi_next (&si))
2648 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2649 STMT_SLP_TYPE (stmt_info) = loop_vect;
2650 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2651 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2653 /* vectorizable_reduction adjusts reduction stmt def-types,
2654 restore them to that of the PHI. */
2655 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2656 = STMT_VINFO_DEF_TYPE (stmt_info);
2657 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2658 (STMT_VINFO_REDUC_DEF (stmt_info)))
2659 = STMT_VINFO_DEF_TYPE (stmt_info);
2662 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2663 !gsi_end_p (si); gsi_next (&si))
2665 if (is_gimple_debug (gsi_stmt (si)))
2666 continue;
2667 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2668 STMT_SLP_TYPE (stmt_info) = loop_vect;
2669 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2671 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2672 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2673 STMT_SLP_TYPE (stmt_info) = loop_vect;
2674 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2675 !gsi_end_p (pi); gsi_next (&pi))
2676 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2677 = loop_vect;
2681 /* Free optimized alias test DDRS. */
2682 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2683 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2684 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2685 /* Reset target cost data. */
2686 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2687 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2688 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2689 /* Reset accumulated rgroup information. */
2690 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2691 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2692 /* Reset assorted flags. */
2693 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2694 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2695 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2696 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2697 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2698 = saved_can_use_partial_vectors_p;
2700 goto start_over;
2703 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2704 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2705 OLD_LOOP_VINFO is better unless something specifically indicates
2706 otherwise.
2708 Note that this deliberately isn't a partial order. */
2710 static bool
2711 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2712 loop_vec_info old_loop_vinfo)
2714 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2715 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2717 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2718 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2720 /* Always prefer a VF of loop->simdlen over any other VF. */
2721 if (loop->simdlen)
2723 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2724 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2725 if (new_simdlen_p != old_simdlen_p)
2726 return new_simdlen_p;
2729 /* Limit the VFs to what is likely to be the maximum number of iterations,
2730 to handle cases in which at least one loop_vinfo is fully-masked. */
2731 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2732 if (estimated_max_niter != -1)
2734 if (known_le (estimated_max_niter, new_vf))
2735 new_vf = estimated_max_niter;
2736 if (known_le (estimated_max_niter, old_vf))
2737 old_vf = estimated_max_niter;
2740 /* Check whether the (fractional) cost per scalar iteration is lower
2741 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2742 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2743 * poly_widest_int (old_vf));
2744 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2745 * poly_widest_int (new_vf));
2746 if (maybe_lt (rel_old, rel_new))
2748 /* When old_loop_vinfo uses a variable vectorization factor,
2749 we know that it has a lower cost for at least one runtime VF.
2750 However, we don't know how likely that VF is.
2752 One option would be to compare the costs for the estimated VFs.
2753 The problem is that that can put too much pressure on the cost
2754 model. E.g. if the estimated VF is also the lowest possible VF,
2755 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2756 for the estimated VF, we'd then choose new_loop_vinfo even
2757 though (a) new_loop_vinfo might not actually be better than
2758 old_loop_vinfo for that VF and (b) it would be significantly
2759 worse at larger VFs.
2761 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2762 no more expensive than old_loop_vinfo even after doubling the
2763 estimated old_loop_vinfo VF. For all but trivial loops, this
2764 ensures that we only pick new_loop_vinfo if it is significantly
2765 better than old_loop_vinfo at the estimated VF. */
2766 if (rel_new.is_constant ())
2767 return false;
2769 HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2770 HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2771 widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2772 * widest_int (old_estimated_vf));
2773 widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2774 * widest_int (new_estimated_vf));
2775 return estimated_rel_new * 2 <= estimated_rel_old;
2777 if (known_lt (rel_new, rel_old))
2778 return true;
2780 /* If there's nothing to choose between the loop bodies, see whether
2781 there's a difference in the prologue and epilogue costs. */
2782 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2783 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2785 return false;
2788 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2789 true if we should. */
2791 static bool
2792 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2793 loop_vec_info old_loop_vinfo)
2795 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2796 return false;
2798 if (dump_enabled_p ())
2799 dump_printf_loc (MSG_NOTE, vect_location,
2800 "***** Preferring vector mode %s to vector mode %s\n",
2801 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2802 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2803 return true;
2806 /* Function vect_analyze_loop.
2808 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2809 for it. The different analyses will record information in the
2810 loop_vec_info struct. */
2811 opt_loop_vec_info
2812 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2814 auto_vector_modes vector_modes;
2816 /* Autodetect first vector size we try. */
2817 unsigned int autovec_flags
2818 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2819 loop->simdlen != 0);
2820 unsigned int mode_i = 0;
2822 DUMP_VECT_SCOPE ("analyze_loop_nest");
2824 if (loop_outer (loop)
2825 && loop_vec_info_for_loop (loop_outer (loop))
2826 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2827 return opt_loop_vec_info::failure_at (vect_location,
2828 "outer-loop already vectorized.\n");
2830 if (!find_loop_nest (loop, &shared->loop_nest))
2831 return opt_loop_vec_info::failure_at
2832 (vect_location,
2833 "not vectorized: loop nest containing two or more consecutive inner"
2834 " loops cannot be vectorized\n");
2836 unsigned n_stmts = 0;
2837 machine_mode autodetected_vector_mode = VOIDmode;
2838 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2839 machine_mode next_vector_mode = VOIDmode;
2840 poly_uint64 lowest_th = 0;
2841 unsigned vectorized_loops = 0;
2842 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2843 && !unlimited_cost_model (loop));
2845 bool vect_epilogues = false;
2846 opt_result res = opt_result::success ();
2847 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2848 while (1)
2850 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2851 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2852 if (!loop_vinfo)
2854 if (dump_enabled_p ())
2855 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2856 "bad loop form.\n");
2857 gcc_checking_assert (first_loop_vinfo == NULL);
2858 return loop_vinfo;
2860 loop_vinfo->vector_mode = next_vector_mode;
2862 bool fatal = false;
2864 /* When pick_lowest_cost_p is true, we should in principle iterate
2865 over all the loop_vec_infos that LOOP_VINFO could replace and
2866 try to vectorize LOOP_VINFO under the same conditions.
2867 E.g. when trying to replace an epilogue loop, we should vectorize
2868 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2869 to replace the main loop, we should vectorize LOOP_VINFO as a main
2870 loop too.
2872 However, autovectorize_vector_modes is usually sorted as follows:
2874 - Modes that naturally produce lower VFs usually follow modes that
2875 naturally produce higher VFs.
2877 - When modes naturally produce the same VF, maskable modes
2878 usually follow unmaskable ones, so that the maskable mode
2879 can be used to vectorize the epilogue of the unmaskable mode.
2881 This order is preferred because it leads to the maximum
2882 epilogue vectorization opportunities. Targets should only use
2883 a different order if they want to make wide modes available while
2884 disparaging them relative to earlier, smaller modes. The assumption
2885 in that case is that the wider modes are more expensive in some
2886 way that isn't reflected directly in the costs.
2888 There should therefore be few interesting cases in which
2889 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2890 treated as a standalone loop, and ends up being genuinely cheaper
2891 than FIRST_LOOP_VINFO. */
2892 if (vect_epilogues)
2893 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2895 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2896 if (mode_i == 0)
2897 autodetected_vector_mode = loop_vinfo->vector_mode;
2898 if (dump_enabled_p ())
2900 if (res)
2901 dump_printf_loc (MSG_NOTE, vect_location,
2902 "***** Analysis succeeded with vector mode %s\n",
2903 GET_MODE_NAME (loop_vinfo->vector_mode));
2904 else
2905 dump_printf_loc (MSG_NOTE, vect_location,
2906 "***** Analysis failed with vector mode %s\n",
2907 GET_MODE_NAME (loop_vinfo->vector_mode));
2910 loop->aux = NULL;
2912 if (!fatal)
2913 while (mode_i < vector_modes.length ()
2914 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2916 if (dump_enabled_p ())
2917 dump_printf_loc (MSG_NOTE, vect_location,
2918 "***** The result for vector mode %s would"
2919 " be the same\n",
2920 GET_MODE_NAME (vector_modes[mode_i]));
2921 mode_i += 1;
2924 if (res)
2926 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2927 vectorized_loops++;
2929 /* Once we hit the desired simdlen for the first time,
2930 discard any previous attempts. */
2931 if (simdlen
2932 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2934 delete first_loop_vinfo;
2935 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2936 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2937 simdlen = 0;
2939 else if (pick_lowest_cost_p && first_loop_vinfo)
2941 /* Keep trying to roll back vectorization attempts while the
2942 loop_vec_infos they produced were worse than this one. */
2943 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2944 while (!vinfos.is_empty ()
2945 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2947 gcc_assert (vect_epilogues);
2948 delete vinfos.pop ();
2950 if (vinfos.is_empty ()
2951 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2953 delete first_loop_vinfo;
2954 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2955 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2959 if (first_loop_vinfo == NULL)
2961 first_loop_vinfo = loop_vinfo;
2962 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2964 else if (vect_epilogues
2965 /* For now only allow one epilogue loop. */
2966 && first_loop_vinfo->epilogue_vinfos.is_empty ())
2968 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2969 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2970 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2971 || maybe_ne (lowest_th, 0U));
2972 /* Keep track of the known smallest versioning
2973 threshold. */
2974 if (ordered_p (lowest_th, th))
2975 lowest_th = ordered_min (lowest_th, th);
2977 else
2979 delete loop_vinfo;
2980 loop_vinfo = opt_loop_vec_info::success (NULL);
2983 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2984 enabled, SIMDUID is not set, it is the innermost loop and we have
2985 either already found the loop's SIMDLEN or there was no SIMDLEN to
2986 begin with.
2987 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
2988 vect_epilogues = (!simdlen
2989 && loop->inner == NULL
2990 && param_vect_epilogues_nomask
2991 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2992 && !loop->simduid
2993 /* For now only allow one epilogue loop, but allow
2994 pick_lowest_cost_p to replace it. */
2995 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2996 || pick_lowest_cost_p));
2998 /* Commit to first_loop_vinfo if we have no reason to try
2999 alternatives. */
3000 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
3001 break;
3003 else
3005 delete loop_vinfo;
3006 loop_vinfo = opt_loop_vec_info::success (NULL);
3007 if (fatal)
3009 gcc_checking_assert (first_loop_vinfo == NULL);
3010 break;
3014 /* Handle the case that the original loop can use partial
3015 vectorization, but want to only adopt it for the epilogue.
3016 The retry should be in the same mode as original. */
3017 if (vect_epilogues
3018 && loop_vinfo
3019 && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
3021 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3022 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
3023 if (dump_enabled_p ())
3024 dump_printf_loc (MSG_NOTE, vect_location,
3025 "***** Re-trying analysis with same vector mode"
3026 " %s for epilogue with partial vectors.\n",
3027 GET_MODE_NAME (loop_vinfo->vector_mode));
3028 continue;
3031 if (mode_i < vector_modes.length ()
3032 && VECTOR_MODE_P (autodetected_vector_mode)
3033 && (related_vector_mode (vector_modes[mode_i],
3034 GET_MODE_INNER (autodetected_vector_mode))
3035 == autodetected_vector_mode)
3036 && (related_vector_mode (autodetected_vector_mode,
3037 GET_MODE_INNER (vector_modes[mode_i]))
3038 == vector_modes[mode_i]))
3040 if (dump_enabled_p ())
3041 dump_printf_loc (MSG_NOTE, vect_location,
3042 "***** Skipping vector mode %s, which would"
3043 " repeat the analysis for %s\n",
3044 GET_MODE_NAME (vector_modes[mode_i]),
3045 GET_MODE_NAME (autodetected_vector_mode));
3046 mode_i += 1;
3049 if (mode_i == vector_modes.length ()
3050 || autodetected_vector_mode == VOIDmode)
3051 break;
3053 /* Try the next biggest vector size. */
3054 next_vector_mode = vector_modes[mode_i++];
3055 if (dump_enabled_p ())
3056 dump_printf_loc (MSG_NOTE, vect_location,
3057 "***** Re-trying analysis with vector mode %s\n",
3058 GET_MODE_NAME (next_vector_mode));
3061 if (first_loop_vinfo)
3063 loop->aux = (loop_vec_info) first_loop_vinfo;
3064 if (dump_enabled_p ())
3065 dump_printf_loc (MSG_NOTE, vect_location,
3066 "***** Choosing vector mode %s\n",
3067 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3068 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3069 return first_loop_vinfo;
3072 return opt_loop_vec_info::propagate_failure (res);
3075 /* Return true if there is an in-order reduction function for CODE, storing
3076 it in *REDUC_FN if so. */
3078 static bool
3079 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3081 switch (code)
3083 case PLUS_EXPR:
3084 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3085 return true;
3087 default:
3088 return false;
3092 /* Function reduction_fn_for_scalar_code
3094 Input:
3095 CODE - tree_code of a reduction operations.
3097 Output:
3098 REDUC_FN - the corresponding internal function to be used to reduce the
3099 vector of partial results into a single scalar result, or IFN_LAST
3100 if the operation is a supported reduction operation, but does not have
3101 such an internal function.
3103 Return FALSE if CODE currently cannot be vectorized as reduction. */
3105 static bool
3106 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3108 switch (code)
3110 case MAX_EXPR:
3111 *reduc_fn = IFN_REDUC_MAX;
3112 return true;
3114 case MIN_EXPR:
3115 *reduc_fn = IFN_REDUC_MIN;
3116 return true;
3118 case PLUS_EXPR:
3119 *reduc_fn = IFN_REDUC_PLUS;
3120 return true;
3122 case BIT_AND_EXPR:
3123 *reduc_fn = IFN_REDUC_AND;
3124 return true;
3126 case BIT_IOR_EXPR:
3127 *reduc_fn = IFN_REDUC_IOR;
3128 return true;
3130 case BIT_XOR_EXPR:
3131 *reduc_fn = IFN_REDUC_XOR;
3132 return true;
3134 case MULT_EXPR:
3135 case MINUS_EXPR:
3136 *reduc_fn = IFN_LAST;
3137 return true;
3139 default:
3140 return false;
3144 /* If there is a neutral value X such that SLP reduction NODE would not
3145 be affected by the introduction of additional X elements, return that X,
3146 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
3147 is the vector type that would hold element X. REDUC_CHAIN is true if
3148 the SLP statements perform a single reduction, false if each statement
3149 performs an independent reduction. */
3151 static tree
3152 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
3153 tree_code code, bool reduc_chain)
3155 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3156 stmt_vec_info stmt_vinfo = stmts[0];
3157 tree scalar_type = TREE_TYPE (vector_type);
3158 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
3159 gcc_assert (loop);
3161 switch (code)
3163 case WIDEN_SUM_EXPR:
3164 case DOT_PROD_EXPR:
3165 case SAD_EXPR:
3166 case PLUS_EXPR:
3167 case MINUS_EXPR:
3168 case BIT_IOR_EXPR:
3169 case BIT_XOR_EXPR:
3170 return build_zero_cst (scalar_type);
3172 case MULT_EXPR:
3173 return build_one_cst (scalar_type);
3175 case BIT_AND_EXPR:
3176 return build_all_ones_cst (scalar_type);
3178 case MAX_EXPR:
3179 case MIN_EXPR:
3180 /* For MIN/MAX the initial values are neutral. A reduction chain
3181 has only a single initial value, so that value is neutral for
3182 all statements. */
3183 if (reduc_chain)
3184 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3185 loop_preheader_edge (loop));
3186 return NULL_TREE;
3188 default:
3189 return NULL_TREE;
3193 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3194 STMT is printed with a message MSG. */
3196 static void
3197 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3199 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3202 /* Return true if we need an in-order reduction for operation CODE
3203 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3204 overflow must wrap. */
3206 bool
3207 needs_fold_left_reduction_p (tree type, tree_code code)
3209 /* CHECKME: check for !flag_finite_math_only too? */
3210 if (SCALAR_FLOAT_TYPE_P (type))
3211 switch (code)
3213 case MIN_EXPR:
3214 case MAX_EXPR:
3215 return false;
3217 default:
3218 return !flag_associative_math;
3221 if (INTEGRAL_TYPE_P (type))
3223 if (!operation_no_trapping_overflow (type, code))
3224 return true;
3225 return false;
3228 if (SAT_FIXED_POINT_TYPE_P (type))
3229 return true;
3231 return false;
3234 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3235 has a handled computation expression. Store the main reduction
3236 operation in *CODE. */
3238 static bool
3239 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3240 tree loop_arg, enum tree_code *code,
3241 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3243 auto_bitmap visited;
3244 tree lookfor = PHI_RESULT (phi);
3245 ssa_op_iter curri;
3246 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3247 while (USE_FROM_PTR (curr) != loop_arg)
3248 curr = op_iter_next_use (&curri);
3249 curri.i = curri.numops;
3252 path.safe_push (std::make_pair (curri, curr));
3253 tree use = USE_FROM_PTR (curr);
3254 if (use == lookfor)
3255 break;
3256 gimple *def = SSA_NAME_DEF_STMT (use);
3257 if (gimple_nop_p (def)
3258 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3260 pop:
3263 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3264 curri = x.first;
3265 curr = x.second;
3267 curr = op_iter_next_use (&curri);
3268 /* Skip already visited or non-SSA operands (from iterating
3269 over PHI args). */
3270 while (curr != NULL_USE_OPERAND_P
3271 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3272 || ! bitmap_set_bit (visited,
3273 SSA_NAME_VERSION
3274 (USE_FROM_PTR (curr)))));
3276 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3277 if (curr == NULL_USE_OPERAND_P)
3278 break;
3280 else
3282 if (gimple_code (def) == GIMPLE_PHI)
3283 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3284 else
3285 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3286 while (curr != NULL_USE_OPERAND_P
3287 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3288 || ! bitmap_set_bit (visited,
3289 SSA_NAME_VERSION
3290 (USE_FROM_PTR (curr)))))
3291 curr = op_iter_next_use (&curri);
3292 if (curr == NULL_USE_OPERAND_P)
3293 goto pop;
3296 while (1);
3297 if (dump_file && (dump_flags & TDF_DETAILS))
3299 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3300 unsigned i;
3301 std::pair<ssa_op_iter, use_operand_p> *x;
3302 FOR_EACH_VEC_ELT (path, i, x)
3303 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3304 dump_printf (MSG_NOTE, "\n");
3307 /* Check whether the reduction path detected is valid. */
3308 bool fail = path.length () == 0;
3309 bool neg = false;
3310 int sign = -1;
3311 *code = ERROR_MARK;
3312 for (unsigned i = 1; i < path.length (); ++i)
3314 gimple *use_stmt = USE_STMT (path[i].second);
3315 tree op = USE_FROM_PTR (path[i].second);
3316 if (! is_gimple_assign (use_stmt)
3317 /* The following make sure we can compute the operand index
3318 easily plus it mostly disallows chaining via COND_EXPR condition
3319 operands. */
3320 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3321 && (gimple_num_ops (use_stmt) <= 2
3322 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3323 && (gimple_num_ops (use_stmt) <= 3
3324 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3326 fail = true;
3327 break;
3329 /* Check there's only a single stmt the op is used on. For the
3330 not value-changing tail and the last stmt allow out-of-loop uses.
3331 ??? We could relax this and handle arbitrary live stmts by
3332 forcing a scalar epilogue for example. */
3333 imm_use_iterator imm_iter;
3334 gimple *op_use_stmt;
3335 unsigned cnt = 0;
3336 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3337 if (!is_gimple_debug (op_use_stmt)
3338 && (*code != ERROR_MARK
3339 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3341 /* We want to allow x + x but not x < 1 ? x : 2. */
3342 if (is_gimple_assign (op_use_stmt)
3343 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3345 use_operand_p use_p;
3346 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3347 cnt++;
3349 else
3350 cnt++;
3352 if (cnt != 1)
3354 fail = true;
3355 break;
3357 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3358 if (use_code == MINUS_EXPR)
3360 use_code = PLUS_EXPR;
3361 /* Track whether we negate the reduction value each iteration. */
3362 if (gimple_assign_rhs2 (use_stmt) == op)
3363 neg = ! neg;
3365 if (CONVERT_EXPR_CODE_P (use_code)
3366 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3367 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3369 else if (*code == ERROR_MARK)
3371 *code = use_code;
3372 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3374 else if (use_code != *code)
3376 fail = true;
3377 break;
3379 else if ((use_code == MIN_EXPR
3380 || use_code == MAX_EXPR)
3381 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3383 fail = true;
3384 break;
3387 return ! fail && ! neg && *code != ERROR_MARK;
3390 bool
3391 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3392 tree loop_arg, enum tree_code code)
3394 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3395 enum tree_code code_;
3396 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3397 && code_ == code);
3402 /* Function vect_is_simple_reduction
3404 (1) Detect a cross-iteration def-use cycle that represents a simple
3405 reduction computation. We look for the following pattern:
3407 loop_header:
3408 a1 = phi < a0, a2 >
3409 a3 = ...
3410 a2 = operation (a3, a1)
3414 a3 = ...
3415 loop_header:
3416 a1 = phi < a0, a2 >
3417 a2 = operation (a3, a1)
3419 such that:
3420 1. operation is commutative and associative and it is safe to
3421 change the order of the computation
3422 2. no uses for a2 in the loop (a2 is used out of the loop)
3423 3. no uses of a1 in the loop besides the reduction operation
3424 4. no uses of a1 outside the loop.
3426 Conditions 1,4 are tested here.
3427 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3429 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3430 nested cycles.
3432 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3433 reductions:
3435 a1 = phi < a0, a2 >
3436 inner loop (def of a3)
3437 a2 = phi < a3 >
3439 (4) Detect condition expressions, ie:
3440 for (int i = 0; i < N; i++)
3441 if (a[i] < val)
3442 ret_val = a[i];
3446 static stmt_vec_info
3447 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3448 bool *double_reduc, bool *reduc_chain_p)
3450 gphi *phi = as_a <gphi *> (phi_info->stmt);
3451 gimple *phi_use_stmt = NULL;
3452 imm_use_iterator imm_iter;
3453 use_operand_p use_p;
3455 *double_reduc = false;
3456 *reduc_chain_p = false;
3457 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3459 tree phi_name = PHI_RESULT (phi);
3460 /* ??? If there are no uses of the PHI result the inner loop reduction
3461 won't be detected as possibly double-reduction by vectorizable_reduction
3462 because that tries to walk the PHI arg from the preheader edge which
3463 can be constant. See PR60382. */
3464 if (has_zero_uses (phi_name))
3465 return NULL;
3466 class loop *loop = (gimple_bb (phi))->loop_father;
3467 unsigned nphi_def_loop_uses = 0;
3468 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3470 gimple *use_stmt = USE_STMT (use_p);
3471 if (is_gimple_debug (use_stmt))
3472 continue;
3474 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3476 if (dump_enabled_p ())
3477 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3478 "intermediate value used outside loop.\n");
3480 return NULL;
3483 nphi_def_loop_uses++;
3484 phi_use_stmt = use_stmt;
3487 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3488 if (TREE_CODE (latch_def) != SSA_NAME)
3490 if (dump_enabled_p ())
3491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3492 "reduction: not ssa_name: %T\n", latch_def);
3493 return NULL;
3496 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3497 if (!def_stmt_info
3498 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3499 return NULL;
3501 bool nested_in_vect_loop
3502 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3503 unsigned nlatch_def_loop_uses = 0;
3504 auto_vec<gphi *, 3> lcphis;
3505 bool inner_loop_of_double_reduc = false;
3506 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3508 gimple *use_stmt = USE_STMT (use_p);
3509 if (is_gimple_debug (use_stmt))
3510 continue;
3511 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3512 nlatch_def_loop_uses++;
3513 else
3515 /* We can have more than one loop-closed PHI. */
3516 lcphis.safe_push (as_a <gphi *> (use_stmt));
3517 if (nested_in_vect_loop
3518 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3519 == vect_double_reduction_def))
3520 inner_loop_of_double_reduc = true;
3524 /* If we are vectorizing an inner reduction we are executing that
3525 in the original order only in case we are not dealing with a
3526 double reduction. */
3527 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3529 if (dump_enabled_p ())
3530 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3531 "detected nested cycle: ");
3532 return def_stmt_info;
3535 /* If this isn't a nested cycle or if the nested cycle reduction value
3536 is used ouside of the inner loop we cannot handle uses of the reduction
3537 value. */
3538 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3540 if (dump_enabled_p ())
3541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3542 "reduction used in loop.\n");
3543 return NULL;
3546 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3547 defined in the inner loop. */
3548 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3550 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3551 if (gimple_phi_num_args (def_stmt) != 1
3552 || TREE_CODE (op1) != SSA_NAME)
3554 if (dump_enabled_p ())
3555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3556 "unsupported phi node definition.\n");
3558 return NULL;
3561 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3562 if (gimple_bb (def1)
3563 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3564 && loop->inner
3565 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3566 && is_gimple_assign (def1)
3567 && is_a <gphi *> (phi_use_stmt)
3568 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3570 if (dump_enabled_p ())
3571 report_vect_op (MSG_NOTE, def_stmt,
3572 "detected double reduction: ");
3574 *double_reduc = true;
3575 return def_stmt_info;
3578 return NULL;
3581 /* Look for the expression computing latch_def from then loop PHI result. */
3582 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3583 enum tree_code code;
3584 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3585 path))
3587 STMT_VINFO_REDUC_CODE (phi_info) = code;
3588 if (code == COND_EXPR && !nested_in_vect_loop)
3589 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3591 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3592 reduction chain for which the additional restriction is that
3593 all operations in the chain are the same. */
3594 auto_vec<stmt_vec_info, 8> reduc_chain;
3595 unsigned i;
3596 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3597 for (i = path.length () - 1; i >= 1; --i)
3599 gimple *stmt = USE_STMT (path[i].second);
3600 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3601 STMT_VINFO_REDUC_IDX (stmt_info)
3602 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3603 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3604 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3605 && (i == 1 || i == path.length () - 1));
3606 if ((stmt_code != code && !leading_conversion)
3607 /* We can only handle the final value in epilogue
3608 generation for reduction chains. */
3609 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3610 is_slp_reduc = false;
3611 /* For reduction chains we support a trailing/leading
3612 conversions. We do not store those in the actual chain. */
3613 if (leading_conversion)
3614 continue;
3615 reduc_chain.safe_push (stmt_info);
3617 if (is_slp_reduc && reduc_chain.length () > 1)
3619 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3621 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3622 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3624 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3625 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3627 /* Save the chain for further analysis in SLP detection. */
3628 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3629 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3631 *reduc_chain_p = true;
3632 if (dump_enabled_p ())
3633 dump_printf_loc (MSG_NOTE, vect_location,
3634 "reduction: detected reduction chain\n");
3636 else if (dump_enabled_p ())
3637 dump_printf_loc (MSG_NOTE, vect_location,
3638 "reduction: detected reduction\n");
3640 return def_stmt_info;
3643 if (dump_enabled_p ())
3644 dump_printf_loc (MSG_NOTE, vect_location,
3645 "reduction: unknown pattern\n");
3647 return NULL;
3650 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3651 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3652 or -1 if not known. */
3654 static int
3655 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3657 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3658 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3660 if (dump_enabled_p ())
3661 dump_printf_loc (MSG_NOTE, vect_location,
3662 "cost model: epilogue peel iters set to vf/2 "
3663 "because loop iterations are unknown .\n");
3664 return assumed_vf / 2;
3666 else
3668 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3669 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3670 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3671 /* If we need to peel for gaps, but no peeling is required, we have to
3672 peel VF iterations. */
3673 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3674 peel_iters_epilogue = assumed_vf;
3675 return peel_iters_epilogue;
3679 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3681 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3682 int *peel_iters_epilogue,
3683 stmt_vector_for_cost *scalar_cost_vec,
3684 stmt_vector_for_cost *prologue_cost_vec,
3685 stmt_vector_for_cost *epilogue_cost_vec)
3687 int retval = 0;
3689 *peel_iters_epilogue
3690 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3692 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3694 /* If peeled iterations are known but number of scalar loop
3695 iterations are unknown, count a taken branch per peeled loop. */
3696 if (peel_iters_prologue > 0)
3697 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3698 NULL, NULL_TREE, 0, vect_prologue);
3699 if (*peel_iters_epilogue > 0)
3700 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3701 NULL, NULL_TREE, 0, vect_epilogue);
3704 stmt_info_for_cost *si;
3705 int j;
3706 if (peel_iters_prologue)
3707 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3708 retval += record_stmt_cost (prologue_cost_vec,
3709 si->count * peel_iters_prologue,
3710 si->kind, si->stmt_info, si->misalign,
3711 vect_prologue);
3712 if (*peel_iters_epilogue)
3713 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3714 retval += record_stmt_cost (epilogue_cost_vec,
3715 si->count * *peel_iters_epilogue,
3716 si->kind, si->stmt_info, si->misalign,
3717 vect_epilogue);
3719 return retval;
3722 /* Function vect_estimate_min_profitable_iters
3724 Return the number of iterations required for the vector version of the
3725 loop to be profitable relative to the cost of the scalar version of the
3726 loop.
3728 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3729 of iterations for vectorization. -1 value means loop vectorization
3730 is not profitable. This returned value may be used for dynamic
3731 profitability check.
3733 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3734 for static check against estimated number of iterations. */
3736 static void
3737 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3738 int *ret_min_profitable_niters,
3739 int *ret_min_profitable_estimate)
3741 int min_profitable_iters;
3742 int min_profitable_estimate;
3743 int peel_iters_prologue;
3744 int peel_iters_epilogue;
3745 unsigned vec_inside_cost = 0;
3746 int vec_outside_cost = 0;
3747 unsigned vec_prologue_cost = 0;
3748 unsigned vec_epilogue_cost = 0;
3749 int scalar_single_iter_cost = 0;
3750 int scalar_outside_cost = 0;
3751 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3752 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3753 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3755 /* Cost model disabled. */
3756 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3758 if (dump_enabled_p ())
3759 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3760 *ret_min_profitable_niters = 0;
3761 *ret_min_profitable_estimate = 0;
3762 return;
3765 /* Requires loop versioning tests to handle misalignment. */
3766 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3768 /* FIXME: Make cost depend on complexity of individual check. */
3769 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3770 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3771 NULL, NULL_TREE, 0, vect_prologue);
3772 if (dump_enabled_p ())
3773 dump_printf (MSG_NOTE,
3774 "cost model: Adding cost of checks for loop "
3775 "versioning to treat misalignment.\n");
3778 /* Requires loop versioning with alias checks. */
3779 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3781 /* FIXME: Make cost depend on complexity of individual check. */
3782 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3783 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3784 NULL, NULL_TREE, 0, vect_prologue);
3785 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3786 if (len)
3787 /* Count LEN - 1 ANDs and LEN comparisons. */
3788 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3789 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3790 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3791 if (len)
3793 /* Count LEN - 1 ANDs and LEN comparisons. */
3794 unsigned int nstmts = len * 2 - 1;
3795 /* +1 for each bias that needs adding. */
3796 for (unsigned int i = 0; i < len; ++i)
3797 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3798 nstmts += 1;
3799 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3800 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3802 if (dump_enabled_p ())
3803 dump_printf (MSG_NOTE,
3804 "cost model: Adding cost of checks for loop "
3805 "versioning aliasing.\n");
3808 /* Requires loop versioning with niter checks. */
3809 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3811 /* FIXME: Make cost depend on complexity of individual check. */
3812 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3813 NULL, NULL_TREE, 0, vect_prologue);
3814 if (dump_enabled_p ())
3815 dump_printf (MSG_NOTE,
3816 "cost model: Adding cost of checks for loop "
3817 "versioning niters.\n");
3820 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3821 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3822 NULL, NULL_TREE, 0, vect_prologue);
3824 /* Count statements in scalar loop. Using this as scalar cost for a single
3825 iteration for now.
3827 TODO: Add outer loop support.
3829 TODO: Consider assigning different costs to different scalar
3830 statements. */
3832 scalar_single_iter_cost
3833 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3835 /* Add additional cost for the peeled instructions in prologue and epilogue
3836 loop. (For fully-masked loops there will be no peeling.)
3838 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3839 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3841 TODO: Build an expression that represents peel_iters for prologue and
3842 epilogue to be used in a run-time test. */
3844 bool prologue_need_br_taken_cost = false;
3845 bool prologue_need_br_not_taken_cost = false;
3847 /* Calculate peel_iters_prologue. */
3848 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3849 peel_iters_prologue = 0;
3850 else if (npeel < 0)
3852 peel_iters_prologue = assumed_vf / 2;
3853 if (dump_enabled_p ())
3854 dump_printf (MSG_NOTE, "cost model: "
3855 "prologue peel iters set to vf/2.\n");
3857 /* If peeled iterations are unknown, count a taken branch and a not taken
3858 branch per peeled loop. Even if scalar loop iterations are known,
3859 vector iterations are not known since peeled prologue iterations are
3860 not known. Hence guards remain the same. */
3861 prologue_need_br_taken_cost = true;
3862 prologue_need_br_not_taken_cost = true;
3864 else
3866 peel_iters_prologue = npeel;
3867 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3868 /* If peeled iterations are known but number of scalar loop
3869 iterations are unknown, count a taken branch per peeled loop. */
3870 prologue_need_br_taken_cost = true;
3873 bool epilogue_need_br_taken_cost = false;
3874 bool epilogue_need_br_not_taken_cost = false;
3876 /* Calculate peel_iters_epilogue. */
3877 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3878 /* We need to peel exactly one iteration for gaps. */
3879 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3880 else if (npeel < 0)
3882 /* If peeling for alignment is unknown, loop bound of main loop
3883 becomes unknown. */
3884 peel_iters_epilogue = assumed_vf / 2;
3885 if (dump_enabled_p ())
3886 dump_printf (MSG_NOTE, "cost model: "
3887 "epilogue peel iters set to vf/2 because "
3888 "peeling for alignment is unknown.\n");
3890 /* See the same reason above in peel_iters_prologue calculation. */
3891 epilogue_need_br_taken_cost = true;
3892 epilogue_need_br_not_taken_cost = true;
3894 else
3896 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3897 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3898 /* If peeled iterations are known but number of scalar loop
3899 iterations are unknown, count a taken branch per peeled loop. */
3900 epilogue_need_br_taken_cost = true;
3903 stmt_info_for_cost *si;
3904 int j;
3905 /* Add costs associated with peel_iters_prologue. */
3906 if (peel_iters_prologue)
3907 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3909 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3910 si->count * peel_iters_prologue, si->kind,
3911 si->stmt_info, si->vectype, si->misalign,
3912 vect_prologue);
3915 /* Add costs associated with peel_iters_epilogue. */
3916 if (peel_iters_epilogue)
3917 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3919 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3920 si->count * peel_iters_epilogue, si->kind,
3921 si->stmt_info, si->vectype, si->misalign,
3922 vect_epilogue);
3925 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
3927 if (prologue_need_br_taken_cost)
3928 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3929 NULL, NULL_TREE, 0, vect_prologue);
3931 if (prologue_need_br_not_taken_cost)
3932 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3933 cond_branch_not_taken, NULL, NULL_TREE, 0,
3934 vect_prologue);
3936 if (epilogue_need_br_taken_cost)
3937 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3938 NULL, NULL_TREE, 0, vect_epilogue);
3940 if (epilogue_need_br_not_taken_cost)
3941 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3942 cond_branch_not_taken, NULL, NULL_TREE, 0,
3943 vect_epilogue);
3945 /* Take care of special costs for rgroup controls of partial vectors. */
3946 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3948 /* Calculate how many masks we need to generate. */
3949 unsigned int num_masks = 0;
3950 rgroup_controls *rgm;
3951 unsigned int num_vectors_m1;
3952 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3953 if (rgm->type)
3954 num_masks += num_vectors_m1 + 1;
3955 gcc_assert (num_masks > 0);
3957 /* In the worst case, we need to generate each mask in the prologue
3958 and in the loop body. One of the loop body mask instructions
3959 replaces the comparison in the scalar loop, and since we don't
3960 count the scalar comparison against the scalar body, we shouldn't
3961 count that vector instruction against the vector body either.
3963 Sometimes we can use unpacks instead of generating prologue
3964 masks and sometimes the prologue mask will fold to a constant,
3965 so the actual prologue cost might be smaller. However, it's
3966 simpler and safer to use the worst-case cost; if this ends up
3967 being the tie-breaker between vectorizing or not, then it's
3968 probably better not to vectorize. */
3969 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
3970 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
3971 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
3972 vector_stmt, NULL, NULL_TREE, 0, vect_body);
3974 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
3976 /* Referring to the functions vect_set_loop_condition_partial_vectors
3977 and vect_set_loop_controls_directly, we need to generate each
3978 length in the prologue and in the loop body if required. Although
3979 there are some possible optimizations, we consider the worst case
3980 here. */
3982 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
3983 bool need_iterate_p
3984 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3985 && !vect_known_niters_smaller_than_vf (loop_vinfo));
3987 /* Calculate how many statements to be added. */
3988 unsigned int prologue_stmts = 0;
3989 unsigned int body_stmts = 0;
3991 rgroup_controls *rgc;
3992 unsigned int num_vectors_m1;
3993 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
3994 if (rgc->type)
3996 /* May need one SHIFT for nitems_total computation. */
3997 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
3998 if (nitems != 1 && !niters_known_p)
3999 prologue_stmts += 1;
4001 /* May need one MAX and one MINUS for wrap around. */
4002 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4003 prologue_stmts += 2;
4005 /* Need one MAX and one MINUS for each batch limit excepting for
4006 the 1st one. */
4007 prologue_stmts += num_vectors_m1 * 2;
4009 unsigned int num_vectors = num_vectors_m1 + 1;
4011 /* Need to set up lengths in prologue, only one MIN required
4012 for each since start index is zero. */
4013 prologue_stmts += num_vectors;
4015 /* Each may need two MINs and one MINUS to update lengths in body
4016 for next iteration. */
4017 if (need_iterate_p)
4018 body_stmts += 3 * num_vectors;
4021 (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
4022 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4023 (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
4024 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4027 /* FORNOW: The scalar outside cost is incremented in one of the
4028 following ways:
4030 1. The vectorizer checks for alignment and aliasing and generates
4031 a condition that allows dynamic vectorization. A cost model
4032 check is ANDED with the versioning condition. Hence scalar code
4033 path now has the added cost of the versioning check.
4035 if (cost > th & versioning_check)
4036 jmp to vector code
4038 Hence run-time scalar is incremented by not-taken branch cost.
4040 2. The vectorizer then checks if a prologue is required. If the
4041 cost model check was not done before during versioning, it has to
4042 be done before the prologue check.
4044 if (cost <= th)
4045 prologue = scalar_iters
4046 if (prologue == 0)
4047 jmp to vector code
4048 else
4049 execute prologue
4050 if (prologue == num_iters)
4051 go to exit
4053 Hence the run-time scalar cost is incremented by a taken branch,
4054 plus a not-taken branch, plus a taken branch cost.
4056 3. The vectorizer then checks if an epilogue is required. If the
4057 cost model check was not done before during prologue check, it
4058 has to be done with the epilogue check.
4060 if (prologue == 0)
4061 jmp to vector code
4062 else
4063 execute prologue
4064 if (prologue == num_iters)
4065 go to exit
4066 vector code:
4067 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4068 jmp to epilogue
4070 Hence the run-time scalar cost should be incremented by 2 taken
4071 branches.
4073 TODO: The back end may reorder the BBS's differently and reverse
4074 conditions/branch directions. Change the estimates below to
4075 something more reasonable. */
4077 /* If the number of iterations is known and we do not do versioning, we can
4078 decide whether to vectorize at compile time. Hence the scalar version
4079 do not carry cost model guard costs. */
4080 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4081 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4083 /* Cost model check occurs at versioning. */
4084 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4085 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4086 else
4088 /* Cost model check occurs at prologue generation. */
4089 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4090 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4091 + vect_get_stmt_cost (cond_branch_not_taken);
4092 /* Cost model check occurs at epilogue generation. */
4093 else
4094 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4098 /* Complete the target-specific cost calculations. */
4099 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4100 &vec_inside_cost, &vec_epilogue_cost);
4102 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4104 /* Stash the costs so that we can compare two loop_vec_infos. */
4105 loop_vinfo->vec_inside_cost = vec_inside_cost;
4106 loop_vinfo->vec_outside_cost = vec_outside_cost;
4108 if (dump_enabled_p ())
4110 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4111 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4112 vec_inside_cost);
4113 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4114 vec_prologue_cost);
4115 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4116 vec_epilogue_cost);
4117 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4118 scalar_single_iter_cost);
4119 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4120 scalar_outside_cost);
4121 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4122 vec_outside_cost);
4123 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4124 peel_iters_prologue);
4125 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4126 peel_iters_epilogue);
4129 /* Calculate number of iterations required to make the vector version
4130 profitable, relative to the loop bodies only. The following condition
4131 must hold true:
4132 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4133 where
4134 SIC = scalar iteration cost, VIC = vector iteration cost,
4135 VOC = vector outside cost, VF = vectorization factor,
4136 NPEEL = prologue iterations + epilogue iterations,
4137 SOC = scalar outside cost for run time cost model check. */
4139 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4140 - vec_inside_cost);
4141 if (saving_per_viter <= 0)
4143 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4144 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4145 "vectorization did not happen for a simd loop");
4147 if (dump_enabled_p ())
4148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4149 "cost model: the vector iteration cost = %d "
4150 "divided by the scalar iteration cost = %d "
4151 "is greater or equal to the vectorization factor = %d"
4152 ".\n",
4153 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4154 *ret_min_profitable_niters = -1;
4155 *ret_min_profitable_estimate = -1;
4156 return;
4159 /* ??? The "if" arm is written to handle all cases; see below for what
4160 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4161 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4163 /* Rewriting the condition above in terms of the number of
4164 vector iterations (vniters) rather than the number of
4165 scalar iterations (niters) gives:
4167 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4169 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4171 For integer N, X and Y when X > 0:
4173 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4174 int outside_overhead = (vec_outside_cost
4175 - scalar_single_iter_cost * peel_iters_prologue
4176 - scalar_single_iter_cost * peel_iters_epilogue
4177 - scalar_outside_cost);
4178 /* We're only interested in cases that require at least one
4179 vector iteration. */
4180 int min_vec_niters = 1;
4181 if (outside_overhead > 0)
4182 min_vec_niters = outside_overhead / saving_per_viter + 1;
4184 if (dump_enabled_p ())
4185 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4186 min_vec_niters);
4188 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4190 /* Now that we know the minimum number of vector iterations,
4191 find the minimum niters for which the scalar cost is larger:
4193 SIC * niters > VIC * vniters + VOC - SOC
4195 We know that the minimum niters is no more than
4196 vniters * VF + NPEEL, but it might be (and often is) less
4197 than that if a partial vector iteration is cheaper than the
4198 equivalent scalar code. */
4199 int threshold = (vec_inside_cost * min_vec_niters
4200 + vec_outside_cost
4201 - scalar_outside_cost);
4202 if (threshold <= 0)
4203 min_profitable_iters = 1;
4204 else
4205 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4207 else
4208 /* Convert the number of vector iterations into a number of
4209 scalar iterations. */
4210 min_profitable_iters = (min_vec_niters * assumed_vf
4211 + peel_iters_prologue
4212 + peel_iters_epilogue);
4214 else
4216 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4217 * assumed_vf
4218 - vec_inside_cost * peel_iters_prologue
4219 - vec_inside_cost * peel_iters_epilogue);
4220 if (min_profitable_iters <= 0)
4221 min_profitable_iters = 0;
4222 else
4224 min_profitable_iters /= saving_per_viter;
4226 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4227 <= (((int) vec_inside_cost * min_profitable_iters)
4228 + (((int) vec_outside_cost - scalar_outside_cost)
4229 * assumed_vf)))
4230 min_profitable_iters++;
4234 if (dump_enabled_p ())
4235 dump_printf (MSG_NOTE,
4236 " Calculated minimum iters for profitability: %d\n",
4237 min_profitable_iters);
4239 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4240 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4241 /* We want the vectorized loop to execute at least once. */
4242 min_profitable_iters = assumed_vf + peel_iters_prologue;
4243 else if (min_profitable_iters < peel_iters_prologue)
4244 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4245 vectorized loop executes at least once. */
4246 min_profitable_iters = peel_iters_prologue;
4248 if (dump_enabled_p ())
4249 dump_printf_loc (MSG_NOTE, vect_location,
4250 " Runtime profitability threshold = %d\n",
4251 min_profitable_iters);
4253 *ret_min_profitable_niters = min_profitable_iters;
4255 /* Calculate number of iterations required to make the vector version
4256 profitable, relative to the loop bodies only.
4258 Non-vectorized variant is SIC * niters and it must win over vector
4259 variant on the expected loop trip count. The following condition must hold true:
4260 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4262 if (vec_outside_cost <= 0)
4263 min_profitable_estimate = 0;
4264 /* ??? This "else if" arm is written to handle all cases; see below for
4265 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4266 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4268 /* This is a repeat of the code above, but with + SOC rather
4269 than - SOC. */
4270 int outside_overhead = (vec_outside_cost
4271 - scalar_single_iter_cost * peel_iters_prologue
4272 - scalar_single_iter_cost * peel_iters_epilogue
4273 + scalar_outside_cost);
4274 int min_vec_niters = 1;
4275 if (outside_overhead > 0)
4276 min_vec_niters = outside_overhead / saving_per_viter + 1;
4278 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4280 int threshold = (vec_inside_cost * min_vec_niters
4281 + vec_outside_cost
4282 + scalar_outside_cost);
4283 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4285 else
4286 min_profitable_estimate = (min_vec_niters * assumed_vf
4287 + peel_iters_prologue
4288 + peel_iters_epilogue);
4290 else
4292 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4293 * assumed_vf
4294 - vec_inside_cost * peel_iters_prologue
4295 - vec_inside_cost * peel_iters_epilogue)
4296 / ((scalar_single_iter_cost * assumed_vf)
4297 - vec_inside_cost);
4299 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4300 if (dump_enabled_p ())
4301 dump_printf_loc (MSG_NOTE, vect_location,
4302 " Static estimate profitability threshold = %d\n",
4303 min_profitable_estimate);
4305 *ret_min_profitable_estimate = min_profitable_estimate;
4308 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4309 vector elements (not bits) for a vector with NELT elements. */
4310 static void
4311 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4312 vec_perm_builder *sel)
4314 /* The encoding is a single stepped pattern. Any wrap-around is handled
4315 by vec_perm_indices. */
4316 sel->new_vector (nelt, 1, 3);
4317 for (unsigned int i = 0; i < 3; i++)
4318 sel->quick_push (i + offset);
4321 /* Checks whether the target supports whole-vector shifts for vectors of mode
4322 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4323 it supports vec_perm_const with masks for all necessary shift amounts. */
4324 static bool
4325 have_whole_vector_shift (machine_mode mode)
4327 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4328 return true;
4330 /* Variable-length vectors should be handled via the optab. */
4331 unsigned int nelt;
4332 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4333 return false;
4335 vec_perm_builder sel;
4336 vec_perm_indices indices;
4337 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4339 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4340 indices.new_vector (sel, 2, nelt);
4341 if (!can_vec_perm_const_p (mode, indices, false))
4342 return false;
4344 return true;
4347 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4348 functions. Design better to avoid maintenance issues. */
4350 /* Function vect_model_reduction_cost.
4352 Models cost for a reduction operation, including the vector ops
4353 generated within the strip-mine loop, the initial definition before
4354 the loop, and the epilogue code that must be generated. */
4356 static void
4357 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4358 stmt_vec_info stmt_info, internal_fn reduc_fn,
4359 vect_reduction_type reduction_type,
4360 int ncopies, stmt_vector_for_cost *cost_vec)
4362 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4363 enum tree_code code;
4364 optab optab;
4365 tree vectype;
4366 machine_mode mode;
4367 class loop *loop = NULL;
4369 if (loop_vinfo)
4370 loop = LOOP_VINFO_LOOP (loop_vinfo);
4372 /* Condition reductions generate two reductions in the loop. */
4373 if (reduction_type == COND_REDUCTION)
4374 ncopies *= 2;
4376 vectype = STMT_VINFO_VECTYPE (stmt_info);
4377 mode = TYPE_MODE (vectype);
4378 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4380 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4382 if (reduction_type == EXTRACT_LAST_REDUCTION)
4383 /* No extra instructions are needed in the prologue. The loop body
4384 operations are costed in vectorizable_condition. */
4385 inside_cost = 0;
4386 else if (reduction_type == FOLD_LEFT_REDUCTION)
4388 /* No extra instructions needed in the prologue. */
4389 prologue_cost = 0;
4391 if (reduc_fn != IFN_LAST)
4392 /* Count one reduction-like operation per vector. */
4393 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4394 stmt_info, 0, vect_body);
4395 else
4397 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4398 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4399 inside_cost = record_stmt_cost (cost_vec, nelements,
4400 vec_to_scalar, stmt_info, 0,
4401 vect_body);
4402 inside_cost += record_stmt_cost (cost_vec, nelements,
4403 scalar_stmt, stmt_info, 0,
4404 vect_body);
4407 else
4409 /* Add in cost for initial definition.
4410 For cond reduction we have four vectors: initial index, step,
4411 initial result of the data reduction, initial value of the index
4412 reduction. */
4413 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4414 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4415 scalar_to_vec, stmt_info, 0,
4416 vect_prologue);
4418 /* Cost of reduction op inside loop. */
4419 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4420 stmt_info, 0, vect_body);
4423 /* Determine cost of epilogue code.
4425 We have a reduction operator that will reduce the vector in one statement.
4426 Also requires scalar extract. */
4428 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4430 if (reduc_fn != IFN_LAST)
4432 if (reduction_type == COND_REDUCTION)
4434 /* An EQ stmt and an COND_EXPR stmt. */
4435 epilogue_cost += record_stmt_cost (cost_vec, 2,
4436 vector_stmt, stmt_info, 0,
4437 vect_epilogue);
4438 /* Reduction of the max index and a reduction of the found
4439 values. */
4440 epilogue_cost += record_stmt_cost (cost_vec, 2,
4441 vec_to_scalar, stmt_info, 0,
4442 vect_epilogue);
4443 /* A broadcast of the max value. */
4444 epilogue_cost += record_stmt_cost (cost_vec, 1,
4445 scalar_to_vec, stmt_info, 0,
4446 vect_epilogue);
4448 else
4450 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4451 stmt_info, 0, vect_epilogue);
4452 epilogue_cost += record_stmt_cost (cost_vec, 1,
4453 vec_to_scalar, stmt_info, 0,
4454 vect_epilogue);
4457 else if (reduction_type == COND_REDUCTION)
4459 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4460 /* Extraction of scalar elements. */
4461 epilogue_cost += record_stmt_cost (cost_vec,
4462 2 * estimated_nunits,
4463 vec_to_scalar, stmt_info, 0,
4464 vect_epilogue);
4465 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4466 epilogue_cost += record_stmt_cost (cost_vec,
4467 2 * estimated_nunits - 3,
4468 scalar_stmt, stmt_info, 0,
4469 vect_epilogue);
4471 else if (reduction_type == EXTRACT_LAST_REDUCTION
4472 || reduction_type == FOLD_LEFT_REDUCTION)
4473 /* No extra instructions need in the epilogue. */
4475 else
4477 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4478 tree bitsize =
4479 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4480 int element_bitsize = tree_to_uhwi (bitsize);
4481 int nelements = vec_size_in_bits / element_bitsize;
4483 if (code == COND_EXPR)
4484 code = MAX_EXPR;
4486 optab = optab_for_tree_code (code, vectype, optab_default);
4488 /* We have a whole vector shift available. */
4489 if (optab != unknown_optab
4490 && VECTOR_MODE_P (mode)
4491 && optab_handler (optab, mode) != CODE_FOR_nothing
4492 && have_whole_vector_shift (mode))
4494 /* Final reduction via vector shifts and the reduction operator.
4495 Also requires scalar extract. */
4496 epilogue_cost += record_stmt_cost (cost_vec,
4497 exact_log2 (nelements) * 2,
4498 vector_stmt, stmt_info, 0,
4499 vect_epilogue);
4500 epilogue_cost += record_stmt_cost (cost_vec, 1,
4501 vec_to_scalar, stmt_info, 0,
4502 vect_epilogue);
4504 else
4505 /* Use extracts and reduction op for final reduction. For N
4506 elements, we have N extracts and N-1 reduction ops. */
4507 epilogue_cost += record_stmt_cost (cost_vec,
4508 nelements + nelements - 1,
4509 vector_stmt, stmt_info, 0,
4510 vect_epilogue);
4514 if (dump_enabled_p ())
4515 dump_printf (MSG_NOTE,
4516 "vect_model_reduction_cost: inside_cost = %d, "
4517 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4518 prologue_cost, epilogue_cost);
4523 /* Function get_initial_def_for_reduction
4525 Input:
4526 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4527 INIT_VAL - the initial value of the reduction variable
4529 Output:
4530 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4531 of the reduction (used for adjusting the epilog - see below).
4532 Return a vector variable, initialized according to the operation that
4533 STMT_VINFO performs. This vector will be used as the initial value
4534 of the vector of partial results.
4536 Option1 (adjust in epilog): Initialize the vector as follows:
4537 add/bit or/xor: [0,0,...,0,0]
4538 mult/bit and: [1,1,...,1,1]
4539 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4540 and when necessary (e.g. add/mult case) let the caller know
4541 that it needs to adjust the result by init_val.
4543 Option2: Initialize the vector as follows:
4544 add/bit or/xor: [init_val,0,0,...,0]
4545 mult/bit and: [init_val,1,1,...,1]
4546 min/max/cond_expr: [init_val,init_val,...,init_val]
4547 and no adjustments are needed.
4549 For example, for the following code:
4551 s = init_val;
4552 for (i=0;i<n;i++)
4553 s = s + a[i];
4555 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4556 For a vector of 4 units, we want to return either [0,0,0,init_val],
4557 or [0,0,0,0] and let the caller know that it needs to adjust
4558 the result at the end by 'init_val'.
4560 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4561 initialization vector is simpler (same element in all entries), if
4562 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4564 A cost model should help decide between these two schemes. */
4566 static tree
4567 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4568 stmt_vec_info stmt_vinfo,
4569 enum tree_code code, tree init_val,
4570 tree *adjustment_def)
4572 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4573 tree scalar_type = TREE_TYPE (init_val);
4574 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4575 tree def_for_init;
4576 tree init_def;
4577 REAL_VALUE_TYPE real_init_val = dconst0;
4578 int int_init_val = 0;
4579 gimple_seq stmts = NULL;
4581 gcc_assert (vectype);
4583 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4584 || SCALAR_FLOAT_TYPE_P (scalar_type));
4586 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4587 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4589 /* ADJUSTMENT_DEF is NULL when called from
4590 vect_create_epilog_for_reduction to vectorize double reduction. */
4591 if (adjustment_def)
4592 *adjustment_def = NULL;
4594 switch (code)
4596 case WIDEN_SUM_EXPR:
4597 case DOT_PROD_EXPR:
4598 case SAD_EXPR:
4599 case PLUS_EXPR:
4600 case MINUS_EXPR:
4601 case BIT_IOR_EXPR:
4602 case BIT_XOR_EXPR:
4603 case MULT_EXPR:
4604 case BIT_AND_EXPR:
4606 if (code == MULT_EXPR)
4608 real_init_val = dconst1;
4609 int_init_val = 1;
4612 if (code == BIT_AND_EXPR)
4613 int_init_val = -1;
4615 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4616 def_for_init = build_real (scalar_type, real_init_val);
4617 else
4618 def_for_init = build_int_cst (scalar_type, int_init_val);
4620 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4622 /* Option1: the first element is '0' or '1' as well. */
4623 if (!operand_equal_p (def_for_init, init_val, 0))
4624 *adjustment_def = init_val;
4625 init_def = gimple_build_vector_from_val (&stmts, vectype,
4626 def_for_init);
4628 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4630 /* Option2 (variable length): the first element is INIT_VAL. */
4631 init_def = gimple_build_vector_from_val (&stmts, vectype,
4632 def_for_init);
4633 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4634 vectype, init_def, init_val);
4636 else
4638 /* Option2: the first element is INIT_VAL. */
4639 tree_vector_builder elts (vectype, 1, 2);
4640 elts.quick_push (init_val);
4641 elts.quick_push (def_for_init);
4642 init_def = gimple_build_vector (&stmts, &elts);
4645 break;
4647 case MIN_EXPR:
4648 case MAX_EXPR:
4649 case COND_EXPR:
4651 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4652 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4654 break;
4656 default:
4657 gcc_unreachable ();
4660 if (stmts)
4661 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4662 return init_def;
4665 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4666 NUMBER_OF_VECTORS is the number of vector defs to create.
4667 If NEUTRAL_OP is nonnull, introducing extra elements of that
4668 value will not change the result. */
4670 static void
4671 get_initial_defs_for_reduction (vec_info *vinfo,
4672 slp_tree slp_node,
4673 vec<tree> *vec_oprnds,
4674 unsigned int number_of_vectors,
4675 bool reduc_chain, tree neutral_op)
4677 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4678 stmt_vec_info stmt_vinfo = stmts[0];
4679 unsigned HOST_WIDE_INT nunits;
4680 unsigned j, number_of_places_left_in_vector;
4681 tree vector_type;
4682 unsigned int group_size = stmts.length ();
4683 unsigned int i;
4684 class loop *loop;
4686 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4688 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4690 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4691 gcc_assert (loop);
4692 edge pe = loop_preheader_edge (loop);
4694 gcc_assert (!reduc_chain || neutral_op);
4696 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4697 created vectors. It is greater than 1 if unrolling is performed.
4699 For example, we have two scalar operands, s1 and s2 (e.g., group of
4700 strided accesses of size two), while NUNITS is four (i.e., four scalars
4701 of this type can be packed in a vector). The output vector will contain
4702 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4703 will be 2).
4705 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4706 vectors containing the operands.
4708 For example, NUNITS is four as before, and the group size is 8
4709 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4710 {s5, s6, s7, s8}. */
4712 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4713 nunits = group_size;
4715 number_of_places_left_in_vector = nunits;
4716 bool constant_p = true;
4717 tree_vector_builder elts (vector_type, nunits, 1);
4718 elts.quick_grow (nunits);
4719 gimple_seq ctor_seq = NULL;
4720 for (j = 0; j < nunits * number_of_vectors; ++j)
4722 tree op;
4723 i = j % group_size;
4724 stmt_vinfo = stmts[i];
4726 /* Get the def before the loop. In reduction chain we have only
4727 one initial value. Else we have as many as PHIs in the group. */
4728 if (reduc_chain)
4729 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4730 else if (((vec_oprnds->length () + 1) * nunits
4731 - number_of_places_left_in_vector >= group_size)
4732 && neutral_op)
4733 op = neutral_op;
4734 else
4735 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4737 /* Create 'vect_ = {op0,op1,...,opn}'. */
4738 number_of_places_left_in_vector--;
4739 elts[nunits - number_of_places_left_in_vector - 1] = op;
4740 if (!CONSTANT_CLASS_P (op))
4741 constant_p = false;
4743 if (number_of_places_left_in_vector == 0)
4745 tree init;
4746 if (constant_p && !neutral_op
4747 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4748 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4749 /* Build the vector directly from ELTS. */
4750 init = gimple_build_vector (&ctor_seq, &elts);
4751 else if (neutral_op)
4753 /* Build a vector of the neutral value and shift the
4754 other elements into place. */
4755 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4756 neutral_op);
4757 int k = nunits;
4758 while (k > 0 && elts[k - 1] == neutral_op)
4759 k -= 1;
4760 while (k > 0)
4762 k -= 1;
4763 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4764 vector_type, init, elts[k]);
4767 else
4769 /* First time round, duplicate ELTS to fill the
4770 required number of vectors. */
4771 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4772 number_of_vectors, *vec_oprnds);
4773 break;
4775 vec_oprnds->quick_push (init);
4777 number_of_places_left_in_vector = nunits;
4778 elts.new_vector (vector_type, nunits, 1);
4779 elts.quick_grow (nunits);
4780 constant_p = true;
4783 if (ctor_seq != NULL)
4784 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4787 /* For a statement STMT_INFO taking part in a reduction operation return
4788 the stmt_vec_info the meta information is stored on. */
4790 stmt_vec_info
4791 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4793 stmt_info = vect_orig_stmt (stmt_info);
4794 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4795 if (!is_a <gphi *> (stmt_info->stmt)
4796 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4797 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4798 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4799 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4801 if (gimple_phi_num_args (phi) == 1)
4802 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4804 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4806 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4807 stmt_vec_info info
4808 = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4809 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4810 stmt_info = info;
4812 return stmt_info;
4815 /* Function vect_create_epilog_for_reduction
4817 Create code at the loop-epilog to finalize the result of a reduction
4818 computation.
4820 STMT_INFO is the scalar reduction stmt that is being vectorized.
4821 SLP_NODE is an SLP node containing a group of reduction statements. The
4822 first one in this group is STMT_INFO.
4823 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4824 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4825 (counting from 0)
4827 This function:
4828 1. Completes the reduction def-use cycles.
4829 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4830 by calling the function specified by REDUC_FN if available, or by
4831 other means (whole-vector shifts or a scalar loop).
4832 The function also creates a new phi node at the loop exit to preserve
4833 loop-closed form, as illustrated below.
4835 The flow at the entry to this function:
4837 loop:
4838 vec_def = phi <vec_init, null> # REDUCTION_PHI
4839 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4840 s_loop = scalar_stmt # (scalar) STMT_INFO
4841 loop_exit:
4842 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4843 use <s_out0>
4844 use <s_out0>
4846 The above is transformed by this function into:
4848 loop:
4849 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4850 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4851 s_loop = scalar_stmt # (scalar) STMT_INFO
4852 loop_exit:
4853 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4854 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4855 v_out2 = reduce <v_out1>
4856 s_out3 = extract_field <v_out2, 0>
4857 s_out4 = adjust_result <s_out3>
4858 use <s_out4>
4859 use <s_out4>
4862 static void
4863 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4864 stmt_vec_info stmt_info,
4865 slp_tree slp_node,
4866 slp_instance slp_node_instance)
4868 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4869 gcc_assert (reduc_info->is_reduc_info);
4870 /* For double reductions we need to get at the inner loop reduction
4871 stmt which has the meta info attached. Our stmt_info is that of the
4872 loop-closed PHI of the inner loop which we remember as
4873 def for the reduction PHI generation. */
4874 bool double_reduc = false;
4875 stmt_vec_info rdef_info = stmt_info;
4876 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4878 gcc_assert (!slp_node);
4879 double_reduc = true;
4880 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4881 (stmt_info->stmt, 0));
4882 stmt_info = vect_stmt_to_vectorize (stmt_info);
4884 gphi *reduc_def_stmt
4885 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4886 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4887 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4888 tree vectype;
4889 machine_mode mode;
4890 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4891 basic_block exit_bb;
4892 tree scalar_dest;
4893 tree scalar_type;
4894 gimple *new_phi = NULL, *phi;
4895 gimple_stmt_iterator exit_gsi;
4896 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4897 gimple *epilog_stmt = NULL;
4898 gimple *exit_phi;
4899 tree bitsize;
4900 tree def;
4901 tree orig_name, scalar_result;
4902 imm_use_iterator imm_iter, phi_imm_iter;
4903 use_operand_p use_p, phi_use_p;
4904 gimple *use_stmt;
4905 bool nested_in_vect_loop = false;
4906 auto_vec<gimple *> new_phis;
4907 int j, i;
4908 auto_vec<tree> scalar_results;
4909 unsigned int group_size = 1, k;
4910 auto_vec<gimple *> phis;
4911 bool slp_reduc = false;
4912 bool direct_slp_reduc;
4913 tree new_phi_result;
4914 tree induction_index = NULL_TREE;
4916 if (slp_node)
4917 group_size = SLP_TREE_LANES (slp_node);
4919 if (nested_in_vect_loop_p (loop, stmt_info))
4921 outer_loop = loop;
4922 loop = loop->inner;
4923 nested_in_vect_loop = true;
4924 gcc_assert (!slp_node);
4926 gcc_assert (!nested_in_vect_loop || double_reduc);
4928 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4929 gcc_assert (vectype);
4930 mode = TYPE_MODE (vectype);
4932 tree initial_def = NULL;
4933 tree induc_val = NULL_TREE;
4934 tree adjustment_def = NULL;
4935 if (slp_node)
4937 else
4939 /* Get at the scalar def before the loop, that defines the initial value
4940 of the reduction variable. */
4941 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4942 loop_preheader_edge (loop));
4943 /* Optimize: for induction condition reduction, if we can't use zero
4944 for induc_val, use initial_def. */
4945 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4946 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4947 else if (double_reduc)
4949 else if (nested_in_vect_loop)
4951 else
4952 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4955 unsigned vec_num;
4956 int ncopies;
4957 if (slp_node)
4959 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4960 ncopies = 1;
4962 else
4964 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4965 vec_num = 1;
4966 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4969 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4970 which is updated with the current index of the loop for every match of
4971 the original loop's cond_expr (VEC_STMT). This results in a vector
4972 containing the last time the condition passed for that vector lane.
4973 The first match will be a 1 to allow 0 to be used for non-matching
4974 indexes. If there are no matches at all then the vector will be all
4975 zeroes.
4977 PR92772: This algorithm is broken for architectures that support
4978 masked vectors, but do not provide fold_extract_last. */
4979 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4981 auto_vec<std::pair<tree, bool>, 2> ccompares;
4982 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4983 cond_info = vect_stmt_to_vectorize (cond_info);
4984 while (cond_info != reduc_info)
4986 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4988 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4989 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4990 ccompares.safe_push
4991 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4992 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4994 cond_info
4995 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4996 1 + STMT_VINFO_REDUC_IDX
4997 (cond_info)));
4998 cond_info = vect_stmt_to_vectorize (cond_info);
5000 gcc_assert (ccompares.length () != 0);
5002 tree indx_before_incr, indx_after_incr;
5003 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5004 int scalar_precision
5005 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5006 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5007 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5008 (TYPE_MODE (vectype), cr_index_scalar_type,
5009 TYPE_VECTOR_SUBPARTS (vectype));
5011 /* First we create a simple vector induction variable which starts
5012 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5013 vector size (STEP). */
5015 /* Create a {1,2,3,...} vector. */
5016 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5018 /* Create a vector of the step value. */
5019 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5020 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5022 /* Create an induction variable. */
5023 gimple_stmt_iterator incr_gsi;
5024 bool insert_after;
5025 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5026 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5027 insert_after, &indx_before_incr, &indx_after_incr);
5029 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5030 filled with zeros (VEC_ZERO). */
5032 /* Create a vector of 0s. */
5033 tree zero = build_zero_cst (cr_index_scalar_type);
5034 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5036 /* Create a vector phi node. */
5037 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5038 new_phi = create_phi_node (new_phi_tree, loop->header);
5039 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5040 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5042 /* Now take the condition from the loops original cond_exprs
5043 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5044 every match uses values from the induction variable
5045 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5046 (NEW_PHI_TREE).
5047 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5048 the new cond_expr (INDEX_COND_EXPR). */
5049 gimple_seq stmts = NULL;
5050 for (int i = ccompares.length () - 1; i != -1; --i)
5052 tree ccompare = ccompares[i].first;
5053 if (ccompares[i].second)
5054 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5055 cr_index_vector_type,
5056 ccompare,
5057 indx_before_incr, new_phi_tree);
5058 else
5059 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5060 cr_index_vector_type,
5061 ccompare,
5062 new_phi_tree, indx_before_incr);
5064 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5066 /* Update the phi with the vec cond. */
5067 induction_index = new_phi_tree;
5068 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5069 loop_latch_edge (loop), UNKNOWN_LOCATION);
5072 /* 2. Create epilog code.
5073 The reduction epilog code operates across the elements of the vector
5074 of partial results computed by the vectorized loop.
5075 The reduction epilog code consists of:
5077 step 1: compute the scalar result in a vector (v_out2)
5078 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5079 step 3: adjust the scalar result (s_out3) if needed.
5081 Step 1 can be accomplished using one the following three schemes:
5082 (scheme 1) using reduc_fn, if available.
5083 (scheme 2) using whole-vector shifts, if available.
5084 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5085 combined.
5087 The overall epilog code looks like this:
5089 s_out0 = phi <s_loop> # original EXIT_PHI
5090 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5091 v_out2 = reduce <v_out1> # step 1
5092 s_out3 = extract_field <v_out2, 0> # step 2
5093 s_out4 = adjust_result <s_out3> # step 3
5095 (step 3 is optional, and steps 1 and 2 may be combined).
5096 Lastly, the uses of s_out0 are replaced by s_out4. */
5099 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5100 v_out1 = phi <VECT_DEF>
5101 Store them in NEW_PHIS. */
5102 if (double_reduc)
5103 loop = outer_loop;
5104 exit_bb = single_exit (loop)->dest;
5105 new_phis.create (slp_node ? vec_num : ncopies);
5106 for (unsigned i = 0; i < vec_num; i++)
5108 if (slp_node)
5109 def = vect_get_slp_vect_def (slp_node, i);
5110 else
5111 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5112 for (j = 0; j < ncopies; j++)
5114 tree new_def = copy_ssa_name (def);
5115 phi = create_phi_node (new_def, exit_bb);
5116 if (j == 0)
5117 new_phis.quick_push (phi);
5118 else
5120 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5121 new_phis.quick_push (phi);
5124 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5128 exit_gsi = gsi_after_labels (exit_bb);
5130 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5131 (i.e. when reduc_fn is not available) and in the final adjustment
5132 code (if needed). Also get the original scalar reduction variable as
5133 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5134 represents a reduction pattern), the tree-code and scalar-def are
5135 taken from the original stmt that the pattern-stmt (STMT) replaces.
5136 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5137 are taken from STMT. */
5139 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5140 if (orig_stmt_info != stmt_info)
5142 /* Reduction pattern */
5143 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5144 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5147 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5148 scalar_type = TREE_TYPE (scalar_dest);
5149 scalar_results.create (group_size);
5150 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5151 bitsize = TYPE_SIZE (scalar_type);
5153 /* SLP reduction without reduction chain, e.g.,
5154 # a1 = phi <a2, a0>
5155 # b1 = phi <b2, b0>
5156 a2 = operation (a1)
5157 b2 = operation (b1) */
5158 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5160 /* True if we should implement SLP_REDUC using native reduction operations
5161 instead of scalar operations. */
5162 direct_slp_reduc = (reduc_fn != IFN_LAST
5163 && slp_reduc
5164 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5166 /* In case of reduction chain, e.g.,
5167 # a1 = phi <a3, a0>
5168 a2 = operation (a1)
5169 a3 = operation (a2),
5171 we may end up with more than one vector result. Here we reduce them to
5172 one vector. */
5173 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5175 gimple_seq stmts = NULL;
5176 tree first_vect = PHI_RESULT (new_phis[0]);
5177 first_vect = gimple_convert (&stmts, vectype, first_vect);
5178 for (k = 1; k < new_phis.length (); k++)
5180 gimple *next_phi = new_phis[k];
5181 tree second_vect = PHI_RESULT (next_phi);
5182 second_vect = gimple_convert (&stmts, vectype, second_vect);
5183 first_vect = gimple_build (&stmts, code, vectype,
5184 first_vect, second_vect);
5186 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5188 new_phi_result = first_vect;
5189 new_phis.truncate (0);
5190 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5192 /* Likewise if we couldn't use a single defuse cycle. */
5193 else if (ncopies > 1)
5195 gimple_seq stmts = NULL;
5196 tree first_vect = PHI_RESULT (new_phis[0]);
5197 first_vect = gimple_convert (&stmts, vectype, first_vect);
5198 for (int k = 1; k < ncopies; ++k)
5200 tree second_vect = PHI_RESULT (new_phis[k]);
5201 second_vect = gimple_convert (&stmts, vectype, second_vect);
5202 first_vect = gimple_build (&stmts, code, vectype,
5203 first_vect, second_vect);
5205 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5206 new_phi_result = first_vect;
5207 new_phis.truncate (0);
5208 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5210 else
5211 new_phi_result = PHI_RESULT (new_phis[0]);
5213 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5214 && reduc_fn != IFN_LAST)
5216 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5217 various data values where the condition matched and another vector
5218 (INDUCTION_INDEX) containing all the indexes of those matches. We
5219 need to extract the last matching index (which will be the index with
5220 highest value) and use this to index into the data vector.
5221 For the case where there were no matches, the data vector will contain
5222 all default values and the index vector will be all zeros. */
5224 /* Get various versions of the type of the vector of indexes. */
5225 tree index_vec_type = TREE_TYPE (induction_index);
5226 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5227 tree index_scalar_type = TREE_TYPE (index_vec_type);
5228 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5230 /* Get an unsigned integer version of the type of the data vector. */
5231 int scalar_precision
5232 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5233 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5234 tree vectype_unsigned = build_vector_type
5235 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5237 /* First we need to create a vector (ZERO_VEC) of zeros and another
5238 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5239 can create using a MAX reduction and then expanding.
5240 In the case where the loop never made any matches, the max index will
5241 be zero. */
5243 /* Vector of {0, 0, 0,...}. */
5244 tree zero_vec = build_zero_cst (vectype);
5246 gimple_seq stmts = NULL;
5247 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5248 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5250 /* Find maximum value from the vector of found indexes. */
5251 tree max_index = make_ssa_name (index_scalar_type);
5252 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5253 1, induction_index);
5254 gimple_call_set_lhs (max_index_stmt, max_index);
5255 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5257 /* Vector of {max_index, max_index, max_index,...}. */
5258 tree max_index_vec = make_ssa_name (index_vec_type);
5259 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5260 max_index);
5261 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5262 max_index_vec_rhs);
5263 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5265 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5266 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5267 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5268 otherwise. Only one value should match, resulting in a vector
5269 (VEC_COND) with one data value and the rest zeros.
5270 In the case where the loop never made any matches, every index will
5271 match, resulting in a vector with all data values (which will all be
5272 the default value). */
5274 /* Compare the max index vector to the vector of found indexes to find
5275 the position of the max value. */
5276 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5277 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5278 induction_index,
5279 max_index_vec);
5280 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5282 /* Use the compare to choose either values from the data vector or
5283 zero. */
5284 tree vec_cond = make_ssa_name (vectype);
5285 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5286 vec_compare, new_phi_result,
5287 zero_vec);
5288 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5290 /* Finally we need to extract the data value from the vector (VEC_COND)
5291 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5292 reduction, but because this doesn't exist, we can use a MAX reduction
5293 instead. The data value might be signed or a float so we need to cast
5294 it first.
5295 In the case where the loop never made any matches, the data values are
5296 all identical, and so will reduce down correctly. */
5298 /* Make the matched data values unsigned. */
5299 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5300 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5301 vec_cond);
5302 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5303 VIEW_CONVERT_EXPR,
5304 vec_cond_cast_rhs);
5305 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5307 /* Reduce down to a scalar value. */
5308 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5309 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5310 1, vec_cond_cast);
5311 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5312 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5314 /* Convert the reduced value back to the result type and set as the
5315 result. */
5316 stmts = NULL;
5317 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5318 data_reduc);
5319 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5320 scalar_results.safe_push (new_temp);
5322 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5323 && reduc_fn == IFN_LAST)
5325 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5326 idx = 0;
5327 idx_val = induction_index[0];
5328 val = data_reduc[0];
5329 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5330 if (induction_index[i] > idx_val)
5331 val = data_reduc[i], idx_val = induction_index[i];
5332 return val; */
5334 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5335 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5336 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5337 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5338 /* Enforced by vectorizable_reduction, which ensures we have target
5339 support before allowing a conditional reduction on variable-length
5340 vectors. */
5341 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5342 tree idx_val = NULL_TREE, val = NULL_TREE;
5343 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5345 tree old_idx_val = idx_val;
5346 tree old_val = val;
5347 idx_val = make_ssa_name (idx_eltype);
5348 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5349 build3 (BIT_FIELD_REF, idx_eltype,
5350 induction_index,
5351 bitsize_int (el_size),
5352 bitsize_int (off)));
5353 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5354 val = make_ssa_name (data_eltype);
5355 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5356 build3 (BIT_FIELD_REF,
5357 data_eltype,
5358 new_phi_result,
5359 bitsize_int (el_size),
5360 bitsize_int (off)));
5361 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5362 if (off != 0)
5364 tree new_idx_val = idx_val;
5365 if (off != v_size - el_size)
5367 new_idx_val = make_ssa_name (idx_eltype);
5368 epilog_stmt = gimple_build_assign (new_idx_val,
5369 MAX_EXPR, idx_val,
5370 old_idx_val);
5371 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5373 tree new_val = make_ssa_name (data_eltype);
5374 epilog_stmt = gimple_build_assign (new_val,
5375 COND_EXPR,
5376 build2 (GT_EXPR,
5377 boolean_type_node,
5378 idx_val,
5379 old_idx_val),
5380 val, old_val);
5381 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5382 idx_val = new_idx_val;
5383 val = new_val;
5386 /* Convert the reduced value back to the result type and set as the
5387 result. */
5388 gimple_seq stmts = NULL;
5389 val = gimple_convert (&stmts, scalar_type, val);
5390 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5391 scalar_results.safe_push (val);
5394 /* 2.3 Create the reduction code, using one of the three schemes described
5395 above. In SLP we simply need to extract all the elements from the
5396 vector (without reducing them), so we use scalar shifts. */
5397 else if (reduc_fn != IFN_LAST && !slp_reduc)
5399 tree tmp;
5400 tree vec_elem_type;
5402 /* Case 1: Create:
5403 v_out2 = reduc_expr <v_out1> */
5405 if (dump_enabled_p ())
5406 dump_printf_loc (MSG_NOTE, vect_location,
5407 "Reduce using direct vector reduction.\n");
5409 gimple_seq stmts = NULL;
5410 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5411 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5412 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5413 vec_elem_type, new_phi_result);
5414 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5415 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5417 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5418 && induc_val)
5420 /* Earlier we set the initial value to be a vector if induc_val
5421 values. Check the result and if it is induc_val then replace
5422 with the original initial value, unless induc_val is
5423 the same as initial_def already. */
5424 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5425 induc_val);
5427 tmp = make_ssa_name (new_scalar_dest);
5428 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5429 initial_def, new_temp);
5430 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5431 new_temp = tmp;
5434 scalar_results.safe_push (new_temp);
5436 else if (direct_slp_reduc)
5438 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5439 with the elements for other SLP statements replaced with the
5440 neutral value. We can then do a normal reduction on each vector. */
5442 /* Enforced by vectorizable_reduction. */
5443 gcc_assert (new_phis.length () == 1);
5444 gcc_assert (pow2p_hwi (group_size));
5446 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5447 vec<stmt_vec_info> orig_phis
5448 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5449 gimple_seq seq = NULL;
5451 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5452 and the same element size as VECTYPE. */
5453 tree index = build_index_vector (vectype, 0, 1);
5454 tree index_type = TREE_TYPE (index);
5455 tree index_elt_type = TREE_TYPE (index_type);
5456 tree mask_type = truth_type_for (index_type);
5458 /* Create a vector that, for each element, identifies which of
5459 the REDUC_GROUP_SIZE results should use it. */
5460 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5461 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5462 build_vector_from_val (index_type, index_mask));
5464 /* Get a neutral vector value. This is simply a splat of the neutral
5465 scalar value if we have one, otherwise the initial scalar value
5466 is itself a neutral value. */
5467 tree vector_identity = NULL_TREE;
5468 tree neutral_op = NULL_TREE;
5469 if (slp_node)
5471 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5472 neutral_op
5473 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5474 vectype, code, first != NULL);
5476 if (neutral_op)
5477 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5478 neutral_op);
5479 for (unsigned int i = 0; i < group_size; ++i)
5481 /* If there's no univeral neutral value, we can use the
5482 initial scalar value from the original PHI. This is used
5483 for MIN and MAX reduction, for example. */
5484 if (!neutral_op)
5486 tree scalar_value
5487 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5488 loop_preheader_edge (loop));
5489 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5490 scalar_value);
5491 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5492 scalar_value);
5495 /* Calculate the equivalent of:
5497 sel[j] = (index[j] == i);
5499 which selects the elements of NEW_PHI_RESULT that should
5500 be included in the result. */
5501 tree compare_val = build_int_cst (index_elt_type, i);
5502 compare_val = build_vector_from_val (index_type, compare_val);
5503 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5504 index, compare_val);
5506 /* Calculate the equivalent of:
5508 vec = seq ? new_phi_result : vector_identity;
5510 VEC is now suitable for a full vector reduction. */
5511 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5512 sel, new_phi_result, vector_identity);
5514 /* Do the reduction and convert it to the appropriate type. */
5515 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5516 TREE_TYPE (vectype), vec);
5517 scalar = gimple_convert (&seq, scalar_type, scalar);
5518 scalar_results.safe_push (scalar);
5520 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5522 else
5524 bool reduce_with_shift;
5525 tree vec_temp;
5527 gcc_assert (slp_reduc || new_phis.length () == 1);
5529 /* See if the target wants to do the final (shift) reduction
5530 in a vector mode of smaller size and first reduce upper/lower
5531 halves against each other. */
5532 enum machine_mode mode1 = mode;
5533 tree stype = TREE_TYPE (vectype);
5534 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5535 unsigned nunits1 = nunits;
5536 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5537 && new_phis.length () == 1)
5539 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5540 /* For SLP reductions we have to make sure lanes match up, but
5541 since we're doing individual element final reduction reducing
5542 vector width here is even more important.
5543 ??? We can also separate lanes with permutes, for the common
5544 case of power-of-two group-size odd/even extracts would work. */
5545 if (slp_reduc && nunits != nunits1)
5547 nunits1 = least_common_multiple (nunits1, group_size);
5548 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5551 if (!slp_reduc
5552 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5553 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5555 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5556 stype, nunits1);
5557 reduce_with_shift = have_whole_vector_shift (mode1);
5558 if (!VECTOR_MODE_P (mode1))
5559 reduce_with_shift = false;
5560 else
5562 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5563 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5564 reduce_with_shift = false;
5567 /* First reduce the vector to the desired vector size we should
5568 do shift reduction on by combining upper and lower halves. */
5569 new_temp = new_phi_result;
5570 while (nunits > nunits1)
5572 nunits /= 2;
5573 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5574 stype, nunits);
5575 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5577 /* The target has to make sure we support lowpart/highpart
5578 extraction, either via direct vector extract or through
5579 an integer mode punning. */
5580 tree dst1, dst2;
5581 if (convert_optab_handler (vec_extract_optab,
5582 TYPE_MODE (TREE_TYPE (new_temp)),
5583 TYPE_MODE (vectype1))
5584 != CODE_FOR_nothing)
5586 /* Extract sub-vectors directly once vec_extract becomes
5587 a conversion optab. */
5588 dst1 = make_ssa_name (vectype1);
5589 epilog_stmt
5590 = gimple_build_assign (dst1, BIT_FIELD_REF,
5591 build3 (BIT_FIELD_REF, vectype1,
5592 new_temp, TYPE_SIZE (vectype1),
5593 bitsize_int (0)));
5594 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5595 dst2 = make_ssa_name (vectype1);
5596 epilog_stmt
5597 = gimple_build_assign (dst2, BIT_FIELD_REF,
5598 build3 (BIT_FIELD_REF, vectype1,
5599 new_temp, TYPE_SIZE (vectype1),
5600 bitsize_int (bitsize)));
5601 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5603 else
5605 /* Extract via punning to appropriately sized integer mode
5606 vector. */
5607 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5608 tree etype = build_vector_type (eltype, 2);
5609 gcc_assert (convert_optab_handler (vec_extract_optab,
5610 TYPE_MODE (etype),
5611 TYPE_MODE (eltype))
5612 != CODE_FOR_nothing);
5613 tree tem = make_ssa_name (etype);
5614 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5615 build1 (VIEW_CONVERT_EXPR,
5616 etype, new_temp));
5617 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5618 new_temp = tem;
5619 tem = make_ssa_name (eltype);
5620 epilog_stmt
5621 = gimple_build_assign (tem, BIT_FIELD_REF,
5622 build3 (BIT_FIELD_REF, eltype,
5623 new_temp, TYPE_SIZE (eltype),
5624 bitsize_int (0)));
5625 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5626 dst1 = make_ssa_name (vectype1);
5627 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5628 build1 (VIEW_CONVERT_EXPR,
5629 vectype1, tem));
5630 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5631 tem = make_ssa_name (eltype);
5632 epilog_stmt
5633 = gimple_build_assign (tem, BIT_FIELD_REF,
5634 build3 (BIT_FIELD_REF, eltype,
5635 new_temp, TYPE_SIZE (eltype),
5636 bitsize_int (bitsize)));
5637 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5638 dst2 = make_ssa_name (vectype1);
5639 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5640 build1 (VIEW_CONVERT_EXPR,
5641 vectype1, tem));
5642 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5645 new_temp = make_ssa_name (vectype1);
5646 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5647 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5648 new_phis[0] = epilog_stmt;
5651 if (reduce_with_shift && !slp_reduc)
5653 int element_bitsize = tree_to_uhwi (bitsize);
5654 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5655 for variable-length vectors and also requires direct target support
5656 for loop reductions. */
5657 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5658 int nelements = vec_size_in_bits / element_bitsize;
5659 vec_perm_builder sel;
5660 vec_perm_indices indices;
5662 int elt_offset;
5664 tree zero_vec = build_zero_cst (vectype1);
5665 /* Case 2: Create:
5666 for (offset = nelements/2; offset >= 1; offset/=2)
5668 Create: va' = vec_shift <va, offset>
5669 Create: va = vop <va, va'>
5670 } */
5672 tree rhs;
5674 if (dump_enabled_p ())
5675 dump_printf_loc (MSG_NOTE, vect_location,
5676 "Reduce using vector shifts\n");
5678 gimple_seq stmts = NULL;
5679 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5680 for (elt_offset = nelements / 2;
5681 elt_offset >= 1;
5682 elt_offset /= 2)
5684 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5685 indices.new_vector (sel, 2, nelements);
5686 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5687 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5688 new_temp, zero_vec, mask);
5689 new_temp = gimple_build (&stmts, code,
5690 vectype1, new_name, new_temp);
5692 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5694 /* 2.4 Extract the final scalar result. Create:
5695 s_out3 = extract_field <v_out2, bitpos> */
5697 if (dump_enabled_p ())
5698 dump_printf_loc (MSG_NOTE, vect_location,
5699 "extract scalar result\n");
5701 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5702 bitsize, bitsize_zero_node);
5703 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5704 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5705 gimple_assign_set_lhs (epilog_stmt, new_temp);
5706 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5707 scalar_results.safe_push (new_temp);
5709 else
5711 /* Case 3: Create:
5712 s = extract_field <v_out2, 0>
5713 for (offset = element_size;
5714 offset < vector_size;
5715 offset += element_size;)
5717 Create: s' = extract_field <v_out2, offset>
5718 Create: s = op <s, s'> // For non SLP cases
5719 } */
5721 if (dump_enabled_p ())
5722 dump_printf_loc (MSG_NOTE, vect_location,
5723 "Reduce using scalar code.\n");
5725 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5726 int element_bitsize = tree_to_uhwi (bitsize);
5727 tree compute_type = TREE_TYPE (vectype);
5728 gimple_seq stmts = NULL;
5729 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5731 int bit_offset;
5732 if (gimple_code (new_phi) == GIMPLE_PHI)
5733 vec_temp = PHI_RESULT (new_phi);
5734 else
5735 vec_temp = gimple_assign_lhs (new_phi);
5736 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5737 vec_temp, bitsize, bitsize_zero_node);
5739 /* In SLP we don't need to apply reduction operation, so we just
5740 collect s' values in SCALAR_RESULTS. */
5741 if (slp_reduc)
5742 scalar_results.safe_push (new_temp);
5744 for (bit_offset = element_bitsize;
5745 bit_offset < vec_size_in_bits;
5746 bit_offset += element_bitsize)
5748 tree bitpos = bitsize_int (bit_offset);
5749 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5750 compute_type, vec_temp,
5751 bitsize, bitpos);
5752 if (slp_reduc)
5754 /* In SLP we don't need to apply reduction operation, so
5755 we just collect s' values in SCALAR_RESULTS. */
5756 new_temp = new_name;
5757 scalar_results.safe_push (new_name);
5759 else
5760 new_temp = gimple_build (&stmts, code, compute_type,
5761 new_name, new_temp);
5765 /* The only case where we need to reduce scalar results in SLP, is
5766 unrolling. If the size of SCALAR_RESULTS is greater than
5767 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5768 REDUC_GROUP_SIZE. */
5769 if (slp_reduc)
5771 tree res, first_res, new_res;
5773 /* Reduce multiple scalar results in case of SLP unrolling. */
5774 for (j = group_size; scalar_results.iterate (j, &res);
5775 j++)
5777 first_res = scalar_results[j % group_size];
5778 new_res = gimple_build (&stmts, code, compute_type,
5779 first_res, res);
5780 scalar_results[j % group_size] = new_res;
5782 for (k = 0; k < group_size; k++)
5783 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5784 scalar_results[k]);
5786 else
5788 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5789 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5790 scalar_results.safe_push (new_temp);
5793 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5796 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5797 && induc_val)
5799 /* Earlier we set the initial value to be a vector if induc_val
5800 values. Check the result and if it is induc_val then replace
5801 with the original initial value, unless induc_val is
5802 the same as initial_def already. */
5803 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5804 induc_val);
5806 tree tmp = make_ssa_name (new_scalar_dest);
5807 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5808 initial_def, new_temp);
5809 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5810 scalar_results[0] = tmp;
5814 /* 2.5 Adjust the final result by the initial value of the reduction
5815 variable. (When such adjustment is not needed, then
5816 'adjustment_def' is zero). For example, if code is PLUS we create:
5817 new_temp = loop_exit_def + adjustment_def */
5819 if (adjustment_def)
5821 gcc_assert (!slp_reduc);
5822 gimple_seq stmts = NULL;
5823 if (nested_in_vect_loop)
5825 new_phi = new_phis[0];
5826 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5827 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5828 new_temp = gimple_build (&stmts, code, vectype,
5829 PHI_RESULT (new_phi), adjustment_def);
5831 else
5833 new_temp = scalar_results[0];
5834 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5835 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5836 new_temp = gimple_build (&stmts, code, scalar_type,
5837 new_temp, adjustment_def);
5840 epilog_stmt = gimple_seq_last_stmt (stmts);
5841 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5842 if (nested_in_vect_loop)
5844 if (!double_reduc)
5845 scalar_results.quick_push (new_temp);
5846 else
5847 scalar_results[0] = new_temp;
5849 else
5850 scalar_results[0] = new_temp;
5852 new_phis[0] = epilog_stmt;
5855 if (double_reduc)
5856 loop = loop->inner;
5858 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5859 phis with new adjusted scalar results, i.e., replace use <s_out0>
5860 with use <s_out4>.
5862 Transform:
5863 loop_exit:
5864 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5865 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5866 v_out2 = reduce <v_out1>
5867 s_out3 = extract_field <v_out2, 0>
5868 s_out4 = adjust_result <s_out3>
5869 use <s_out0>
5870 use <s_out0>
5872 into:
5874 loop_exit:
5875 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5876 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5877 v_out2 = reduce <v_out1>
5878 s_out3 = extract_field <v_out2, 0>
5879 s_out4 = adjust_result <s_out3>
5880 use <s_out4>
5881 use <s_out4> */
5884 /* In SLP reduction chain we reduce vector results into one vector if
5885 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5886 LHS of the last stmt in the reduction chain, since we are looking for
5887 the loop exit phi node. */
5888 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5890 stmt_vec_info dest_stmt_info
5891 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5892 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5893 group_size = 1;
5896 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5897 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5898 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5899 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5900 correspond to the first vector stmt, etc.
5901 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5902 if (group_size > new_phis.length ())
5903 gcc_assert (!(group_size % new_phis.length ()));
5905 for (k = 0; k < group_size; k++)
5907 if (slp_reduc)
5909 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5911 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5912 /* SLP statements can't participate in patterns. */
5913 gcc_assert (!orig_stmt_info);
5914 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5917 if (nested_in_vect_loop)
5919 if (double_reduc)
5920 loop = outer_loop;
5921 else
5922 gcc_unreachable ();
5925 phis.create (3);
5926 /* Find the loop-closed-use at the loop exit of the original scalar
5927 result. (The reduction result is expected to have two immediate uses,
5928 one at the latch block, and one at the loop exit). For double
5929 reductions we are looking for exit phis of the outer loop. */
5930 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5932 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5934 if (!is_gimple_debug (USE_STMT (use_p)))
5935 phis.safe_push (USE_STMT (use_p));
5937 else
5939 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5941 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5943 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5945 if (!flow_bb_inside_loop_p (loop,
5946 gimple_bb (USE_STMT (phi_use_p)))
5947 && !is_gimple_debug (USE_STMT (phi_use_p)))
5948 phis.safe_push (USE_STMT (phi_use_p));
5954 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5956 /* Replace the uses: */
5957 orig_name = PHI_RESULT (exit_phi);
5958 scalar_result = scalar_results[k];
5959 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5961 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5962 SET_USE (use_p, scalar_result);
5963 update_stmt (use_stmt);
5967 phis.release ();
5971 /* Return a vector of type VECTYPE that is equal to the vector select
5972 operation "MASK ? VEC : IDENTITY". Insert the select statements
5973 before GSI. */
5975 static tree
5976 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5977 tree vec, tree identity)
5979 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5980 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5981 mask, vec, identity);
5982 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5983 return cond;
5986 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5987 order, starting with LHS. Insert the extraction statements before GSI and
5988 associate the new scalar SSA names with variable SCALAR_DEST.
5989 Return the SSA name for the result. */
5991 static tree
5992 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5993 tree_code code, tree lhs, tree vector_rhs)
5995 tree vectype = TREE_TYPE (vector_rhs);
5996 tree scalar_type = TREE_TYPE (vectype);
5997 tree bitsize = TYPE_SIZE (scalar_type);
5998 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5999 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6001 for (unsigned HOST_WIDE_INT bit_offset = 0;
6002 bit_offset < vec_size_in_bits;
6003 bit_offset += element_bitsize)
6005 tree bitpos = bitsize_int (bit_offset);
6006 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6007 bitsize, bitpos);
6009 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6010 rhs = make_ssa_name (scalar_dest, stmt);
6011 gimple_assign_set_lhs (stmt, rhs);
6012 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6014 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6015 tree new_name = make_ssa_name (scalar_dest, stmt);
6016 gimple_assign_set_lhs (stmt, new_name);
6017 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6018 lhs = new_name;
6020 return lhs;
6023 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6024 type of the vector input. */
6026 static internal_fn
6027 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6029 internal_fn mask_reduc_fn;
6031 switch (reduc_fn)
6033 case IFN_FOLD_LEFT_PLUS:
6034 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6035 break;
6037 default:
6038 return IFN_LAST;
6041 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6042 OPTIMIZE_FOR_SPEED))
6043 return mask_reduc_fn;
6044 return IFN_LAST;
6047 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6048 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6049 statement. CODE is the operation performed by STMT_INFO and OPS are
6050 its scalar operands. REDUC_INDEX is the index of the operand in
6051 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6052 implements in-order reduction, or IFN_LAST if we should open-code it.
6053 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6054 that should be used to control the operation in a fully-masked loop. */
6056 static bool
6057 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6058 stmt_vec_info stmt_info,
6059 gimple_stmt_iterator *gsi,
6060 gimple **vec_stmt, slp_tree slp_node,
6061 gimple *reduc_def_stmt,
6062 tree_code code, internal_fn reduc_fn,
6063 tree ops[3], tree vectype_in,
6064 int reduc_index, vec_loop_masks *masks)
6066 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6067 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6068 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6070 int ncopies;
6071 if (slp_node)
6072 ncopies = 1;
6073 else
6074 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6076 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6077 gcc_assert (ncopies == 1);
6078 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6080 if (slp_node)
6081 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6082 TYPE_VECTOR_SUBPARTS (vectype_in)));
6084 tree op0 = ops[1 - reduc_index];
6086 int group_size = 1;
6087 stmt_vec_info scalar_dest_def_info;
6088 auto_vec<tree> vec_oprnds0;
6089 if (slp_node)
6091 auto_vec<vec<tree> > vec_defs (2);
6092 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6093 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6094 vec_defs[0].release ();
6095 vec_defs[1].release ();
6096 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6097 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6099 else
6101 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6102 op0, &vec_oprnds0);
6103 scalar_dest_def_info = stmt_info;
6106 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6107 tree scalar_type = TREE_TYPE (scalar_dest);
6108 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6110 int vec_num = vec_oprnds0.length ();
6111 gcc_assert (vec_num == 1 || slp_node);
6112 tree vec_elem_type = TREE_TYPE (vectype_out);
6113 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6115 tree vector_identity = NULL_TREE;
6116 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6117 vector_identity = build_zero_cst (vectype_out);
6119 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6120 int i;
6121 tree def0;
6122 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6124 gimple *new_stmt;
6125 tree mask = NULL_TREE;
6126 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6127 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6129 /* Handle MINUS by adding the negative. */
6130 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6132 tree negated = make_ssa_name (vectype_out);
6133 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6134 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6135 def0 = negated;
6138 if (mask && mask_reduc_fn == IFN_LAST)
6139 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6140 vector_identity);
6142 /* On the first iteration the input is simply the scalar phi
6143 result, and for subsequent iterations it is the output of
6144 the preceding operation. */
6145 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6147 if (mask && mask_reduc_fn != IFN_LAST)
6148 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6149 def0, mask);
6150 else
6151 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6152 def0);
6153 /* For chained SLP reductions the output of the previous reduction
6154 operation serves as the input of the next. For the final statement
6155 the output cannot be a temporary - we reuse the original
6156 scalar destination of the last statement. */
6157 if (i != vec_num - 1)
6159 gimple_set_lhs (new_stmt, scalar_dest_var);
6160 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6161 gimple_set_lhs (new_stmt, reduc_var);
6164 else
6166 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6167 reduc_var, def0);
6168 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6169 /* Remove the statement, so that we can use the same code paths
6170 as for statements that we've just created. */
6171 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6172 gsi_remove (&tmp_gsi, true);
6175 if (i == vec_num - 1)
6177 gimple_set_lhs (new_stmt, scalar_dest);
6178 vect_finish_replace_stmt (loop_vinfo,
6179 scalar_dest_def_info,
6180 new_stmt);
6182 else
6183 vect_finish_stmt_generation (loop_vinfo,
6184 scalar_dest_def_info,
6185 new_stmt, gsi);
6187 if (slp_node)
6188 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6189 else
6191 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6192 *vec_stmt = new_stmt;
6196 return true;
6199 /* Function is_nonwrapping_integer_induction.
6201 Check if STMT_VINO (which is part of loop LOOP) both increments and
6202 does not cause overflow. */
6204 static bool
6205 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6207 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6208 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6209 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6210 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6211 widest_int ni, max_loop_value, lhs_max;
6212 wi::overflow_type overflow = wi::OVF_NONE;
6214 /* Make sure the loop is integer based. */
6215 if (TREE_CODE (base) != INTEGER_CST
6216 || TREE_CODE (step) != INTEGER_CST)
6217 return false;
6219 /* Check that the max size of the loop will not wrap. */
6221 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6222 return true;
6224 if (! max_stmt_executions (loop, &ni))
6225 return false;
6227 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6228 &overflow);
6229 if (overflow)
6230 return false;
6232 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6233 TYPE_SIGN (lhs_type), &overflow);
6234 if (overflow)
6235 return false;
6237 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6238 <= TYPE_PRECISION (lhs_type));
6241 /* Check if masking can be supported by inserting a conditional expression.
6242 CODE is the code for the operation. COND_FN is the conditional internal
6243 function, if it exists. VECTYPE_IN is the type of the vector input. */
6244 static bool
6245 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6246 tree vectype_in)
6248 if (cond_fn != IFN_LAST
6249 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6250 OPTIMIZE_FOR_SPEED))
6251 return false;
6253 switch (code)
6255 case DOT_PROD_EXPR:
6256 case SAD_EXPR:
6257 return true;
6259 default:
6260 return false;
6264 /* Insert a conditional expression to enable masked vectorization. CODE is the
6265 code for the operation. VOP is the array of operands. MASK is the loop
6266 mask. GSI is a statement iterator used to place the new conditional
6267 expression. */
6268 static void
6269 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6270 gimple_stmt_iterator *gsi)
6272 switch (code)
6274 case DOT_PROD_EXPR:
6276 tree vectype = TREE_TYPE (vop[1]);
6277 tree zero = build_zero_cst (vectype);
6278 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6279 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6280 mask, vop[1], zero);
6281 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6282 vop[1] = masked_op1;
6283 break;
6286 case SAD_EXPR:
6288 tree vectype = TREE_TYPE (vop[1]);
6289 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6290 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6291 mask, vop[1], vop[0]);
6292 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6293 vop[1] = masked_op1;
6294 break;
6297 default:
6298 gcc_unreachable ();
6302 /* Function vectorizable_reduction.
6304 Check if STMT_INFO performs a reduction operation that can be vectorized.
6305 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6306 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6307 Return true if STMT_INFO is vectorizable in this way.
6309 This function also handles reduction idioms (patterns) that have been
6310 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6311 may be of this form:
6312 X = pattern_expr (arg0, arg1, ..., X)
6313 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6314 sequence that had been detected and replaced by the pattern-stmt
6315 (STMT_INFO).
6317 This function also handles reduction of condition expressions, for example:
6318 for (int i = 0; i < N; i++)
6319 if (a[i] < value)
6320 last = a[i];
6321 This is handled by vectorising the loop and creating an additional vector
6322 containing the loop indexes for which "a[i] < value" was true. In the
6323 function epilogue this is reduced to a single max value and then used to
6324 index into the vector of results.
6326 In some cases of reduction patterns, the type of the reduction variable X is
6327 different than the type of the other arguments of STMT_INFO.
6328 In such cases, the vectype that is used when transforming STMT_INFO into
6329 a vector stmt is different than the vectype that is used to determine the
6330 vectorization factor, because it consists of a different number of elements
6331 than the actual number of elements that are being operated upon in parallel.
6333 For example, consider an accumulation of shorts into an int accumulator.
6334 On some targets it's possible to vectorize this pattern operating on 8
6335 shorts at a time (hence, the vectype for purposes of determining the
6336 vectorization factor should be V8HI); on the other hand, the vectype that
6337 is used to create the vector form is actually V4SI (the type of the result).
6339 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6340 indicates what is the actual level of parallelism (V8HI in the example), so
6341 that the right vectorization factor would be derived. This vectype
6342 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6343 be used to create the vectorized stmt. The right vectype for the vectorized
6344 stmt is obtained from the type of the result X:
6345 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6347 This means that, contrary to "regular" reductions (or "regular" stmts in
6348 general), the following equation:
6349 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6350 does *NOT* necessarily hold for reduction patterns. */
6352 bool
6353 vectorizable_reduction (loop_vec_info loop_vinfo,
6354 stmt_vec_info stmt_info, slp_tree slp_node,
6355 slp_instance slp_node_instance,
6356 stmt_vector_for_cost *cost_vec)
6358 tree scalar_dest;
6359 tree vectype_in = NULL_TREE;
6360 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6361 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6362 stmt_vec_info cond_stmt_vinfo = NULL;
6363 tree scalar_type;
6364 int i;
6365 int ncopies;
6366 bool single_defuse_cycle = false;
6367 bool nested_cycle = false;
6368 bool double_reduc = false;
6369 int vec_num;
6370 tree tem;
6371 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6372 tree cond_reduc_val = NULL_TREE;
6374 /* Make sure it was already recognized as a reduction computation. */
6375 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6376 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6377 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6378 return false;
6380 /* The stmt we store reduction analysis meta on. */
6381 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6382 reduc_info->is_reduc_info = true;
6384 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6386 if (is_a <gphi *> (stmt_info->stmt))
6388 if (slp_node)
6390 /* We eventually need to set a vector type on invariant
6391 arguments. */
6392 unsigned j;
6393 slp_tree child;
6394 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6395 if (!vect_maybe_update_slp_op_vectype
6396 (child, SLP_TREE_VECTYPE (slp_node)))
6398 if (dump_enabled_p ())
6399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6400 "incompatible vector types for "
6401 "invariants\n");
6402 return false;
6405 /* Analysis for double-reduction is done on the outer
6406 loop PHI, nested cycles have no further restrictions. */
6407 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6409 else
6410 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6411 return true;
6414 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6415 stmt_vec_info phi_info = stmt_info;
6416 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6417 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6419 if (!is_a <gphi *> (stmt_info->stmt))
6421 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6422 return true;
6424 if (slp_node)
6426 slp_node_instance->reduc_phis = slp_node;
6427 /* ??? We're leaving slp_node to point to the PHIs, we only
6428 need it to get at the number of vector stmts which wasn't
6429 yet initialized for the instance root. */
6431 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6432 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6433 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6435 use_operand_p use_p;
6436 gimple *use_stmt;
6437 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6438 &use_p, &use_stmt);
6439 gcc_assert (res);
6440 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6441 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6445 /* PHIs should not participate in patterns. */
6446 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6447 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6449 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6450 and compute the reduction chain length. Discover the real
6451 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6452 tree reduc_def
6453 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6454 loop_latch_edge
6455 (gimple_bb (reduc_def_phi)->loop_father));
6456 unsigned reduc_chain_length = 0;
6457 bool only_slp_reduc_chain = true;
6458 stmt_info = NULL;
6459 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6460 while (reduc_def != PHI_RESULT (reduc_def_phi))
6462 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6463 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6464 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6466 if (dump_enabled_p ())
6467 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6468 "reduction chain broken by patterns.\n");
6469 return false;
6471 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6472 only_slp_reduc_chain = false;
6473 /* ??? For epilogue generation live members of the chain need
6474 to point back to the PHI via their original stmt for
6475 info_for_reduction to work. */
6476 if (STMT_VINFO_LIVE_P (vdef))
6477 STMT_VINFO_REDUC_DEF (def) = phi_info;
6478 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6479 if (!assign)
6481 if (dump_enabled_p ())
6482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6483 "reduction chain includes calls.\n");
6484 return false;
6486 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6488 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6489 TREE_TYPE (gimple_assign_rhs1 (assign))))
6491 if (dump_enabled_p ())
6492 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6493 "conversion in the reduction chain.\n");
6494 return false;
6497 else if (!stmt_info)
6498 /* First non-conversion stmt. */
6499 stmt_info = vdef;
6500 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6501 reduc_chain_length++;
6502 if (!stmt_info && slp_node)
6503 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6505 /* PHIs should not participate in patterns. */
6506 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6508 if (nested_in_vect_loop_p (loop, stmt_info))
6510 loop = loop->inner;
6511 nested_cycle = true;
6514 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6515 element. */
6516 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6518 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6519 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6521 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6522 gcc_assert (slp_node
6523 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6525 /* 1. Is vectorizable reduction? */
6526 /* Not supportable if the reduction variable is used in the loop, unless
6527 it's a reduction chain. */
6528 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6529 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6530 return false;
6532 /* Reductions that are not used even in an enclosing outer-loop,
6533 are expected to be "live" (used out of the loop). */
6534 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6535 && !STMT_VINFO_LIVE_P (stmt_info))
6536 return false;
6538 /* 2. Has this been recognized as a reduction pattern?
6540 Check if STMT represents a pattern that has been recognized
6541 in earlier analysis stages. For stmts that represent a pattern,
6542 the STMT_VINFO_RELATED_STMT field records the last stmt in
6543 the original sequence that constitutes the pattern. */
6545 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6546 if (orig_stmt_info)
6548 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6549 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6552 /* 3. Check the operands of the operation. The first operands are defined
6553 inside the loop body. The last operand is the reduction variable,
6554 which is defined by the loop-header-phi. */
6556 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6557 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6558 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6559 enum tree_code code = gimple_assign_rhs_code (stmt);
6560 bool lane_reduc_code_p
6561 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6562 int op_type = TREE_CODE_LENGTH (code);
6564 scalar_dest = gimple_assign_lhs (stmt);
6565 scalar_type = TREE_TYPE (scalar_dest);
6566 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6567 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6568 return false;
6570 /* Do not try to vectorize bit-precision reductions. */
6571 if (!type_has_mode_precision_p (scalar_type))
6572 return false;
6574 /* For lane-reducing ops we're reducing the number of reduction PHIs
6575 which means the only use of that may be in the lane-reducing operation. */
6576 if (lane_reduc_code_p
6577 && reduc_chain_length != 1
6578 && !only_slp_reduc_chain)
6580 if (dump_enabled_p ())
6581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6582 "lane-reducing reduction with extra stmts.\n");
6583 return false;
6586 /* All uses but the last are expected to be defined in the loop.
6587 The last use is the reduction variable. In case of nested cycle this
6588 assumption is not true: we use reduc_index to record the index of the
6589 reduction variable. */
6590 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6591 /* We need to skip an extra operand for COND_EXPRs with embedded
6592 comparison. */
6593 unsigned opno_adjust = 0;
6594 if (code == COND_EXPR
6595 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6596 opno_adjust = 1;
6597 for (i = 0; i < op_type; i++)
6599 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6600 if (i == 0 && code == COND_EXPR)
6601 continue;
6603 stmt_vec_info def_stmt_info;
6604 enum vect_def_type dt;
6605 tree op;
6606 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6607 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6608 &def_stmt_info))
6610 if (dump_enabled_p ())
6611 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6612 "use not simple.\n");
6613 return false;
6615 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6616 continue;
6618 /* There should be only one cycle def in the stmt, the one
6619 leading to reduc_def. */
6620 if (VECTORIZABLE_CYCLE_DEF (dt))
6621 return false;
6623 /* To properly compute ncopies we are interested in the widest
6624 non-reduction input type in case we're looking at a widening
6625 accumulation that we later handle in vect_transform_reduction. */
6626 if (lane_reduc_code_p
6627 && tem
6628 && (!vectype_in
6629 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6630 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6631 vectype_in = tem;
6633 if (code == COND_EXPR)
6635 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6636 if (dt == vect_constant_def)
6638 cond_reduc_dt = dt;
6639 cond_reduc_val = op;
6641 if (dt == vect_induction_def
6642 && def_stmt_info
6643 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6645 cond_reduc_dt = dt;
6646 cond_stmt_vinfo = def_stmt_info;
6650 if (!vectype_in)
6651 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6652 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6654 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6655 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6656 /* If we have a condition reduction, see if we can simplify it further. */
6657 if (v_reduc_type == COND_REDUCTION)
6659 if (slp_node)
6660 return false;
6662 /* When the condition uses the reduction value in the condition, fail. */
6663 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6665 if (dump_enabled_p ())
6666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6667 "condition depends on previous iteration\n");
6668 return false;
6671 if (reduc_chain_length == 1
6672 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6673 vectype_in, OPTIMIZE_FOR_SPEED))
6675 if (dump_enabled_p ())
6676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6677 "optimizing condition reduction with"
6678 " FOLD_EXTRACT_LAST.\n");
6679 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6681 else if (cond_reduc_dt == vect_induction_def)
6683 tree base
6684 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6685 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6687 gcc_assert (TREE_CODE (base) == INTEGER_CST
6688 && TREE_CODE (step) == INTEGER_CST);
6689 cond_reduc_val = NULL_TREE;
6690 enum tree_code cond_reduc_op_code = ERROR_MARK;
6691 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6692 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6694 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6695 above base; punt if base is the minimum value of the type for
6696 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6697 else if (tree_int_cst_sgn (step) == -1)
6699 cond_reduc_op_code = MIN_EXPR;
6700 if (tree_int_cst_sgn (base) == -1)
6701 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6702 else if (tree_int_cst_lt (base,
6703 TYPE_MAX_VALUE (TREE_TYPE (base))))
6704 cond_reduc_val
6705 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6707 else
6709 cond_reduc_op_code = MAX_EXPR;
6710 if (tree_int_cst_sgn (base) == 1)
6711 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6712 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6713 base))
6714 cond_reduc_val
6715 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6717 if (cond_reduc_val)
6719 if (dump_enabled_p ())
6720 dump_printf_loc (MSG_NOTE, vect_location,
6721 "condition expression based on "
6722 "integer induction.\n");
6723 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6724 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6725 = cond_reduc_val;
6726 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6729 else if (cond_reduc_dt == vect_constant_def)
6731 enum vect_def_type cond_initial_dt;
6732 tree cond_initial_val
6733 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6735 gcc_assert (cond_reduc_val != NULL_TREE);
6736 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6737 if (cond_initial_dt == vect_constant_def
6738 && types_compatible_p (TREE_TYPE (cond_initial_val),
6739 TREE_TYPE (cond_reduc_val)))
6741 tree e = fold_binary (LE_EXPR, boolean_type_node,
6742 cond_initial_val, cond_reduc_val);
6743 if (e && (integer_onep (e) || integer_zerop (e)))
6745 if (dump_enabled_p ())
6746 dump_printf_loc (MSG_NOTE, vect_location,
6747 "condition expression based on "
6748 "compile time constant.\n");
6749 /* Record reduction code at analysis stage. */
6750 STMT_VINFO_REDUC_CODE (reduc_info)
6751 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6752 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6758 if (STMT_VINFO_LIVE_P (phi_info))
6759 return false;
6761 if (slp_node)
6762 ncopies = 1;
6763 else
6764 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6766 gcc_assert (ncopies >= 1);
6768 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6770 if (nested_cycle)
6772 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6773 == vect_double_reduction_def);
6774 double_reduc = true;
6777 /* 4.2. Check support for the epilog operation.
6779 If STMT represents a reduction pattern, then the type of the
6780 reduction variable may be different than the type of the rest
6781 of the arguments. For example, consider the case of accumulation
6782 of shorts into an int accumulator; The original code:
6783 S1: int_a = (int) short_a;
6784 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6786 was replaced with:
6787 STMT: int_acc = widen_sum <short_a, int_acc>
6789 This means that:
6790 1. The tree-code that is used to create the vector operation in the
6791 epilog code (that reduces the partial results) is not the
6792 tree-code of STMT, but is rather the tree-code of the original
6793 stmt from the pattern that STMT is replacing. I.e, in the example
6794 above we want to use 'widen_sum' in the loop, but 'plus' in the
6795 epilog.
6796 2. The type (mode) we use to check available target support
6797 for the vector operation to be created in the *epilog*, is
6798 determined by the type of the reduction variable (in the example
6799 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6800 However the type (mode) we use to check available target support
6801 for the vector operation to be created *inside the loop*, is
6802 determined by the type of the other arguments to STMT (in the
6803 example we'd check this: optab_handler (widen_sum_optab,
6804 vect_short_mode)).
6806 This is contrary to "regular" reductions, in which the types of all
6807 the arguments are the same as the type of the reduction variable.
6808 For "regular" reductions we can therefore use the same vector type
6809 (and also the same tree-code) when generating the epilog code and
6810 when generating the code inside the loop. */
6812 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6813 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6815 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6816 if (reduction_type == TREE_CODE_REDUCTION)
6818 /* Check whether it's ok to change the order of the computation.
6819 Generally, when vectorizing a reduction we change the order of the
6820 computation. This may change the behavior of the program in some
6821 cases, so we need to check that this is ok. One exception is when
6822 vectorizing an outer-loop: the inner-loop is executed sequentially,
6823 and therefore vectorizing reductions in the inner-loop during
6824 outer-loop vectorization is safe. */
6825 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6827 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6828 is not directy used in stmt. */
6829 if (!only_slp_reduc_chain
6830 && reduc_chain_length != 1)
6832 if (dump_enabled_p ())
6833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6834 "in-order reduction chain without SLP.\n");
6835 return false;
6837 STMT_VINFO_REDUC_TYPE (reduc_info)
6838 = reduction_type = FOLD_LEFT_REDUCTION;
6840 else if (!commutative_tree_code (orig_code)
6841 || !associative_tree_code (orig_code))
6843 if (dump_enabled_p ())
6844 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6845 "reduction: not commutative/associative");
6846 return false;
6850 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6851 && ncopies > 1)
6853 if (dump_enabled_p ())
6854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6855 "multiple types in double reduction or condition "
6856 "reduction or fold-left reduction.\n");
6857 return false;
6860 internal_fn reduc_fn = IFN_LAST;
6861 if (reduction_type == TREE_CODE_REDUCTION
6862 || reduction_type == FOLD_LEFT_REDUCTION
6863 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6864 || reduction_type == CONST_COND_REDUCTION)
6866 if (reduction_type == FOLD_LEFT_REDUCTION
6867 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6868 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6870 if (reduc_fn != IFN_LAST
6871 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6872 OPTIMIZE_FOR_SPEED))
6874 if (dump_enabled_p ())
6875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6876 "reduc op not supported by target.\n");
6878 reduc_fn = IFN_LAST;
6881 else
6883 if (!nested_cycle || double_reduc)
6885 if (dump_enabled_p ())
6886 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6887 "no reduc code for scalar code.\n");
6889 return false;
6893 else if (reduction_type == COND_REDUCTION)
6895 int scalar_precision
6896 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6897 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6898 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6899 nunits_out);
6901 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6902 OPTIMIZE_FOR_SPEED))
6903 reduc_fn = IFN_REDUC_MAX;
6905 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6907 if (reduction_type != EXTRACT_LAST_REDUCTION
6908 && (!nested_cycle || double_reduc)
6909 && reduc_fn == IFN_LAST
6910 && !nunits_out.is_constant ())
6912 if (dump_enabled_p ())
6913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6914 "missing target support for reduction on"
6915 " variable-length vectors.\n");
6916 return false;
6919 /* For SLP reductions, see if there is a neutral value we can use. */
6920 tree neutral_op = NULL_TREE;
6921 if (slp_node)
6922 neutral_op = neutral_op_for_slp_reduction
6923 (slp_node_instance->reduc_phis, vectype_out, orig_code,
6924 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6926 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6928 /* We can't support in-order reductions of code such as this:
6930 for (int i = 0; i < n1; ++i)
6931 for (int j = 0; j < n2; ++j)
6932 l += a[j];
6934 since GCC effectively transforms the loop when vectorizing:
6936 for (int i = 0; i < n1 / VF; ++i)
6937 for (int j = 0; j < n2; ++j)
6938 for (int k = 0; k < VF; ++k)
6939 l += a[j];
6941 which is a reassociation of the original operation. */
6942 if (dump_enabled_p ())
6943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6944 "in-order double reduction not supported.\n");
6946 return false;
6949 if (reduction_type == FOLD_LEFT_REDUCTION
6950 && slp_node
6951 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6953 /* We cannot use in-order reductions in this case because there is
6954 an implicit reassociation of the operations involved. */
6955 if (dump_enabled_p ())
6956 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6957 "in-order unchained SLP reductions not supported.\n");
6958 return false;
6961 /* For double reductions, and for SLP reductions with a neutral value,
6962 we construct a variable-length initial vector by loading a vector
6963 full of the neutral value and then shift-and-inserting the start
6964 values into the low-numbered elements. */
6965 if ((double_reduc || neutral_op)
6966 && !nunits_out.is_constant ()
6967 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6968 vectype_out, OPTIMIZE_FOR_SPEED))
6970 if (dump_enabled_p ())
6971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6972 "reduction on variable-length vectors requires"
6973 " target support for a vector-shift-and-insert"
6974 " operation.\n");
6975 return false;
6978 /* Check extra constraints for variable-length unchained SLP reductions. */
6979 if (STMT_SLP_TYPE (stmt_info)
6980 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6981 && !nunits_out.is_constant ())
6983 /* We checked above that we could build the initial vector when
6984 there's a neutral element value. Check here for the case in
6985 which each SLP statement has its own initial value and in which
6986 that value needs to be repeated for every instance of the
6987 statement within the initial vector. */
6988 unsigned int group_size = SLP_TREE_LANES (slp_node);
6989 if (!neutral_op
6990 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6991 TREE_TYPE (vectype_out)))
6993 if (dump_enabled_p ())
6994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6995 "unsupported form of SLP reduction for"
6996 " variable-length vectors: cannot build"
6997 " initial vector.\n");
6998 return false;
7000 /* The epilogue code relies on the number of elements being a multiple
7001 of the group size. The duplicate-and-interleave approach to setting
7002 up the initial vector does too. */
7003 if (!multiple_p (nunits_out, group_size))
7005 if (dump_enabled_p ())
7006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7007 "unsupported form of SLP reduction for"
7008 " variable-length vectors: the vector size"
7009 " is not a multiple of the number of results.\n");
7010 return false;
7014 if (reduction_type == COND_REDUCTION)
7016 widest_int ni;
7018 if (! max_loop_iterations (loop, &ni))
7020 if (dump_enabled_p ())
7021 dump_printf_loc (MSG_NOTE, vect_location,
7022 "loop count not known, cannot create cond "
7023 "reduction.\n");
7024 return false;
7026 /* Convert backedges to iterations. */
7027 ni += 1;
7029 /* The additional index will be the same type as the condition. Check
7030 that the loop can fit into this less one (because we'll use up the
7031 zero slot for when there are no matches). */
7032 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7033 if (wi::geu_p (ni, wi::to_widest (max_index)))
7035 if (dump_enabled_p ())
7036 dump_printf_loc (MSG_NOTE, vect_location,
7037 "loop size is greater than data size.\n");
7038 return false;
7042 /* In case the vectorization factor (VF) is bigger than the number
7043 of elements that we can fit in a vectype (nunits), we have to generate
7044 more than one vector stmt - i.e - we need to "unroll" the
7045 vector stmt by a factor VF/nunits. For more details see documentation
7046 in vectorizable_operation. */
7048 /* If the reduction is used in an outer loop we need to generate
7049 VF intermediate results, like so (e.g. for ncopies=2):
7050 r0 = phi (init, r0)
7051 r1 = phi (init, r1)
7052 r0 = x0 + r0;
7053 r1 = x1 + r1;
7054 (i.e. we generate VF results in 2 registers).
7055 In this case we have a separate def-use cycle for each copy, and therefore
7056 for each copy we get the vector def for the reduction variable from the
7057 respective phi node created for this copy.
7059 Otherwise (the reduction is unused in the loop nest), we can combine
7060 together intermediate results, like so (e.g. for ncopies=2):
7061 r = phi (init, r)
7062 r = x0 + r;
7063 r = x1 + r;
7064 (i.e. we generate VF/2 results in a single register).
7065 In this case for each copy we get the vector def for the reduction variable
7066 from the vectorized reduction operation generated in the previous iteration.
7068 This only works when we see both the reduction PHI and its only consumer
7069 in vectorizable_reduction and there are no intermediate stmts
7070 participating. */
7071 if (ncopies > 1
7072 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7073 && reduc_chain_length == 1)
7074 single_defuse_cycle = true;
7076 if (single_defuse_cycle || lane_reduc_code_p)
7078 gcc_assert (code != COND_EXPR);
7080 /* 4. Supportable by target? */
7081 bool ok = true;
7083 /* 4.1. check support for the operation in the loop */
7084 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
7085 if (!optab)
7087 if (dump_enabled_p ())
7088 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7089 "no optab.\n");
7090 ok = false;
7093 machine_mode vec_mode = TYPE_MODE (vectype_in);
7094 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7096 if (dump_enabled_p ())
7097 dump_printf (MSG_NOTE, "op not supported by target.\n");
7098 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7099 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
7100 ok = false;
7101 else
7102 if (dump_enabled_p ())
7103 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7106 /* Worthwhile without SIMD support? */
7107 if (ok
7108 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
7109 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
7111 if (dump_enabled_p ())
7112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7113 "not worthwhile without SIMD support.\n");
7114 ok = false;
7117 /* lane-reducing operations have to go through vect_transform_reduction.
7118 For the other cases try without the single cycle optimization. */
7119 if (!ok)
7121 if (lane_reduc_code_p)
7122 return false;
7123 else
7124 single_defuse_cycle = false;
7127 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7129 /* If the reduction stmt is one of the patterns that have lane
7130 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7131 if ((ncopies > 1 && ! single_defuse_cycle)
7132 && lane_reduc_code_p)
7134 if (dump_enabled_p ())
7135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7136 "multi def-use cycle not possible for lane-reducing "
7137 "reduction operation\n");
7138 return false;
7141 if (slp_node
7142 && !(!single_defuse_cycle
7143 && code != DOT_PROD_EXPR
7144 && code != WIDEN_SUM_EXPR
7145 && code != SAD_EXPR
7146 && reduction_type != FOLD_LEFT_REDUCTION))
7147 for (i = 0; i < op_type; i++)
7148 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7150 if (dump_enabled_p ())
7151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7152 "incompatible vector types for invariants\n");
7153 return false;
7156 if (slp_node)
7157 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7158 else
7159 vec_num = 1;
7161 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7162 reduction_type, ncopies, cost_vec);
7163 if (dump_enabled_p ()
7164 && reduction_type == FOLD_LEFT_REDUCTION)
7165 dump_printf_loc (MSG_NOTE, vect_location,
7166 "using an in-order (fold-left) reduction.\n");
7167 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7168 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7169 reductions go through their own vectorizable_* routines. */
7170 if (!single_defuse_cycle
7171 && code != DOT_PROD_EXPR
7172 && code != WIDEN_SUM_EXPR
7173 && code != SAD_EXPR
7174 && reduction_type != FOLD_LEFT_REDUCTION)
7176 stmt_vec_info tem
7177 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7178 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7180 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7181 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7183 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7184 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7186 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7188 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7189 internal_fn cond_fn = get_conditional_internal_fn (code);
7191 if (reduction_type != FOLD_LEFT_REDUCTION
7192 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7193 && (cond_fn == IFN_LAST
7194 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7195 OPTIMIZE_FOR_SPEED)))
7197 if (dump_enabled_p ())
7198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7199 "can't operate on partial vectors because"
7200 " no conditional operation is available.\n");
7201 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7203 else if (reduction_type == FOLD_LEFT_REDUCTION
7204 && reduc_fn == IFN_LAST
7205 && !expand_vec_cond_expr_p (vectype_in,
7206 truth_type_for (vectype_in),
7207 SSA_NAME))
7209 if (dump_enabled_p ())
7210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7211 "can't operate on partial vectors because"
7212 " no conditional operation is available.\n");
7213 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7215 else
7216 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7217 vectype_in, NULL);
7219 return true;
7222 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7223 value. */
7225 bool
7226 vect_transform_reduction (loop_vec_info loop_vinfo,
7227 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7228 gimple **vec_stmt, slp_tree slp_node)
7230 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7231 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7232 int i;
7233 int ncopies;
7234 int vec_num;
7236 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7237 gcc_assert (reduc_info->is_reduc_info);
7239 if (nested_in_vect_loop_p (loop, stmt_info))
7241 loop = loop->inner;
7242 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7245 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7246 enum tree_code code = gimple_assign_rhs_code (stmt);
7247 int op_type = TREE_CODE_LENGTH (code);
7249 /* Flatten RHS. */
7250 tree ops[3];
7251 switch (get_gimple_rhs_class (code))
7253 case GIMPLE_TERNARY_RHS:
7254 ops[2] = gimple_assign_rhs3 (stmt);
7255 /* Fall thru. */
7256 case GIMPLE_BINARY_RHS:
7257 ops[0] = gimple_assign_rhs1 (stmt);
7258 ops[1] = gimple_assign_rhs2 (stmt);
7259 break;
7260 default:
7261 gcc_unreachable ();
7264 /* All uses but the last are expected to be defined in the loop.
7265 The last use is the reduction variable. In case of nested cycle this
7266 assumption is not true: we use reduc_index to record the index of the
7267 reduction variable. */
7268 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7269 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7270 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7271 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7273 if (slp_node)
7275 ncopies = 1;
7276 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7278 else
7280 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7281 vec_num = 1;
7284 internal_fn cond_fn = get_conditional_internal_fn (code);
7285 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7286 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7288 /* Transform. */
7289 tree new_temp = NULL_TREE;
7290 auto_vec<tree> vec_oprnds0;
7291 auto_vec<tree> vec_oprnds1;
7292 auto_vec<tree> vec_oprnds2;
7293 tree def0;
7295 if (dump_enabled_p ())
7296 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7298 /* FORNOW: Multiple types are not supported for condition. */
7299 if (code == COND_EXPR)
7300 gcc_assert (ncopies == 1);
7302 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7304 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7305 if (reduction_type == FOLD_LEFT_REDUCTION)
7307 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7308 return vectorize_fold_left_reduction
7309 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7310 reduc_fn, ops, vectype_in, reduc_index, masks);
7313 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7314 gcc_assert (single_defuse_cycle
7315 || code == DOT_PROD_EXPR
7316 || code == WIDEN_SUM_EXPR
7317 || code == SAD_EXPR);
7319 /* Create the destination vector */
7320 tree scalar_dest = gimple_assign_lhs (stmt);
7321 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7323 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7324 single_defuse_cycle && reduc_index == 0
7325 ? NULL_TREE : ops[0], &vec_oprnds0,
7326 single_defuse_cycle && reduc_index == 1
7327 ? NULL_TREE : ops[1], &vec_oprnds1,
7328 op_type == ternary_op
7329 && !(single_defuse_cycle && reduc_index == 2)
7330 ? ops[2] : NULL_TREE, &vec_oprnds2);
7331 if (single_defuse_cycle)
7333 gcc_assert (!slp_node);
7334 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7335 ops[reduc_index],
7336 reduc_index == 0 ? &vec_oprnds0
7337 : (reduc_index == 1 ? &vec_oprnds1
7338 : &vec_oprnds2));
7341 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7343 gimple *new_stmt;
7344 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7345 if (masked_loop_p && !mask_by_cond_expr)
7347 /* Make sure that the reduction accumulator is vop[0]. */
7348 if (reduc_index == 1)
7350 gcc_assert (commutative_tree_code (code));
7351 std::swap (vop[0], vop[1]);
7353 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7354 vectype_in, i);
7355 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7356 vop[0], vop[1], vop[0]);
7357 new_temp = make_ssa_name (vec_dest, call);
7358 gimple_call_set_lhs (call, new_temp);
7359 gimple_call_set_nothrow (call, true);
7360 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7361 new_stmt = call;
7363 else
7365 if (op_type == ternary_op)
7366 vop[2] = vec_oprnds2[i];
7368 if (masked_loop_p && mask_by_cond_expr)
7370 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7371 vectype_in, i);
7372 build_vect_cond_expr (code, vop, mask, gsi);
7375 new_stmt = gimple_build_assign (vec_dest, code,
7376 vop[0], vop[1], vop[2]);
7377 new_temp = make_ssa_name (vec_dest, new_stmt);
7378 gimple_assign_set_lhs (new_stmt, new_temp);
7379 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7382 if (slp_node)
7383 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7384 else if (single_defuse_cycle
7385 && i < ncopies - 1)
7387 if (reduc_index == 0)
7388 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7389 else if (reduc_index == 1)
7390 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7391 else if (reduc_index == 2)
7392 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7394 else
7395 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7398 if (!slp_node)
7399 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7401 return true;
7404 /* Transform phase of a cycle PHI. */
7406 bool
7407 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7408 stmt_vec_info stmt_info, gimple **vec_stmt,
7409 slp_tree slp_node, slp_instance slp_node_instance)
7411 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7412 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7413 int i;
7414 int ncopies;
7415 int j;
7416 bool nested_cycle = false;
7417 int vec_num;
7419 if (nested_in_vect_loop_p (loop, stmt_info))
7421 loop = loop->inner;
7422 nested_cycle = true;
7425 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7426 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7427 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7428 gcc_assert (reduc_info->is_reduc_info);
7430 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7431 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7432 /* Leave the scalar phi in place. */
7433 return true;
7435 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7436 /* For a nested cycle we do not fill the above. */
7437 if (!vectype_in)
7438 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7439 gcc_assert (vectype_in);
7441 if (slp_node)
7443 /* The size vect_schedule_slp_instance computes is off for us. */
7444 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7445 * SLP_TREE_LANES (slp_node), vectype_in);
7446 ncopies = 1;
7448 else
7450 vec_num = 1;
7451 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7454 /* Check whether we should use a single PHI node and accumulate
7455 vectors to one before the backedge. */
7456 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7457 ncopies = 1;
7459 /* Create the destination vector */
7460 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7461 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7462 vectype_out);
7464 /* Get the loop-entry arguments. */
7465 tree vec_initial_def;
7466 auto_vec<tree> vec_initial_defs;
7467 if (slp_node)
7469 vec_initial_defs.reserve (vec_num);
7470 if (nested_cycle)
7472 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7473 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7474 &vec_initial_defs);
7476 else
7478 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7479 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7480 tree neutral_op
7481 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7482 STMT_VINFO_REDUC_CODE (reduc_info),
7483 first != NULL);
7484 get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7485 &vec_initial_defs, vec_num,
7486 first != NULL, neutral_op);
7489 else
7491 /* Get at the scalar def before the loop, that defines the initial
7492 value of the reduction variable. */
7493 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7494 loop_preheader_edge (loop));
7495 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7496 and we can't use zero for induc_val, use initial_def. Similarly
7497 for REDUC_MIN and initial_def larger than the base. */
7498 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7500 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7501 if (TREE_CODE (initial_def) == INTEGER_CST
7502 && !integer_zerop (induc_val)
7503 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7504 && tree_int_cst_lt (initial_def, induc_val))
7505 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7506 && tree_int_cst_lt (induc_val, initial_def))))
7508 induc_val = initial_def;
7509 /* Communicate we used the initial_def to epilouge
7510 generation. */
7511 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7513 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7514 vec_initial_defs.create (ncopies);
7515 for (i = 0; i < ncopies; ++i)
7516 vec_initial_defs.quick_push (vec_initial_def);
7518 else if (nested_cycle)
7520 /* Do not use an adjustment def as that case is not supported
7521 correctly if ncopies is not one. */
7522 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7523 ncopies, initial_def,
7524 &vec_initial_defs);
7526 else
7528 tree adjustment_def = NULL_TREE;
7529 tree *adjustment_defp = &adjustment_def;
7530 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7531 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7532 adjustment_defp = NULL;
7533 vec_initial_def
7534 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7535 initial_def, adjustment_defp);
7536 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7537 vec_initial_defs.create (ncopies);
7538 for (i = 0; i < ncopies; ++i)
7539 vec_initial_defs.quick_push (vec_initial_def);
7543 /* Generate the reduction PHIs upfront. */
7544 for (i = 0; i < vec_num; i++)
7546 tree vec_init_def = vec_initial_defs[i];
7547 for (j = 0; j < ncopies; j++)
7549 /* Create the reduction-phi that defines the reduction
7550 operand. */
7551 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7553 /* Set the loop-entry arg of the reduction-phi. */
7554 if (j != 0 && nested_cycle)
7555 vec_init_def = vec_initial_defs[j];
7556 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7557 UNKNOWN_LOCATION);
7559 /* The loop-latch arg is set in epilogue processing. */
7561 if (slp_node)
7562 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7563 else
7565 if (j == 0)
7566 *vec_stmt = new_phi;
7567 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7572 return true;
7575 /* Vectorizes LC PHIs. */
7577 bool
7578 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7579 stmt_vec_info stmt_info, gimple **vec_stmt,
7580 slp_tree slp_node)
7582 if (!loop_vinfo
7583 || !is_a <gphi *> (stmt_info->stmt)
7584 || gimple_phi_num_args (stmt_info->stmt) != 1)
7585 return false;
7587 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7588 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7589 return false;
7591 if (!vec_stmt) /* transformation not required. */
7593 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7594 return true;
7597 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7598 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7599 basic_block bb = gimple_bb (stmt_info->stmt);
7600 edge e = single_pred_edge (bb);
7601 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7602 auto_vec<tree> vec_oprnds;
7603 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7604 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7605 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7606 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7608 /* Create the vectorized LC PHI node. */
7609 gphi *new_phi = create_phi_node (vec_dest, bb);
7610 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7611 if (slp_node)
7612 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7613 else
7614 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7616 if (!slp_node)
7617 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7619 return true;
7622 /* Vectorizes PHIs. */
7624 bool
7625 vectorizable_phi (vec_info *,
7626 stmt_vec_info stmt_info, gimple **vec_stmt,
7627 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7629 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7630 return false;
7632 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7633 return false;
7635 tree vectype = SLP_TREE_VECTYPE (slp_node);
7637 if (!vec_stmt) /* transformation not required. */
7639 slp_tree child;
7640 unsigned i;
7641 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7642 if (!child)
7644 if (dump_enabled_p ())
7645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7646 "PHI node with unvectorized backedge def\n");
7647 return false;
7649 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7651 if (dump_enabled_p ())
7652 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7653 "incompatible vector types for invariants\n");
7654 return false;
7656 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7657 vector_stmt, stmt_info, vectype, 0, vect_body);
7658 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7659 return true;
7662 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7663 basic_block bb = gimple_bb (stmt_info->stmt);
7664 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7665 auto_vec<gphi *> new_phis;
7666 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7668 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7670 /* Skip not yet vectorized defs. */
7671 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7672 && SLP_TREE_VEC_STMTS (child).is_empty ())
7673 continue;
7675 auto_vec<tree> vec_oprnds;
7676 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7677 if (!new_phis.exists ())
7679 new_phis.create (vec_oprnds.length ());
7680 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7682 /* Create the vectorized LC PHI node. */
7683 new_phis.quick_push (create_phi_node (vec_dest, bb));
7684 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7687 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7688 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7689 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7691 /* We should have at least one already vectorized child. */
7692 gcc_assert (new_phis.exists ());
7694 return true;
7698 /* Function vect_min_worthwhile_factor.
7700 For a loop where we could vectorize the operation indicated by CODE,
7701 return the minimum vectorization factor that makes it worthwhile
7702 to use generic vectors. */
7703 static unsigned int
7704 vect_min_worthwhile_factor (enum tree_code code)
7706 switch (code)
7708 case PLUS_EXPR:
7709 case MINUS_EXPR:
7710 case NEGATE_EXPR:
7711 return 4;
7713 case BIT_AND_EXPR:
7714 case BIT_IOR_EXPR:
7715 case BIT_XOR_EXPR:
7716 case BIT_NOT_EXPR:
7717 return 2;
7719 default:
7720 return INT_MAX;
7724 /* Return true if VINFO indicates we are doing loop vectorization and if
7725 it is worth decomposing CODE operations into scalar operations for
7726 that loop's vectorization factor. */
7728 bool
7729 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7731 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7732 unsigned HOST_WIDE_INT value;
7733 return (loop_vinfo
7734 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7735 && value >= vect_min_worthwhile_factor (code));
7738 /* Function vectorizable_induction
7740 Check if STMT_INFO performs an induction computation that can be vectorized.
7741 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7742 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7743 Return true if STMT_INFO is vectorizable in this way. */
7745 bool
7746 vectorizable_induction (loop_vec_info loop_vinfo,
7747 stmt_vec_info stmt_info,
7748 gimple **vec_stmt, slp_tree slp_node,
7749 stmt_vector_for_cost *cost_vec)
7751 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7752 unsigned ncopies;
7753 bool nested_in_vect_loop = false;
7754 class loop *iv_loop;
7755 tree vec_def;
7756 edge pe = loop_preheader_edge (loop);
7757 basic_block new_bb;
7758 tree new_vec, vec_init, vec_step, t;
7759 tree new_name;
7760 gimple *new_stmt;
7761 gphi *induction_phi;
7762 tree induc_def, vec_dest;
7763 tree init_expr, step_expr;
7764 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7765 unsigned i;
7766 tree expr;
7767 gimple_stmt_iterator si;
7769 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7770 if (!phi)
7771 return false;
7773 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7774 return false;
7776 /* Make sure it was recognized as induction computation. */
7777 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7778 return false;
7780 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7781 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7783 if (slp_node)
7784 ncopies = 1;
7785 else
7786 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7787 gcc_assert (ncopies >= 1);
7789 /* FORNOW. These restrictions should be relaxed. */
7790 if (nested_in_vect_loop_p (loop, stmt_info))
7792 imm_use_iterator imm_iter;
7793 use_operand_p use_p;
7794 gimple *exit_phi;
7795 edge latch_e;
7796 tree loop_arg;
7798 if (ncopies > 1)
7800 if (dump_enabled_p ())
7801 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7802 "multiple types in nested loop.\n");
7803 return false;
7806 exit_phi = NULL;
7807 latch_e = loop_latch_edge (loop->inner);
7808 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7809 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7811 gimple *use_stmt = USE_STMT (use_p);
7812 if (is_gimple_debug (use_stmt))
7813 continue;
7815 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7817 exit_phi = use_stmt;
7818 break;
7821 if (exit_phi)
7823 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7824 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7825 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7827 if (dump_enabled_p ())
7828 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7829 "inner-loop induction only used outside "
7830 "of the outer vectorized loop.\n");
7831 return false;
7835 nested_in_vect_loop = true;
7836 iv_loop = loop->inner;
7838 else
7839 iv_loop = loop;
7840 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7842 if (slp_node && !nunits.is_constant ())
7844 /* The current SLP code creates the step value element-by-element. */
7845 if (dump_enabled_p ())
7846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7847 "SLP induction not supported for variable-length"
7848 " vectors.\n");
7849 return false;
7852 if (!vec_stmt) /* transformation not required. */
7854 unsigned inside_cost = 0, prologue_cost = 0;
7855 if (slp_node)
7857 /* We eventually need to set a vector type on invariant
7858 arguments. */
7859 unsigned j;
7860 slp_tree child;
7861 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7862 if (!vect_maybe_update_slp_op_vectype
7863 (child, SLP_TREE_VECTYPE (slp_node)))
7865 if (dump_enabled_p ())
7866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7867 "incompatible vector types for "
7868 "invariants\n");
7869 return false;
7871 /* loop cost for vec_loop. */
7872 inside_cost
7873 = record_stmt_cost (cost_vec,
7874 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7875 vector_stmt, stmt_info, 0, vect_body);
7876 /* prologue cost for vec_init (if not nested) and step. */
7877 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
7878 scalar_to_vec,
7879 stmt_info, 0, vect_prologue);
7881 else /* if (!slp_node) */
7883 /* loop cost for vec_loop. */
7884 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
7885 stmt_info, 0, vect_body);
7886 /* prologue cost for vec_init and vec_step. */
7887 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
7888 stmt_info, 0, vect_prologue);
7890 if (dump_enabled_p ())
7891 dump_printf_loc (MSG_NOTE, vect_location,
7892 "vect_model_induction_cost: inside_cost = %d, "
7893 "prologue_cost = %d .\n", inside_cost,
7894 prologue_cost);
7896 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7897 DUMP_VECT_SCOPE ("vectorizable_induction");
7898 return true;
7901 /* Transform. */
7903 /* Compute a vector variable, initialized with the first VF values of
7904 the induction variable. E.g., for an iv with IV_PHI='X' and
7905 evolution S, for a vector of 4 units, we want to compute:
7906 [X, X + S, X + 2*S, X + 3*S]. */
7908 if (dump_enabled_p ())
7909 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7911 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7912 gcc_assert (step_expr != NULL_TREE);
7913 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7915 pe = loop_preheader_edge (iv_loop);
7916 /* Find the first insertion point in the BB. */
7917 basic_block bb = gimple_bb (phi);
7918 si = gsi_after_labels (bb);
7920 /* For SLP induction we have to generate several IVs as for example
7921 with group size 3 we need
7922 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
7923 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
7924 if (slp_node)
7926 /* Enforced above. */
7927 unsigned int const_nunits = nunits.to_constant ();
7929 /* The initial values are vectorized, but any lanes > group_size
7930 need adjustment. */
7931 slp_tree init_node
7932 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
7934 /* Gather steps. Since we do not vectorize inductions as
7935 cycles we have to reconstruct the step from SCEV data. */
7936 unsigned group_size = SLP_TREE_LANES (slp_node);
7937 tree *steps = XALLOCAVEC (tree, group_size);
7938 tree *inits = XALLOCAVEC (tree, group_size);
7939 stmt_vec_info phi_info;
7940 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
7942 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
7943 if (!init_node)
7944 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
7945 pe->dest_idx);
7948 /* Now generate the IVs. */
7949 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7950 gcc_assert ((const_nunits * nvects) % group_size == 0);
7951 unsigned nivs;
7952 if (nested_in_vect_loop)
7953 nivs = nvects;
7954 else
7956 /* Compute the number of distinct IVs we need. First reduce
7957 group_size if it is a multiple of const_nunits so we get
7958 one IV for a group_size of 4 but const_nunits 2. */
7959 unsigned group_sizep = group_size;
7960 if (group_sizep % const_nunits == 0)
7961 group_sizep = group_sizep / const_nunits;
7962 nivs = least_common_multiple (group_sizep,
7963 const_nunits) / const_nunits;
7965 tree stept = TREE_TYPE (step_vectype);
7966 tree lupdate_mul = NULL_TREE;
7967 if (!nested_in_vect_loop)
7969 /* The number of iterations covered in one vector iteration. */
7970 unsigned lup_mul = (nvects * const_nunits) / group_size;
7971 lupdate_mul
7972 = build_vector_from_val (step_vectype,
7973 SCALAR_FLOAT_TYPE_P (stept)
7974 ? build_real_from_wide (stept, lup_mul,
7975 UNSIGNED)
7976 : build_int_cstu (stept, lup_mul));
7978 tree peel_mul = NULL_TREE;
7979 gimple_seq init_stmts = NULL;
7980 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
7982 if (SCALAR_FLOAT_TYPE_P (stept))
7983 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
7984 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
7985 else
7986 peel_mul = gimple_convert (&init_stmts, stept,
7987 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
7988 peel_mul = gimple_build_vector_from_val (&init_stmts,
7989 step_vectype, peel_mul);
7991 unsigned ivn;
7992 auto_vec<tree> vec_steps;
7993 for (ivn = 0; ivn < nivs; ++ivn)
7995 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
7996 tree_vector_builder init_elts (vectype, const_nunits, 1);
7997 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
7998 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8000 /* The scalar steps of the IVs. */
8001 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8002 step_elts.quick_push (elt);
8003 if (!init_node)
8005 /* The scalar inits of the IVs if not vectorized. */
8006 elt = inits[(ivn*const_nunits + eltn) % group_size];
8007 if (!useless_type_conversion_p (TREE_TYPE (vectype),
8008 TREE_TYPE (elt)))
8009 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8010 TREE_TYPE (vectype), elt);
8011 init_elts.quick_push (elt);
8013 /* The number of steps to add to the initial values. */
8014 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8015 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8016 ? build_real_from_wide (stept,
8017 mul_elt, UNSIGNED)
8018 : build_int_cstu (stept, mul_elt));
8020 vec_step = gimple_build_vector (&init_stmts, &step_elts);
8021 vec_step = gimple_convert (&init_stmts, step_vectype, vec_step);
8022 vec_steps.safe_push (vec_step);
8023 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8024 if (peel_mul)
8025 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8026 step_mul, peel_mul);
8027 if (!init_node)
8028 vec_init = gimple_build_vector (&init_stmts, &init_elts);
8030 /* Create the induction-phi that defines the induction-operand. */
8031 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8032 "vec_iv_");
8033 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8034 induc_def = PHI_RESULT (induction_phi);
8036 /* Create the iv update inside the loop */
8037 tree up = vec_step;
8038 if (lupdate_mul)
8039 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8040 vec_step, lupdate_mul);
8041 gimple_seq stmts = NULL;
8042 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8043 vec_def = gimple_build (&stmts,
8044 PLUS_EXPR, step_vectype, vec_def, up);
8045 vec_def = gimple_convert (&stmts, vectype, vec_def);
8046 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8047 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8048 UNKNOWN_LOCATION);
8050 if (init_node)
8051 vec_init = vect_get_slp_vect_def (init_node, ivn);
8052 if (!nested_in_vect_loop
8053 && !integer_zerop (step_mul))
8055 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8056 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8057 vec_step, step_mul);
8058 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8059 vec_def, up);
8060 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8063 /* Set the arguments of the phi node: */
8064 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8066 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8068 if (!nested_in_vect_loop)
8070 /* Fill up to the number of vectors we need for the whole group. */
8071 nivs = least_common_multiple (group_size,
8072 const_nunits) / const_nunits;
8073 for (; ivn < nivs; ++ivn)
8075 SLP_TREE_VEC_STMTS (slp_node)
8076 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8077 vec_steps.safe_push (vec_steps[0]);
8081 /* Re-use IVs when we can. We are generating further vector
8082 stmts by adding VF' * stride to the IVs generated above. */
8083 if (ivn < nvects)
8085 unsigned vfp
8086 = least_common_multiple (group_size, const_nunits) / group_size;
8087 tree lupdate_mul
8088 = build_vector_from_val (step_vectype,
8089 SCALAR_FLOAT_TYPE_P (stept)
8090 ? build_real_from_wide (stept,
8091 vfp, UNSIGNED)
8092 : build_int_cstu (stept, vfp));
8093 for (; ivn < nvects; ++ivn)
8095 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8096 tree def = gimple_get_lhs (iv);
8097 if (ivn < 2*nivs)
8098 vec_steps[ivn - nivs]
8099 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8100 vec_steps[ivn - nivs], lupdate_mul);
8101 gimple_seq stmts = NULL;
8102 def = gimple_convert (&stmts, step_vectype, def);
8103 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8104 def, vec_steps[ivn % nivs]);
8105 def = gimple_convert (&stmts, vectype, def);
8106 if (gimple_code (iv) == GIMPLE_PHI)
8107 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8108 else
8110 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8111 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8113 SLP_TREE_VEC_STMTS (slp_node)
8114 .quick_push (SSA_NAME_DEF_STMT (def));
8118 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8119 gcc_assert (!new_bb);
8121 return true;
8124 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
8125 loop_preheader_edge (iv_loop));
8127 gimple_seq stmts = NULL;
8128 if (!nested_in_vect_loop)
8130 /* Convert the initial value to the IV update type. */
8131 tree new_type = TREE_TYPE (step_expr);
8132 init_expr = gimple_convert (&stmts, new_type, init_expr);
8134 /* If we are using the loop mask to "peel" for alignment then we need
8135 to adjust the start value here. */
8136 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8137 if (skip_niters != NULL_TREE)
8139 if (FLOAT_TYPE_P (vectype))
8140 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8141 skip_niters);
8142 else
8143 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8144 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8145 skip_niters, step_expr);
8146 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8147 init_expr, skip_step);
8151 if (stmts)
8153 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8154 gcc_assert (!new_bb);
8157 /* Create the vector that holds the initial_value of the induction. */
8158 if (nested_in_vect_loop)
8160 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8161 been created during vectorization of previous stmts. We obtain it
8162 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8163 auto_vec<tree> vec_inits;
8164 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8165 init_expr, &vec_inits);
8166 vec_init = vec_inits[0];
8167 /* If the initial value is not of proper type, convert it. */
8168 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8170 new_stmt
8171 = gimple_build_assign (vect_get_new_ssa_name (vectype,
8172 vect_simple_var,
8173 "vec_iv_"),
8174 VIEW_CONVERT_EXPR,
8175 build1 (VIEW_CONVERT_EXPR, vectype,
8176 vec_init));
8177 vec_init = gimple_assign_lhs (new_stmt);
8178 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8179 new_stmt);
8180 gcc_assert (!new_bb);
8183 else
8185 /* iv_loop is the loop to be vectorized. Create:
8186 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8187 stmts = NULL;
8188 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8190 unsigned HOST_WIDE_INT const_nunits;
8191 if (nunits.is_constant (&const_nunits))
8193 tree_vector_builder elts (step_vectype, const_nunits, 1);
8194 elts.quick_push (new_name);
8195 for (i = 1; i < const_nunits; i++)
8197 /* Create: new_name_i = new_name + step_expr */
8198 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8199 new_name, step_expr);
8200 elts.quick_push (new_name);
8202 /* Create a vector from [new_name_0, new_name_1, ...,
8203 new_name_nunits-1] */
8204 vec_init = gimple_build_vector (&stmts, &elts);
8206 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8207 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8208 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8209 new_name, step_expr);
8210 else
8212 /* Build:
8213 [base, base, base, ...]
8214 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8215 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8216 gcc_assert (flag_associative_math);
8217 tree index = build_index_vector (step_vectype, 0, 1);
8218 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8219 new_name);
8220 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8221 step_expr);
8222 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8223 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8224 vec_init, step_vec);
8225 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8226 vec_init, base_vec);
8228 vec_init = gimple_convert (&stmts, vectype, vec_init);
8230 if (stmts)
8232 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8233 gcc_assert (!new_bb);
8238 /* Create the vector that holds the step of the induction. */
8239 if (nested_in_vect_loop)
8240 /* iv_loop is nested in the loop to be vectorized. Generate:
8241 vec_step = [S, S, S, S] */
8242 new_name = step_expr;
8243 else
8245 /* iv_loop is the loop to be vectorized. Generate:
8246 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8247 gimple_seq seq = NULL;
8248 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8250 expr = build_int_cst (integer_type_node, vf);
8251 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8253 else
8254 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8255 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8256 expr, step_expr);
8257 if (seq)
8259 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8260 gcc_assert (!new_bb);
8264 t = unshare_expr (new_name);
8265 gcc_assert (CONSTANT_CLASS_P (new_name)
8266 || TREE_CODE (new_name) == SSA_NAME);
8267 new_vec = build_vector_from_val (step_vectype, t);
8268 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8269 new_vec, step_vectype, NULL);
8272 /* Create the following def-use cycle:
8273 loop prolog:
8274 vec_init = ...
8275 vec_step = ...
8276 loop:
8277 vec_iv = PHI <vec_init, vec_loop>
8279 STMT
8281 vec_loop = vec_iv + vec_step; */
8283 /* Create the induction-phi that defines the induction-operand. */
8284 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8285 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8286 induc_def = PHI_RESULT (induction_phi);
8288 /* Create the iv update inside the loop */
8289 stmts = NULL;
8290 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8291 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8292 vec_def = gimple_convert (&stmts, vectype, vec_def);
8293 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8294 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8296 /* Set the arguments of the phi node: */
8297 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8298 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8299 UNKNOWN_LOCATION);
8301 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8302 *vec_stmt = induction_phi;
8304 /* In case that vectorization factor (VF) is bigger than the number
8305 of elements that we can fit in a vectype (nunits), we have to generate
8306 more than one vector stmt - i.e - we need to "unroll" the
8307 vector stmt by a factor VF/nunits. For more details see documentation
8308 in vectorizable_operation. */
8310 if (ncopies > 1)
8312 gimple_seq seq = NULL;
8313 /* FORNOW. This restriction should be relaxed. */
8314 gcc_assert (!nested_in_vect_loop);
8316 /* Create the vector that holds the step of the induction. */
8317 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8319 expr = build_int_cst (integer_type_node, nunits);
8320 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8322 else
8323 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8324 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8325 expr, step_expr);
8326 if (seq)
8328 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8329 gcc_assert (!new_bb);
8332 t = unshare_expr (new_name);
8333 gcc_assert (CONSTANT_CLASS_P (new_name)
8334 || TREE_CODE (new_name) == SSA_NAME);
8335 new_vec = build_vector_from_val (step_vectype, t);
8336 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8337 new_vec, step_vectype, NULL);
8339 vec_def = induc_def;
8340 for (i = 1; i < ncopies; i++)
8342 /* vec_i = vec_prev + vec_step */
8343 gimple_seq stmts = NULL;
8344 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8345 vec_def = gimple_build (&stmts,
8346 PLUS_EXPR, step_vectype, vec_def, vec_step);
8347 vec_def = gimple_convert (&stmts, vectype, vec_def);
8349 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8350 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8351 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8355 if (dump_enabled_p ())
8356 dump_printf_loc (MSG_NOTE, vect_location,
8357 "transform induction: created def-use cycle: %G%G",
8358 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8360 return true;
8363 /* Function vectorizable_live_operation.
8365 STMT_INFO computes a value that is used outside the loop. Check if
8366 it can be supported. */
8368 bool
8369 vectorizable_live_operation (vec_info *vinfo,
8370 stmt_vec_info stmt_info,
8371 gimple_stmt_iterator *gsi,
8372 slp_tree slp_node, slp_instance slp_node_instance,
8373 int slp_index, bool vec_stmt_p,
8374 stmt_vector_for_cost *cost_vec)
8376 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8377 imm_use_iterator imm_iter;
8378 tree lhs, lhs_type, bitsize, vec_bitsize;
8379 tree vectype = (slp_node
8380 ? SLP_TREE_VECTYPE (slp_node)
8381 : STMT_VINFO_VECTYPE (stmt_info));
8382 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8383 int ncopies;
8384 gimple *use_stmt;
8385 auto_vec<tree> vec_oprnds;
8386 int vec_entry = 0;
8387 poly_uint64 vec_index = 0;
8389 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8391 /* If a stmt of a reduction is live, vectorize it via
8392 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8393 validity so just trigger the transform here. */
8394 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8396 if (!vec_stmt_p)
8397 return true;
8398 if (slp_node)
8400 /* For reduction chains the meta-info is attached to
8401 the group leader. */
8402 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8403 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8404 /* For SLP reductions we vectorize the epilogue for
8405 all involved stmts together. */
8406 else if (slp_index != 0)
8407 return true;
8408 else
8409 /* For SLP reductions the meta-info is attached to
8410 the representative. */
8411 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8413 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8414 gcc_assert (reduc_info->is_reduc_info);
8415 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8416 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8417 return true;
8418 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8419 slp_node_instance);
8420 return true;
8423 /* If STMT is not relevant and it is a simple assignment and its inputs are
8424 invariant then it can remain in place, unvectorized. The original last
8425 scalar value that it computes will be used. */
8426 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8428 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8429 if (dump_enabled_p ())
8430 dump_printf_loc (MSG_NOTE, vect_location,
8431 "statement is simple and uses invariant. Leaving in "
8432 "place.\n");
8433 return true;
8436 if (slp_node)
8437 ncopies = 1;
8438 else
8439 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8441 if (slp_node)
8443 gcc_assert (slp_index >= 0);
8445 /* Get the last occurrence of the scalar index from the concatenation of
8446 all the slp vectors. Calculate which slp vector it is and the index
8447 within. */
8448 int num_scalar = SLP_TREE_LANES (slp_node);
8449 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8450 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8452 /* Calculate which vector contains the result, and which lane of
8453 that vector we need. */
8454 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8456 if (dump_enabled_p ())
8457 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8458 "Cannot determine which vector holds the"
8459 " final result.\n");
8460 return false;
8464 if (!vec_stmt_p)
8466 /* No transformation required. */
8467 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8469 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8470 OPTIMIZE_FOR_SPEED))
8472 if (dump_enabled_p ())
8473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8474 "can't operate on partial vectors "
8475 "because the target doesn't support extract "
8476 "last reduction.\n");
8477 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8479 else if (slp_node)
8481 if (dump_enabled_p ())
8482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8483 "can't operate on partial vectors "
8484 "because an SLP statement is live after "
8485 "the loop.\n");
8486 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8488 else if (ncopies > 1)
8490 if (dump_enabled_p ())
8491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8492 "can't operate on partial vectors "
8493 "because ncopies is greater than 1.\n");
8494 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8496 else
8498 gcc_assert (ncopies == 1 && !slp_node);
8499 vect_record_loop_mask (loop_vinfo,
8500 &LOOP_VINFO_MASKS (loop_vinfo),
8501 1, vectype, NULL);
8504 /* ??? Enable for loop costing as well. */
8505 if (!loop_vinfo)
8506 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8507 0, vect_epilogue);
8508 return true;
8511 /* Use the lhs of the original scalar statement. */
8512 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8513 if (dump_enabled_p ())
8514 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8515 "stmt %G", stmt);
8517 lhs = gimple_get_lhs (stmt);
8518 lhs_type = TREE_TYPE (lhs);
8520 bitsize = vector_element_bits_tree (vectype);
8521 vec_bitsize = TYPE_SIZE (vectype);
8523 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8524 tree vec_lhs, bitstart;
8525 gimple *vec_stmt;
8526 if (slp_node)
8528 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8530 /* Get the correct slp vectorized stmt. */
8531 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8532 vec_lhs = gimple_get_lhs (vec_stmt);
8534 /* Get entry to use. */
8535 bitstart = bitsize_int (vec_index);
8536 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8538 else
8540 /* For multiple copies, get the last copy. */
8541 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8542 vec_lhs = gimple_get_lhs (vec_stmt);
8544 /* Get the last lane in the vector. */
8545 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8548 if (loop_vinfo)
8550 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8551 requirement, insert one phi node for it. It looks like:
8552 loop;
8554 # lhs' = PHI <lhs>
8556 loop;
8558 # vec_lhs' = PHI <vec_lhs>
8559 new_tree = lane_extract <vec_lhs', ...>;
8560 lhs' = new_tree; */
8562 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8563 basic_block exit_bb = single_exit (loop)->dest;
8564 gcc_assert (single_pred_p (exit_bb));
8566 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8567 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8568 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8570 gimple_seq stmts = NULL;
8571 tree new_tree;
8572 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8574 /* Emit:
8576 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8578 where VEC_LHS is the vectorized live-out result and MASK is
8579 the loop mask for the final iteration. */
8580 gcc_assert (ncopies == 1 && !slp_node);
8581 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8582 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8583 1, vectype, 0);
8584 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8585 mask, vec_lhs_phi);
8587 /* Convert the extracted vector element to the scalar type. */
8588 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8590 else
8592 tree bftype = TREE_TYPE (vectype);
8593 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8594 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8595 new_tree = build3 (BIT_FIELD_REF, bftype,
8596 vec_lhs_phi, bitsize, bitstart);
8597 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8598 &stmts, true, NULL_TREE);
8601 if (stmts)
8603 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8604 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8606 /* Remove existing phi from lhs and create one copy from new_tree. */
8607 tree lhs_phi = NULL_TREE;
8608 gimple_stmt_iterator gsi;
8609 for (gsi = gsi_start_phis (exit_bb);
8610 !gsi_end_p (gsi); gsi_next (&gsi))
8612 gimple *phi = gsi_stmt (gsi);
8613 if ((gimple_phi_arg_def (phi, 0) == lhs))
8615 remove_phi_node (&gsi, false);
8616 lhs_phi = gimple_phi_result (phi);
8617 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8618 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8619 break;
8624 /* Replace use of lhs with newly computed result. If the use stmt is a
8625 single arg PHI, just replace all uses of PHI result. It's necessary
8626 because lcssa PHI defining lhs may be before newly inserted stmt. */
8627 use_operand_p use_p;
8628 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8629 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8630 && !is_gimple_debug (use_stmt))
8632 if (gimple_code (use_stmt) == GIMPLE_PHI
8633 && gimple_phi_num_args (use_stmt) == 1)
8635 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8637 else
8639 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8640 SET_USE (use_p, new_tree);
8642 update_stmt (use_stmt);
8645 else
8647 /* For basic-block vectorization simply insert the lane-extraction. */
8648 tree bftype = TREE_TYPE (vectype);
8649 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8650 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8651 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8652 vec_lhs, bitsize, bitstart);
8653 gimple_seq stmts = NULL;
8654 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8655 &stmts, true, NULL_TREE);
8656 if (TREE_CODE (new_tree) == SSA_NAME
8657 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8658 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8659 if (is_a <gphi *> (vec_stmt))
8661 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8662 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8664 else
8666 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8667 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8670 /* Replace use of lhs with newly computed result. If the use stmt is a
8671 single arg PHI, just replace all uses of PHI result. It's necessary
8672 because lcssa PHI defining lhs may be before newly inserted stmt. */
8673 use_operand_p use_p;
8674 stmt_vec_info use_stmt_info;
8675 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8676 if (!is_gimple_debug (use_stmt)
8677 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8678 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8680 /* ??? This can happen when the live lane ends up being
8681 used in a vector construction code-generated by an
8682 external SLP node (and code-generation for that already
8683 happened). See gcc.dg/vect/bb-slp-47.c.
8684 Doing this is what would happen if that vector CTOR
8685 were not code-generated yet so it is not too bad.
8686 ??? In fact we'd likely want to avoid this situation
8687 in the first place. */
8688 if (TREE_CODE (new_tree) == SSA_NAME
8689 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8690 && gimple_code (use_stmt) != GIMPLE_PHI
8691 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8692 use_stmt))
8694 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8695 gcc_assert (code == CONSTRUCTOR
8696 || code == VIEW_CONVERT_EXPR
8697 || CONVERT_EXPR_CODE_P (code));
8698 if (dump_enabled_p ())
8699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8700 "Using original scalar computation for "
8701 "live lane because use preceeds vector "
8702 "def\n");
8703 continue;
8705 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8706 SET_USE (use_p, new_tree);
8707 update_stmt (use_stmt);
8711 return true;
8714 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8716 static void
8717 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8719 ssa_op_iter op_iter;
8720 imm_use_iterator imm_iter;
8721 def_operand_p def_p;
8722 gimple *ustmt;
8724 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8726 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8728 basic_block bb;
8730 if (!is_gimple_debug (ustmt))
8731 continue;
8733 bb = gimple_bb (ustmt);
8735 if (!flow_bb_inside_loop_p (loop, bb))
8737 if (gimple_debug_bind_p (ustmt))
8739 if (dump_enabled_p ())
8740 dump_printf_loc (MSG_NOTE, vect_location,
8741 "killing debug use\n");
8743 gimple_debug_bind_reset_value (ustmt);
8744 update_stmt (ustmt);
8746 else
8747 gcc_unreachable ();
8753 /* Given loop represented by LOOP_VINFO, return true if computation of
8754 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8755 otherwise. */
8757 static bool
8758 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8760 /* Constant case. */
8761 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8763 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8764 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8766 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8767 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8768 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8769 return true;
8772 widest_int max;
8773 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8774 /* Check the upper bound of loop niters. */
8775 if (get_max_loop_iterations (loop, &max))
8777 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8778 signop sgn = TYPE_SIGN (type);
8779 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8780 if (max < type_max)
8781 return true;
8783 return false;
8786 /* Return a mask type with half the number of elements as OLD_TYPE,
8787 given that it should have mode NEW_MODE. */
8789 tree
8790 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8792 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8793 return build_truth_vector_type_for_mode (nunits, new_mode);
8796 /* Return a mask type with twice as many elements as OLD_TYPE,
8797 given that it should have mode NEW_MODE. */
8799 tree
8800 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8802 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8803 return build_truth_vector_type_for_mode (nunits, new_mode);
8806 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8807 contain a sequence of NVECTORS masks that each control a vector of type
8808 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8809 these vector masks with the vector version of SCALAR_MASK. */
8811 void
8812 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8813 unsigned int nvectors, tree vectype, tree scalar_mask)
8815 gcc_assert (nvectors != 0);
8816 if (masks->length () < nvectors)
8817 masks->safe_grow_cleared (nvectors, true);
8818 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8819 /* The number of scalars per iteration and the number of vectors are
8820 both compile-time constants. */
8821 unsigned int nscalars_per_iter
8822 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8823 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8825 if (scalar_mask)
8827 scalar_cond_masked_key cond (scalar_mask, nvectors);
8828 loop_vinfo->scalar_cond_masked_set.add (cond);
8831 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8833 rgm->max_nscalars_per_iter = nscalars_per_iter;
8834 rgm->type = truth_type_for (vectype);
8835 rgm->factor = 1;
8839 /* Given a complete set of masks MASKS, extract mask number INDEX
8840 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8841 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8843 See the comment above vec_loop_masks for more details about the mask
8844 arrangement. */
8846 tree
8847 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8848 unsigned int nvectors, tree vectype, unsigned int index)
8850 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8851 tree mask_type = rgm->type;
8853 /* Populate the rgroup's mask array, if this is the first time we've
8854 used it. */
8855 if (rgm->controls.is_empty ())
8857 rgm->controls.safe_grow_cleared (nvectors, true);
8858 for (unsigned int i = 0; i < nvectors; ++i)
8860 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8861 /* Provide a dummy definition until the real one is available. */
8862 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8863 rgm->controls[i] = mask;
8867 tree mask = rgm->controls[index];
8868 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8869 TYPE_VECTOR_SUBPARTS (vectype)))
8871 /* A loop mask for data type X can be reused for data type Y
8872 if X has N times more elements than Y and if Y's elements
8873 are N times bigger than X's. In this case each sequence
8874 of N elements in the loop mask will be all-zero or all-one.
8875 We can then view-convert the mask so that each sequence of
8876 N elements is replaced by a single element. */
8877 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8878 TYPE_VECTOR_SUBPARTS (vectype)));
8879 gimple_seq seq = NULL;
8880 mask_type = truth_type_for (vectype);
8881 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8882 if (seq)
8883 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8885 return mask;
8888 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
8889 lengths for controlling an operation on VECTYPE. The operation splits
8890 each element of VECTYPE into FACTOR separate subelements, measuring the
8891 length as a number of these subelements. */
8893 void
8894 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8895 unsigned int nvectors, tree vectype, unsigned int factor)
8897 gcc_assert (nvectors != 0);
8898 if (lens->length () < nvectors)
8899 lens->safe_grow_cleared (nvectors, true);
8900 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8902 /* The number of scalars per iteration, scalar occupied bytes and
8903 the number of vectors are both compile-time constants. */
8904 unsigned int nscalars_per_iter
8905 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8906 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8908 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
8910 /* For now, we only support cases in which all loads and stores fall back
8911 to VnQI or none do. */
8912 gcc_assert (!rgl->max_nscalars_per_iter
8913 || (rgl->factor == 1 && factor == 1)
8914 || (rgl->max_nscalars_per_iter * rgl->factor
8915 == nscalars_per_iter * factor));
8916 rgl->max_nscalars_per_iter = nscalars_per_iter;
8917 rgl->type = vectype;
8918 rgl->factor = factor;
8922 /* Given a complete set of length LENS, extract length number INDEX for an
8923 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
8925 tree
8926 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8927 unsigned int nvectors, unsigned int index)
8929 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8931 /* Populate the rgroup's len array, if this is the first time we've
8932 used it. */
8933 if (rgl->controls.is_empty ())
8935 rgl->controls.safe_grow_cleared (nvectors, true);
8936 for (unsigned int i = 0; i < nvectors; ++i)
8938 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
8939 gcc_assert (len_type != NULL_TREE);
8940 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
8942 /* Provide a dummy definition until the real one is available. */
8943 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
8944 rgl->controls[i] = len;
8948 return rgl->controls[index];
8951 /* Scale profiling counters by estimation for LOOP which is vectorized
8952 by factor VF. */
8954 static void
8955 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8957 edge preheader = loop_preheader_edge (loop);
8958 /* Reduce loop iterations by the vectorization factor. */
8959 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8960 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8962 if (freq_h.nonzero_p ())
8964 profile_probability p;
8966 /* Avoid dropping loop body profile counter to 0 because of zero count
8967 in loop's preheader. */
8968 if (!(freq_e == profile_count::zero ()))
8969 freq_e = freq_e.force_nonzero ();
8970 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8971 scale_loop_frequencies (loop, p);
8974 edge exit_e = single_exit (loop);
8975 exit_e->probability = profile_probability::always ()
8976 .apply_scale (1, new_est_niter + 1);
8978 edge exit_l = single_pred_edge (loop->latch);
8979 profile_probability prob = exit_l->probability;
8980 exit_l->probability = exit_e->probability.invert ();
8981 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8982 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8985 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
8986 latch edge values originally defined by it. */
8988 static void
8989 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
8990 stmt_vec_info def_stmt_info)
8992 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
8993 if (!def || TREE_CODE (def) != SSA_NAME)
8994 return;
8995 stmt_vec_info phi_info;
8996 imm_use_iterator iter;
8997 use_operand_p use_p;
8998 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
8999 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9000 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9001 && (phi_info = loop_vinfo->lookup_stmt (phi))
9002 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9003 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9004 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9006 loop_p loop = gimple_bb (phi)->loop_father;
9007 edge e = loop_latch_edge (loop);
9008 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9010 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9011 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9012 gcc_assert (phi_defs.length () == latch_defs.length ());
9013 for (unsigned i = 0; i < phi_defs.length (); ++i)
9014 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9015 gimple_get_lhs (latch_defs[i]), e,
9016 gimple_phi_arg_location (phi, e->dest_idx));
9021 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9022 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9023 stmt_vec_info. */
9025 static void
9026 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9027 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9029 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9030 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9032 if (dump_enabled_p ())
9033 dump_printf_loc (MSG_NOTE, vect_location,
9034 "------>vectorizing statement: %G", stmt_info->stmt);
9036 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9037 vect_loop_kill_debug_uses (loop, stmt_info);
9039 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9040 && !STMT_VINFO_LIVE_P (stmt_info))
9041 return;
9043 if (STMT_VINFO_VECTYPE (stmt_info))
9045 poly_uint64 nunits
9046 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9047 if (!STMT_SLP_TYPE (stmt_info)
9048 && maybe_ne (nunits, vf)
9049 && dump_enabled_p ())
9050 /* For SLP VF is set according to unrolling factor, and not
9051 to vector size, hence for SLP this print is not valid. */
9052 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9055 /* Pure SLP statements have already been vectorized. We still need
9056 to apply loop vectorization to hybrid SLP statements. */
9057 if (PURE_SLP_STMT (stmt_info))
9058 return;
9060 if (dump_enabled_p ())
9061 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9063 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9064 *seen_store = stmt_info;
9067 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9068 in the hash_map with its corresponding values. */
9070 static tree
9071 find_in_mapping (tree t, void *context)
9073 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9075 tree *value = mapping->get (t);
9076 return value ? *value : t;
9079 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9080 original loop that has now been vectorized.
9082 The inits of the data_references need to be advanced with the number of
9083 iterations of the main loop. This has been computed in vect_do_peeling and
9084 is stored in parameter ADVANCE. We first restore the data_references
9085 initial offset with the values recored in ORIG_DRS_INIT.
9087 Since the loop_vec_info of this EPILOGUE was constructed for the original
9088 loop, its stmt_vec_infos all point to the original statements. These need
9089 to be updated to point to their corresponding copies as well as the SSA_NAMES
9090 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9092 The data_reference's connections also need to be updated. Their
9093 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9094 stmt_vec_infos, their statements need to point to their corresponding copy,
9095 if they are gather loads or scatter stores then their reference needs to be
9096 updated to point to its corresponding copy and finally we set
9097 'base_misaligned' to false as we have already peeled for alignment in the
9098 prologue of the main loop. */
9100 static void
9101 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9103 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9104 auto_vec<gimple *> stmt_worklist;
9105 hash_map<tree,tree> mapping;
9106 gimple *orig_stmt, *new_stmt;
9107 gimple_stmt_iterator epilogue_gsi;
9108 gphi_iterator epilogue_phi_gsi;
9109 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9110 basic_block *epilogue_bbs = get_loop_body (epilogue);
9111 unsigned i;
9113 free (LOOP_VINFO_BBS (epilogue_vinfo));
9114 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9116 /* Advance data_reference's with the number of iterations of the previous
9117 loop and its prologue. */
9118 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9121 /* The EPILOGUE loop is a copy of the original loop so they share the same
9122 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9123 point to the copied statements. We also create a mapping of all LHS' in
9124 the original loop and all the LHS' in the EPILOGUE and create worklists to
9125 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9126 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9128 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9129 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9131 new_stmt = epilogue_phi_gsi.phi ();
9133 gcc_assert (gimple_uid (new_stmt) > 0);
9134 stmt_vinfo
9135 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9137 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9138 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9140 mapping.put (gimple_phi_result (orig_stmt),
9141 gimple_phi_result (new_stmt));
9142 /* PHI nodes can not have patterns or related statements. */
9143 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9144 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9147 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9148 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9150 new_stmt = gsi_stmt (epilogue_gsi);
9151 if (is_gimple_debug (new_stmt))
9152 continue;
9154 gcc_assert (gimple_uid (new_stmt) > 0);
9155 stmt_vinfo
9156 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9158 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9159 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9161 if (tree old_lhs = gimple_get_lhs (orig_stmt))
9162 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9164 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9166 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9167 for (gimple_stmt_iterator gsi = gsi_start (seq);
9168 !gsi_end_p (gsi); gsi_next (&gsi))
9169 stmt_worklist.safe_push (gsi_stmt (gsi));
9172 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9173 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9175 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9176 stmt_worklist.safe_push (stmt);
9177 /* Set BB such that the assert in
9178 'get_initial_def_for_reduction' is able to determine that
9179 the BB of the related stmt is inside this loop. */
9180 gimple_set_bb (stmt,
9181 gimple_bb (new_stmt));
9182 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9183 gcc_assert (related_vinfo == NULL
9184 || related_vinfo == stmt_vinfo);
9189 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9190 using the original main loop and thus need to be updated to refer to the
9191 cloned variables used in the epilogue. */
9192 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9194 gimple *stmt = stmt_worklist[i];
9195 tree *new_op;
9197 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9199 tree op = gimple_op (stmt, j);
9200 if ((new_op = mapping.get(op)))
9201 gimple_set_op (stmt, j, *new_op);
9202 else
9204 /* PR92429: The last argument of simplify_replace_tree disables
9205 folding when replacing arguments. This is required as
9206 otherwise you might end up with different statements than the
9207 ones analyzed in vect_loop_analyze, leading to different
9208 vectorization. */
9209 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9210 &find_in_mapping, &mapping, false);
9211 gimple_set_op (stmt, j, op);
9216 struct data_reference *dr;
9217 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9218 FOR_EACH_VEC_ELT (datarefs, i, dr)
9220 orig_stmt = DR_STMT (dr);
9221 gcc_assert (gimple_uid (orig_stmt) > 0);
9222 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9223 /* Data references for gather loads and scatter stores do not use the
9224 updated offset we set using ADVANCE. Instead we have to make sure the
9225 reference in the data references point to the corresponding copy of
9226 the original in the epilogue. */
9227 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9228 == VMAT_GATHER_SCATTER)
9230 DR_REF (dr)
9231 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9232 &find_in_mapping, &mapping);
9233 DR_BASE_ADDRESS (dr)
9234 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9235 &find_in_mapping, &mapping);
9237 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9238 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9239 /* The vector size of the epilogue is smaller than that of the main loop
9240 so the alignment is either the same or lower. This means the dr will
9241 thus by definition be aligned. */
9242 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9245 epilogue_vinfo->shared->datarefs_copy.release ();
9246 epilogue_vinfo->shared->save_datarefs ();
9249 /* Function vect_transform_loop.
9251 The analysis phase has determined that the loop is vectorizable.
9252 Vectorize the loop - created vectorized stmts to replace the scalar
9253 stmts in the loop, and update the loop exit condition.
9254 Returns scalar epilogue loop if any. */
9256 class loop *
9257 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9259 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9260 class loop *epilogue = NULL;
9261 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9262 int nbbs = loop->num_nodes;
9263 int i;
9264 tree niters_vector = NULL_TREE;
9265 tree step_vector = NULL_TREE;
9266 tree niters_vector_mult_vf = NULL_TREE;
9267 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9268 unsigned int lowest_vf = constant_lower_bound (vf);
9269 gimple *stmt;
9270 bool check_profitability = false;
9271 unsigned int th;
9273 DUMP_VECT_SCOPE ("vec_transform_loop");
9275 loop_vinfo->shared->check_datarefs ();
9277 /* Use the more conservative vectorization threshold. If the number
9278 of iterations is constant assume the cost check has been performed
9279 by our caller. If the threshold makes all loops profitable that
9280 run at least the (estimated) vectorization factor number of times
9281 checking is pointless, too. */
9282 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9283 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9285 if (dump_enabled_p ())
9286 dump_printf_loc (MSG_NOTE, vect_location,
9287 "Profitability threshold is %d loop iterations.\n",
9288 th);
9289 check_profitability = true;
9292 /* Make sure there exists a single-predecessor exit bb. Do this before
9293 versioning. */
9294 edge e = single_exit (loop);
9295 if (! single_pred_p (e->dest))
9297 split_loop_exit_edge (e, true);
9298 if (dump_enabled_p ())
9299 dump_printf (MSG_NOTE, "split exit edge\n");
9302 /* Version the loop first, if required, so the profitability check
9303 comes first. */
9305 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9307 class loop *sloop
9308 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9309 sloop->force_vectorize = false;
9310 check_profitability = false;
9313 /* Make sure there exists a single-predecessor exit bb also on the
9314 scalar loop copy. Do this after versioning but before peeling
9315 so CFG structure is fine for both scalar and if-converted loop
9316 to make slpeel_duplicate_current_defs_from_edges face matched
9317 loop closed PHI nodes on the exit. */
9318 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9320 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9321 if (! single_pred_p (e->dest))
9323 split_loop_exit_edge (e, true);
9324 if (dump_enabled_p ())
9325 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9329 tree niters = vect_build_loop_niters (loop_vinfo);
9330 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9331 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9332 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9333 tree advance;
9334 drs_init_vec orig_drs_init;
9336 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9337 &step_vector, &niters_vector_mult_vf, th,
9338 check_profitability, niters_no_overflow,
9339 &advance);
9341 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9342 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9343 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9344 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9346 if (niters_vector == NULL_TREE)
9348 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9349 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9350 && known_eq (lowest_vf, vf))
9352 niters_vector
9353 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9354 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9355 step_vector = build_one_cst (TREE_TYPE (niters));
9357 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9358 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9359 &step_vector, niters_no_overflow);
9360 else
9361 /* vect_do_peeling subtracted the number of peeled prologue
9362 iterations from LOOP_VINFO_NITERS. */
9363 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9364 &niters_vector, &step_vector,
9365 niters_no_overflow);
9368 /* 1) Make sure the loop header has exactly two entries
9369 2) Make sure we have a preheader basic block. */
9371 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9373 split_edge (loop_preheader_edge (loop));
9375 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9376 /* This will deal with any possible peeling. */
9377 vect_prepare_for_masked_peels (loop_vinfo);
9379 /* Schedule the SLP instances first, then handle loop vectorization
9380 below. */
9381 if (!loop_vinfo->slp_instances.is_empty ())
9383 DUMP_VECT_SCOPE ("scheduling SLP instances");
9384 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9387 /* FORNOW: the vectorizer supports only loops which body consist
9388 of one basic block (header + empty latch). When the vectorizer will
9389 support more involved loop forms, the order by which the BBs are
9390 traversed need to be reconsidered. */
9392 for (i = 0; i < nbbs; i++)
9394 basic_block bb = bbs[i];
9395 stmt_vec_info stmt_info;
9397 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9398 gsi_next (&si))
9400 gphi *phi = si.phi ();
9401 if (dump_enabled_p ())
9402 dump_printf_loc (MSG_NOTE, vect_location,
9403 "------>vectorizing phi: %G", phi);
9404 stmt_info = loop_vinfo->lookup_stmt (phi);
9405 if (!stmt_info)
9406 continue;
9408 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9409 vect_loop_kill_debug_uses (loop, stmt_info);
9411 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9412 && !STMT_VINFO_LIVE_P (stmt_info))
9413 continue;
9415 if (STMT_VINFO_VECTYPE (stmt_info)
9416 && (maybe_ne
9417 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9418 && dump_enabled_p ())
9419 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9421 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9422 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9423 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9424 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9425 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9426 && ! PURE_SLP_STMT (stmt_info))
9428 if (dump_enabled_p ())
9429 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9430 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9434 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9435 gsi_next (&si))
9437 gphi *phi = si.phi ();
9438 stmt_info = loop_vinfo->lookup_stmt (phi);
9439 if (!stmt_info)
9440 continue;
9442 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9443 && !STMT_VINFO_LIVE_P (stmt_info))
9444 continue;
9446 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9447 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9448 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9449 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9450 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9451 && ! PURE_SLP_STMT (stmt_info))
9452 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9455 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9456 !gsi_end_p (si);)
9458 stmt = gsi_stmt (si);
9459 /* During vectorization remove existing clobber stmts. */
9460 if (gimple_clobber_p (stmt))
9462 unlink_stmt_vdef (stmt);
9463 gsi_remove (&si, true);
9464 release_defs (stmt);
9466 else
9468 /* Ignore vector stmts created in the outer loop. */
9469 stmt_info = loop_vinfo->lookup_stmt (stmt);
9471 /* vector stmts created in the outer-loop during vectorization of
9472 stmts in an inner-loop may not have a stmt_info, and do not
9473 need to be vectorized. */
9474 stmt_vec_info seen_store = NULL;
9475 if (stmt_info)
9477 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9479 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9480 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9481 !gsi_end_p (subsi); gsi_next (&subsi))
9483 stmt_vec_info pat_stmt_info
9484 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9485 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9486 &si, &seen_store);
9488 stmt_vec_info pat_stmt_info
9489 = STMT_VINFO_RELATED_STMT (stmt_info);
9490 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
9491 &seen_store);
9492 maybe_set_vectorized_backedge_value (loop_vinfo,
9493 pat_stmt_info);
9495 else
9497 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9498 &seen_store);
9499 maybe_set_vectorized_backedge_value (loop_vinfo,
9500 stmt_info);
9503 gsi_next (&si);
9504 if (seen_store)
9506 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9507 /* Interleaving. If IS_STORE is TRUE, the
9508 vectorization of the interleaving chain was
9509 completed - free all the stores in the chain. */
9510 vect_remove_stores (loop_vinfo,
9511 DR_GROUP_FIRST_ELEMENT (seen_store));
9512 else
9513 /* Free the attached stmt_vec_info and remove the stmt. */
9514 loop_vinfo->remove_stmt (stmt_info);
9519 /* Stub out scalar statements that must not survive vectorization.
9520 Doing this here helps with grouped statements, or statements that
9521 are involved in patterns. */
9522 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9523 !gsi_end_p (gsi); gsi_next (&gsi))
9525 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9526 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9528 tree lhs = gimple_get_lhs (call);
9529 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9531 tree zero = build_zero_cst (TREE_TYPE (lhs));
9532 gimple *new_stmt = gimple_build_assign (lhs, zero);
9533 gsi_replace (&gsi, new_stmt, true);
9537 } /* BBs in loop */
9539 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9540 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9541 if (integer_onep (step_vector))
9542 niters_no_overflow = true;
9543 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9544 niters_vector_mult_vf, !niters_no_overflow);
9546 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9547 scale_profile_for_vect_loop (loop, assumed_vf);
9549 /* True if the final iteration might not handle a full vector's
9550 worth of scalar iterations. */
9551 bool final_iter_may_be_partial
9552 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9553 /* The minimum number of iterations performed by the epilogue. This
9554 is 1 when peeling for gaps because we always need a final scalar
9555 iteration. */
9556 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9557 /* +1 to convert latch counts to loop iteration counts,
9558 -min_epilogue_iters to remove iterations that cannot be performed
9559 by the vector code. */
9560 int bias_for_lowest = 1 - min_epilogue_iters;
9561 int bias_for_assumed = bias_for_lowest;
9562 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9563 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9565 /* When the amount of peeling is known at compile time, the first
9566 iteration will have exactly alignment_npeels active elements.
9567 In the worst case it will have at least one. */
9568 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9569 bias_for_lowest += lowest_vf - min_first_active;
9570 bias_for_assumed += assumed_vf - min_first_active;
9572 /* In these calculations the "- 1" converts loop iteration counts
9573 back to latch counts. */
9574 if (loop->any_upper_bound)
9575 loop->nb_iterations_upper_bound
9576 = (final_iter_may_be_partial
9577 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9578 lowest_vf) - 1
9579 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9580 lowest_vf) - 1);
9581 if (loop->any_likely_upper_bound)
9582 loop->nb_iterations_likely_upper_bound
9583 = (final_iter_may_be_partial
9584 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9585 + bias_for_lowest, lowest_vf) - 1
9586 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9587 + bias_for_lowest, lowest_vf) - 1);
9588 if (loop->any_estimate)
9589 loop->nb_iterations_estimate
9590 = (final_iter_may_be_partial
9591 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9592 assumed_vf) - 1
9593 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9594 assumed_vf) - 1);
9596 if (dump_enabled_p ())
9598 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9600 dump_printf_loc (MSG_NOTE, vect_location,
9601 "LOOP VECTORIZED\n");
9602 if (loop->inner)
9603 dump_printf_loc (MSG_NOTE, vect_location,
9604 "OUTER LOOP VECTORIZED\n");
9605 dump_printf (MSG_NOTE, "\n");
9607 else
9608 dump_printf_loc (MSG_NOTE, vect_location,
9609 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9610 GET_MODE_NAME (loop_vinfo->vector_mode));
9613 /* Loops vectorized with a variable factor won't benefit from
9614 unrolling/peeling. */
9615 if (!vf.is_constant ())
9617 loop->unroll = 1;
9618 if (dump_enabled_p ())
9619 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9620 " variable-length vectorization factor\n");
9622 /* Free SLP instances here because otherwise stmt reference counting
9623 won't work. */
9624 slp_instance instance;
9625 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9626 vect_free_slp_instance (instance);
9627 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9628 /* Clear-up safelen field since its value is invalid after vectorization
9629 since vectorized loop can have loop-carried dependencies. */
9630 loop->safelen = 0;
9632 if (epilogue)
9634 update_epilogue_loop_vinfo (epilogue, advance);
9636 epilogue->simduid = loop->simduid;
9637 epilogue->force_vectorize = loop->force_vectorize;
9638 epilogue->dont_vectorize = false;
9641 return epilogue;
9644 /* The code below is trying to perform simple optimization - revert
9645 if-conversion for masked stores, i.e. if the mask of a store is zero
9646 do not perform it and all stored value producers also if possible.
9647 For example,
9648 for (i=0; i<n; i++)
9649 if (c[i])
9651 p1[i] += 1;
9652 p2[i] = p3[i] +2;
9654 this transformation will produce the following semi-hammock:
9656 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9658 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9659 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9660 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9661 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9662 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9663 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9667 void
9668 optimize_mask_stores (class loop *loop)
9670 basic_block *bbs = get_loop_body (loop);
9671 unsigned nbbs = loop->num_nodes;
9672 unsigned i;
9673 basic_block bb;
9674 class loop *bb_loop;
9675 gimple_stmt_iterator gsi;
9676 gimple *stmt;
9677 auto_vec<gimple *> worklist;
9678 auto_purge_vect_location sentinel;
9680 vect_location = find_loop_location (loop);
9681 /* Pick up all masked stores in loop if any. */
9682 for (i = 0; i < nbbs; i++)
9684 bb = bbs[i];
9685 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9686 gsi_next (&gsi))
9688 stmt = gsi_stmt (gsi);
9689 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9690 worklist.safe_push (stmt);
9694 free (bbs);
9695 if (worklist.is_empty ())
9696 return;
9698 /* Loop has masked stores. */
9699 while (!worklist.is_empty ())
9701 gimple *last, *last_store;
9702 edge e, efalse;
9703 tree mask;
9704 basic_block store_bb, join_bb;
9705 gimple_stmt_iterator gsi_to;
9706 tree vdef, new_vdef;
9707 gphi *phi;
9708 tree vectype;
9709 tree zero;
9711 last = worklist.pop ();
9712 mask = gimple_call_arg (last, 2);
9713 bb = gimple_bb (last);
9714 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9715 the same loop as if_bb. It could be different to LOOP when two
9716 level loop-nest is vectorized and mask_store belongs to the inner
9717 one. */
9718 e = split_block (bb, last);
9719 bb_loop = bb->loop_father;
9720 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9721 join_bb = e->dest;
9722 store_bb = create_empty_bb (bb);
9723 add_bb_to_loop (store_bb, bb_loop);
9724 e->flags = EDGE_TRUE_VALUE;
9725 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9726 /* Put STORE_BB to likely part. */
9727 efalse->probability = profile_probability::unlikely ();
9728 store_bb->count = efalse->count ();
9729 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9730 if (dom_info_available_p (CDI_DOMINATORS))
9731 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9732 if (dump_enabled_p ())
9733 dump_printf_loc (MSG_NOTE, vect_location,
9734 "Create new block %d to sink mask stores.",
9735 store_bb->index);
9736 /* Create vector comparison with boolean result. */
9737 vectype = TREE_TYPE (mask);
9738 zero = build_zero_cst (vectype);
9739 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9740 gsi = gsi_last_bb (bb);
9741 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9742 /* Create new PHI node for vdef of the last masked store:
9743 .MEM_2 = VDEF <.MEM_1>
9744 will be converted to
9745 .MEM.3 = VDEF <.MEM_1>
9746 and new PHI node will be created in join bb
9747 .MEM_2 = PHI <.MEM_1, .MEM_3>
9749 vdef = gimple_vdef (last);
9750 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9751 gimple_set_vdef (last, new_vdef);
9752 phi = create_phi_node (vdef, join_bb);
9753 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9755 /* Put all masked stores with the same mask to STORE_BB if possible. */
9756 while (true)
9758 gimple_stmt_iterator gsi_from;
9759 gimple *stmt1 = NULL;
9761 /* Move masked store to STORE_BB. */
9762 last_store = last;
9763 gsi = gsi_for_stmt (last);
9764 gsi_from = gsi;
9765 /* Shift GSI to the previous stmt for further traversal. */
9766 gsi_prev (&gsi);
9767 gsi_to = gsi_start_bb (store_bb);
9768 gsi_move_before (&gsi_from, &gsi_to);
9769 /* Setup GSI_TO to the non-empty block start. */
9770 gsi_to = gsi_start_bb (store_bb);
9771 if (dump_enabled_p ())
9772 dump_printf_loc (MSG_NOTE, vect_location,
9773 "Move stmt to created bb\n%G", last);
9774 /* Move all stored value producers if possible. */
9775 while (!gsi_end_p (gsi))
9777 tree lhs;
9778 imm_use_iterator imm_iter;
9779 use_operand_p use_p;
9780 bool res;
9782 /* Skip debug statements. */
9783 if (is_gimple_debug (gsi_stmt (gsi)))
9785 gsi_prev (&gsi);
9786 continue;
9788 stmt1 = gsi_stmt (gsi);
9789 /* Do not consider statements writing to memory or having
9790 volatile operand. */
9791 if (gimple_vdef (stmt1)
9792 || gimple_has_volatile_ops (stmt1))
9793 break;
9794 gsi_from = gsi;
9795 gsi_prev (&gsi);
9796 lhs = gimple_get_lhs (stmt1);
9797 if (!lhs)
9798 break;
9800 /* LHS of vectorized stmt must be SSA_NAME. */
9801 if (TREE_CODE (lhs) != SSA_NAME)
9802 break;
9804 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9806 /* Remove dead scalar statement. */
9807 if (has_zero_uses (lhs))
9809 gsi_remove (&gsi_from, true);
9810 continue;
9814 /* Check that LHS does not have uses outside of STORE_BB. */
9815 res = true;
9816 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9818 gimple *use_stmt;
9819 use_stmt = USE_STMT (use_p);
9820 if (is_gimple_debug (use_stmt))
9821 continue;
9822 if (gimple_bb (use_stmt) != store_bb)
9824 res = false;
9825 break;
9828 if (!res)
9829 break;
9831 if (gimple_vuse (stmt1)
9832 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9833 break;
9835 /* Can move STMT1 to STORE_BB. */
9836 if (dump_enabled_p ())
9837 dump_printf_loc (MSG_NOTE, vect_location,
9838 "Move stmt to created bb\n%G", stmt1);
9839 gsi_move_before (&gsi_from, &gsi_to);
9840 /* Shift GSI_TO for further insertion. */
9841 gsi_prev (&gsi_to);
9843 /* Put other masked stores with the same mask to STORE_BB. */
9844 if (worklist.is_empty ()
9845 || gimple_call_arg (worklist.last (), 2) != mask
9846 || worklist.last () != stmt1)
9847 break;
9848 last = worklist.pop ();
9850 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9854 /* Decide whether it is possible to use a zero-based induction variable
9855 when vectorizing LOOP_VINFO with partial vectors. If it is, return
9856 the value that the induction variable must be able to hold in order
9857 to ensure that the rgroups eventually have no active vector elements.
9858 Return -1 otherwise. */
9860 widest_int
9861 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
9863 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9864 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9865 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9867 /* Calculate the value that the induction variable must be able
9868 to hit in order to ensure that we end the loop with an all-false mask.
9869 This involves adding the maximum number of inactive trailing scalar
9870 iterations. */
9871 widest_int iv_limit = -1;
9872 if (max_loop_iterations (loop, &iv_limit))
9874 if (niters_skip)
9876 /* Add the maximum number of skipped iterations to the
9877 maximum iteration count. */
9878 if (TREE_CODE (niters_skip) == INTEGER_CST)
9879 iv_limit += wi::to_widest (niters_skip);
9880 else
9881 iv_limit += max_vf - 1;
9883 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9884 /* Make a conservatively-correct assumption. */
9885 iv_limit += max_vf - 1;
9887 /* IV_LIMIT is the maximum number of latch iterations, which is also
9888 the maximum in-range IV value. Round this value down to the previous
9889 vector alignment boundary and then add an extra full iteration. */
9890 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9891 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9893 return iv_limit;
9896 /* For the given rgroup_controls RGC, check whether an induction variable
9897 would ever hit a value that produces a set of all-false masks or zero
9898 lengths before wrapping around. Return true if it's possible to wrap
9899 around before hitting the desirable value, otherwise return false. */
9901 bool
9902 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
9904 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
9906 if (iv_limit == -1)
9907 return true;
9909 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9910 unsigned int compare_precision = TYPE_PRECISION (compare_type);
9911 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
9913 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
9914 return true;
9916 return false;