IBM Z: Fix usage of "f" constraint with long doubles
[official-gcc.git] / gcc / tree-vect-loop.c
blob3e973e774af8f9205be893e01ad9263281116885
1 /* Loop Vectorization
2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
57 /* Loop Vectorization Pass.
59 This pass tries to vectorize loops.
61 For example, the vectorizer transforms the following simple loop:
63 short a[N]; short b[N]; short c[N]; int i;
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
69 as if it was manually vectorized by rewriting the source code into:
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
117 For example, say stmt S1 was vectorized into stmt VS1:
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
163 static opt_result
164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
181 &stmt_vectype,
182 &nunits_vectype);
183 if (!res)
184 return res;
186 if (stmt_vectype)
188 if (STMT_VINFO_VECTYPE (stmt_info))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
193 || vectype_maybe_set_p)
194 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
202 return opt_result::success ();
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
210 static opt_result
211 vect_determine_vf_for_stmt (vec_info *vinfo,
212 stmt_vec_info stmt_info, poly_uint64 *vf)
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
216 stmt_info->stmt);
217 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
218 if (!res)
219 return res;
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
222 && STMT_VINFO_RELATED_STMT (stmt_info))
224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
225 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
229 !gsi_end_p (si); gsi_next (&si))
231 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE, vect_location,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info->stmt);
236 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
237 if (!res)
238 return res;
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE, vect_location,
243 "==> examining pattern statement: %G",
244 stmt_info->stmt);
245 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
246 if (!res)
247 return res;
250 return opt_result::success ();
253 /* Function vect_determine_vectorization_factor
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
264 in the loop.
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
267 original loop:
268 for (i=0; i<N; i++){
269 a[i] = b[i] + c[i];
272 vectorized loop:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
278 static opt_result
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
281 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
283 unsigned nbbs = loop->num_nodes;
284 poly_uint64 vectorization_factor = 1;
285 tree scalar_type = NULL_TREE;
286 gphi *phi;
287 tree vectype;
288 stmt_vec_info stmt_info;
289 unsigned i;
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
293 for (i = 0; i < nbbs; i++)
295 basic_block bb = bbs[i];
297 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
298 gsi_next (&si))
300 phi = si.phi ();
301 stmt_info = loop_vinfo->lookup_stmt (phi);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
304 phi);
306 gcc_assert (stmt_info);
308 if (STMT_VINFO_RELEVANT_P (stmt_info)
309 || STMT_VINFO_LIVE_P (stmt_info))
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
312 scalar_type = TREE_TYPE (PHI_RESULT (phi));
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "get vectype for scalar type: %T\n",
317 scalar_type);
319 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
320 if (!vectype)
321 return opt_result::failure_at (phi,
322 "not vectorized: unsupported "
323 "data-type %T\n",
324 scalar_type);
325 STMT_VINFO_VECTYPE (stmt_info) = vectype;
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
329 vectype);
331 if (dump_enabled_p ())
333 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
334 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
335 dump_printf (MSG_NOTE, "\n");
338 vect_update_max_nunits (&vectorization_factor, vectype);
342 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
343 gsi_next (&si))
345 if (is_gimple_debug (gsi_stmt (si)))
346 continue;
347 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
348 opt_result res
349 = vect_determine_vf_for_stmt (loop_vinfo,
350 stmt_info, &vectorization_factor);
351 if (!res)
352 return res;
356 /* TODO: Analyze cost. Decide if worth while to vectorize. */
357 if (dump_enabled_p ())
359 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
360 dump_dec (MSG_NOTE, vectorization_factor);
361 dump_printf (MSG_NOTE, "\n");
364 if (known_le (vectorization_factor, 1U))
365 return opt_result::failure_at (vect_location,
366 "not vectorized: unsupported data-type\n");
367 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
368 return opt_result::success ();
372 /* Function vect_is_simple_iv_evolution.
374 FORNOW: A simple evolution of an induction variables in the loop is
375 considered a polynomial evolution. */
377 static bool
378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
379 tree * step)
381 tree init_expr;
382 tree step_expr;
383 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
384 basic_block bb;
386 /* When there is no evolution in this loop, the evolution function
387 is not "simple". */
388 if (evolution_part == NULL_TREE)
389 return false;
391 /* When the evolution is a polynomial of degree >= 2
392 the evolution function is not "simple". */
393 if (tree_is_chrec (evolution_part))
394 return false;
396 step_expr = evolution_part;
397 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
401 step_expr, init_expr);
403 *init = init_expr;
404 *step = step_expr;
406 if (TREE_CODE (step_expr) != INTEGER_CST
407 && (TREE_CODE (step_expr) != SSA_NAME
408 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
409 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
410 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
411 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
412 || !flag_associative_math)))
413 && (TREE_CODE (step_expr) != REAL_CST
414 || !flag_associative_math))
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "step unknown.\n");
419 return false;
422 return true;
425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
426 what we are assuming is a double reduction. For example, given
427 a structure like this:
429 outer1:
430 x_1 = PHI <x_4(outer2), ...>;
433 inner:
434 x_2 = PHI <x_1(outer1), ...>;
436 x_3 = ...;
439 outer2:
440 x_4 = PHI <x_3(inner)>;
443 outer loop analysis would treat x_1 as a double reduction phi and
444 this function would then return true for x_2. */
446 static bool
447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
449 use_operand_p use_p;
450 ssa_op_iter op_iter;
451 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
452 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
453 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
454 return true;
455 return false;
458 /* Function vect_analyze_scalar_cycles_1.
460 Examine the cross iteration def-use cycles of scalar variables
461 in LOOP. LOOP_VINFO represents the loop that is now being
462 considered for vectorization (can be LOOP, or an outer-loop
463 enclosing LOOP). */
465 static void
466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
468 basic_block bb = loop->header;
469 tree init, step;
470 auto_vec<stmt_vec_info, 64> worklist;
471 gphi_iterator gsi;
472 bool double_reduc, reduc_chain;
474 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
476 /* First - identify all inductions. Reduction detection assumes that all the
477 inductions have been identified, therefore, this order must not be
478 changed. */
479 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
481 gphi *phi = gsi.phi ();
482 tree access_fn = NULL;
483 tree def = PHI_RESULT (phi);
484 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
489 /* Skip virtual phi's. The data dependences that are associated with
490 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
491 if (virtual_operand_p (def))
492 continue;
494 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
496 /* Analyze the evolution function. */
497 access_fn = analyze_scalar_evolution (loop, def);
498 if (access_fn)
500 STRIP_NOPS (access_fn);
501 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE, vect_location,
503 "Access function of PHI: %T\n", access_fn);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
505 = initial_condition_in_loop_num (access_fn, loop->num);
506 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
507 = evolution_part_in_loop_num (access_fn, loop->num);
510 if (!access_fn
511 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
512 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
513 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
514 && TREE_CODE (step) != INTEGER_CST))
516 worklist.safe_push (stmt_vinfo);
517 continue;
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
521 != NULL_TREE);
522 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
526 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
530 /* Second - identify all reductions and nested cycles. */
531 while (worklist.length () > 0)
533 stmt_vec_info stmt_vinfo = worklist.pop ();
534 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
535 tree def = PHI_RESULT (phi);
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
540 gcc_assert (!virtual_operand_p (def)
541 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
543 stmt_vec_info reduc_stmt_info
544 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
545 &reduc_chain);
546 if (reduc_stmt_info)
548 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
549 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
550 if (double_reduc)
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE, vect_location,
554 "Detected double reduction.\n");
556 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
557 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
559 else
561 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_NOTE, vect_location,
565 "Detected vectorizable nested cycle.\n");
567 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
569 else
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location,
573 "Detected reduction.\n");
575 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
577 /* Store the reduction cycles for possible vectorization in
578 loop-aware SLP if it was not detected as reduction
579 chain. */
580 if (! reduc_chain)
581 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
582 (reduc_stmt_info);
586 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
589 "Unknown def-use cycle pattern.\n");
594 /* Function vect_analyze_scalar_cycles.
596 Examine the cross iteration def-use cycles of scalar variables, by
597 analyzing the loop-header PHIs of scalar variables. Classify each
598 cycle as one of the following: invariant, induction, reduction, unknown.
599 We do that for the loop represented by LOOP_VINFO, and also to its
600 inner-loop, if exists.
601 Examples for scalar cycles:
603 Example1: reduction:
605 loop1:
606 for (i=0; i<N; i++)
607 sum += a[i];
609 Example2: induction:
611 loop2:
612 for (i=0; i<N; i++)
613 a[i] = i; */
615 static void
616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
618 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
620 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
622 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
623 Reductions in such inner-loop therefore have different properties than
624 the reductions in the nest that gets vectorized:
625 1. When vectorized, they are executed in the same order as in the original
626 scalar loop, so we can't change the order of computation when
627 vectorizing them.
628 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
629 current checks are too strict. */
631 if (loop->inner)
632 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
635 /* Transfer group and reduction information from STMT_INFO to its
636 pattern stmt. */
638 static void
639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
641 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
642 stmt_vec_info stmtp;
643 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
644 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
645 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
648 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
649 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
650 == STMT_VINFO_DEF_TYPE (stmt_info));
651 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
652 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
653 if (stmt_info)
654 REDUC_GROUP_NEXT_ELEMENT (stmtp)
655 = STMT_VINFO_RELATED_STMT (stmt_info);
657 while (stmt_info);
660 /* Fixup scalar cycles that now have their stmts detected as patterns. */
662 static void
663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
665 stmt_vec_info first;
666 unsigned i;
668 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
670 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
671 while (next)
673 if ((STMT_VINFO_IN_PATTERN_P (next)
674 != STMT_VINFO_IN_PATTERN_P (first))
675 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
676 break;
677 next = REDUC_GROUP_NEXT_ELEMENT (next);
679 /* If all reduction chain members are well-formed patterns adjust
680 the group to group the pattern stmts instead. */
681 if (! next
682 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
684 if (STMT_VINFO_IN_PATTERN_P (first))
686 vect_fixup_reduc_chain (first);
687 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
688 = STMT_VINFO_RELATED_STMT (first);
691 /* If not all stmt in the chain are patterns or if we failed
692 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
693 it as regular reduction instead. */
694 else
696 stmt_vec_info vinfo = first;
697 stmt_vec_info last = NULL;
698 while (vinfo)
700 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
701 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
702 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
703 last = vinfo;
704 vinfo = next;
706 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
707 = vect_internal_def;
708 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
709 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
710 --i;
715 /* Function vect_get_loop_niters.
717 Determine how many iterations the loop is executed and place it
718 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
719 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
720 niter information holds in ASSUMPTIONS.
722 Return the loop exit condition. */
725 static gcond *
726 vect_get_loop_niters (class loop *loop, tree *assumptions,
727 tree *number_of_iterations, tree *number_of_iterationsm1)
729 edge exit = single_exit (loop);
730 class tree_niter_desc niter_desc;
731 tree niter_assumptions, niter, may_be_zero;
732 gcond *cond = get_loop_exit_condition (loop);
734 *assumptions = boolean_true_node;
735 *number_of_iterationsm1 = chrec_dont_know;
736 *number_of_iterations = chrec_dont_know;
737 DUMP_VECT_SCOPE ("get_loop_niters");
739 if (!exit)
740 return cond;
742 may_be_zero = NULL_TREE;
743 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
744 || chrec_contains_undetermined (niter_desc.niter))
745 return cond;
747 niter_assumptions = niter_desc.assumptions;
748 may_be_zero = niter_desc.may_be_zero;
749 niter = niter_desc.niter;
751 if (may_be_zero && integer_zerop (may_be_zero))
752 may_be_zero = NULL_TREE;
754 if (may_be_zero)
756 if (COMPARISON_CLASS_P (may_be_zero))
758 /* Try to combine may_be_zero with assumptions, this can simplify
759 computation of niter expression. */
760 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
761 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
762 niter_assumptions,
763 fold_build1 (TRUTH_NOT_EXPR,
764 boolean_type_node,
765 may_be_zero));
766 else
767 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
768 build_int_cst (TREE_TYPE (niter), 0),
769 rewrite_to_non_trapping_overflow (niter));
771 may_be_zero = NULL_TREE;
773 else if (integer_nonzerop (may_be_zero))
775 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
776 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
777 return cond;
779 else
780 return cond;
783 *assumptions = niter_assumptions;
784 *number_of_iterationsm1 = niter;
786 /* We want the number of loop header executions which is the number
787 of latch executions plus one.
788 ??? For UINT_MAX latch executions this number overflows to zero
789 for loops like do { n++; } while (n != 0); */
790 if (niter && !chrec_contains_undetermined (niter))
791 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
792 build_int_cst (TREE_TYPE (niter), 1));
793 *number_of_iterations = niter;
795 return cond;
798 /* Function bb_in_loop_p
800 Used as predicate for dfs order traversal of the loop bbs. */
802 static bool
803 bb_in_loop_p (const_basic_block bb, const void *data)
805 const class loop *const loop = (const class loop *)data;
806 if (flow_bb_inside_loop_p (loop, bb))
807 return true;
808 return false;
812 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
813 stmt_vec_info structs for all the stmts in LOOP_IN. */
815 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
816 : vec_info (vec_info::loop, init_cost (loop_in), shared),
817 loop (loop_in),
818 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
819 num_itersm1 (NULL_TREE),
820 num_iters (NULL_TREE),
821 num_iters_unchanged (NULL_TREE),
822 num_iters_assumptions (NULL_TREE),
823 th (0),
824 versioning_threshold (0),
825 vectorization_factor (0),
826 max_vectorization_factor (0),
827 mask_skip_niters (NULL_TREE),
828 rgroup_compare_type (NULL_TREE),
829 simd_if_cond (NULL_TREE),
830 unaligned_dr (NULL),
831 peeling_for_alignment (0),
832 ptr_mask (0),
833 ivexpr_map (NULL),
834 scan_map (NULL),
835 slp_unrolling_factor (1),
836 single_scalar_iteration_cost (0),
837 vec_outside_cost (0),
838 vec_inside_cost (0),
839 vectorizable (false),
840 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
841 using_partial_vectors_p (false),
842 epil_using_partial_vectors_p (false),
843 peeling_for_gaps (false),
844 peeling_for_niter (false),
845 no_data_dependencies (false),
846 has_mask_store (false),
847 scalar_loop_scaling (profile_probability::uninitialized ()),
848 scalar_loop (NULL),
849 orig_loop_info (NULL)
851 /* CHECKME: We want to visit all BBs before their successors (except for
852 latch blocks, for which this assertion wouldn't hold). In the simple
853 case of the loop forms we allow, a dfs order of the BBs would the same
854 as reversed postorder traversal, so we are safe. */
856 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
857 bbs, loop->num_nodes, loop);
858 gcc_assert (nbbs == loop->num_nodes);
860 for (unsigned int i = 0; i < nbbs; i++)
862 basic_block bb = bbs[i];
863 gimple_stmt_iterator si;
865 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
867 gimple *phi = gsi_stmt (si);
868 gimple_set_uid (phi, 0);
869 add_stmt (phi);
872 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
874 gimple *stmt = gsi_stmt (si);
875 gimple_set_uid (stmt, 0);
876 if (is_gimple_debug (stmt))
877 continue;
878 add_stmt (stmt);
879 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
880 third argument is the #pragma omp simd if (x) condition, when 0,
881 loop shouldn't be vectorized, when non-zero constant, it should
882 be vectorized normally, otherwise versioned with vectorized loop
883 done if the condition is non-zero at runtime. */
884 if (loop_in->simduid
885 && is_gimple_call (stmt)
886 && gimple_call_internal_p (stmt)
887 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
888 && gimple_call_num_args (stmt) >= 3
889 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
890 && (loop_in->simduid
891 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
893 tree arg = gimple_call_arg (stmt, 2);
894 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
895 simd_if_cond = arg;
896 else
897 gcc_assert (integer_nonzerop (arg));
902 epilogue_vinfos.create (6);
905 /* Free all levels of rgroup CONTROLS. */
907 void
908 release_vec_loop_controls (vec<rgroup_controls> *controls)
910 rgroup_controls *rgc;
911 unsigned int i;
912 FOR_EACH_VEC_ELT (*controls, i, rgc)
913 rgc->controls.release ();
914 controls->release ();
917 /* Free all memory used by the _loop_vec_info, as well as all the
918 stmt_vec_info structs of all the stmts in the loop. */
920 _loop_vec_info::~_loop_vec_info ()
922 free (bbs);
924 release_vec_loop_controls (&masks);
925 release_vec_loop_controls (&lens);
926 delete ivexpr_map;
927 delete scan_map;
928 epilogue_vinfos.release ();
930 /* When we release an epiloge vinfo that we do not intend to use
931 avoid clearing AUX of the main loop which should continue to
932 point to the main loop vinfo since otherwise we'll leak that. */
933 if (loop->aux == this)
934 loop->aux = NULL;
937 /* Return an invariant or register for EXPR and emit necessary
938 computations in the LOOP_VINFO loop preheader. */
940 tree
941 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
943 if (is_gimple_reg (expr)
944 || is_gimple_min_invariant (expr))
945 return expr;
947 if (! loop_vinfo->ivexpr_map)
948 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
949 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
950 if (! cached)
952 gimple_seq stmts = NULL;
953 cached = force_gimple_operand (unshare_expr (expr),
954 &stmts, true, NULL_TREE);
955 if (stmts)
957 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
958 gsi_insert_seq_on_edge_immediate (e, stmts);
961 return cached;
964 /* Return true if we can use CMP_TYPE as the comparison type to produce
965 all masks required to mask LOOP_VINFO. */
967 static bool
968 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
970 rgroup_controls *rgm;
971 unsigned int i;
972 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
973 if (rgm->type != NULL_TREE
974 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
975 cmp_type, rgm->type,
976 OPTIMIZE_FOR_SPEED))
977 return false;
978 return true;
981 /* Calculate the maximum number of scalars per iteration for every
982 rgroup in LOOP_VINFO. */
984 static unsigned int
985 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
987 unsigned int res = 1;
988 unsigned int i;
989 rgroup_controls *rgm;
990 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
991 res = MAX (res, rgm->max_nscalars_per_iter);
992 return res;
995 /* Calculate the minimum precision necessary to represent:
997 MAX_NITERS * FACTOR
999 as an unsigned integer, where MAX_NITERS is the maximum number of
1000 loop header iterations for the original scalar form of LOOP_VINFO. */
1002 static unsigned
1003 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1005 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1007 /* Get the maximum number of iterations that is representable
1008 in the counter type. */
1009 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1010 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1012 /* Get a more refined estimate for the number of iterations. */
1013 widest_int max_back_edges;
1014 if (max_loop_iterations (loop, &max_back_edges))
1015 max_ni = wi::smin (max_ni, max_back_edges + 1);
1017 /* Work out how many bits we need to represent the limit. */
1018 return wi::min_precision (max_ni * factor, UNSIGNED);
1021 /* True if the loop needs peeling or partial vectors when vectorized. */
1023 static bool
1024 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1026 unsigned HOST_WIDE_INT const_vf;
1027 HOST_WIDE_INT max_niter
1028 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1030 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1031 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1032 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1033 (loop_vinfo));
1035 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1036 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1038 /* Work out the (constant) number of iterations that need to be
1039 peeled for reasons other than niters. */
1040 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1041 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1042 peel_niter += 1;
1043 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1044 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1045 return true;
1047 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1048 /* ??? When peeling for gaps but not alignment, we could
1049 try to check whether the (variable) niters is known to be
1050 VF * N + 1. That's something of a niche case though. */
1051 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1052 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1053 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1054 < (unsigned) exact_log2 (const_vf))
1055 /* In case of versioning, check if the maximum number of
1056 iterations is greater than th. If they are identical,
1057 the epilogue is unnecessary. */
1058 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1059 || ((unsigned HOST_WIDE_INT) max_niter
1060 > (th / const_vf) * const_vf))))
1061 return true;
1063 return false;
1066 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1067 whether we can actually generate the masks required. Return true if so,
1068 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1070 static bool
1071 vect_verify_full_masking (loop_vec_info loop_vinfo)
1073 unsigned int min_ni_width;
1074 unsigned int max_nscalars_per_iter
1075 = vect_get_max_nscalars_per_iter (loop_vinfo);
1077 /* Use a normal loop if there are no statements that need masking.
1078 This only happens in rare degenerate cases: it means that the loop
1079 has no loads, no stores, and no live-out values. */
1080 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1081 return false;
1083 /* Work out how many bits we need to represent the limit. */
1084 min_ni_width
1085 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1087 /* Find a scalar mode for which WHILE_ULT is supported. */
1088 opt_scalar_int_mode cmp_mode_iter;
1089 tree cmp_type = NULL_TREE;
1090 tree iv_type = NULL_TREE;
1091 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1092 unsigned int iv_precision = UINT_MAX;
1094 if (iv_limit != -1)
1095 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1096 UNSIGNED);
1098 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1100 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1101 if (cmp_bits >= min_ni_width
1102 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1104 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1105 if (this_type
1106 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1108 /* Although we could stop as soon as we find a valid mode,
1109 there are at least two reasons why that's not always the
1110 best choice:
1112 - An IV that's Pmode or wider is more likely to be reusable
1113 in address calculations than an IV that's narrower than
1114 Pmode.
1116 - Doing the comparison in IV_PRECISION or wider allows
1117 a natural 0-based IV, whereas using a narrower comparison
1118 type requires mitigations against wrap-around.
1120 Conversely, if the IV limit is variable, doing the comparison
1121 in a wider type than the original type can introduce
1122 unnecessary extensions, so picking the widest valid mode
1123 is not always a good choice either.
1125 Here we prefer the first IV type that's Pmode or wider,
1126 and the first comparison type that's IV_PRECISION or wider.
1127 (The comparison type must be no wider than the IV type,
1128 to avoid extensions in the vector loop.)
1130 ??? We might want to try continuing beyond Pmode for ILP32
1131 targets if CMP_BITS < IV_PRECISION. */
1132 iv_type = this_type;
1133 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1134 cmp_type = this_type;
1135 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1136 break;
1141 if (!cmp_type)
1142 return false;
1144 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1145 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1146 return true;
1149 /* Check whether we can use vector access with length based on precison
1150 comparison. So far, to keep it simple, we only allow the case that the
1151 precision of the target supported length is larger than the precision
1152 required by loop niters. */
1154 static bool
1155 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1157 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1158 return false;
1160 unsigned int max_nitems_per_iter = 1;
1161 unsigned int i;
1162 rgroup_controls *rgl;
1163 /* Find the maximum number of items per iteration for every rgroup. */
1164 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1166 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1167 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1170 /* Work out how many bits we need to represent the length limit. */
1171 unsigned int min_ni_prec
1172 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1174 /* Now use the maximum of below precisions for one suitable IV type:
1175 - the IV's natural precision
1176 - the precision needed to hold: the maximum number of scalar
1177 iterations multiplied by the scale factor (min_ni_prec above)
1178 - the Pmode precision
1180 If min_ni_prec is less than the precision of the current niters,
1181 we perfer to still use the niters type. Prefer to use Pmode and
1182 wider IV to avoid narrow conversions. */
1184 unsigned int ni_prec
1185 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1186 min_ni_prec = MAX (min_ni_prec, ni_prec);
1187 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1189 tree iv_type = NULL_TREE;
1190 opt_scalar_int_mode tmode_iter;
1191 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1193 scalar_mode tmode = tmode_iter.require ();
1194 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1196 /* ??? Do we really want to construct one IV whose precision exceeds
1197 BITS_PER_WORD? */
1198 if (tbits > BITS_PER_WORD)
1199 break;
1201 /* Find the first available standard integral type. */
1202 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1204 iv_type = build_nonstandard_integer_type (tbits, true);
1205 break;
1209 if (!iv_type)
1211 if (dump_enabled_p ())
1212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1213 "can't vectorize with length-based partial vectors"
1214 " because there is no suitable iv type.\n");
1215 return false;
1218 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1219 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1221 return true;
1224 /* Calculate the cost of one scalar iteration of the loop. */
1225 static void
1226 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1228 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1229 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1230 int nbbs = loop->num_nodes, factor;
1231 int innerloop_iters, i;
1233 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1235 /* Gather costs for statements in the scalar loop. */
1237 /* FORNOW. */
1238 innerloop_iters = 1;
1239 if (loop->inner)
1240 innerloop_iters = 50; /* FIXME */
1242 for (i = 0; i < nbbs; i++)
1244 gimple_stmt_iterator si;
1245 basic_block bb = bbs[i];
1247 if (bb->loop_father == loop->inner)
1248 factor = innerloop_iters;
1249 else
1250 factor = 1;
1252 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1254 gimple *stmt = gsi_stmt (si);
1255 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1257 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1258 continue;
1260 /* Skip stmts that are not vectorized inside the loop. */
1261 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1262 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1263 && (!STMT_VINFO_LIVE_P (vstmt_info)
1264 || !VECTORIZABLE_CYCLE_DEF
1265 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1266 continue;
1268 vect_cost_for_stmt kind;
1269 if (STMT_VINFO_DATA_REF (stmt_info))
1271 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1272 kind = scalar_load;
1273 else
1274 kind = scalar_store;
1276 else if (vect_nop_conversion_p (stmt_info))
1277 continue;
1278 else
1279 kind = scalar_stmt;
1281 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1282 factor, kind, stmt_info, 0, vect_prologue);
1286 /* Now accumulate cost. */
1287 void *target_cost_data = init_cost (loop);
1288 stmt_info_for_cost *si;
1289 int j;
1290 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1291 j, si)
1292 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1293 si->kind, si->stmt_info, si->vectype,
1294 si->misalign, vect_body);
1295 unsigned dummy, body_cost = 0;
1296 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1297 destroy_cost_data (target_cost_data);
1298 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1302 /* Function vect_analyze_loop_form_1.
1304 Verify that certain CFG restrictions hold, including:
1305 - the loop has a pre-header
1306 - the loop has a single entry and exit
1307 - the loop exit condition is simple enough
1308 - the number of iterations can be analyzed, i.e, a countable loop. The
1309 niter could be analyzed under some assumptions. */
1311 opt_result
1312 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1313 tree *assumptions, tree *number_of_iterationsm1,
1314 tree *number_of_iterations, gcond **inner_loop_cond)
1316 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1318 /* Different restrictions apply when we are considering an inner-most loop,
1319 vs. an outer (nested) loop.
1320 (FORNOW. May want to relax some of these restrictions in the future). */
1322 if (!loop->inner)
1324 /* Inner-most loop. We currently require that the number of BBs is
1325 exactly 2 (the header and latch). Vectorizable inner-most loops
1326 look like this:
1328 (pre-header)
1330 header <--------+
1331 | | |
1332 | +--> latch --+
1334 (exit-bb) */
1336 if (loop->num_nodes != 2)
1337 return opt_result::failure_at (vect_location,
1338 "not vectorized:"
1339 " control flow in loop.\n");
1341 if (empty_block_p (loop->header))
1342 return opt_result::failure_at (vect_location,
1343 "not vectorized: empty loop.\n");
1345 else
1347 class loop *innerloop = loop->inner;
1348 edge entryedge;
1350 /* Nested loop. We currently require that the loop is doubly-nested,
1351 contains a single inner loop, and the number of BBs is exactly 5.
1352 Vectorizable outer-loops look like this:
1354 (pre-header)
1356 header <---+
1358 inner-loop |
1360 tail ------+
1362 (exit-bb)
1364 The inner-loop has the properties expected of inner-most loops
1365 as described above. */
1367 if ((loop->inner)->inner || (loop->inner)->next)
1368 return opt_result::failure_at (vect_location,
1369 "not vectorized:"
1370 " multiple nested loops.\n");
1372 if (loop->num_nodes != 5)
1373 return opt_result::failure_at (vect_location,
1374 "not vectorized:"
1375 " control flow in loop.\n");
1377 entryedge = loop_preheader_edge (innerloop);
1378 if (entryedge->src != loop->header
1379 || !single_exit (innerloop)
1380 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1381 return opt_result::failure_at (vect_location,
1382 "not vectorized:"
1383 " unsupported outerloop form.\n");
1385 /* Analyze the inner-loop. */
1386 tree inner_niterm1, inner_niter, inner_assumptions;
1387 opt_result res
1388 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1389 &inner_assumptions, &inner_niterm1,
1390 &inner_niter, NULL);
1391 if (!res)
1393 if (dump_enabled_p ())
1394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1395 "not vectorized: Bad inner loop.\n");
1396 return res;
1399 /* Don't support analyzing niter under assumptions for inner
1400 loop. */
1401 if (!integer_onep (inner_assumptions))
1402 return opt_result::failure_at (vect_location,
1403 "not vectorized: Bad inner loop.\n");
1405 if (!expr_invariant_in_loop_p (loop, inner_niter))
1406 return opt_result::failure_at (vect_location,
1407 "not vectorized: inner-loop count not"
1408 " invariant.\n");
1410 if (dump_enabled_p ())
1411 dump_printf_loc (MSG_NOTE, vect_location,
1412 "Considering outer-loop vectorization.\n");
1415 if (!single_exit (loop))
1416 return opt_result::failure_at (vect_location,
1417 "not vectorized: multiple exits.\n");
1418 if (EDGE_COUNT (loop->header->preds) != 2)
1419 return opt_result::failure_at (vect_location,
1420 "not vectorized:"
1421 " too many incoming edges.\n");
1423 /* We assume that the loop exit condition is at the end of the loop. i.e,
1424 that the loop is represented as a do-while (with a proper if-guard
1425 before the loop if needed), where the loop header contains all the
1426 executable statements, and the latch is empty. */
1427 if (!empty_block_p (loop->latch)
1428 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1429 return opt_result::failure_at (vect_location,
1430 "not vectorized: latch block not empty.\n");
1432 /* Make sure the exit is not abnormal. */
1433 edge e = single_exit (loop);
1434 if (e->flags & EDGE_ABNORMAL)
1435 return opt_result::failure_at (vect_location,
1436 "not vectorized:"
1437 " abnormal loop exit edge.\n");
1439 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1440 number_of_iterationsm1);
1441 if (!*loop_cond)
1442 return opt_result::failure_at
1443 (vect_location,
1444 "not vectorized: complicated exit condition.\n");
1446 if (integer_zerop (*assumptions)
1447 || !*number_of_iterations
1448 || chrec_contains_undetermined (*number_of_iterations))
1449 return opt_result::failure_at
1450 (*loop_cond,
1451 "not vectorized: number of iterations cannot be computed.\n");
1453 if (integer_zerop (*number_of_iterations))
1454 return opt_result::failure_at
1455 (*loop_cond,
1456 "not vectorized: number of iterations = 0.\n");
1458 return opt_result::success ();
1461 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1463 opt_loop_vec_info
1464 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1466 tree assumptions, number_of_iterations, number_of_iterationsm1;
1467 gcond *loop_cond, *inner_loop_cond = NULL;
1469 opt_result res
1470 = vect_analyze_loop_form_1 (loop, &loop_cond,
1471 &assumptions, &number_of_iterationsm1,
1472 &number_of_iterations, &inner_loop_cond);
1473 if (!res)
1474 return opt_loop_vec_info::propagate_failure (res);
1476 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1477 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1478 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1479 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1480 if (!integer_onep (assumptions))
1482 /* We consider to vectorize this loop by versioning it under
1483 some assumptions. In order to do this, we need to clear
1484 existing information computed by scev and niter analyzer. */
1485 scev_reset_htab ();
1486 free_numbers_of_iterations_estimates (loop);
1487 /* Also set flag for this loop so that following scev and niter
1488 analysis are done under the assumptions. */
1489 loop_constraint_set (loop, LOOP_C_FINITE);
1490 /* Also record the assumptions for versioning. */
1491 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1494 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1496 if (dump_enabled_p ())
1498 dump_printf_loc (MSG_NOTE, vect_location,
1499 "Symbolic number of iterations is ");
1500 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1501 dump_printf (MSG_NOTE, "\n");
1505 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1506 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1507 if (inner_loop_cond)
1509 stmt_vec_info inner_loop_cond_info
1510 = loop_vinfo->lookup_stmt (inner_loop_cond);
1511 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1514 gcc_assert (!loop->aux);
1515 loop->aux = loop_vinfo;
1516 return opt_loop_vec_info::success (loop_vinfo);
1521 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1522 statements update the vectorization factor. */
1524 static void
1525 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1527 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1528 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1529 int nbbs = loop->num_nodes;
1530 poly_uint64 vectorization_factor;
1531 int i;
1533 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1535 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1536 gcc_assert (known_ne (vectorization_factor, 0U));
1538 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1539 vectorization factor of the loop is the unrolling factor required by
1540 the SLP instances. If that unrolling factor is 1, we say, that we
1541 perform pure SLP on loop - cross iteration parallelism is not
1542 exploited. */
1543 bool only_slp_in_loop = true;
1544 for (i = 0; i < nbbs; i++)
1546 basic_block bb = bbs[i];
1547 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1548 gsi_next (&si))
1550 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1551 if (!stmt_info)
1552 continue;
1553 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1554 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1555 && !PURE_SLP_STMT (stmt_info))
1556 /* STMT needs both SLP and loop-based vectorization. */
1557 only_slp_in_loop = false;
1559 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1560 gsi_next (&si))
1562 if (is_gimple_debug (gsi_stmt (si)))
1563 continue;
1564 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1565 stmt_info = vect_stmt_to_vectorize (stmt_info);
1566 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1567 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1568 && !PURE_SLP_STMT (stmt_info))
1569 /* STMT needs both SLP and loop-based vectorization. */
1570 only_slp_in_loop = false;
1574 if (only_slp_in_loop)
1576 if (dump_enabled_p ())
1577 dump_printf_loc (MSG_NOTE, vect_location,
1578 "Loop contains only SLP stmts\n");
1579 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1581 else
1583 if (dump_enabled_p ())
1584 dump_printf_loc (MSG_NOTE, vect_location,
1585 "Loop contains SLP and non-SLP stmts\n");
1586 /* Both the vectorization factor and unroll factor have the form
1587 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1588 so they must have a common multiple. */
1589 vectorization_factor
1590 = force_common_multiple (vectorization_factor,
1591 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1594 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1595 if (dump_enabled_p ())
1597 dump_printf_loc (MSG_NOTE, vect_location,
1598 "Updating vectorization factor to ");
1599 dump_dec (MSG_NOTE, vectorization_factor);
1600 dump_printf (MSG_NOTE, ".\n");
1604 /* Return true if STMT_INFO describes a double reduction phi and if
1605 the other phi in the reduction is also relevant for vectorization.
1606 This rejects cases such as:
1608 outer1:
1609 x_1 = PHI <x_3(outer2), ...>;
1612 inner:
1613 x_2 = ...;
1616 outer2:
1617 x_3 = PHI <x_2(inner)>;
1619 if nothing in x_2 or elsewhere makes x_1 relevant. */
1621 static bool
1622 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1624 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1625 return false;
1627 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1630 /* Function vect_analyze_loop_operations.
1632 Scan the loop stmts and make sure they are all vectorizable. */
1634 static opt_result
1635 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1637 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1638 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1639 int nbbs = loop->num_nodes;
1640 int i;
1641 stmt_vec_info stmt_info;
1642 bool need_to_vectorize = false;
1643 bool ok;
1645 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1647 auto_vec<stmt_info_for_cost> cost_vec;
1649 for (i = 0; i < nbbs; i++)
1651 basic_block bb = bbs[i];
1653 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1654 gsi_next (&si))
1656 gphi *phi = si.phi ();
1657 ok = true;
1659 stmt_info = loop_vinfo->lookup_stmt (phi);
1660 if (dump_enabled_p ())
1661 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1662 if (virtual_operand_p (gimple_phi_result (phi)))
1663 continue;
1665 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1666 (i.e., a phi in the tail of the outer-loop). */
1667 if (! is_loop_header_bb_p (bb))
1669 /* FORNOW: we currently don't support the case that these phis
1670 are not used in the outerloop (unless it is double reduction,
1671 i.e., this phi is vect_reduction_def), cause this case
1672 requires to actually do something here. */
1673 if (STMT_VINFO_LIVE_P (stmt_info)
1674 && !vect_active_double_reduction_p (stmt_info))
1675 return opt_result::failure_at (phi,
1676 "Unsupported loop-closed phi"
1677 " in outer-loop.\n");
1679 /* If PHI is used in the outer loop, we check that its operand
1680 is defined in the inner loop. */
1681 if (STMT_VINFO_RELEVANT_P (stmt_info))
1683 tree phi_op;
1685 if (gimple_phi_num_args (phi) != 1)
1686 return opt_result::failure_at (phi, "unsupported phi");
1688 phi_op = PHI_ARG_DEF (phi, 0);
1689 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1690 if (!op_def_info)
1691 return opt_result::failure_at (phi, "unsupported phi\n");
1693 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1694 && (STMT_VINFO_RELEVANT (op_def_info)
1695 != vect_used_in_outer_by_reduction))
1696 return opt_result::failure_at (phi, "unsupported phi\n");
1698 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1699 || (STMT_VINFO_DEF_TYPE (stmt_info)
1700 == vect_double_reduction_def))
1701 && !vectorizable_lc_phi (loop_vinfo,
1702 stmt_info, NULL, NULL))
1703 return opt_result::failure_at (phi, "unsupported phi\n");
1706 continue;
1709 gcc_assert (stmt_info);
1711 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1712 || STMT_VINFO_LIVE_P (stmt_info))
1713 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1714 /* A scalar-dependence cycle that we don't support. */
1715 return opt_result::failure_at (phi,
1716 "not vectorized:"
1717 " scalar dependence cycle.\n");
1719 if (STMT_VINFO_RELEVANT_P (stmt_info))
1721 need_to_vectorize = true;
1722 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1723 && ! PURE_SLP_STMT (stmt_info))
1724 ok = vectorizable_induction (loop_vinfo,
1725 stmt_info, NULL, NULL,
1726 &cost_vec);
1727 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1728 || (STMT_VINFO_DEF_TYPE (stmt_info)
1729 == vect_double_reduction_def)
1730 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1731 && ! PURE_SLP_STMT (stmt_info))
1732 ok = vectorizable_reduction (loop_vinfo,
1733 stmt_info, NULL, NULL, &cost_vec);
1736 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1737 if (ok
1738 && STMT_VINFO_LIVE_P (stmt_info)
1739 && !PURE_SLP_STMT (stmt_info))
1740 ok = vectorizable_live_operation (loop_vinfo,
1741 stmt_info, NULL, NULL, NULL,
1742 -1, false, &cost_vec);
1744 if (!ok)
1745 return opt_result::failure_at (phi,
1746 "not vectorized: relevant phi not "
1747 "supported: %G",
1748 static_cast <gimple *> (phi));
1751 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1752 gsi_next (&si))
1754 gimple *stmt = gsi_stmt (si);
1755 if (!gimple_clobber_p (stmt)
1756 && !is_gimple_debug (stmt))
1758 opt_result res
1759 = vect_analyze_stmt (loop_vinfo,
1760 loop_vinfo->lookup_stmt (stmt),
1761 &need_to_vectorize,
1762 NULL, NULL, &cost_vec);
1763 if (!res)
1764 return res;
1767 } /* bbs */
1769 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1771 /* All operations in the loop are either irrelevant (deal with loop
1772 control, or dead), or only used outside the loop and can be moved
1773 out of the loop (e.g. invariants, inductions). The loop can be
1774 optimized away by scalar optimizations. We're better off not
1775 touching this loop. */
1776 if (!need_to_vectorize)
1778 if (dump_enabled_p ())
1779 dump_printf_loc (MSG_NOTE, vect_location,
1780 "All the computation can be taken out of the loop.\n");
1781 return opt_result::failure_at
1782 (vect_location,
1783 "not vectorized: redundant loop. no profit to vectorize.\n");
1786 return opt_result::success ();
1789 /* Return true if we know that the iteration count is smaller than the
1790 vectorization factor. Return false if it isn't, or if we can't be sure
1791 either way. */
1793 static bool
1794 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1796 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1798 HOST_WIDE_INT max_niter;
1799 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1800 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1801 else
1802 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1804 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1805 return true;
1807 return false;
1810 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1811 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1812 definitely no, or -1 if it's worth retrying. */
1814 static int
1815 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1817 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1818 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1820 /* Only loops that can handle partially-populated vectors can have iteration
1821 counts less than the vectorization factor. */
1822 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1824 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1826 if (dump_enabled_p ())
1827 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1828 "not vectorized: iteration count smaller than "
1829 "vectorization factor.\n");
1830 return 0;
1834 /* If using the "very cheap" model. reject cases in which we'd keep
1835 a copy of the scalar code (even if we might be able to vectorize it). */
1836 if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1837 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1838 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1839 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1841 if (dump_enabled_p ())
1842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1843 "some scalar iterations would need to be peeled\n");
1844 return 0;
1847 int min_profitable_iters, min_profitable_estimate;
1848 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1849 &min_profitable_estimate);
1851 if (min_profitable_iters < 0)
1853 if (dump_enabled_p ())
1854 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1855 "not vectorized: vectorization not profitable.\n");
1856 if (dump_enabled_p ())
1857 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1858 "not vectorized: vector version will never be "
1859 "profitable.\n");
1860 return -1;
1863 int min_scalar_loop_bound = (param_min_vect_loop_bound
1864 * assumed_vf);
1866 /* Use the cost model only if it is more conservative than user specified
1867 threshold. */
1868 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1869 min_profitable_iters);
1871 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1873 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1874 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1876 if (dump_enabled_p ())
1877 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1878 "not vectorized: vectorization not profitable.\n");
1879 if (dump_enabled_p ())
1880 dump_printf_loc (MSG_NOTE, vect_location,
1881 "not vectorized: iteration count smaller than user "
1882 "specified loop bound parameter or minimum profitable "
1883 "iterations (whichever is more conservative).\n");
1884 return 0;
1887 /* The static profitablity threshold min_profitable_estimate includes
1888 the cost of having to check at runtime whether the scalar loop
1889 should be used instead. If it turns out that we don't need or want
1890 such a check, the threshold we should use for the static estimate
1891 is simply the point at which the vector loop becomes more profitable
1892 than the scalar loop. */
1893 if (min_profitable_estimate > min_profitable_iters
1894 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1895 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1896 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1897 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1899 if (dump_enabled_p ())
1900 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1901 " choice between the scalar and vector loops\n");
1902 min_profitable_estimate = min_profitable_iters;
1905 /* If the vector loop needs multiple iterations to be beneficial then
1906 things are probably too close to call, and the conservative thing
1907 would be to stick with the scalar code. */
1908 if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1909 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1911 if (dump_enabled_p ())
1912 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1913 "one iteration of the vector loop would be"
1914 " more expensive than the equivalent number of"
1915 " iterations of the scalar loop\n");
1916 return 0;
1919 HOST_WIDE_INT estimated_niter;
1921 /* If we are vectorizing an epilogue then we know the maximum number of
1922 scalar iterations it will cover is at least one lower than the
1923 vectorization factor of the main loop. */
1924 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1925 estimated_niter
1926 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1927 else
1929 estimated_niter = estimated_stmt_executions_int (loop);
1930 if (estimated_niter == -1)
1931 estimated_niter = likely_max_stmt_executions_int (loop);
1933 if (estimated_niter != -1
1934 && ((unsigned HOST_WIDE_INT) estimated_niter
1935 < MAX (th, (unsigned) min_profitable_estimate)))
1937 if (dump_enabled_p ())
1938 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1939 "not vectorized: estimated iteration count too "
1940 "small.\n");
1941 if (dump_enabled_p ())
1942 dump_printf_loc (MSG_NOTE, vect_location,
1943 "not vectorized: estimated iteration count smaller "
1944 "than specified loop bound parameter or minimum "
1945 "profitable iterations (whichever is more "
1946 "conservative).\n");
1947 return -1;
1950 return 1;
1953 static opt_result
1954 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1955 vec<data_reference_p> *datarefs,
1956 unsigned int *n_stmts)
1958 *n_stmts = 0;
1959 for (unsigned i = 0; i < loop->num_nodes; i++)
1960 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1961 !gsi_end_p (gsi); gsi_next (&gsi))
1963 gimple *stmt = gsi_stmt (gsi);
1964 if (is_gimple_debug (stmt))
1965 continue;
1966 ++(*n_stmts);
1967 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1968 NULL, 0);
1969 if (!res)
1971 if (is_gimple_call (stmt) && loop->safelen)
1973 tree fndecl = gimple_call_fndecl (stmt), op;
1974 if (fndecl != NULL_TREE)
1976 cgraph_node *node = cgraph_node::get (fndecl);
1977 if (node != NULL && node->simd_clones != NULL)
1979 unsigned int j, n = gimple_call_num_args (stmt);
1980 for (j = 0; j < n; j++)
1982 op = gimple_call_arg (stmt, j);
1983 if (DECL_P (op)
1984 || (REFERENCE_CLASS_P (op)
1985 && get_base_address (op)))
1986 break;
1988 op = gimple_call_lhs (stmt);
1989 /* Ignore #pragma omp declare simd functions
1990 if they don't have data references in the
1991 call stmt itself. */
1992 if (j == n
1993 && !(op
1994 && (DECL_P (op)
1995 || (REFERENCE_CLASS_P (op)
1996 && get_base_address (op)))))
1997 continue;
2001 return res;
2003 /* If dependence analysis will give up due to the limit on the
2004 number of datarefs stop here and fail fatally. */
2005 if (datarefs->length ()
2006 > (unsigned)param_loop_max_datarefs_for_datadeps)
2007 return opt_result::failure_at (stmt, "exceeded param "
2008 "loop-max-datarefs-for-datadeps\n");
2010 return opt_result::success ();
2013 /* Look for SLP-only access groups and turn each individual access into its own
2014 group. */
2015 static void
2016 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2018 unsigned int i;
2019 struct data_reference *dr;
2021 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2023 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2024 FOR_EACH_VEC_ELT (datarefs, i, dr)
2026 gcc_assert (DR_REF (dr));
2027 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2029 /* Check if the load is a part of an interleaving chain. */
2030 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2032 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2033 unsigned int group_size = DR_GROUP_SIZE (first_element);
2035 /* Check if SLP-only groups. */
2036 if (!STMT_SLP_TYPE (stmt_info)
2037 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2039 /* Dissolve the group. */
2040 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2042 stmt_vec_info vinfo = first_element;
2043 while (vinfo)
2045 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2046 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2047 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2048 DR_GROUP_SIZE (vinfo) = 1;
2049 if (STMT_VINFO_STRIDED_P (first_element))
2050 DR_GROUP_GAP (vinfo) = 0;
2051 else
2052 DR_GROUP_GAP (vinfo) = group_size - 1;
2053 vinfo = next;
2060 /* Determine if operating on full vectors for LOOP_VINFO might leave
2061 some scalar iterations still to do. If so, decide how we should
2062 handle those scalar iterations. The possibilities are:
2064 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2065 In this case:
2067 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2068 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2069 LOOP_VINFO_PEELING_FOR_NITER == false
2071 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2072 to handle the remaining scalar iterations. In this case:
2074 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2075 LOOP_VINFO_PEELING_FOR_NITER == true
2077 There are two choices:
2079 (2a) Consider vectorizing the epilogue loop at the same VF as the
2080 main loop, but using partial vectors instead of full vectors.
2081 In this case:
2083 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2085 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2086 In this case:
2088 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2090 When FOR_EPILOGUE_P is true, make this determination based on the
2091 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2092 based on the assumption that LOOP_VINFO is the main loop. The caller
2093 has made sure that the number of iterations is set appropriately for
2094 this value of FOR_EPILOGUE_P. */
2096 opt_result
2097 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2098 bool for_epilogue_p)
2100 /* Determine whether there would be any scalar iterations left over. */
2101 bool need_peeling_or_partial_vectors_p
2102 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2104 /* Decide whether to vectorize the loop with partial vectors. */
2105 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2106 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2107 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2108 && need_peeling_or_partial_vectors_p)
2110 /* For partial-vector-usage=1, try to push the handling of partial
2111 vectors to the epilogue, with the main loop continuing to operate
2112 on full vectors.
2114 ??? We could then end up failing to use partial vectors if we
2115 decide to peel iterations into a prologue, and if the main loop
2116 then ends up processing fewer than VF iterations. */
2117 if (param_vect_partial_vector_usage == 1
2118 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2119 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2120 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2121 else
2122 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2125 if (dump_enabled_p ())
2127 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2128 dump_printf_loc (MSG_NOTE, vect_location,
2129 "operating on partial vectors%s.\n",
2130 for_epilogue_p ? " for epilogue loop" : "");
2131 else
2132 dump_printf_loc (MSG_NOTE, vect_location,
2133 "operating only on full vectors%s.\n",
2134 for_epilogue_p ? " for epilogue loop" : "");
2137 if (for_epilogue_p)
2139 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2140 gcc_assert (orig_loop_vinfo);
2141 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2142 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2143 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2146 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2147 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2149 /* Check that the loop processes at least one full vector. */
2150 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2151 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2152 if (known_lt (wi::to_widest (scalar_niters), vf))
2153 return opt_result::failure_at (vect_location,
2154 "loop does not have enough iterations"
2155 " to support vectorization.\n");
2157 /* If we need to peel an extra epilogue iteration to handle data
2158 accesses with gaps, check that there are enough scalar iterations
2159 available.
2161 The check above is redundant with this one when peeling for gaps,
2162 but the distinction is useful for diagnostics. */
2163 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2164 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2165 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2166 return opt_result::failure_at (vect_location,
2167 "loop does not have enough iterations"
2168 " to support peeling for gaps.\n");
2171 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2172 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2173 && need_peeling_or_partial_vectors_p);
2175 return opt_result::success ();
2178 /* Function vect_analyze_loop_2.
2180 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2181 for it. The different analyses will record information in the
2182 loop_vec_info struct. */
2183 static opt_result
2184 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2186 opt_result ok = opt_result::success ();
2187 int res;
2188 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2189 poly_uint64 min_vf = 2;
2190 loop_vec_info orig_loop_vinfo = NULL;
2192 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2193 loop_vec_info of the first vectorized loop. */
2194 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2195 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2196 else
2197 orig_loop_vinfo = loop_vinfo;
2198 gcc_assert (orig_loop_vinfo);
2200 /* The first group of checks is independent of the vector size. */
2201 fatal = true;
2203 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2204 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2205 return opt_result::failure_at (vect_location,
2206 "not vectorized: simd if(0)\n");
2208 /* Find all data references in the loop (which correspond to vdefs/vuses)
2209 and analyze their evolution in the loop. */
2211 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2213 /* Gather the data references and count stmts in the loop. */
2214 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2216 opt_result res
2217 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2218 &LOOP_VINFO_DATAREFS (loop_vinfo),
2219 n_stmts);
2220 if (!res)
2222 if (dump_enabled_p ())
2223 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2224 "not vectorized: loop contains function "
2225 "calls or data references that cannot "
2226 "be analyzed\n");
2227 return res;
2229 loop_vinfo->shared->save_datarefs ();
2231 else
2232 loop_vinfo->shared->check_datarefs ();
2234 /* Analyze the data references and also adjust the minimal
2235 vectorization factor according to the loads and stores. */
2237 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2238 if (!ok)
2240 if (dump_enabled_p ())
2241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2242 "bad data references.\n");
2243 return ok;
2246 /* Classify all cross-iteration scalar data-flow cycles.
2247 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2248 vect_analyze_scalar_cycles (loop_vinfo);
2250 vect_pattern_recog (loop_vinfo);
2252 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2254 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2255 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2257 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2258 if (!ok)
2260 if (dump_enabled_p ())
2261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262 "bad data access.\n");
2263 return ok;
2266 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2268 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2269 if (!ok)
2271 if (dump_enabled_p ())
2272 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2273 "unexpected pattern.\n");
2274 return ok;
2277 /* While the rest of the analysis below depends on it in some way. */
2278 fatal = false;
2280 /* Analyze data dependences between the data-refs in the loop
2281 and adjust the maximum vectorization factor according to
2282 the dependences.
2283 FORNOW: fail at the first data dependence that we encounter. */
2285 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2286 if (!ok)
2288 if (dump_enabled_p ())
2289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2290 "bad data dependence.\n");
2291 return ok;
2293 if (max_vf != MAX_VECTORIZATION_FACTOR
2294 && maybe_lt (max_vf, min_vf))
2295 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2296 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2298 ok = vect_determine_vectorization_factor (loop_vinfo);
2299 if (!ok)
2301 if (dump_enabled_p ())
2302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2303 "can't determine vectorization factor.\n");
2304 return ok;
2306 if (max_vf != MAX_VECTORIZATION_FACTOR
2307 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2308 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2310 /* Compute the scalar iteration cost. */
2311 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2313 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2315 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2316 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2317 if (!ok)
2318 return ok;
2320 /* If there are any SLP instances mark them as pure_slp. */
2321 bool slp = vect_make_slp_decision (loop_vinfo);
2322 if (slp)
2324 /* Find stmts that need to be both vectorized and SLPed. */
2325 vect_detect_hybrid_slp (loop_vinfo);
2327 /* Update the vectorization factor based on the SLP decision. */
2328 vect_update_vf_for_slp (loop_vinfo);
2330 /* Optimize the SLP graph with the vectorization factor fixed. */
2331 vect_optimize_slp (loop_vinfo);
2333 /* Gather the loads reachable from the SLP graph entries. */
2334 vect_gather_slp_loads (loop_vinfo);
2337 bool saved_can_use_partial_vectors_p
2338 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2340 /* We don't expect to have to roll back to anything other than an empty
2341 set of rgroups. */
2342 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2344 /* This is the point where we can re-start analysis with SLP forced off. */
2345 start_over:
2347 /* Now the vectorization factor is final. */
2348 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2349 gcc_assert (known_ne (vectorization_factor, 0U));
2351 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2353 dump_printf_loc (MSG_NOTE, vect_location,
2354 "vectorization_factor = ");
2355 dump_dec (MSG_NOTE, vectorization_factor);
2356 dump_printf (MSG_NOTE, ", niters = %wd\n",
2357 LOOP_VINFO_INT_NITERS (loop_vinfo));
2360 /* Analyze the alignment of the data-refs in the loop.
2361 Fail if a data reference is found that cannot be vectorized. */
2363 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2364 if (!ok)
2366 if (dump_enabled_p ())
2367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2368 "bad data alignment.\n");
2369 return ok;
2372 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2373 It is important to call pruning after vect_analyze_data_ref_accesses,
2374 since we use grouping information gathered by interleaving analysis. */
2375 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2376 if (!ok)
2377 return ok;
2379 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2380 vectorization, since we do not want to add extra peeling or
2381 add versioning for alignment. */
2382 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2383 /* This pass will decide on using loop versioning and/or loop peeling in
2384 order to enhance the alignment of data references in the loop. */
2385 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2386 if (!ok)
2387 return ok;
2389 if (slp)
2391 /* Analyze operations in the SLP instances. Note this may
2392 remove unsupported SLP instances which makes the above
2393 SLP kind detection invalid. */
2394 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2395 vect_slp_analyze_operations (loop_vinfo);
2396 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2398 ok = opt_result::failure_at (vect_location,
2399 "unsupported SLP instances\n");
2400 goto again;
2403 /* Check whether any load in ALL SLP instances is possibly permuted. */
2404 slp_tree load_node, slp_root;
2405 unsigned i, x;
2406 slp_instance instance;
2407 bool can_use_lanes = true;
2408 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2410 slp_root = SLP_INSTANCE_TREE (instance);
2411 int group_size = SLP_TREE_LANES (slp_root);
2412 tree vectype = SLP_TREE_VECTYPE (slp_root);
2413 bool loads_permuted = false;
2414 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2416 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2417 continue;
2418 unsigned j;
2419 stmt_vec_info load_info;
2420 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2421 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2423 loads_permuted = true;
2424 break;
2428 /* If the loads and stores can be handled with load/store-lane
2429 instructions record it and move on to the next instance. */
2430 if (loads_permuted
2431 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2432 && vect_store_lanes_supported (vectype, group_size, false))
2434 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2436 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2437 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2438 /* Use SLP for strided accesses (or if we can't
2439 load-lanes). */
2440 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2441 || ! vect_load_lanes_supported
2442 (STMT_VINFO_VECTYPE (stmt_vinfo),
2443 DR_GROUP_SIZE (stmt_vinfo), false))
2444 break;
2447 can_use_lanes
2448 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2450 if (can_use_lanes && dump_enabled_p ())
2451 dump_printf_loc (MSG_NOTE, vect_location,
2452 "SLP instance %p can use load/store-lanes\n",
2453 instance);
2455 else
2457 can_use_lanes = false;
2458 break;
2462 /* If all SLP instances can use load/store-lanes abort SLP and try again
2463 with SLP disabled. */
2464 if (can_use_lanes)
2466 ok = opt_result::failure_at (vect_location,
2467 "Built SLP cancelled: can use "
2468 "load/store-lanes\n");
2469 if (dump_enabled_p ())
2470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2471 "Built SLP cancelled: all SLP instances support "
2472 "load/store-lanes\n");
2473 goto again;
2477 /* Dissolve SLP-only groups. */
2478 vect_dissolve_slp_only_groups (loop_vinfo);
2480 /* Scan all the remaining operations in the loop that are not subject
2481 to SLP and make sure they are vectorizable. */
2482 ok = vect_analyze_loop_operations (loop_vinfo);
2483 if (!ok)
2485 if (dump_enabled_p ())
2486 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2487 "bad operation or unsupported loop bound.\n");
2488 return ok;
2491 /* For now, we don't expect to mix both masking and length approaches for one
2492 loop, disable it if both are recorded. */
2493 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2494 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2495 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2497 if (dump_enabled_p ())
2498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2499 "can't vectorize a loop with partial vectors"
2500 " because we don't expect to mix different"
2501 " approaches with partial vectors for the"
2502 " same loop.\n");
2503 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2506 /* If we still have the option of using partial vectors,
2507 check whether we can generate the necessary loop controls. */
2508 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2509 && !vect_verify_full_masking (loop_vinfo)
2510 && !vect_verify_loop_lens (loop_vinfo))
2511 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2513 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2514 to be able to handle fewer than VF scalars, or needs to have a lower VF
2515 than the main loop. */
2516 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2517 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2518 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2519 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2520 return opt_result::failure_at (vect_location,
2521 "Vectorization factor too high for"
2522 " epilogue loop.\n");
2524 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2525 assuming that the loop will be used as a main loop. We will redo
2526 this analysis later if we instead decide to use the loop as an
2527 epilogue loop. */
2528 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2529 if (!ok)
2530 return ok;
2532 /* Check the costings of the loop make vectorizing worthwhile. */
2533 res = vect_analyze_loop_costing (loop_vinfo);
2534 if (res < 0)
2536 ok = opt_result::failure_at (vect_location,
2537 "Loop costings may not be worthwhile.\n");
2538 goto again;
2540 if (!res)
2541 return opt_result::failure_at (vect_location,
2542 "Loop costings not worthwhile.\n");
2544 /* If an epilogue loop is required make sure we can create one. */
2545 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2546 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2548 if (dump_enabled_p ())
2549 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2550 if (!vect_can_advance_ivs_p (loop_vinfo)
2551 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2552 single_exit (LOOP_VINFO_LOOP
2553 (loop_vinfo))))
2555 ok = opt_result::failure_at (vect_location,
2556 "not vectorized: can't create required "
2557 "epilog loop\n");
2558 goto again;
2562 /* During peeling, we need to check if number of loop iterations is
2563 enough for both peeled prolog loop and vector loop. This check
2564 can be merged along with threshold check of loop versioning, so
2565 increase threshold for this case if necessary.
2567 If we are analyzing an epilogue we still want to check what its
2568 versioning threshold would be. If we decide to vectorize the epilogues we
2569 will want to use the lowest versioning threshold of all epilogues and main
2570 loop. This will enable us to enter a vectorized epilogue even when
2571 versioning the loop. We can't simply check whether the epilogue requires
2572 versioning though since we may have skipped some versioning checks when
2573 analyzing the epilogue. For instance, checks for alias versioning will be
2574 skipped when dealing with epilogues as we assume we already checked them
2575 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2576 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2578 poly_uint64 niters_th = 0;
2579 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2581 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2583 /* Niters for peeled prolog loop. */
2584 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2586 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2587 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2588 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2590 else
2591 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2594 /* Niters for at least one iteration of vectorized loop. */
2595 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2596 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2597 /* One additional iteration because of peeling for gap. */
2598 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2599 niters_th += 1;
2601 /* Use the same condition as vect_transform_loop to decide when to use
2602 the cost to determine a versioning threshold. */
2603 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2604 && ordered_p (th, niters_th))
2605 niters_th = ordered_max (poly_uint64 (th), niters_th);
2607 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2610 gcc_assert (known_eq (vectorization_factor,
2611 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2613 /* Ok to vectorize! */
2614 return opt_result::success ();
2616 again:
2617 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2618 gcc_assert (!ok);
2620 /* Try again with SLP forced off but if we didn't do any SLP there is
2621 no point in re-trying. */
2622 if (!slp)
2623 return ok;
2625 /* If there are reduction chains re-trying will fail anyway. */
2626 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2627 return ok;
2629 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2630 via interleaving or lane instructions. */
2631 slp_instance instance;
2632 slp_tree node;
2633 unsigned i, j;
2634 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2636 stmt_vec_info vinfo;
2637 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2638 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2639 continue;
2640 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2641 unsigned int size = DR_GROUP_SIZE (vinfo);
2642 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2643 if (! vect_store_lanes_supported (vectype, size, false)
2644 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2645 && ! vect_grouped_store_supported (vectype, size))
2646 return opt_result::failure_at (vinfo->stmt,
2647 "unsupported grouped store\n");
2648 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2650 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2651 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2652 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2653 size = DR_GROUP_SIZE (vinfo);
2654 vectype = STMT_VINFO_VECTYPE (vinfo);
2655 if (! vect_load_lanes_supported (vectype, size, false)
2656 && ! vect_grouped_load_supported (vectype, single_element_p,
2657 size))
2658 return opt_result::failure_at (vinfo->stmt,
2659 "unsupported grouped load\n");
2663 if (dump_enabled_p ())
2664 dump_printf_loc (MSG_NOTE, vect_location,
2665 "re-trying with SLP disabled\n");
2667 /* Roll back state appropriately. No SLP this time. */
2668 slp = false;
2669 /* Restore vectorization factor as it were without SLP. */
2670 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2671 /* Free the SLP instances. */
2672 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2673 vect_free_slp_instance (instance);
2674 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2675 /* Reset SLP type to loop_vect on all stmts. */
2676 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2678 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2679 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2680 !gsi_end_p (si); gsi_next (&si))
2682 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2683 STMT_SLP_TYPE (stmt_info) = loop_vect;
2684 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2685 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2687 /* vectorizable_reduction adjusts reduction stmt def-types,
2688 restore them to that of the PHI. */
2689 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2690 = STMT_VINFO_DEF_TYPE (stmt_info);
2691 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2692 (STMT_VINFO_REDUC_DEF (stmt_info)))
2693 = STMT_VINFO_DEF_TYPE (stmt_info);
2696 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2697 !gsi_end_p (si); gsi_next (&si))
2699 if (is_gimple_debug (gsi_stmt (si)))
2700 continue;
2701 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2702 STMT_SLP_TYPE (stmt_info) = loop_vect;
2703 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2705 stmt_vec_info pattern_stmt_info
2706 = STMT_VINFO_RELATED_STMT (stmt_info);
2707 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2708 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2710 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2711 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2712 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2713 !gsi_end_p (pi); gsi_next (&pi))
2714 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2715 = loop_vect;
2719 /* Free optimized alias test DDRS. */
2720 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2721 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2722 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2723 /* Reset target cost data. */
2724 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2725 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2726 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2727 /* Reset accumulated rgroup information. */
2728 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2729 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2730 /* Reset assorted flags. */
2731 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2732 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2733 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2734 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2735 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2736 = saved_can_use_partial_vectors_p;
2738 goto start_over;
2741 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2742 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2743 OLD_LOOP_VINFO is better unless something specifically indicates
2744 otherwise.
2746 Note that this deliberately isn't a partial order. */
2748 static bool
2749 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2750 loop_vec_info old_loop_vinfo)
2752 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2753 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2755 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2756 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2758 /* Always prefer a VF of loop->simdlen over any other VF. */
2759 if (loop->simdlen)
2761 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2762 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2763 if (new_simdlen_p != old_simdlen_p)
2764 return new_simdlen_p;
2767 /* Limit the VFs to what is likely to be the maximum number of iterations,
2768 to handle cases in which at least one loop_vinfo is fully-masked. */
2769 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2770 if (estimated_max_niter != -1)
2772 if (known_le (estimated_max_niter, new_vf))
2773 new_vf = estimated_max_niter;
2774 if (known_le (estimated_max_niter, old_vf))
2775 old_vf = estimated_max_niter;
2778 /* Check whether the (fractional) cost per scalar iteration is lower
2779 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2780 poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
2781 poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
2783 HOST_WIDE_INT est_rel_new_min
2784 = estimated_poly_value (rel_new, POLY_VALUE_MIN);
2785 HOST_WIDE_INT est_rel_new_max
2786 = estimated_poly_value (rel_new, POLY_VALUE_MAX);
2788 HOST_WIDE_INT est_rel_old_min
2789 = estimated_poly_value (rel_old, POLY_VALUE_MIN);
2790 HOST_WIDE_INT est_rel_old_max
2791 = estimated_poly_value (rel_old, POLY_VALUE_MAX);
2793 /* Check first if we can make out an unambigous total order from the minimum
2794 and maximum estimates. */
2795 if (est_rel_new_min < est_rel_old_min
2796 && est_rel_new_max < est_rel_old_max)
2797 return true;
2798 else if (est_rel_old_min < est_rel_new_min
2799 && est_rel_old_max < est_rel_new_max)
2800 return false;
2801 /* When old_loop_vinfo uses a variable vectorization factor,
2802 we know that it has a lower cost for at least one runtime VF.
2803 However, we don't know how likely that VF is.
2805 One option would be to compare the costs for the estimated VFs.
2806 The problem is that that can put too much pressure on the cost
2807 model. E.g. if the estimated VF is also the lowest possible VF,
2808 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2809 for the estimated VF, we'd then choose new_loop_vinfo even
2810 though (a) new_loop_vinfo might not actually be better than
2811 old_loop_vinfo for that VF and (b) it would be significantly
2812 worse at larger VFs.
2814 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2815 no more expensive than old_loop_vinfo even after doubling the
2816 estimated old_loop_vinfo VF. For all but trivial loops, this
2817 ensures that we only pick new_loop_vinfo if it is significantly
2818 better than old_loop_vinfo at the estimated VF. */
2820 if (est_rel_old_min != est_rel_new_min
2821 || est_rel_old_max != est_rel_new_max)
2823 HOST_WIDE_INT est_rel_new_likely
2824 = estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
2825 HOST_WIDE_INT est_rel_old_likely
2826 = estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
2828 return est_rel_new_likely * 2 <= est_rel_old_likely;
2831 /* If there's nothing to choose between the loop bodies, see whether
2832 there's a difference in the prologue and epilogue costs. */
2833 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2834 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2836 return false;
2839 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2840 true if we should. */
2842 static bool
2843 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2844 loop_vec_info old_loop_vinfo)
2846 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2847 return false;
2849 if (dump_enabled_p ())
2850 dump_printf_loc (MSG_NOTE, vect_location,
2851 "***** Preferring vector mode %s to vector mode %s\n",
2852 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2853 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2854 return true;
2857 /* If LOOP_VINFO is already a main loop, return it unmodified. Otherwise
2858 try to reanalyze it as a main loop. Return the loop_vinfo on success
2859 and null on failure. */
2861 static loop_vec_info
2862 vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, unsigned int *n_stmts)
2864 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2865 return loop_vinfo;
2867 if (dump_enabled_p ())
2868 dump_printf_loc (MSG_NOTE, vect_location,
2869 "***** Reanalyzing as a main loop with vector mode %s\n",
2870 GET_MODE_NAME (loop_vinfo->vector_mode));
2872 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2873 vec_info_shared *shared = loop_vinfo->shared;
2874 opt_loop_vec_info main_loop_vinfo = vect_analyze_loop_form (loop, shared);
2875 gcc_assert (main_loop_vinfo);
2877 main_loop_vinfo->vector_mode = loop_vinfo->vector_mode;
2879 bool fatal = false;
2880 bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts);
2881 loop->aux = NULL;
2882 if (!res)
2884 if (dump_enabled_p ())
2885 dump_printf_loc (MSG_NOTE, vect_location,
2886 "***** Failed to analyze main loop with vector"
2887 " mode %s\n",
2888 GET_MODE_NAME (loop_vinfo->vector_mode));
2889 delete main_loop_vinfo;
2890 return NULL;
2892 LOOP_VINFO_VECTORIZABLE_P (main_loop_vinfo) = 1;
2893 return main_loop_vinfo;
2896 /* Function vect_analyze_loop.
2898 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2899 for it. The different analyses will record information in the
2900 loop_vec_info struct. */
2901 opt_loop_vec_info
2902 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2904 auto_vector_modes vector_modes;
2906 /* Autodetect first vector size we try. */
2907 unsigned int autovec_flags
2908 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2909 loop->simdlen != 0);
2910 unsigned int mode_i = 0;
2912 DUMP_VECT_SCOPE ("analyze_loop_nest");
2914 if (loop_outer (loop)
2915 && loop_vec_info_for_loop (loop_outer (loop))
2916 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2917 return opt_loop_vec_info::failure_at (vect_location,
2918 "outer-loop already vectorized.\n");
2920 if (!find_loop_nest (loop, &shared->loop_nest))
2921 return opt_loop_vec_info::failure_at
2922 (vect_location,
2923 "not vectorized: loop nest containing two or more consecutive inner"
2924 " loops cannot be vectorized\n");
2926 unsigned n_stmts = 0;
2927 machine_mode autodetected_vector_mode = VOIDmode;
2928 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2929 machine_mode next_vector_mode = VOIDmode;
2930 poly_uint64 lowest_th = 0;
2931 unsigned vectorized_loops = 0;
2932 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2933 && !unlimited_cost_model (loop));
2935 bool vect_epilogues = false;
2936 opt_result res = opt_result::success ();
2937 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2938 while (1)
2940 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2941 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2942 if (!loop_vinfo)
2944 if (dump_enabled_p ())
2945 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2946 "bad loop form.\n");
2947 gcc_checking_assert (first_loop_vinfo == NULL);
2948 return loop_vinfo;
2950 loop_vinfo->vector_mode = next_vector_mode;
2952 bool fatal = false;
2954 /* When pick_lowest_cost_p is true, we should in principle iterate
2955 over all the loop_vec_infos that LOOP_VINFO could replace and
2956 try to vectorize LOOP_VINFO under the same conditions.
2957 E.g. when trying to replace an epilogue loop, we should vectorize
2958 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2959 to replace the main loop, we should vectorize LOOP_VINFO as a main
2960 loop too.
2962 However, autovectorize_vector_modes is usually sorted as follows:
2964 - Modes that naturally produce lower VFs usually follow modes that
2965 naturally produce higher VFs.
2967 - When modes naturally produce the same VF, maskable modes
2968 usually follow unmaskable ones, so that the maskable mode
2969 can be used to vectorize the epilogue of the unmaskable mode.
2971 This order is preferred because it leads to the maximum
2972 epilogue vectorization opportunities. Targets should only use
2973 a different order if they want to make wide modes available while
2974 disparaging them relative to earlier, smaller modes. The assumption
2975 in that case is that the wider modes are more expensive in some
2976 way that isn't reflected directly in the costs.
2978 There should therefore be few interesting cases in which
2979 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2980 treated as a standalone loop, and ends up being genuinely cheaper
2981 than FIRST_LOOP_VINFO. */
2982 if (vect_epilogues)
2983 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2985 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2986 if (mode_i == 0)
2987 autodetected_vector_mode = loop_vinfo->vector_mode;
2988 if (dump_enabled_p ())
2990 if (res)
2991 dump_printf_loc (MSG_NOTE, vect_location,
2992 "***** Analysis succeeded with vector mode %s\n",
2993 GET_MODE_NAME (loop_vinfo->vector_mode));
2994 else
2995 dump_printf_loc (MSG_NOTE, vect_location,
2996 "***** Analysis failed with vector mode %s\n",
2997 GET_MODE_NAME (loop_vinfo->vector_mode));
3000 loop->aux = NULL;
3002 if (!fatal)
3003 while (mode_i < vector_modes.length ()
3004 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
3006 if (dump_enabled_p ())
3007 dump_printf_loc (MSG_NOTE, vect_location,
3008 "***** The result for vector mode %s would"
3009 " be the same\n",
3010 GET_MODE_NAME (vector_modes[mode_i]));
3011 mode_i += 1;
3014 if (res)
3016 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3017 vectorized_loops++;
3019 /* Once we hit the desired simdlen for the first time,
3020 discard any previous attempts. */
3021 if (simdlen
3022 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3024 delete first_loop_vinfo;
3025 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3026 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
3027 simdlen = 0;
3029 else if (pick_lowest_cost_p && first_loop_vinfo)
3031 /* Keep trying to roll back vectorization attempts while the
3032 loop_vec_infos they produced were worse than this one. */
3033 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3034 while (!vinfos.is_empty ()
3035 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3037 gcc_assert (vect_epilogues);
3038 delete vinfos.pop ();
3040 if (vinfos.is_empty ()
3041 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3043 loop_vec_info main_loop_vinfo
3044 = vect_reanalyze_as_main_loop (loop_vinfo, &n_stmts);
3045 if (main_loop_vinfo == loop_vinfo)
3047 delete first_loop_vinfo;
3048 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3050 else if (main_loop_vinfo
3051 && vect_joust_loop_vinfos (main_loop_vinfo,
3052 first_loop_vinfo))
3054 delete first_loop_vinfo;
3055 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3056 delete loop_vinfo;
3057 loop_vinfo
3058 = opt_loop_vec_info::success (main_loop_vinfo);
3060 else
3061 delete main_loop_vinfo;
3065 if (first_loop_vinfo == NULL)
3067 first_loop_vinfo = loop_vinfo;
3068 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3070 else if (vect_epilogues
3071 /* For now only allow one epilogue loop. */
3072 && first_loop_vinfo->epilogue_vinfos.is_empty ())
3074 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3075 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3076 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3077 || maybe_ne (lowest_th, 0U));
3078 /* Keep track of the known smallest versioning
3079 threshold. */
3080 if (ordered_p (lowest_th, th))
3081 lowest_th = ordered_min (lowest_th, th);
3083 else
3085 delete loop_vinfo;
3086 loop_vinfo = opt_loop_vec_info::success (NULL);
3089 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3090 enabled, SIMDUID is not set, it is the innermost loop and we have
3091 either already found the loop's SIMDLEN or there was no SIMDLEN to
3092 begin with.
3093 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3094 vect_epilogues = (!simdlen
3095 && loop->inner == NULL
3096 && param_vect_epilogues_nomask
3097 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3098 && !loop->simduid
3099 /* For now only allow one epilogue loop, but allow
3100 pick_lowest_cost_p to replace it. */
3101 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
3102 || pick_lowest_cost_p));
3104 /* Commit to first_loop_vinfo if we have no reason to try
3105 alternatives. */
3106 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
3107 break;
3109 else
3111 delete loop_vinfo;
3112 loop_vinfo = opt_loop_vec_info::success (NULL);
3113 if (fatal)
3115 gcc_checking_assert (first_loop_vinfo == NULL);
3116 break;
3120 /* Handle the case that the original loop can use partial
3121 vectorization, but want to only adopt it for the epilogue.
3122 The retry should be in the same mode as original. */
3123 if (vect_epilogues
3124 && loop_vinfo
3125 && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
3127 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3128 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
3129 if (dump_enabled_p ())
3130 dump_printf_loc (MSG_NOTE, vect_location,
3131 "***** Re-trying analysis with same vector mode"
3132 " %s for epilogue with partial vectors.\n",
3133 GET_MODE_NAME (loop_vinfo->vector_mode));
3134 continue;
3137 if (mode_i < vector_modes.length ()
3138 && VECTOR_MODE_P (autodetected_vector_mode)
3139 && (related_vector_mode (vector_modes[mode_i],
3140 GET_MODE_INNER (autodetected_vector_mode))
3141 == autodetected_vector_mode)
3142 && (related_vector_mode (autodetected_vector_mode,
3143 GET_MODE_INNER (vector_modes[mode_i]))
3144 == vector_modes[mode_i]))
3146 if (dump_enabled_p ())
3147 dump_printf_loc (MSG_NOTE, vect_location,
3148 "***** Skipping vector mode %s, which would"
3149 " repeat the analysis for %s\n",
3150 GET_MODE_NAME (vector_modes[mode_i]),
3151 GET_MODE_NAME (autodetected_vector_mode));
3152 mode_i += 1;
3155 if (mode_i == vector_modes.length ()
3156 || autodetected_vector_mode == VOIDmode)
3157 break;
3159 /* Try the next biggest vector size. */
3160 next_vector_mode = vector_modes[mode_i++];
3161 if (dump_enabled_p ())
3162 dump_printf_loc (MSG_NOTE, vect_location,
3163 "***** Re-trying analysis with vector mode %s\n",
3164 GET_MODE_NAME (next_vector_mode));
3167 if (first_loop_vinfo)
3169 loop->aux = (loop_vec_info) first_loop_vinfo;
3170 if (dump_enabled_p ())
3171 dump_printf_loc (MSG_NOTE, vect_location,
3172 "***** Choosing vector mode %s\n",
3173 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3174 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3175 return first_loop_vinfo;
3178 return opt_loop_vec_info::propagate_failure (res);
3181 /* Return true if there is an in-order reduction function for CODE, storing
3182 it in *REDUC_FN if so. */
3184 static bool
3185 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3187 switch (code)
3189 case PLUS_EXPR:
3190 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3191 return true;
3193 default:
3194 return false;
3198 /* Function reduction_fn_for_scalar_code
3200 Input:
3201 CODE - tree_code of a reduction operations.
3203 Output:
3204 REDUC_FN - the corresponding internal function to be used to reduce the
3205 vector of partial results into a single scalar result, or IFN_LAST
3206 if the operation is a supported reduction operation, but does not have
3207 such an internal function.
3209 Return FALSE if CODE currently cannot be vectorized as reduction. */
3211 static bool
3212 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3214 switch (code)
3216 case MAX_EXPR:
3217 *reduc_fn = IFN_REDUC_MAX;
3218 return true;
3220 case MIN_EXPR:
3221 *reduc_fn = IFN_REDUC_MIN;
3222 return true;
3224 case PLUS_EXPR:
3225 *reduc_fn = IFN_REDUC_PLUS;
3226 return true;
3228 case BIT_AND_EXPR:
3229 *reduc_fn = IFN_REDUC_AND;
3230 return true;
3232 case BIT_IOR_EXPR:
3233 *reduc_fn = IFN_REDUC_IOR;
3234 return true;
3236 case BIT_XOR_EXPR:
3237 *reduc_fn = IFN_REDUC_XOR;
3238 return true;
3240 case MULT_EXPR:
3241 case MINUS_EXPR:
3242 *reduc_fn = IFN_LAST;
3243 return true;
3245 default:
3246 return false;
3250 /* If there is a neutral value X such that SLP reduction NODE would not
3251 be affected by the introduction of additional X elements, return that X,
3252 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
3253 is the vector type that would hold element X. REDUC_CHAIN is true if
3254 the SLP statements perform a single reduction, false if each statement
3255 performs an independent reduction. */
3257 static tree
3258 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
3259 tree_code code, bool reduc_chain)
3261 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3262 stmt_vec_info stmt_vinfo = stmts[0];
3263 tree scalar_type = TREE_TYPE (vector_type);
3264 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
3265 gcc_assert (loop);
3267 switch (code)
3269 case WIDEN_SUM_EXPR:
3270 case DOT_PROD_EXPR:
3271 case SAD_EXPR:
3272 case PLUS_EXPR:
3273 case MINUS_EXPR:
3274 case BIT_IOR_EXPR:
3275 case BIT_XOR_EXPR:
3276 return build_zero_cst (scalar_type);
3278 case MULT_EXPR:
3279 return build_one_cst (scalar_type);
3281 case BIT_AND_EXPR:
3282 return build_all_ones_cst (scalar_type);
3284 case MAX_EXPR:
3285 case MIN_EXPR:
3286 /* For MIN/MAX the initial values are neutral. A reduction chain
3287 has only a single initial value, so that value is neutral for
3288 all statements. */
3289 if (reduc_chain)
3290 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3291 loop_preheader_edge (loop));
3292 return NULL_TREE;
3294 default:
3295 return NULL_TREE;
3299 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3300 STMT is printed with a message MSG. */
3302 static void
3303 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3305 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3308 /* Return true if we need an in-order reduction for operation CODE
3309 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3310 overflow must wrap. */
3312 bool
3313 needs_fold_left_reduction_p (tree type, tree_code code)
3315 /* CHECKME: check for !flag_finite_math_only too? */
3316 if (SCALAR_FLOAT_TYPE_P (type))
3317 switch (code)
3319 case MIN_EXPR:
3320 case MAX_EXPR:
3321 return false;
3323 default:
3324 return !flag_associative_math;
3327 if (INTEGRAL_TYPE_P (type))
3329 if (!operation_no_trapping_overflow (type, code))
3330 return true;
3331 return false;
3334 if (SAT_FIXED_POINT_TYPE_P (type))
3335 return true;
3337 return false;
3340 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3341 has a handled computation expression. Store the main reduction
3342 operation in *CODE. */
3344 static bool
3345 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3346 tree loop_arg, enum tree_code *code,
3347 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3349 auto_bitmap visited;
3350 tree lookfor = PHI_RESULT (phi);
3351 ssa_op_iter curri;
3352 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3353 while (USE_FROM_PTR (curr) != loop_arg)
3354 curr = op_iter_next_use (&curri);
3355 curri.i = curri.numops;
3358 path.safe_push (std::make_pair (curri, curr));
3359 tree use = USE_FROM_PTR (curr);
3360 if (use == lookfor)
3361 break;
3362 gimple *def = SSA_NAME_DEF_STMT (use);
3363 if (gimple_nop_p (def)
3364 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3366 pop:
3369 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3370 curri = x.first;
3371 curr = x.second;
3373 curr = op_iter_next_use (&curri);
3374 /* Skip already visited or non-SSA operands (from iterating
3375 over PHI args). */
3376 while (curr != NULL_USE_OPERAND_P
3377 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3378 || ! bitmap_set_bit (visited,
3379 SSA_NAME_VERSION
3380 (USE_FROM_PTR (curr)))));
3382 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3383 if (curr == NULL_USE_OPERAND_P)
3384 break;
3386 else
3388 if (gimple_code (def) == GIMPLE_PHI)
3389 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3390 else
3391 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3392 while (curr != NULL_USE_OPERAND_P
3393 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3394 || ! bitmap_set_bit (visited,
3395 SSA_NAME_VERSION
3396 (USE_FROM_PTR (curr)))))
3397 curr = op_iter_next_use (&curri);
3398 if (curr == NULL_USE_OPERAND_P)
3399 goto pop;
3402 while (1);
3403 if (dump_file && (dump_flags & TDF_DETAILS))
3405 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3406 unsigned i;
3407 std::pair<ssa_op_iter, use_operand_p> *x;
3408 FOR_EACH_VEC_ELT (path, i, x)
3409 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3410 dump_printf (MSG_NOTE, "\n");
3413 /* Check whether the reduction path detected is valid. */
3414 bool fail = path.length () == 0;
3415 bool neg = false;
3416 int sign = -1;
3417 *code = ERROR_MARK;
3418 for (unsigned i = 1; i < path.length (); ++i)
3420 gimple *use_stmt = USE_STMT (path[i].second);
3421 tree op = USE_FROM_PTR (path[i].second);
3422 if (! is_gimple_assign (use_stmt)
3423 /* The following make sure we can compute the operand index
3424 easily plus it mostly disallows chaining via COND_EXPR condition
3425 operands. */
3426 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3427 && (gimple_num_ops (use_stmt) <= 2
3428 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3429 && (gimple_num_ops (use_stmt) <= 3
3430 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3432 fail = true;
3433 break;
3435 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3436 if (use_code == MINUS_EXPR)
3438 use_code = PLUS_EXPR;
3439 /* Track whether we negate the reduction value each iteration. */
3440 if (gimple_assign_rhs2 (use_stmt) == op)
3441 neg = ! neg;
3443 if (CONVERT_EXPR_CODE_P (use_code)
3444 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3445 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3447 else if (*code == ERROR_MARK)
3449 *code = use_code;
3450 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3452 else if (use_code != *code)
3454 fail = true;
3455 break;
3457 else if ((use_code == MIN_EXPR
3458 || use_code == MAX_EXPR)
3459 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3461 fail = true;
3462 break;
3464 /* Check there's only a single stmt the op is used on. For the
3465 not value-changing tail and the last stmt allow out-of-loop uses.
3466 ??? We could relax this and handle arbitrary live stmts by
3467 forcing a scalar epilogue for example. */
3468 imm_use_iterator imm_iter;
3469 gimple *op_use_stmt;
3470 unsigned cnt = 0;
3471 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3472 if (!is_gimple_debug (op_use_stmt)
3473 && (*code != ERROR_MARK
3474 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3476 /* We want to allow x + x but not x < 1 ? x : 2. */
3477 if (is_gimple_assign (op_use_stmt)
3478 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3480 use_operand_p use_p;
3481 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3482 cnt++;
3484 else
3485 cnt++;
3487 if (cnt != 1)
3489 fail = true;
3490 break;
3493 return ! fail && ! neg && *code != ERROR_MARK;
3496 bool
3497 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3498 tree loop_arg, enum tree_code code)
3500 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3501 enum tree_code code_;
3502 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3503 && code_ == code);
3508 /* Function vect_is_simple_reduction
3510 (1) Detect a cross-iteration def-use cycle that represents a simple
3511 reduction computation. We look for the following pattern:
3513 loop_header:
3514 a1 = phi < a0, a2 >
3515 a3 = ...
3516 a2 = operation (a3, a1)
3520 a3 = ...
3521 loop_header:
3522 a1 = phi < a0, a2 >
3523 a2 = operation (a3, a1)
3525 such that:
3526 1. operation is commutative and associative and it is safe to
3527 change the order of the computation
3528 2. no uses for a2 in the loop (a2 is used out of the loop)
3529 3. no uses of a1 in the loop besides the reduction operation
3530 4. no uses of a1 outside the loop.
3532 Conditions 1,4 are tested here.
3533 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3535 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3536 nested cycles.
3538 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3539 reductions:
3541 a1 = phi < a0, a2 >
3542 inner loop (def of a3)
3543 a2 = phi < a3 >
3545 (4) Detect condition expressions, ie:
3546 for (int i = 0; i < N; i++)
3547 if (a[i] < val)
3548 ret_val = a[i];
3552 static stmt_vec_info
3553 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3554 bool *double_reduc, bool *reduc_chain_p)
3556 gphi *phi = as_a <gphi *> (phi_info->stmt);
3557 gimple *phi_use_stmt = NULL;
3558 imm_use_iterator imm_iter;
3559 use_operand_p use_p;
3561 *double_reduc = false;
3562 *reduc_chain_p = false;
3563 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3565 tree phi_name = PHI_RESULT (phi);
3566 /* ??? If there are no uses of the PHI result the inner loop reduction
3567 won't be detected as possibly double-reduction by vectorizable_reduction
3568 because that tries to walk the PHI arg from the preheader edge which
3569 can be constant. See PR60382. */
3570 if (has_zero_uses (phi_name))
3571 return NULL;
3572 class loop *loop = (gimple_bb (phi))->loop_father;
3573 unsigned nphi_def_loop_uses = 0;
3574 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3576 gimple *use_stmt = USE_STMT (use_p);
3577 if (is_gimple_debug (use_stmt))
3578 continue;
3580 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3582 if (dump_enabled_p ())
3583 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3584 "intermediate value used outside loop.\n");
3586 return NULL;
3589 nphi_def_loop_uses++;
3590 phi_use_stmt = use_stmt;
3593 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3594 if (TREE_CODE (latch_def) != SSA_NAME)
3596 if (dump_enabled_p ())
3597 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3598 "reduction: not ssa_name: %T\n", latch_def);
3599 return NULL;
3602 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3603 if (!def_stmt_info
3604 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3605 return NULL;
3607 bool nested_in_vect_loop
3608 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3609 unsigned nlatch_def_loop_uses = 0;
3610 auto_vec<gphi *, 3> lcphis;
3611 bool inner_loop_of_double_reduc = false;
3612 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3614 gimple *use_stmt = USE_STMT (use_p);
3615 if (is_gimple_debug (use_stmt))
3616 continue;
3617 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3618 nlatch_def_loop_uses++;
3619 else
3621 /* We can have more than one loop-closed PHI. */
3622 lcphis.safe_push (as_a <gphi *> (use_stmt));
3623 if (nested_in_vect_loop
3624 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3625 == vect_double_reduction_def))
3626 inner_loop_of_double_reduc = true;
3630 /* If we are vectorizing an inner reduction we are executing that
3631 in the original order only in case we are not dealing with a
3632 double reduction. */
3633 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3635 if (dump_enabled_p ())
3636 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3637 "detected nested cycle: ");
3638 return def_stmt_info;
3641 /* If this isn't a nested cycle or if the nested cycle reduction value
3642 is used ouside of the inner loop we cannot handle uses of the reduction
3643 value. */
3644 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3646 if (dump_enabled_p ())
3647 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3648 "reduction used in loop.\n");
3649 return NULL;
3652 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3653 defined in the inner loop. */
3654 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3656 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3657 if (gimple_phi_num_args (def_stmt) != 1
3658 || TREE_CODE (op1) != SSA_NAME)
3660 if (dump_enabled_p ())
3661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3662 "unsupported phi node definition.\n");
3664 return NULL;
3667 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3668 if (gimple_bb (def1)
3669 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3670 && loop->inner
3671 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3672 && is_gimple_assign (def1)
3673 && is_a <gphi *> (phi_use_stmt)
3674 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3676 if (dump_enabled_p ())
3677 report_vect_op (MSG_NOTE, def_stmt,
3678 "detected double reduction: ");
3680 *double_reduc = true;
3681 return def_stmt_info;
3684 return NULL;
3687 /* Look for the expression computing latch_def from then loop PHI result. */
3688 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3689 enum tree_code code;
3690 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3691 path))
3693 STMT_VINFO_REDUC_CODE (phi_info) = code;
3694 if (code == COND_EXPR && !nested_in_vect_loop)
3695 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3697 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3698 reduction chain for which the additional restriction is that
3699 all operations in the chain are the same. */
3700 auto_vec<stmt_vec_info, 8> reduc_chain;
3701 unsigned i;
3702 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3703 for (i = path.length () - 1; i >= 1; --i)
3705 gimple *stmt = USE_STMT (path[i].second);
3706 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3707 STMT_VINFO_REDUC_IDX (stmt_info)
3708 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3709 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3710 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3711 && (i == 1 || i == path.length () - 1));
3712 if ((stmt_code != code && !leading_conversion)
3713 /* We can only handle the final value in epilogue
3714 generation for reduction chains. */
3715 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3716 is_slp_reduc = false;
3717 /* For reduction chains we support a trailing/leading
3718 conversions. We do not store those in the actual chain. */
3719 if (leading_conversion)
3720 continue;
3721 reduc_chain.safe_push (stmt_info);
3723 if (is_slp_reduc && reduc_chain.length () > 1)
3725 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3727 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3728 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3730 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3731 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3733 /* Save the chain for further analysis in SLP detection. */
3734 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3735 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3737 *reduc_chain_p = true;
3738 if (dump_enabled_p ())
3739 dump_printf_loc (MSG_NOTE, vect_location,
3740 "reduction: detected reduction chain\n");
3742 else if (dump_enabled_p ())
3743 dump_printf_loc (MSG_NOTE, vect_location,
3744 "reduction: detected reduction\n");
3746 return def_stmt_info;
3749 if (dump_enabled_p ())
3750 dump_printf_loc (MSG_NOTE, vect_location,
3751 "reduction: unknown pattern\n");
3753 return NULL;
3756 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3757 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3758 or -1 if not known. */
3760 static int
3761 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3763 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3764 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3766 if (dump_enabled_p ())
3767 dump_printf_loc (MSG_NOTE, vect_location,
3768 "cost model: epilogue peel iters set to vf/2 "
3769 "because loop iterations are unknown .\n");
3770 return assumed_vf / 2;
3772 else
3774 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3775 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3776 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3777 /* If we need to peel for gaps, but no peeling is required, we have to
3778 peel VF iterations. */
3779 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3780 peel_iters_epilogue = assumed_vf;
3781 return peel_iters_epilogue;
3785 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3787 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3788 int *peel_iters_epilogue,
3789 stmt_vector_for_cost *scalar_cost_vec,
3790 stmt_vector_for_cost *prologue_cost_vec,
3791 stmt_vector_for_cost *epilogue_cost_vec)
3793 int retval = 0;
3795 *peel_iters_epilogue
3796 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3798 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3800 /* If peeled iterations are known but number of scalar loop
3801 iterations are unknown, count a taken branch per peeled loop. */
3802 if (peel_iters_prologue > 0)
3803 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3804 NULL, NULL_TREE, 0, vect_prologue);
3805 if (*peel_iters_epilogue > 0)
3806 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3807 NULL, NULL_TREE, 0, vect_epilogue);
3810 stmt_info_for_cost *si;
3811 int j;
3812 if (peel_iters_prologue)
3813 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3814 retval += record_stmt_cost (prologue_cost_vec,
3815 si->count * peel_iters_prologue,
3816 si->kind, si->stmt_info, si->misalign,
3817 vect_prologue);
3818 if (*peel_iters_epilogue)
3819 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3820 retval += record_stmt_cost (epilogue_cost_vec,
3821 si->count * *peel_iters_epilogue,
3822 si->kind, si->stmt_info, si->misalign,
3823 vect_epilogue);
3825 return retval;
3828 /* Function vect_estimate_min_profitable_iters
3830 Return the number of iterations required for the vector version of the
3831 loop to be profitable relative to the cost of the scalar version of the
3832 loop.
3834 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3835 of iterations for vectorization. -1 value means loop vectorization
3836 is not profitable. This returned value may be used for dynamic
3837 profitability check.
3839 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3840 for static check against estimated number of iterations. */
3842 static void
3843 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3844 int *ret_min_profitable_niters,
3845 int *ret_min_profitable_estimate)
3847 int min_profitable_iters;
3848 int min_profitable_estimate;
3849 int peel_iters_prologue;
3850 int peel_iters_epilogue;
3851 unsigned vec_inside_cost = 0;
3852 int vec_outside_cost = 0;
3853 unsigned vec_prologue_cost = 0;
3854 unsigned vec_epilogue_cost = 0;
3855 int scalar_single_iter_cost = 0;
3856 int scalar_outside_cost = 0;
3857 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3858 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3859 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3861 /* Cost model disabled. */
3862 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3864 if (dump_enabled_p ())
3865 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3866 *ret_min_profitable_niters = 0;
3867 *ret_min_profitable_estimate = 0;
3868 return;
3871 /* Requires loop versioning tests to handle misalignment. */
3872 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3874 /* FIXME: Make cost depend on complexity of individual check. */
3875 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3876 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3877 NULL, NULL_TREE, 0, vect_prologue);
3878 if (dump_enabled_p ())
3879 dump_printf (MSG_NOTE,
3880 "cost model: Adding cost of checks for loop "
3881 "versioning to treat misalignment.\n");
3884 /* Requires loop versioning with alias checks. */
3885 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3887 /* FIXME: Make cost depend on complexity of individual check. */
3888 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3889 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3890 NULL, NULL_TREE, 0, vect_prologue);
3891 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3892 if (len)
3893 /* Count LEN - 1 ANDs and LEN comparisons. */
3894 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3895 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3896 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3897 if (len)
3899 /* Count LEN - 1 ANDs and LEN comparisons. */
3900 unsigned int nstmts = len * 2 - 1;
3901 /* +1 for each bias that needs adding. */
3902 for (unsigned int i = 0; i < len; ++i)
3903 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3904 nstmts += 1;
3905 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3906 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3908 if (dump_enabled_p ())
3909 dump_printf (MSG_NOTE,
3910 "cost model: Adding cost of checks for loop "
3911 "versioning aliasing.\n");
3914 /* Requires loop versioning with niter checks. */
3915 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3917 /* FIXME: Make cost depend on complexity of individual check. */
3918 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3919 NULL, NULL_TREE, 0, vect_prologue);
3920 if (dump_enabled_p ())
3921 dump_printf (MSG_NOTE,
3922 "cost model: Adding cost of checks for loop "
3923 "versioning niters.\n");
3926 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3927 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3928 NULL, NULL_TREE, 0, vect_prologue);
3930 /* Count statements in scalar loop. Using this as scalar cost for a single
3931 iteration for now.
3933 TODO: Add outer loop support.
3935 TODO: Consider assigning different costs to different scalar
3936 statements. */
3938 scalar_single_iter_cost
3939 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3941 /* Add additional cost for the peeled instructions in prologue and epilogue
3942 loop. (For fully-masked loops there will be no peeling.)
3944 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3945 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3947 TODO: Build an expression that represents peel_iters for prologue and
3948 epilogue to be used in a run-time test. */
3950 bool prologue_need_br_taken_cost = false;
3951 bool prologue_need_br_not_taken_cost = false;
3953 /* Calculate peel_iters_prologue. */
3954 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3955 peel_iters_prologue = 0;
3956 else if (npeel < 0)
3958 peel_iters_prologue = assumed_vf / 2;
3959 if (dump_enabled_p ())
3960 dump_printf (MSG_NOTE, "cost model: "
3961 "prologue peel iters set to vf/2.\n");
3963 /* If peeled iterations are unknown, count a taken branch and a not taken
3964 branch per peeled loop. Even if scalar loop iterations are known,
3965 vector iterations are not known since peeled prologue iterations are
3966 not known. Hence guards remain the same. */
3967 prologue_need_br_taken_cost = true;
3968 prologue_need_br_not_taken_cost = true;
3970 else
3972 peel_iters_prologue = npeel;
3973 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3974 /* If peeled iterations are known but number of scalar loop
3975 iterations are unknown, count a taken branch per peeled loop. */
3976 prologue_need_br_taken_cost = true;
3979 bool epilogue_need_br_taken_cost = false;
3980 bool epilogue_need_br_not_taken_cost = false;
3982 /* Calculate peel_iters_epilogue. */
3983 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3984 /* We need to peel exactly one iteration for gaps. */
3985 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3986 else if (npeel < 0)
3988 /* If peeling for alignment is unknown, loop bound of main loop
3989 becomes unknown. */
3990 peel_iters_epilogue = assumed_vf / 2;
3991 if (dump_enabled_p ())
3992 dump_printf (MSG_NOTE, "cost model: "
3993 "epilogue peel iters set to vf/2 because "
3994 "peeling for alignment is unknown.\n");
3996 /* See the same reason above in peel_iters_prologue calculation. */
3997 epilogue_need_br_taken_cost = true;
3998 epilogue_need_br_not_taken_cost = true;
4000 else
4002 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4003 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4004 /* If peeled iterations are known but number of scalar loop
4005 iterations are unknown, count a taken branch per peeled loop. */
4006 epilogue_need_br_taken_cost = true;
4009 stmt_info_for_cost *si;
4010 int j;
4011 /* Add costs associated with peel_iters_prologue. */
4012 if (peel_iters_prologue)
4013 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4015 (void) add_stmt_cost (loop_vinfo, target_cost_data,
4016 si->count * peel_iters_prologue, si->kind,
4017 si->stmt_info, si->vectype, si->misalign,
4018 vect_prologue);
4021 /* Add costs associated with peel_iters_epilogue. */
4022 if (peel_iters_epilogue)
4023 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4025 (void) add_stmt_cost (loop_vinfo, target_cost_data,
4026 si->count * peel_iters_epilogue, si->kind,
4027 si->stmt_info, si->vectype, si->misalign,
4028 vect_epilogue);
4031 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4033 if (prologue_need_br_taken_cost)
4034 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4035 NULL, NULL_TREE, 0, vect_prologue);
4037 if (prologue_need_br_not_taken_cost)
4038 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4039 cond_branch_not_taken, NULL, NULL_TREE, 0,
4040 vect_prologue);
4042 if (epilogue_need_br_taken_cost)
4043 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4044 NULL, NULL_TREE, 0, vect_epilogue);
4046 if (epilogue_need_br_not_taken_cost)
4047 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4048 cond_branch_not_taken, NULL, NULL_TREE, 0,
4049 vect_epilogue);
4051 /* Take care of special costs for rgroup controls of partial vectors. */
4052 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4054 /* Calculate how many masks we need to generate. */
4055 unsigned int num_masks = 0;
4056 rgroup_controls *rgm;
4057 unsigned int num_vectors_m1;
4058 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4059 if (rgm->type)
4060 num_masks += num_vectors_m1 + 1;
4061 gcc_assert (num_masks > 0);
4063 /* In the worst case, we need to generate each mask in the prologue
4064 and in the loop body. One of the loop body mask instructions
4065 replaces the comparison in the scalar loop, and since we don't
4066 count the scalar comparison against the scalar body, we shouldn't
4067 count that vector instruction against the vector body either.
4069 Sometimes we can use unpacks instead of generating prologue
4070 masks and sometimes the prologue mask will fold to a constant,
4071 so the actual prologue cost might be smaller. However, it's
4072 simpler and safer to use the worst-case cost; if this ends up
4073 being the tie-breaker between vectorizing or not, then it's
4074 probably better not to vectorize. */
4075 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
4076 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4077 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
4078 vector_stmt, NULL, NULL_TREE, 0, vect_body);
4080 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4082 /* Referring to the functions vect_set_loop_condition_partial_vectors
4083 and vect_set_loop_controls_directly, we need to generate each
4084 length in the prologue and in the loop body if required. Although
4085 there are some possible optimizations, we consider the worst case
4086 here. */
4088 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4089 bool need_iterate_p
4090 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4091 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4093 /* Calculate how many statements to be added. */
4094 unsigned int prologue_stmts = 0;
4095 unsigned int body_stmts = 0;
4097 rgroup_controls *rgc;
4098 unsigned int num_vectors_m1;
4099 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4100 if (rgc->type)
4102 /* May need one SHIFT for nitems_total computation. */
4103 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4104 if (nitems != 1 && !niters_known_p)
4105 prologue_stmts += 1;
4107 /* May need one MAX and one MINUS for wrap around. */
4108 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4109 prologue_stmts += 2;
4111 /* Need one MAX and one MINUS for each batch limit excepting for
4112 the 1st one. */
4113 prologue_stmts += num_vectors_m1 * 2;
4115 unsigned int num_vectors = num_vectors_m1 + 1;
4117 /* Need to set up lengths in prologue, only one MIN required
4118 for each since start index is zero. */
4119 prologue_stmts += num_vectors;
4121 /* Each may need two MINs and one MINUS to update lengths in body
4122 for next iteration. */
4123 if (need_iterate_p)
4124 body_stmts += 3 * num_vectors;
4127 (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
4128 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4129 (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
4130 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4133 /* FORNOW: The scalar outside cost is incremented in one of the
4134 following ways:
4136 1. The vectorizer checks for alignment and aliasing and generates
4137 a condition that allows dynamic vectorization. A cost model
4138 check is ANDED with the versioning condition. Hence scalar code
4139 path now has the added cost of the versioning check.
4141 if (cost > th & versioning_check)
4142 jmp to vector code
4144 Hence run-time scalar is incremented by not-taken branch cost.
4146 2. The vectorizer then checks if a prologue is required. If the
4147 cost model check was not done before during versioning, it has to
4148 be done before the prologue check.
4150 if (cost <= th)
4151 prologue = scalar_iters
4152 if (prologue == 0)
4153 jmp to vector code
4154 else
4155 execute prologue
4156 if (prologue == num_iters)
4157 go to exit
4159 Hence the run-time scalar cost is incremented by a taken branch,
4160 plus a not-taken branch, plus a taken branch cost.
4162 3. The vectorizer then checks if an epilogue is required. If the
4163 cost model check was not done before during prologue check, it
4164 has to be done with the epilogue check.
4166 if (prologue == 0)
4167 jmp to vector code
4168 else
4169 execute prologue
4170 if (prologue == num_iters)
4171 go to exit
4172 vector code:
4173 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4174 jmp to epilogue
4176 Hence the run-time scalar cost should be incremented by 2 taken
4177 branches.
4179 TODO: The back end may reorder the BBS's differently and reverse
4180 conditions/branch directions. Change the estimates below to
4181 something more reasonable. */
4183 /* If the number of iterations is known and we do not do versioning, we can
4184 decide whether to vectorize at compile time. Hence the scalar version
4185 do not carry cost model guard costs. */
4186 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4187 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4189 /* Cost model check occurs at versioning. */
4190 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4191 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4192 else
4194 /* Cost model check occurs at prologue generation. */
4195 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4196 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4197 + vect_get_stmt_cost (cond_branch_not_taken);
4198 /* Cost model check occurs at epilogue generation. */
4199 else
4200 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4204 /* Complete the target-specific cost calculations. */
4205 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4206 &vec_inside_cost, &vec_epilogue_cost);
4208 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4210 /* Stash the costs so that we can compare two loop_vec_infos. */
4211 loop_vinfo->vec_inside_cost = vec_inside_cost;
4212 loop_vinfo->vec_outside_cost = vec_outside_cost;
4214 if (dump_enabled_p ())
4216 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4217 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4218 vec_inside_cost);
4219 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4220 vec_prologue_cost);
4221 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4222 vec_epilogue_cost);
4223 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4224 scalar_single_iter_cost);
4225 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4226 scalar_outside_cost);
4227 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4228 vec_outside_cost);
4229 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4230 peel_iters_prologue);
4231 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4232 peel_iters_epilogue);
4235 /* Calculate number of iterations required to make the vector version
4236 profitable, relative to the loop bodies only. The following condition
4237 must hold true:
4238 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4239 where
4240 SIC = scalar iteration cost, VIC = vector iteration cost,
4241 VOC = vector outside cost, VF = vectorization factor,
4242 NPEEL = prologue iterations + epilogue iterations,
4243 SOC = scalar outside cost for run time cost model check. */
4245 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4246 - vec_inside_cost);
4247 if (saving_per_viter <= 0)
4249 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4250 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4251 "vectorization did not happen for a simd loop");
4253 if (dump_enabled_p ())
4254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4255 "cost model: the vector iteration cost = %d "
4256 "divided by the scalar iteration cost = %d "
4257 "is greater or equal to the vectorization factor = %d"
4258 ".\n",
4259 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4260 *ret_min_profitable_niters = -1;
4261 *ret_min_profitable_estimate = -1;
4262 return;
4265 /* ??? The "if" arm is written to handle all cases; see below for what
4266 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4267 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4269 /* Rewriting the condition above in terms of the number of
4270 vector iterations (vniters) rather than the number of
4271 scalar iterations (niters) gives:
4273 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4275 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4277 For integer N, X and Y when X > 0:
4279 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4280 int outside_overhead = (vec_outside_cost
4281 - scalar_single_iter_cost * peel_iters_prologue
4282 - scalar_single_iter_cost * peel_iters_epilogue
4283 - scalar_outside_cost);
4284 /* We're only interested in cases that require at least one
4285 vector iteration. */
4286 int min_vec_niters = 1;
4287 if (outside_overhead > 0)
4288 min_vec_niters = outside_overhead / saving_per_viter + 1;
4290 if (dump_enabled_p ())
4291 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4292 min_vec_niters);
4294 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4296 /* Now that we know the minimum number of vector iterations,
4297 find the minimum niters for which the scalar cost is larger:
4299 SIC * niters > VIC * vniters + VOC - SOC
4301 We know that the minimum niters is no more than
4302 vniters * VF + NPEEL, but it might be (and often is) less
4303 than that if a partial vector iteration is cheaper than the
4304 equivalent scalar code. */
4305 int threshold = (vec_inside_cost * min_vec_niters
4306 + vec_outside_cost
4307 - scalar_outside_cost);
4308 if (threshold <= 0)
4309 min_profitable_iters = 1;
4310 else
4311 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4313 else
4314 /* Convert the number of vector iterations into a number of
4315 scalar iterations. */
4316 min_profitable_iters = (min_vec_niters * assumed_vf
4317 + peel_iters_prologue
4318 + peel_iters_epilogue);
4320 else
4322 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4323 * assumed_vf
4324 - vec_inside_cost * peel_iters_prologue
4325 - vec_inside_cost * peel_iters_epilogue);
4326 if (min_profitable_iters <= 0)
4327 min_profitable_iters = 0;
4328 else
4330 min_profitable_iters /= saving_per_viter;
4332 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4333 <= (((int) vec_inside_cost * min_profitable_iters)
4334 + (((int) vec_outside_cost - scalar_outside_cost)
4335 * assumed_vf)))
4336 min_profitable_iters++;
4340 if (dump_enabled_p ())
4341 dump_printf (MSG_NOTE,
4342 " Calculated minimum iters for profitability: %d\n",
4343 min_profitable_iters);
4345 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4346 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4347 /* We want the vectorized loop to execute at least once. */
4348 min_profitable_iters = assumed_vf + peel_iters_prologue;
4349 else if (min_profitable_iters < peel_iters_prologue)
4350 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4351 vectorized loop executes at least once. */
4352 min_profitable_iters = peel_iters_prologue;
4354 if (dump_enabled_p ())
4355 dump_printf_loc (MSG_NOTE, vect_location,
4356 " Runtime profitability threshold = %d\n",
4357 min_profitable_iters);
4359 *ret_min_profitable_niters = min_profitable_iters;
4361 /* Calculate number of iterations required to make the vector version
4362 profitable, relative to the loop bodies only.
4364 Non-vectorized variant is SIC * niters and it must win over vector
4365 variant on the expected loop trip count. The following condition must hold true:
4366 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4368 if (vec_outside_cost <= 0)
4369 min_profitable_estimate = 0;
4370 /* ??? This "else if" arm is written to handle all cases; see below for
4371 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4372 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4374 /* This is a repeat of the code above, but with + SOC rather
4375 than - SOC. */
4376 int outside_overhead = (vec_outside_cost
4377 - scalar_single_iter_cost * peel_iters_prologue
4378 - scalar_single_iter_cost * peel_iters_epilogue
4379 + scalar_outside_cost);
4380 int min_vec_niters = 1;
4381 if (outside_overhead > 0)
4382 min_vec_niters = outside_overhead / saving_per_viter + 1;
4384 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4386 int threshold = (vec_inside_cost * min_vec_niters
4387 + vec_outside_cost
4388 + scalar_outside_cost);
4389 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4391 else
4392 min_profitable_estimate = (min_vec_niters * assumed_vf
4393 + peel_iters_prologue
4394 + peel_iters_epilogue);
4396 else
4398 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4399 * assumed_vf
4400 - vec_inside_cost * peel_iters_prologue
4401 - vec_inside_cost * peel_iters_epilogue)
4402 / ((scalar_single_iter_cost * assumed_vf)
4403 - vec_inside_cost);
4405 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4406 if (dump_enabled_p ())
4407 dump_printf_loc (MSG_NOTE, vect_location,
4408 " Static estimate profitability threshold = %d\n",
4409 min_profitable_estimate);
4411 *ret_min_profitable_estimate = min_profitable_estimate;
4414 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4415 vector elements (not bits) for a vector with NELT elements. */
4416 static void
4417 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4418 vec_perm_builder *sel)
4420 /* The encoding is a single stepped pattern. Any wrap-around is handled
4421 by vec_perm_indices. */
4422 sel->new_vector (nelt, 1, 3);
4423 for (unsigned int i = 0; i < 3; i++)
4424 sel->quick_push (i + offset);
4427 /* Checks whether the target supports whole-vector shifts for vectors of mode
4428 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4429 it supports vec_perm_const with masks for all necessary shift amounts. */
4430 static bool
4431 have_whole_vector_shift (machine_mode mode)
4433 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4434 return true;
4436 /* Variable-length vectors should be handled via the optab. */
4437 unsigned int nelt;
4438 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4439 return false;
4441 vec_perm_builder sel;
4442 vec_perm_indices indices;
4443 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4445 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4446 indices.new_vector (sel, 2, nelt);
4447 if (!can_vec_perm_const_p (mode, indices, false))
4448 return false;
4450 return true;
4453 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4454 functions. Design better to avoid maintenance issues. */
4456 /* Function vect_model_reduction_cost.
4458 Models cost for a reduction operation, including the vector ops
4459 generated within the strip-mine loop in some cases, the initial
4460 definition before the loop, and the epilogue code that must be generated. */
4462 static void
4463 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4464 stmt_vec_info stmt_info, internal_fn reduc_fn,
4465 vect_reduction_type reduction_type,
4466 int ncopies, stmt_vector_for_cost *cost_vec)
4468 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4469 enum tree_code code;
4470 optab optab;
4471 tree vectype;
4472 machine_mode mode;
4473 class loop *loop = NULL;
4475 if (loop_vinfo)
4476 loop = LOOP_VINFO_LOOP (loop_vinfo);
4478 /* Condition reductions generate two reductions in the loop. */
4479 if (reduction_type == COND_REDUCTION)
4480 ncopies *= 2;
4482 vectype = STMT_VINFO_VECTYPE (stmt_info);
4483 mode = TYPE_MODE (vectype);
4484 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4486 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4488 if (reduction_type == EXTRACT_LAST_REDUCTION)
4489 /* No extra instructions are needed in the prologue. The loop body
4490 operations are costed in vectorizable_condition. */
4491 inside_cost = 0;
4492 else if (reduction_type == FOLD_LEFT_REDUCTION)
4494 /* No extra instructions needed in the prologue. */
4495 prologue_cost = 0;
4497 if (reduc_fn != IFN_LAST)
4498 /* Count one reduction-like operation per vector. */
4499 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4500 stmt_info, 0, vect_body);
4501 else
4503 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4504 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4505 inside_cost = record_stmt_cost (cost_vec, nelements,
4506 vec_to_scalar, stmt_info, 0,
4507 vect_body);
4508 inside_cost += record_stmt_cost (cost_vec, nelements,
4509 scalar_stmt, stmt_info, 0,
4510 vect_body);
4513 else
4515 /* Add in cost for initial definition.
4516 For cond reduction we have four vectors: initial index, step,
4517 initial result of the data reduction, initial value of the index
4518 reduction. */
4519 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4520 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4521 scalar_to_vec, stmt_info, 0,
4522 vect_prologue);
4525 /* Determine cost of epilogue code.
4527 We have a reduction operator that will reduce the vector in one statement.
4528 Also requires scalar extract. */
4530 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4532 if (reduc_fn != IFN_LAST)
4534 if (reduction_type == COND_REDUCTION)
4536 /* An EQ stmt and an COND_EXPR stmt. */
4537 epilogue_cost += record_stmt_cost (cost_vec, 2,
4538 vector_stmt, stmt_info, 0,
4539 vect_epilogue);
4540 /* Reduction of the max index and a reduction of the found
4541 values. */
4542 epilogue_cost += record_stmt_cost (cost_vec, 2,
4543 vec_to_scalar, stmt_info, 0,
4544 vect_epilogue);
4545 /* A broadcast of the max value. */
4546 epilogue_cost += record_stmt_cost (cost_vec, 1,
4547 scalar_to_vec, stmt_info, 0,
4548 vect_epilogue);
4550 else
4552 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4553 stmt_info, 0, vect_epilogue);
4554 epilogue_cost += record_stmt_cost (cost_vec, 1,
4555 vec_to_scalar, stmt_info, 0,
4556 vect_epilogue);
4559 else if (reduction_type == COND_REDUCTION)
4561 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4562 /* Extraction of scalar elements. */
4563 epilogue_cost += record_stmt_cost (cost_vec,
4564 2 * estimated_nunits,
4565 vec_to_scalar, stmt_info, 0,
4566 vect_epilogue);
4567 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4568 epilogue_cost += record_stmt_cost (cost_vec,
4569 2 * estimated_nunits - 3,
4570 scalar_stmt, stmt_info, 0,
4571 vect_epilogue);
4573 else if (reduction_type == EXTRACT_LAST_REDUCTION
4574 || reduction_type == FOLD_LEFT_REDUCTION)
4575 /* No extra instructions need in the epilogue. */
4577 else
4579 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4580 tree bitsize =
4581 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4582 int element_bitsize = tree_to_uhwi (bitsize);
4583 int nelements = vec_size_in_bits / element_bitsize;
4585 if (code == COND_EXPR)
4586 code = MAX_EXPR;
4588 optab = optab_for_tree_code (code, vectype, optab_default);
4590 /* We have a whole vector shift available. */
4591 if (optab != unknown_optab
4592 && VECTOR_MODE_P (mode)
4593 && optab_handler (optab, mode) != CODE_FOR_nothing
4594 && have_whole_vector_shift (mode))
4596 /* Final reduction via vector shifts and the reduction operator.
4597 Also requires scalar extract. */
4598 epilogue_cost += record_stmt_cost (cost_vec,
4599 exact_log2 (nelements) * 2,
4600 vector_stmt, stmt_info, 0,
4601 vect_epilogue);
4602 epilogue_cost += record_stmt_cost (cost_vec, 1,
4603 vec_to_scalar, stmt_info, 0,
4604 vect_epilogue);
4606 else
4607 /* Use extracts and reduction op for final reduction. For N
4608 elements, we have N extracts and N-1 reduction ops. */
4609 epilogue_cost += record_stmt_cost (cost_vec,
4610 nelements + nelements - 1,
4611 vector_stmt, stmt_info, 0,
4612 vect_epilogue);
4616 if (dump_enabled_p ())
4617 dump_printf (MSG_NOTE,
4618 "vect_model_reduction_cost: inside_cost = %d, "
4619 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4620 prologue_cost, epilogue_cost);
4625 /* Function get_initial_def_for_reduction
4627 Input:
4628 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4629 INIT_VAL - the initial value of the reduction variable
4631 Output:
4632 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4633 of the reduction (used for adjusting the epilog - see below).
4634 Return a vector variable, initialized according to the operation that
4635 STMT_VINFO performs. This vector will be used as the initial value
4636 of the vector of partial results.
4638 Option1 (adjust in epilog): Initialize the vector as follows:
4639 add/bit or/xor: [0,0,...,0,0]
4640 mult/bit and: [1,1,...,1,1]
4641 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4642 and when necessary (e.g. add/mult case) let the caller know
4643 that it needs to adjust the result by init_val.
4645 Option2: Initialize the vector as follows:
4646 add/bit or/xor: [init_val,0,0,...,0]
4647 mult/bit and: [init_val,1,1,...,1]
4648 min/max/cond_expr: [init_val,init_val,...,init_val]
4649 and no adjustments are needed.
4651 For example, for the following code:
4653 s = init_val;
4654 for (i=0;i<n;i++)
4655 s = s + a[i];
4657 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4658 For a vector of 4 units, we want to return either [0,0,0,init_val],
4659 or [0,0,0,0] and let the caller know that it needs to adjust
4660 the result at the end by 'init_val'.
4662 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4663 initialization vector is simpler (same element in all entries), if
4664 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4666 A cost model should help decide between these two schemes. */
4668 static tree
4669 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4670 stmt_vec_info stmt_vinfo,
4671 enum tree_code code, tree init_val,
4672 tree *adjustment_def)
4674 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4675 tree scalar_type = TREE_TYPE (init_val);
4676 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4677 tree def_for_init;
4678 tree init_def;
4679 REAL_VALUE_TYPE real_init_val = dconst0;
4680 int int_init_val = 0;
4681 gimple_seq stmts = NULL;
4683 gcc_assert (vectype);
4685 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4686 || SCALAR_FLOAT_TYPE_P (scalar_type));
4688 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4689 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4691 /* ADJUSTMENT_DEF is NULL when called from
4692 vect_create_epilog_for_reduction to vectorize double reduction. */
4693 if (adjustment_def)
4694 *adjustment_def = NULL;
4696 switch (code)
4698 case WIDEN_SUM_EXPR:
4699 case DOT_PROD_EXPR:
4700 case SAD_EXPR:
4701 case PLUS_EXPR:
4702 case MINUS_EXPR:
4703 case BIT_IOR_EXPR:
4704 case BIT_XOR_EXPR:
4705 case MULT_EXPR:
4706 case BIT_AND_EXPR:
4708 if (code == MULT_EXPR)
4710 real_init_val = dconst1;
4711 int_init_val = 1;
4714 if (code == BIT_AND_EXPR)
4715 int_init_val = -1;
4717 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4718 def_for_init = build_real (scalar_type, real_init_val);
4719 else
4720 def_for_init = build_int_cst (scalar_type, int_init_val);
4722 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4724 /* Option1: the first element is '0' or '1' as well. */
4725 if (!operand_equal_p (def_for_init, init_val, 0))
4726 *adjustment_def = init_val;
4727 init_def = gimple_build_vector_from_val (&stmts, vectype,
4728 def_for_init);
4730 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4732 /* Option2 (variable length): the first element is INIT_VAL. */
4733 init_def = gimple_build_vector_from_val (&stmts, vectype,
4734 def_for_init);
4735 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4736 vectype, init_def, init_val);
4738 else
4740 /* Option2: the first element is INIT_VAL. */
4741 tree_vector_builder elts (vectype, 1, 2);
4742 elts.quick_push (init_val);
4743 elts.quick_push (def_for_init);
4744 init_def = gimple_build_vector (&stmts, &elts);
4747 break;
4749 case MIN_EXPR:
4750 case MAX_EXPR:
4751 case COND_EXPR:
4753 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4754 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4756 break;
4758 default:
4759 gcc_unreachable ();
4762 if (stmts)
4763 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4764 return init_def;
4767 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4768 NUMBER_OF_VECTORS is the number of vector defs to create.
4769 If NEUTRAL_OP is nonnull, introducing extra elements of that
4770 value will not change the result. */
4772 static void
4773 get_initial_defs_for_reduction (vec_info *vinfo,
4774 slp_tree slp_node,
4775 vec<tree> *vec_oprnds,
4776 unsigned int number_of_vectors,
4777 bool reduc_chain, tree neutral_op)
4779 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4780 stmt_vec_info stmt_vinfo = stmts[0];
4781 unsigned HOST_WIDE_INT nunits;
4782 unsigned j, number_of_places_left_in_vector;
4783 tree vector_type;
4784 unsigned int group_size = stmts.length ();
4785 unsigned int i;
4786 class loop *loop;
4788 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4790 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4792 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4793 gcc_assert (loop);
4794 edge pe = loop_preheader_edge (loop);
4796 gcc_assert (!reduc_chain || neutral_op);
4798 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4799 created vectors. It is greater than 1 if unrolling is performed.
4801 For example, we have two scalar operands, s1 and s2 (e.g., group of
4802 strided accesses of size two), while NUNITS is four (i.e., four scalars
4803 of this type can be packed in a vector). The output vector will contain
4804 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4805 will be 2).
4807 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4808 vectors containing the operands.
4810 For example, NUNITS is four as before, and the group size is 8
4811 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4812 {s5, s6, s7, s8}. */
4814 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4815 nunits = group_size;
4817 number_of_places_left_in_vector = nunits;
4818 bool constant_p = true;
4819 tree_vector_builder elts (vector_type, nunits, 1);
4820 elts.quick_grow (nunits);
4821 gimple_seq ctor_seq = NULL;
4822 for (j = 0; j < nunits * number_of_vectors; ++j)
4824 tree op;
4825 i = j % group_size;
4826 stmt_vinfo = stmts[i];
4828 /* Get the def before the loop. In reduction chain we have only
4829 one initial value. Else we have as many as PHIs in the group. */
4830 if (reduc_chain)
4831 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4832 else if (((vec_oprnds->length () + 1) * nunits
4833 - number_of_places_left_in_vector >= group_size)
4834 && neutral_op)
4835 op = neutral_op;
4836 else
4837 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4839 /* Create 'vect_ = {op0,op1,...,opn}'. */
4840 number_of_places_left_in_vector--;
4841 elts[nunits - number_of_places_left_in_vector - 1] = op;
4842 if (!CONSTANT_CLASS_P (op))
4843 constant_p = false;
4845 if (number_of_places_left_in_vector == 0)
4847 tree init;
4848 if (constant_p && !neutral_op
4849 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4850 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4851 /* Build the vector directly from ELTS. */
4852 init = gimple_build_vector (&ctor_seq, &elts);
4853 else if (neutral_op)
4855 /* Build a vector of the neutral value and shift the
4856 other elements into place. */
4857 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4858 neutral_op);
4859 int k = nunits;
4860 while (k > 0 && elts[k - 1] == neutral_op)
4861 k -= 1;
4862 while (k > 0)
4864 k -= 1;
4865 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4866 vector_type, init, elts[k]);
4869 else
4871 /* First time round, duplicate ELTS to fill the
4872 required number of vectors. */
4873 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4874 number_of_vectors, *vec_oprnds);
4875 break;
4877 vec_oprnds->quick_push (init);
4879 number_of_places_left_in_vector = nunits;
4880 elts.new_vector (vector_type, nunits, 1);
4881 elts.quick_grow (nunits);
4882 constant_p = true;
4885 if (ctor_seq != NULL)
4886 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4889 /* For a statement STMT_INFO taking part in a reduction operation return
4890 the stmt_vec_info the meta information is stored on. */
4892 stmt_vec_info
4893 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4895 stmt_info = vect_orig_stmt (stmt_info);
4896 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4897 if (!is_a <gphi *> (stmt_info->stmt)
4898 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4899 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4900 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4901 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4903 if (gimple_phi_num_args (phi) == 1)
4904 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4906 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4908 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4909 stmt_vec_info info
4910 = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4911 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4912 stmt_info = info;
4914 return stmt_info;
4917 /* Function vect_create_epilog_for_reduction
4919 Create code at the loop-epilog to finalize the result of a reduction
4920 computation.
4922 STMT_INFO is the scalar reduction stmt that is being vectorized.
4923 SLP_NODE is an SLP node containing a group of reduction statements. The
4924 first one in this group is STMT_INFO.
4925 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4926 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4927 (counting from 0)
4929 This function:
4930 1. Completes the reduction def-use cycles.
4931 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4932 by calling the function specified by REDUC_FN if available, or by
4933 other means (whole-vector shifts or a scalar loop).
4934 The function also creates a new phi node at the loop exit to preserve
4935 loop-closed form, as illustrated below.
4937 The flow at the entry to this function:
4939 loop:
4940 vec_def = phi <vec_init, null> # REDUCTION_PHI
4941 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4942 s_loop = scalar_stmt # (scalar) STMT_INFO
4943 loop_exit:
4944 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4945 use <s_out0>
4946 use <s_out0>
4948 The above is transformed by this function into:
4950 loop:
4951 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4952 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4953 s_loop = scalar_stmt # (scalar) STMT_INFO
4954 loop_exit:
4955 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4956 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4957 v_out2 = reduce <v_out1>
4958 s_out3 = extract_field <v_out2, 0>
4959 s_out4 = adjust_result <s_out3>
4960 use <s_out4>
4961 use <s_out4>
4964 static void
4965 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4966 stmt_vec_info stmt_info,
4967 slp_tree slp_node,
4968 slp_instance slp_node_instance)
4970 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4971 gcc_assert (reduc_info->is_reduc_info);
4972 /* For double reductions we need to get at the inner loop reduction
4973 stmt which has the meta info attached. Our stmt_info is that of the
4974 loop-closed PHI of the inner loop which we remember as
4975 def for the reduction PHI generation. */
4976 bool double_reduc = false;
4977 stmt_vec_info rdef_info = stmt_info;
4978 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4980 gcc_assert (!slp_node);
4981 double_reduc = true;
4982 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4983 (stmt_info->stmt, 0));
4984 stmt_info = vect_stmt_to_vectorize (stmt_info);
4986 gphi *reduc_def_stmt
4987 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4988 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4989 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4990 tree vectype;
4991 machine_mode mode;
4992 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4993 basic_block exit_bb;
4994 tree scalar_dest;
4995 tree scalar_type;
4996 gimple *new_phi = NULL, *phi;
4997 gimple_stmt_iterator exit_gsi;
4998 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4999 gimple *epilog_stmt = NULL;
5000 gimple *exit_phi;
5001 tree bitsize;
5002 tree def;
5003 tree orig_name, scalar_result;
5004 imm_use_iterator imm_iter, phi_imm_iter;
5005 use_operand_p use_p, phi_use_p;
5006 gimple *use_stmt;
5007 bool nested_in_vect_loop = false;
5008 auto_vec<gimple *> new_phis;
5009 int j, i;
5010 auto_vec<tree> scalar_results;
5011 unsigned int group_size = 1, k;
5012 auto_vec<gimple *> phis;
5013 bool slp_reduc = false;
5014 bool direct_slp_reduc;
5015 tree new_phi_result;
5016 tree induction_index = NULL_TREE;
5018 if (slp_node)
5019 group_size = SLP_TREE_LANES (slp_node);
5021 if (nested_in_vect_loop_p (loop, stmt_info))
5023 outer_loop = loop;
5024 loop = loop->inner;
5025 nested_in_vect_loop = true;
5026 gcc_assert (!slp_node);
5028 gcc_assert (!nested_in_vect_loop || double_reduc);
5030 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5031 gcc_assert (vectype);
5032 mode = TYPE_MODE (vectype);
5034 tree initial_def = NULL;
5035 tree induc_val = NULL_TREE;
5036 tree adjustment_def = NULL;
5037 if (slp_node)
5039 else
5041 /* Get at the scalar def before the loop, that defines the initial value
5042 of the reduction variable. */
5043 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5044 loop_preheader_edge (loop));
5045 /* Optimize: for induction condition reduction, if we can't use zero
5046 for induc_val, use initial_def. */
5047 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5048 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5049 else if (double_reduc)
5051 else if (nested_in_vect_loop)
5053 else
5054 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5057 unsigned vec_num;
5058 int ncopies;
5059 if (slp_node)
5061 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5062 ncopies = 1;
5064 else
5066 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5067 vec_num = 1;
5068 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5071 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5072 which is updated with the current index of the loop for every match of
5073 the original loop's cond_expr (VEC_STMT). This results in a vector
5074 containing the last time the condition passed for that vector lane.
5075 The first match will be a 1 to allow 0 to be used for non-matching
5076 indexes. If there are no matches at all then the vector will be all
5077 zeroes.
5079 PR92772: This algorithm is broken for architectures that support
5080 masked vectors, but do not provide fold_extract_last. */
5081 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5083 auto_vec<std::pair<tree, bool>, 2> ccompares;
5084 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5085 cond_info = vect_stmt_to_vectorize (cond_info);
5086 while (cond_info != reduc_info)
5088 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5090 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5091 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5092 ccompares.safe_push
5093 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5094 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5096 cond_info
5097 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5098 1 + STMT_VINFO_REDUC_IDX
5099 (cond_info)));
5100 cond_info = vect_stmt_to_vectorize (cond_info);
5102 gcc_assert (ccompares.length () != 0);
5104 tree indx_before_incr, indx_after_incr;
5105 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5106 int scalar_precision
5107 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5108 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5109 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5110 (TYPE_MODE (vectype), cr_index_scalar_type,
5111 TYPE_VECTOR_SUBPARTS (vectype));
5113 /* First we create a simple vector induction variable which starts
5114 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5115 vector size (STEP). */
5117 /* Create a {1,2,3,...} vector. */
5118 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5120 /* Create a vector of the step value. */
5121 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5122 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5124 /* Create an induction variable. */
5125 gimple_stmt_iterator incr_gsi;
5126 bool insert_after;
5127 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5128 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5129 insert_after, &indx_before_incr, &indx_after_incr);
5131 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5132 filled with zeros (VEC_ZERO). */
5134 /* Create a vector of 0s. */
5135 tree zero = build_zero_cst (cr_index_scalar_type);
5136 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5138 /* Create a vector phi node. */
5139 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5140 new_phi = create_phi_node (new_phi_tree, loop->header);
5141 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5142 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5144 /* Now take the condition from the loops original cond_exprs
5145 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5146 every match uses values from the induction variable
5147 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5148 (NEW_PHI_TREE).
5149 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5150 the new cond_expr (INDEX_COND_EXPR). */
5151 gimple_seq stmts = NULL;
5152 for (int i = ccompares.length () - 1; i != -1; --i)
5154 tree ccompare = ccompares[i].first;
5155 if (ccompares[i].second)
5156 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5157 cr_index_vector_type,
5158 ccompare,
5159 indx_before_incr, new_phi_tree);
5160 else
5161 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5162 cr_index_vector_type,
5163 ccompare,
5164 new_phi_tree, indx_before_incr);
5166 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5168 /* Update the phi with the vec cond. */
5169 induction_index = new_phi_tree;
5170 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5171 loop_latch_edge (loop), UNKNOWN_LOCATION);
5174 /* 2. Create epilog code.
5175 The reduction epilog code operates across the elements of the vector
5176 of partial results computed by the vectorized loop.
5177 The reduction epilog code consists of:
5179 step 1: compute the scalar result in a vector (v_out2)
5180 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5181 step 3: adjust the scalar result (s_out3) if needed.
5183 Step 1 can be accomplished using one the following three schemes:
5184 (scheme 1) using reduc_fn, if available.
5185 (scheme 2) using whole-vector shifts, if available.
5186 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5187 combined.
5189 The overall epilog code looks like this:
5191 s_out0 = phi <s_loop> # original EXIT_PHI
5192 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5193 v_out2 = reduce <v_out1> # step 1
5194 s_out3 = extract_field <v_out2, 0> # step 2
5195 s_out4 = adjust_result <s_out3> # step 3
5197 (step 3 is optional, and steps 1 and 2 may be combined).
5198 Lastly, the uses of s_out0 are replaced by s_out4. */
5201 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5202 v_out1 = phi <VECT_DEF>
5203 Store them in NEW_PHIS. */
5204 if (double_reduc)
5205 loop = outer_loop;
5206 exit_bb = single_exit (loop)->dest;
5207 new_phis.create (slp_node ? vec_num : ncopies);
5208 for (unsigned i = 0; i < vec_num; i++)
5210 if (slp_node)
5211 def = vect_get_slp_vect_def (slp_node, i);
5212 else
5213 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5214 for (j = 0; j < ncopies; j++)
5216 tree new_def = copy_ssa_name (def);
5217 phi = create_phi_node (new_def, exit_bb);
5218 if (j == 0)
5219 new_phis.quick_push (phi);
5220 else
5222 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5223 new_phis.quick_push (phi);
5226 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5230 exit_gsi = gsi_after_labels (exit_bb);
5232 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5233 (i.e. when reduc_fn is not available) and in the final adjustment
5234 code (if needed). Also get the original scalar reduction variable as
5235 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5236 represents a reduction pattern), the tree-code and scalar-def are
5237 taken from the original stmt that the pattern-stmt (STMT) replaces.
5238 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5239 are taken from STMT. */
5241 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5242 if (orig_stmt_info != stmt_info)
5244 /* Reduction pattern */
5245 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5246 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5249 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5250 scalar_type = TREE_TYPE (scalar_dest);
5251 scalar_results.create (group_size);
5252 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5253 bitsize = TYPE_SIZE (scalar_type);
5255 /* SLP reduction without reduction chain, e.g.,
5256 # a1 = phi <a2, a0>
5257 # b1 = phi <b2, b0>
5258 a2 = operation (a1)
5259 b2 = operation (b1) */
5260 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5262 /* True if we should implement SLP_REDUC using native reduction operations
5263 instead of scalar operations. */
5264 direct_slp_reduc = (reduc_fn != IFN_LAST
5265 && slp_reduc
5266 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5268 /* In case of reduction chain, e.g.,
5269 # a1 = phi <a3, a0>
5270 a2 = operation (a1)
5271 a3 = operation (a2),
5273 we may end up with more than one vector result. Here we reduce them to
5274 one vector. */
5275 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5277 gimple_seq stmts = NULL;
5278 tree first_vect = PHI_RESULT (new_phis[0]);
5279 first_vect = gimple_convert (&stmts, vectype, first_vect);
5280 for (k = 1; k < new_phis.length (); k++)
5282 gimple *next_phi = new_phis[k];
5283 tree second_vect = PHI_RESULT (next_phi);
5284 second_vect = gimple_convert (&stmts, vectype, second_vect);
5285 first_vect = gimple_build (&stmts, code, vectype,
5286 first_vect, second_vect);
5288 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5290 new_phi_result = first_vect;
5291 new_phis.truncate (0);
5292 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5294 /* Likewise if we couldn't use a single defuse cycle. */
5295 else if (ncopies > 1)
5297 gimple_seq stmts = NULL;
5298 tree first_vect = PHI_RESULT (new_phis[0]);
5299 first_vect = gimple_convert (&stmts, vectype, first_vect);
5300 for (int k = 1; k < ncopies; ++k)
5302 tree second_vect = PHI_RESULT (new_phis[k]);
5303 second_vect = gimple_convert (&stmts, vectype, second_vect);
5304 first_vect = gimple_build (&stmts, code, vectype,
5305 first_vect, second_vect);
5307 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5308 new_phi_result = first_vect;
5309 new_phis.truncate (0);
5310 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5312 else
5313 new_phi_result = PHI_RESULT (new_phis[0]);
5315 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5316 && reduc_fn != IFN_LAST)
5318 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5319 various data values where the condition matched and another vector
5320 (INDUCTION_INDEX) containing all the indexes of those matches. We
5321 need to extract the last matching index (which will be the index with
5322 highest value) and use this to index into the data vector.
5323 For the case where there were no matches, the data vector will contain
5324 all default values and the index vector will be all zeros. */
5326 /* Get various versions of the type of the vector of indexes. */
5327 tree index_vec_type = TREE_TYPE (induction_index);
5328 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5329 tree index_scalar_type = TREE_TYPE (index_vec_type);
5330 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5332 /* Get an unsigned integer version of the type of the data vector. */
5333 int scalar_precision
5334 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5335 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5336 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5337 vectype);
5339 /* First we need to create a vector (ZERO_VEC) of zeros and another
5340 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5341 can create using a MAX reduction and then expanding.
5342 In the case where the loop never made any matches, the max index will
5343 be zero. */
5345 /* Vector of {0, 0, 0,...}. */
5346 tree zero_vec = build_zero_cst (vectype);
5348 gimple_seq stmts = NULL;
5349 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5350 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5352 /* Find maximum value from the vector of found indexes. */
5353 tree max_index = make_ssa_name (index_scalar_type);
5354 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5355 1, induction_index);
5356 gimple_call_set_lhs (max_index_stmt, max_index);
5357 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5359 /* Vector of {max_index, max_index, max_index,...}. */
5360 tree max_index_vec = make_ssa_name (index_vec_type);
5361 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5362 max_index);
5363 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5364 max_index_vec_rhs);
5365 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5367 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5368 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5369 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5370 otherwise. Only one value should match, resulting in a vector
5371 (VEC_COND) with one data value and the rest zeros.
5372 In the case where the loop never made any matches, every index will
5373 match, resulting in a vector with all data values (which will all be
5374 the default value). */
5376 /* Compare the max index vector to the vector of found indexes to find
5377 the position of the max value. */
5378 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5379 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5380 induction_index,
5381 max_index_vec);
5382 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5384 /* Use the compare to choose either values from the data vector or
5385 zero. */
5386 tree vec_cond = make_ssa_name (vectype);
5387 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5388 vec_compare, new_phi_result,
5389 zero_vec);
5390 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5392 /* Finally we need to extract the data value from the vector (VEC_COND)
5393 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5394 reduction, but because this doesn't exist, we can use a MAX reduction
5395 instead. The data value might be signed or a float so we need to cast
5396 it first.
5397 In the case where the loop never made any matches, the data values are
5398 all identical, and so will reduce down correctly. */
5400 /* Make the matched data values unsigned. */
5401 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5402 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5403 vec_cond);
5404 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5405 VIEW_CONVERT_EXPR,
5406 vec_cond_cast_rhs);
5407 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5409 /* Reduce down to a scalar value. */
5410 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5411 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5412 1, vec_cond_cast);
5413 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5414 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5416 /* Convert the reduced value back to the result type and set as the
5417 result. */
5418 stmts = NULL;
5419 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5420 data_reduc);
5421 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5422 scalar_results.safe_push (new_temp);
5424 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5425 && reduc_fn == IFN_LAST)
5427 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5428 idx = 0;
5429 idx_val = induction_index[0];
5430 val = data_reduc[0];
5431 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5432 if (induction_index[i] > idx_val)
5433 val = data_reduc[i], idx_val = induction_index[i];
5434 return val; */
5436 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5437 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5438 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5439 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5440 /* Enforced by vectorizable_reduction, which ensures we have target
5441 support before allowing a conditional reduction on variable-length
5442 vectors. */
5443 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5444 tree idx_val = NULL_TREE, val = NULL_TREE;
5445 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5447 tree old_idx_val = idx_val;
5448 tree old_val = val;
5449 idx_val = make_ssa_name (idx_eltype);
5450 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5451 build3 (BIT_FIELD_REF, idx_eltype,
5452 induction_index,
5453 bitsize_int (el_size),
5454 bitsize_int (off)));
5455 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5456 val = make_ssa_name (data_eltype);
5457 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5458 build3 (BIT_FIELD_REF,
5459 data_eltype,
5460 new_phi_result,
5461 bitsize_int (el_size),
5462 bitsize_int (off)));
5463 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5464 if (off != 0)
5466 tree new_idx_val = idx_val;
5467 if (off != v_size - el_size)
5469 new_idx_val = make_ssa_name (idx_eltype);
5470 epilog_stmt = gimple_build_assign (new_idx_val,
5471 MAX_EXPR, idx_val,
5472 old_idx_val);
5473 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5475 tree new_val = make_ssa_name (data_eltype);
5476 epilog_stmt = gimple_build_assign (new_val,
5477 COND_EXPR,
5478 build2 (GT_EXPR,
5479 boolean_type_node,
5480 idx_val,
5481 old_idx_val),
5482 val, old_val);
5483 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5484 idx_val = new_idx_val;
5485 val = new_val;
5488 /* Convert the reduced value back to the result type and set as the
5489 result. */
5490 gimple_seq stmts = NULL;
5491 val = gimple_convert (&stmts, scalar_type, val);
5492 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5493 scalar_results.safe_push (val);
5496 /* 2.3 Create the reduction code, using one of the three schemes described
5497 above. In SLP we simply need to extract all the elements from the
5498 vector (without reducing them), so we use scalar shifts. */
5499 else if (reduc_fn != IFN_LAST && !slp_reduc)
5501 tree tmp;
5502 tree vec_elem_type;
5504 /* Case 1: Create:
5505 v_out2 = reduc_expr <v_out1> */
5507 if (dump_enabled_p ())
5508 dump_printf_loc (MSG_NOTE, vect_location,
5509 "Reduce using direct vector reduction.\n");
5511 gimple_seq stmts = NULL;
5512 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5513 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5514 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5515 vec_elem_type, new_phi_result);
5516 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5517 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5519 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5520 && induc_val)
5522 /* Earlier we set the initial value to be a vector if induc_val
5523 values. Check the result and if it is induc_val then replace
5524 with the original initial value, unless induc_val is
5525 the same as initial_def already. */
5526 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5527 induc_val);
5529 tmp = make_ssa_name (new_scalar_dest);
5530 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5531 initial_def, new_temp);
5532 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5533 new_temp = tmp;
5536 scalar_results.safe_push (new_temp);
5538 else if (direct_slp_reduc)
5540 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5541 with the elements for other SLP statements replaced with the
5542 neutral value. We can then do a normal reduction on each vector. */
5544 /* Enforced by vectorizable_reduction. */
5545 gcc_assert (new_phis.length () == 1);
5546 gcc_assert (pow2p_hwi (group_size));
5548 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5549 vec<stmt_vec_info> orig_phis
5550 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5551 gimple_seq seq = NULL;
5553 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5554 and the same element size as VECTYPE. */
5555 tree index = build_index_vector (vectype, 0, 1);
5556 tree index_type = TREE_TYPE (index);
5557 tree index_elt_type = TREE_TYPE (index_type);
5558 tree mask_type = truth_type_for (index_type);
5560 /* Create a vector that, for each element, identifies which of
5561 the REDUC_GROUP_SIZE results should use it. */
5562 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5563 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5564 build_vector_from_val (index_type, index_mask));
5566 /* Get a neutral vector value. This is simply a splat of the neutral
5567 scalar value if we have one, otherwise the initial scalar value
5568 is itself a neutral value. */
5569 tree vector_identity = NULL_TREE;
5570 tree neutral_op = NULL_TREE;
5571 if (slp_node)
5573 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5574 neutral_op
5575 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5576 vectype, code, first != NULL);
5578 if (neutral_op)
5579 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5580 neutral_op);
5581 for (unsigned int i = 0; i < group_size; ++i)
5583 /* If there's no univeral neutral value, we can use the
5584 initial scalar value from the original PHI. This is used
5585 for MIN and MAX reduction, for example. */
5586 if (!neutral_op)
5588 tree scalar_value
5589 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5590 loop_preheader_edge (loop));
5591 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5592 scalar_value);
5593 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5594 scalar_value);
5597 /* Calculate the equivalent of:
5599 sel[j] = (index[j] == i);
5601 which selects the elements of NEW_PHI_RESULT that should
5602 be included in the result. */
5603 tree compare_val = build_int_cst (index_elt_type, i);
5604 compare_val = build_vector_from_val (index_type, compare_val);
5605 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5606 index, compare_val);
5608 /* Calculate the equivalent of:
5610 vec = seq ? new_phi_result : vector_identity;
5612 VEC is now suitable for a full vector reduction. */
5613 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5614 sel, new_phi_result, vector_identity);
5616 /* Do the reduction and convert it to the appropriate type. */
5617 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5618 TREE_TYPE (vectype), vec);
5619 scalar = gimple_convert (&seq, scalar_type, scalar);
5620 scalar_results.safe_push (scalar);
5622 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5624 else
5626 bool reduce_with_shift;
5627 tree vec_temp;
5629 gcc_assert (slp_reduc || new_phis.length () == 1);
5631 /* See if the target wants to do the final (shift) reduction
5632 in a vector mode of smaller size and first reduce upper/lower
5633 halves against each other. */
5634 enum machine_mode mode1 = mode;
5635 tree stype = TREE_TYPE (vectype);
5636 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5637 unsigned nunits1 = nunits;
5638 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5639 && new_phis.length () == 1)
5641 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5642 /* For SLP reductions we have to make sure lanes match up, but
5643 since we're doing individual element final reduction reducing
5644 vector width here is even more important.
5645 ??? We can also separate lanes with permutes, for the common
5646 case of power-of-two group-size odd/even extracts would work. */
5647 if (slp_reduc && nunits != nunits1)
5649 nunits1 = least_common_multiple (nunits1, group_size);
5650 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5653 if (!slp_reduc
5654 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5655 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5657 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5658 stype, nunits1);
5659 reduce_with_shift = have_whole_vector_shift (mode1);
5660 if (!VECTOR_MODE_P (mode1))
5661 reduce_with_shift = false;
5662 else
5664 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5665 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5666 reduce_with_shift = false;
5669 /* First reduce the vector to the desired vector size we should
5670 do shift reduction on by combining upper and lower halves. */
5671 new_temp = new_phi_result;
5672 while (nunits > nunits1)
5674 nunits /= 2;
5675 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5676 stype, nunits);
5677 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5679 /* The target has to make sure we support lowpart/highpart
5680 extraction, either via direct vector extract or through
5681 an integer mode punning. */
5682 tree dst1, dst2;
5683 if (convert_optab_handler (vec_extract_optab,
5684 TYPE_MODE (TREE_TYPE (new_temp)),
5685 TYPE_MODE (vectype1))
5686 != CODE_FOR_nothing)
5688 /* Extract sub-vectors directly once vec_extract becomes
5689 a conversion optab. */
5690 dst1 = make_ssa_name (vectype1);
5691 epilog_stmt
5692 = gimple_build_assign (dst1, BIT_FIELD_REF,
5693 build3 (BIT_FIELD_REF, vectype1,
5694 new_temp, TYPE_SIZE (vectype1),
5695 bitsize_int (0)));
5696 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5697 dst2 = make_ssa_name (vectype1);
5698 epilog_stmt
5699 = gimple_build_assign (dst2, BIT_FIELD_REF,
5700 build3 (BIT_FIELD_REF, vectype1,
5701 new_temp, TYPE_SIZE (vectype1),
5702 bitsize_int (bitsize)));
5703 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5705 else
5707 /* Extract via punning to appropriately sized integer mode
5708 vector. */
5709 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5710 tree etype = build_vector_type (eltype, 2);
5711 gcc_assert (convert_optab_handler (vec_extract_optab,
5712 TYPE_MODE (etype),
5713 TYPE_MODE (eltype))
5714 != CODE_FOR_nothing);
5715 tree tem = make_ssa_name (etype);
5716 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5717 build1 (VIEW_CONVERT_EXPR,
5718 etype, new_temp));
5719 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5720 new_temp = tem;
5721 tem = make_ssa_name (eltype);
5722 epilog_stmt
5723 = gimple_build_assign (tem, BIT_FIELD_REF,
5724 build3 (BIT_FIELD_REF, eltype,
5725 new_temp, TYPE_SIZE (eltype),
5726 bitsize_int (0)));
5727 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5728 dst1 = make_ssa_name (vectype1);
5729 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5730 build1 (VIEW_CONVERT_EXPR,
5731 vectype1, tem));
5732 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5733 tem = make_ssa_name (eltype);
5734 epilog_stmt
5735 = gimple_build_assign (tem, BIT_FIELD_REF,
5736 build3 (BIT_FIELD_REF, eltype,
5737 new_temp, TYPE_SIZE (eltype),
5738 bitsize_int (bitsize)));
5739 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5740 dst2 = make_ssa_name (vectype1);
5741 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5742 build1 (VIEW_CONVERT_EXPR,
5743 vectype1, tem));
5744 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5747 new_temp = make_ssa_name (vectype1);
5748 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5749 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5750 new_phis[0] = epilog_stmt;
5753 if (reduce_with_shift && !slp_reduc)
5755 int element_bitsize = tree_to_uhwi (bitsize);
5756 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5757 for variable-length vectors and also requires direct target support
5758 for loop reductions. */
5759 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5760 int nelements = vec_size_in_bits / element_bitsize;
5761 vec_perm_builder sel;
5762 vec_perm_indices indices;
5764 int elt_offset;
5766 tree zero_vec = build_zero_cst (vectype1);
5767 /* Case 2: Create:
5768 for (offset = nelements/2; offset >= 1; offset/=2)
5770 Create: va' = vec_shift <va, offset>
5771 Create: va = vop <va, va'>
5772 } */
5774 tree rhs;
5776 if (dump_enabled_p ())
5777 dump_printf_loc (MSG_NOTE, vect_location,
5778 "Reduce using vector shifts\n");
5780 gimple_seq stmts = NULL;
5781 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5782 for (elt_offset = nelements / 2;
5783 elt_offset >= 1;
5784 elt_offset /= 2)
5786 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5787 indices.new_vector (sel, 2, nelements);
5788 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5789 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5790 new_temp, zero_vec, mask);
5791 new_temp = gimple_build (&stmts, code,
5792 vectype1, new_name, new_temp);
5794 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5796 /* 2.4 Extract the final scalar result. Create:
5797 s_out3 = extract_field <v_out2, bitpos> */
5799 if (dump_enabled_p ())
5800 dump_printf_loc (MSG_NOTE, vect_location,
5801 "extract scalar result\n");
5803 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5804 bitsize, bitsize_zero_node);
5805 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5806 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5807 gimple_assign_set_lhs (epilog_stmt, new_temp);
5808 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5809 scalar_results.safe_push (new_temp);
5811 else
5813 /* Case 3: Create:
5814 s = extract_field <v_out2, 0>
5815 for (offset = element_size;
5816 offset < vector_size;
5817 offset += element_size;)
5819 Create: s' = extract_field <v_out2, offset>
5820 Create: s = op <s, s'> // For non SLP cases
5821 } */
5823 if (dump_enabled_p ())
5824 dump_printf_loc (MSG_NOTE, vect_location,
5825 "Reduce using scalar code.\n");
5827 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5828 int element_bitsize = tree_to_uhwi (bitsize);
5829 tree compute_type = TREE_TYPE (vectype);
5830 gimple_seq stmts = NULL;
5831 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5833 int bit_offset;
5834 if (gimple_code (new_phi) == GIMPLE_PHI)
5835 vec_temp = PHI_RESULT (new_phi);
5836 else
5837 vec_temp = gimple_assign_lhs (new_phi);
5838 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5839 vec_temp, bitsize, bitsize_zero_node);
5841 /* In SLP we don't need to apply reduction operation, so we just
5842 collect s' values in SCALAR_RESULTS. */
5843 if (slp_reduc)
5844 scalar_results.safe_push (new_temp);
5846 for (bit_offset = element_bitsize;
5847 bit_offset < vec_size_in_bits;
5848 bit_offset += element_bitsize)
5850 tree bitpos = bitsize_int (bit_offset);
5851 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5852 compute_type, vec_temp,
5853 bitsize, bitpos);
5854 if (slp_reduc)
5856 /* In SLP we don't need to apply reduction operation, so
5857 we just collect s' values in SCALAR_RESULTS. */
5858 new_temp = new_name;
5859 scalar_results.safe_push (new_name);
5861 else
5862 new_temp = gimple_build (&stmts, code, compute_type,
5863 new_name, new_temp);
5867 /* The only case where we need to reduce scalar results in SLP, is
5868 unrolling. If the size of SCALAR_RESULTS is greater than
5869 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5870 REDUC_GROUP_SIZE. */
5871 if (slp_reduc)
5873 tree res, first_res, new_res;
5875 /* Reduce multiple scalar results in case of SLP unrolling. */
5876 for (j = group_size; scalar_results.iterate (j, &res);
5877 j++)
5879 first_res = scalar_results[j % group_size];
5880 new_res = gimple_build (&stmts, code, compute_type,
5881 first_res, res);
5882 scalar_results[j % group_size] = new_res;
5884 for (k = 0; k < group_size; k++)
5885 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5886 scalar_results[k]);
5888 else
5890 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5891 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5892 scalar_results.safe_push (new_temp);
5895 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5898 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5899 && induc_val)
5901 /* Earlier we set the initial value to be a vector if induc_val
5902 values. Check the result and if it is induc_val then replace
5903 with the original initial value, unless induc_val is
5904 the same as initial_def already. */
5905 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5906 induc_val);
5908 tree tmp = make_ssa_name (new_scalar_dest);
5909 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5910 initial_def, new_temp);
5911 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5912 scalar_results[0] = tmp;
5916 /* 2.5 Adjust the final result by the initial value of the reduction
5917 variable. (When such adjustment is not needed, then
5918 'adjustment_def' is zero). For example, if code is PLUS we create:
5919 new_temp = loop_exit_def + adjustment_def */
5921 if (adjustment_def)
5923 gcc_assert (!slp_reduc);
5924 gimple_seq stmts = NULL;
5925 if (nested_in_vect_loop)
5927 new_phi = new_phis[0];
5928 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5929 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5930 new_temp = gimple_build (&stmts, code, vectype,
5931 PHI_RESULT (new_phi), adjustment_def);
5933 else
5935 new_temp = scalar_results[0];
5936 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5937 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5938 new_temp = gimple_build (&stmts, code, scalar_type,
5939 new_temp, adjustment_def);
5942 epilog_stmt = gimple_seq_last_stmt (stmts);
5943 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5944 if (nested_in_vect_loop)
5946 if (!double_reduc)
5947 scalar_results.quick_push (new_temp);
5948 else
5949 scalar_results[0] = new_temp;
5951 else
5952 scalar_results[0] = new_temp;
5954 new_phis[0] = epilog_stmt;
5957 if (double_reduc)
5958 loop = loop->inner;
5960 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5961 phis with new adjusted scalar results, i.e., replace use <s_out0>
5962 with use <s_out4>.
5964 Transform:
5965 loop_exit:
5966 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5967 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5968 v_out2 = reduce <v_out1>
5969 s_out3 = extract_field <v_out2, 0>
5970 s_out4 = adjust_result <s_out3>
5971 use <s_out0>
5972 use <s_out0>
5974 into:
5976 loop_exit:
5977 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5978 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5979 v_out2 = reduce <v_out1>
5980 s_out3 = extract_field <v_out2, 0>
5981 s_out4 = adjust_result <s_out3>
5982 use <s_out4>
5983 use <s_out4> */
5986 /* In SLP reduction chain we reduce vector results into one vector if
5987 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5988 LHS of the last stmt in the reduction chain, since we are looking for
5989 the loop exit phi node. */
5990 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5992 stmt_vec_info dest_stmt_info
5993 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5994 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5995 group_size = 1;
5998 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5999 case that REDUC_GROUP_SIZE is greater than vectorization factor).
6000 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
6001 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
6002 correspond to the first vector stmt, etc.
6003 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
6004 if (group_size > new_phis.length ())
6005 gcc_assert (!(group_size % new_phis.length ()));
6007 for (k = 0; k < group_size; k++)
6009 if (slp_reduc)
6011 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6013 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
6014 /* SLP statements can't participate in patterns. */
6015 gcc_assert (!orig_stmt_info);
6016 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
6019 if (nested_in_vect_loop)
6021 if (double_reduc)
6022 loop = outer_loop;
6023 else
6024 gcc_unreachable ();
6027 phis.create (3);
6028 /* Find the loop-closed-use at the loop exit of the original scalar
6029 result. (The reduction result is expected to have two immediate uses,
6030 one at the latch block, and one at the loop exit). For double
6031 reductions we are looking for exit phis of the outer loop. */
6032 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6034 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6036 if (!is_gimple_debug (USE_STMT (use_p)))
6037 phis.safe_push (USE_STMT (use_p));
6039 else
6041 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6043 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6045 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6047 if (!flow_bb_inside_loop_p (loop,
6048 gimple_bb (USE_STMT (phi_use_p)))
6049 && !is_gimple_debug (USE_STMT (phi_use_p)))
6050 phis.safe_push (USE_STMT (phi_use_p));
6056 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6058 /* Replace the uses: */
6059 orig_name = PHI_RESULT (exit_phi);
6060 scalar_result = scalar_results[k];
6061 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6063 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6064 SET_USE (use_p, scalar_result);
6065 update_stmt (use_stmt);
6069 phis.release ();
6073 /* Return a vector of type VECTYPE that is equal to the vector select
6074 operation "MASK ? VEC : IDENTITY". Insert the select statements
6075 before GSI. */
6077 static tree
6078 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6079 tree vec, tree identity)
6081 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6082 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6083 mask, vec, identity);
6084 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6085 return cond;
6088 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6089 order, starting with LHS. Insert the extraction statements before GSI and
6090 associate the new scalar SSA names with variable SCALAR_DEST.
6091 Return the SSA name for the result. */
6093 static tree
6094 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6095 tree_code code, tree lhs, tree vector_rhs)
6097 tree vectype = TREE_TYPE (vector_rhs);
6098 tree scalar_type = TREE_TYPE (vectype);
6099 tree bitsize = TYPE_SIZE (scalar_type);
6100 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6101 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6103 for (unsigned HOST_WIDE_INT bit_offset = 0;
6104 bit_offset < vec_size_in_bits;
6105 bit_offset += element_bitsize)
6107 tree bitpos = bitsize_int (bit_offset);
6108 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6109 bitsize, bitpos);
6111 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6112 rhs = make_ssa_name (scalar_dest, stmt);
6113 gimple_assign_set_lhs (stmt, rhs);
6114 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6116 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6117 tree new_name = make_ssa_name (scalar_dest, stmt);
6118 gimple_assign_set_lhs (stmt, new_name);
6119 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6120 lhs = new_name;
6122 return lhs;
6125 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6126 type of the vector input. */
6128 static internal_fn
6129 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6131 internal_fn mask_reduc_fn;
6133 switch (reduc_fn)
6135 case IFN_FOLD_LEFT_PLUS:
6136 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6137 break;
6139 default:
6140 return IFN_LAST;
6143 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6144 OPTIMIZE_FOR_SPEED))
6145 return mask_reduc_fn;
6146 return IFN_LAST;
6149 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6150 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6151 statement. CODE is the operation performed by STMT_INFO and OPS are
6152 its scalar operands. REDUC_INDEX is the index of the operand in
6153 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6154 implements in-order reduction, or IFN_LAST if we should open-code it.
6155 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6156 that should be used to control the operation in a fully-masked loop. */
6158 static bool
6159 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6160 stmt_vec_info stmt_info,
6161 gimple_stmt_iterator *gsi,
6162 gimple **vec_stmt, slp_tree slp_node,
6163 gimple *reduc_def_stmt,
6164 tree_code code, internal_fn reduc_fn,
6165 tree ops[3], tree vectype_in,
6166 int reduc_index, vec_loop_masks *masks)
6168 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6169 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6170 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6172 int ncopies;
6173 if (slp_node)
6174 ncopies = 1;
6175 else
6176 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6178 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6179 gcc_assert (ncopies == 1);
6180 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6182 if (slp_node)
6183 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6184 TYPE_VECTOR_SUBPARTS (vectype_in)));
6186 tree op0 = ops[1 - reduc_index];
6188 int group_size = 1;
6189 stmt_vec_info scalar_dest_def_info;
6190 auto_vec<tree> vec_oprnds0;
6191 if (slp_node)
6193 auto_vec<vec<tree> > vec_defs (2);
6194 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6195 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6196 vec_defs[0].release ();
6197 vec_defs[1].release ();
6198 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6199 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6201 else
6203 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6204 op0, &vec_oprnds0);
6205 scalar_dest_def_info = stmt_info;
6208 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6209 tree scalar_type = TREE_TYPE (scalar_dest);
6210 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6212 int vec_num = vec_oprnds0.length ();
6213 gcc_assert (vec_num == 1 || slp_node);
6214 tree vec_elem_type = TREE_TYPE (vectype_out);
6215 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6217 tree vector_identity = NULL_TREE;
6218 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6219 vector_identity = build_zero_cst (vectype_out);
6221 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6222 int i;
6223 tree def0;
6224 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6226 gimple *new_stmt;
6227 tree mask = NULL_TREE;
6228 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6229 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6231 /* Handle MINUS by adding the negative. */
6232 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6234 tree negated = make_ssa_name (vectype_out);
6235 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6236 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6237 def0 = negated;
6240 if (mask && mask_reduc_fn == IFN_LAST)
6241 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6242 vector_identity);
6244 /* On the first iteration the input is simply the scalar phi
6245 result, and for subsequent iterations it is the output of
6246 the preceding operation. */
6247 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6249 if (mask && mask_reduc_fn != IFN_LAST)
6250 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6251 def0, mask);
6252 else
6253 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6254 def0);
6255 /* For chained SLP reductions the output of the previous reduction
6256 operation serves as the input of the next. For the final statement
6257 the output cannot be a temporary - we reuse the original
6258 scalar destination of the last statement. */
6259 if (i != vec_num - 1)
6261 gimple_set_lhs (new_stmt, scalar_dest_var);
6262 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6263 gimple_set_lhs (new_stmt, reduc_var);
6266 else
6268 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6269 reduc_var, def0);
6270 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6271 /* Remove the statement, so that we can use the same code paths
6272 as for statements that we've just created. */
6273 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6274 gsi_remove (&tmp_gsi, true);
6277 if (i == vec_num - 1)
6279 gimple_set_lhs (new_stmt, scalar_dest);
6280 vect_finish_replace_stmt (loop_vinfo,
6281 scalar_dest_def_info,
6282 new_stmt);
6284 else
6285 vect_finish_stmt_generation (loop_vinfo,
6286 scalar_dest_def_info,
6287 new_stmt, gsi);
6289 if (slp_node)
6290 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6291 else
6293 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6294 *vec_stmt = new_stmt;
6298 return true;
6301 /* Function is_nonwrapping_integer_induction.
6303 Check if STMT_VINO (which is part of loop LOOP) both increments and
6304 does not cause overflow. */
6306 static bool
6307 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6309 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6310 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6311 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6312 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6313 widest_int ni, max_loop_value, lhs_max;
6314 wi::overflow_type overflow = wi::OVF_NONE;
6316 /* Make sure the loop is integer based. */
6317 if (TREE_CODE (base) != INTEGER_CST
6318 || TREE_CODE (step) != INTEGER_CST)
6319 return false;
6321 /* Check that the max size of the loop will not wrap. */
6323 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6324 return true;
6326 if (! max_stmt_executions (loop, &ni))
6327 return false;
6329 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6330 &overflow);
6331 if (overflow)
6332 return false;
6334 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6335 TYPE_SIGN (lhs_type), &overflow);
6336 if (overflow)
6337 return false;
6339 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6340 <= TYPE_PRECISION (lhs_type));
6343 /* Check if masking can be supported by inserting a conditional expression.
6344 CODE is the code for the operation. COND_FN is the conditional internal
6345 function, if it exists. VECTYPE_IN is the type of the vector input. */
6346 static bool
6347 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6348 tree vectype_in)
6350 if (cond_fn != IFN_LAST
6351 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6352 OPTIMIZE_FOR_SPEED))
6353 return false;
6355 switch (code)
6357 case DOT_PROD_EXPR:
6358 case SAD_EXPR:
6359 return true;
6361 default:
6362 return false;
6366 /* Insert a conditional expression to enable masked vectorization. CODE is the
6367 code for the operation. VOP is the array of operands. MASK is the loop
6368 mask. GSI is a statement iterator used to place the new conditional
6369 expression. */
6370 static void
6371 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6372 gimple_stmt_iterator *gsi)
6374 switch (code)
6376 case DOT_PROD_EXPR:
6378 tree vectype = TREE_TYPE (vop[1]);
6379 tree zero = build_zero_cst (vectype);
6380 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6381 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6382 mask, vop[1], zero);
6383 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6384 vop[1] = masked_op1;
6385 break;
6388 case SAD_EXPR:
6390 tree vectype = TREE_TYPE (vop[1]);
6391 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6392 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6393 mask, vop[1], vop[0]);
6394 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6395 vop[1] = masked_op1;
6396 break;
6399 default:
6400 gcc_unreachable ();
6404 /* Function vectorizable_reduction.
6406 Check if STMT_INFO performs a reduction operation that can be vectorized.
6407 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6408 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6409 Return true if STMT_INFO is vectorizable in this way.
6411 This function also handles reduction idioms (patterns) that have been
6412 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6413 may be of this form:
6414 X = pattern_expr (arg0, arg1, ..., X)
6415 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6416 sequence that had been detected and replaced by the pattern-stmt
6417 (STMT_INFO).
6419 This function also handles reduction of condition expressions, for example:
6420 for (int i = 0; i < N; i++)
6421 if (a[i] < value)
6422 last = a[i];
6423 This is handled by vectorising the loop and creating an additional vector
6424 containing the loop indexes for which "a[i] < value" was true. In the
6425 function epilogue this is reduced to a single max value and then used to
6426 index into the vector of results.
6428 In some cases of reduction patterns, the type of the reduction variable X is
6429 different than the type of the other arguments of STMT_INFO.
6430 In such cases, the vectype that is used when transforming STMT_INFO into
6431 a vector stmt is different than the vectype that is used to determine the
6432 vectorization factor, because it consists of a different number of elements
6433 than the actual number of elements that are being operated upon in parallel.
6435 For example, consider an accumulation of shorts into an int accumulator.
6436 On some targets it's possible to vectorize this pattern operating on 8
6437 shorts at a time (hence, the vectype for purposes of determining the
6438 vectorization factor should be V8HI); on the other hand, the vectype that
6439 is used to create the vector form is actually V4SI (the type of the result).
6441 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6442 indicates what is the actual level of parallelism (V8HI in the example), so
6443 that the right vectorization factor would be derived. This vectype
6444 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6445 be used to create the vectorized stmt. The right vectype for the vectorized
6446 stmt is obtained from the type of the result X:
6447 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6449 This means that, contrary to "regular" reductions (or "regular" stmts in
6450 general), the following equation:
6451 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6452 does *NOT* necessarily hold for reduction patterns. */
6454 bool
6455 vectorizable_reduction (loop_vec_info loop_vinfo,
6456 stmt_vec_info stmt_info, slp_tree slp_node,
6457 slp_instance slp_node_instance,
6458 stmt_vector_for_cost *cost_vec)
6460 tree scalar_dest;
6461 tree vectype_in = NULL_TREE;
6462 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6463 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6464 stmt_vec_info cond_stmt_vinfo = NULL;
6465 tree scalar_type;
6466 int i;
6467 int ncopies;
6468 bool single_defuse_cycle = false;
6469 bool nested_cycle = false;
6470 bool double_reduc = false;
6471 int vec_num;
6472 tree tem;
6473 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6474 tree cond_reduc_val = NULL_TREE;
6476 /* Make sure it was already recognized as a reduction computation. */
6477 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6478 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6479 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6480 return false;
6482 /* The stmt we store reduction analysis meta on. */
6483 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6484 reduc_info->is_reduc_info = true;
6486 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6488 if (is_a <gphi *> (stmt_info->stmt))
6490 if (slp_node)
6492 /* We eventually need to set a vector type on invariant
6493 arguments. */
6494 unsigned j;
6495 slp_tree child;
6496 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6497 if (!vect_maybe_update_slp_op_vectype
6498 (child, SLP_TREE_VECTYPE (slp_node)))
6500 if (dump_enabled_p ())
6501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6502 "incompatible vector types for "
6503 "invariants\n");
6504 return false;
6507 /* Analysis for double-reduction is done on the outer
6508 loop PHI, nested cycles have no further restrictions. */
6509 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6511 else
6512 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6513 return true;
6516 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6517 stmt_vec_info phi_info = stmt_info;
6518 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6519 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6521 if (!is_a <gphi *> (stmt_info->stmt))
6523 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6524 return true;
6526 if (slp_node)
6528 slp_node_instance->reduc_phis = slp_node;
6529 /* ??? We're leaving slp_node to point to the PHIs, we only
6530 need it to get at the number of vector stmts which wasn't
6531 yet initialized for the instance root. */
6533 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6534 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6535 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6537 use_operand_p use_p;
6538 gimple *use_stmt;
6539 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6540 &use_p, &use_stmt);
6541 gcc_assert (res);
6542 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6543 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6547 /* PHIs should not participate in patterns. */
6548 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6549 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6551 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6552 and compute the reduction chain length. Discover the real
6553 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6554 tree reduc_def
6555 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6556 loop_latch_edge
6557 (gimple_bb (reduc_def_phi)->loop_father));
6558 unsigned reduc_chain_length = 0;
6559 bool only_slp_reduc_chain = true;
6560 stmt_info = NULL;
6561 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6562 while (reduc_def != PHI_RESULT (reduc_def_phi))
6564 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6565 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6566 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6568 if (dump_enabled_p ())
6569 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6570 "reduction chain broken by patterns.\n");
6571 return false;
6573 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6574 only_slp_reduc_chain = false;
6575 /* ??? For epilogue generation live members of the chain need
6576 to point back to the PHI via their original stmt for
6577 info_for_reduction to work. */
6578 if (STMT_VINFO_LIVE_P (vdef))
6579 STMT_VINFO_REDUC_DEF (def) = phi_info;
6580 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6581 if (!assign)
6583 if (dump_enabled_p ())
6584 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6585 "reduction chain includes calls.\n");
6586 return false;
6588 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6590 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6591 TREE_TYPE (gimple_assign_rhs1 (assign))))
6593 if (dump_enabled_p ())
6594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6595 "conversion in the reduction chain.\n");
6596 return false;
6599 else if (!stmt_info)
6600 /* First non-conversion stmt. */
6601 stmt_info = vdef;
6602 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6603 reduc_chain_length++;
6604 if (!stmt_info && slp_node)
6605 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6607 /* PHIs should not participate in patterns. */
6608 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6610 if (nested_in_vect_loop_p (loop, stmt_info))
6612 loop = loop->inner;
6613 nested_cycle = true;
6616 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6617 element. */
6618 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6620 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6621 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6623 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6624 gcc_assert (slp_node
6625 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6627 /* 1. Is vectorizable reduction? */
6628 /* Not supportable if the reduction variable is used in the loop, unless
6629 it's a reduction chain. */
6630 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6631 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6632 return false;
6634 /* Reductions that are not used even in an enclosing outer-loop,
6635 are expected to be "live" (used out of the loop). */
6636 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6637 && !STMT_VINFO_LIVE_P (stmt_info))
6638 return false;
6640 /* 2. Has this been recognized as a reduction pattern?
6642 Check if STMT represents a pattern that has been recognized
6643 in earlier analysis stages. For stmts that represent a pattern,
6644 the STMT_VINFO_RELATED_STMT field records the last stmt in
6645 the original sequence that constitutes the pattern. */
6647 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6648 if (orig_stmt_info)
6650 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6651 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6654 /* 3. Check the operands of the operation. The first operands are defined
6655 inside the loop body. The last operand is the reduction variable,
6656 which is defined by the loop-header-phi. */
6658 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6659 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6660 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6661 enum tree_code code = gimple_assign_rhs_code (stmt);
6662 bool lane_reduc_code_p
6663 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6664 int op_type = TREE_CODE_LENGTH (code);
6666 scalar_dest = gimple_assign_lhs (stmt);
6667 scalar_type = TREE_TYPE (scalar_dest);
6668 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6669 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6670 return false;
6672 /* Do not try to vectorize bit-precision reductions. */
6673 if (!type_has_mode_precision_p (scalar_type))
6674 return false;
6676 /* For lane-reducing ops we're reducing the number of reduction PHIs
6677 which means the only use of that may be in the lane-reducing operation. */
6678 if (lane_reduc_code_p
6679 && reduc_chain_length != 1
6680 && !only_slp_reduc_chain)
6682 if (dump_enabled_p ())
6683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6684 "lane-reducing reduction with extra stmts.\n");
6685 return false;
6688 /* All uses but the last are expected to be defined in the loop.
6689 The last use is the reduction variable. In case of nested cycle this
6690 assumption is not true: we use reduc_index to record the index of the
6691 reduction variable. */
6692 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6693 /* We need to skip an extra operand for COND_EXPRs with embedded
6694 comparison. */
6695 unsigned opno_adjust = 0;
6696 if (code == COND_EXPR
6697 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6698 opno_adjust = 1;
6699 for (i = 0; i < op_type; i++)
6701 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6702 if (i == 0 && code == COND_EXPR)
6703 continue;
6705 stmt_vec_info def_stmt_info;
6706 enum vect_def_type dt;
6707 tree op;
6708 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6709 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6710 &def_stmt_info))
6712 if (dump_enabled_p ())
6713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6714 "use not simple.\n");
6715 return false;
6717 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6718 continue;
6720 /* There should be only one cycle def in the stmt, the one
6721 leading to reduc_def. */
6722 if (VECTORIZABLE_CYCLE_DEF (dt))
6723 return false;
6725 /* To properly compute ncopies we are interested in the widest
6726 non-reduction input type in case we're looking at a widening
6727 accumulation that we later handle in vect_transform_reduction. */
6728 if (lane_reduc_code_p
6729 && tem
6730 && (!vectype_in
6731 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6732 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6733 vectype_in = tem;
6735 if (code == COND_EXPR)
6737 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6738 if (dt == vect_constant_def)
6740 cond_reduc_dt = dt;
6741 cond_reduc_val = op;
6743 if (dt == vect_induction_def
6744 && def_stmt_info
6745 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6747 cond_reduc_dt = dt;
6748 cond_stmt_vinfo = def_stmt_info;
6752 if (!vectype_in)
6753 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6754 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6756 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6757 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6758 /* If we have a condition reduction, see if we can simplify it further. */
6759 if (v_reduc_type == COND_REDUCTION)
6761 if (slp_node)
6762 return false;
6764 /* When the condition uses the reduction value in the condition, fail. */
6765 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6767 if (dump_enabled_p ())
6768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6769 "condition depends on previous iteration\n");
6770 return false;
6773 if (reduc_chain_length == 1
6774 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6775 vectype_in, OPTIMIZE_FOR_SPEED))
6777 if (dump_enabled_p ())
6778 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6779 "optimizing condition reduction with"
6780 " FOLD_EXTRACT_LAST.\n");
6781 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6783 else if (cond_reduc_dt == vect_induction_def)
6785 tree base
6786 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6787 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6789 gcc_assert (TREE_CODE (base) == INTEGER_CST
6790 && TREE_CODE (step) == INTEGER_CST);
6791 cond_reduc_val = NULL_TREE;
6792 enum tree_code cond_reduc_op_code = ERROR_MARK;
6793 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6794 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6796 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6797 above base; punt if base is the minimum value of the type for
6798 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6799 else if (tree_int_cst_sgn (step) == -1)
6801 cond_reduc_op_code = MIN_EXPR;
6802 if (tree_int_cst_sgn (base) == -1)
6803 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6804 else if (tree_int_cst_lt (base,
6805 TYPE_MAX_VALUE (TREE_TYPE (base))))
6806 cond_reduc_val
6807 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6809 else
6811 cond_reduc_op_code = MAX_EXPR;
6812 if (tree_int_cst_sgn (base) == 1)
6813 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6814 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6815 base))
6816 cond_reduc_val
6817 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6819 if (cond_reduc_val)
6821 if (dump_enabled_p ())
6822 dump_printf_loc (MSG_NOTE, vect_location,
6823 "condition expression based on "
6824 "integer induction.\n");
6825 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6826 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6827 = cond_reduc_val;
6828 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6831 else if (cond_reduc_dt == vect_constant_def)
6833 enum vect_def_type cond_initial_dt;
6834 tree cond_initial_val
6835 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6837 gcc_assert (cond_reduc_val != NULL_TREE);
6838 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6839 if (cond_initial_dt == vect_constant_def
6840 && types_compatible_p (TREE_TYPE (cond_initial_val),
6841 TREE_TYPE (cond_reduc_val)))
6843 tree e = fold_binary (LE_EXPR, boolean_type_node,
6844 cond_initial_val, cond_reduc_val);
6845 if (e && (integer_onep (e) || integer_zerop (e)))
6847 if (dump_enabled_p ())
6848 dump_printf_loc (MSG_NOTE, vect_location,
6849 "condition expression based on "
6850 "compile time constant.\n");
6851 /* Record reduction code at analysis stage. */
6852 STMT_VINFO_REDUC_CODE (reduc_info)
6853 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6854 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6860 if (STMT_VINFO_LIVE_P (phi_info))
6861 return false;
6863 if (slp_node)
6864 ncopies = 1;
6865 else
6866 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6868 gcc_assert (ncopies >= 1);
6870 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6872 if (nested_cycle)
6874 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6875 == vect_double_reduction_def);
6876 double_reduc = true;
6879 /* 4.2. Check support for the epilog operation.
6881 If STMT represents a reduction pattern, then the type of the
6882 reduction variable may be different than the type of the rest
6883 of the arguments. For example, consider the case of accumulation
6884 of shorts into an int accumulator; The original code:
6885 S1: int_a = (int) short_a;
6886 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6888 was replaced with:
6889 STMT: int_acc = widen_sum <short_a, int_acc>
6891 This means that:
6892 1. The tree-code that is used to create the vector operation in the
6893 epilog code (that reduces the partial results) is not the
6894 tree-code of STMT, but is rather the tree-code of the original
6895 stmt from the pattern that STMT is replacing. I.e, in the example
6896 above we want to use 'widen_sum' in the loop, but 'plus' in the
6897 epilog.
6898 2. The type (mode) we use to check available target support
6899 for the vector operation to be created in the *epilog*, is
6900 determined by the type of the reduction variable (in the example
6901 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6902 However the type (mode) we use to check available target support
6903 for the vector operation to be created *inside the loop*, is
6904 determined by the type of the other arguments to STMT (in the
6905 example we'd check this: optab_handler (widen_sum_optab,
6906 vect_short_mode)).
6908 This is contrary to "regular" reductions, in which the types of all
6909 the arguments are the same as the type of the reduction variable.
6910 For "regular" reductions we can therefore use the same vector type
6911 (and also the same tree-code) when generating the epilog code and
6912 when generating the code inside the loop. */
6914 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6915 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6917 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6918 if (reduction_type == TREE_CODE_REDUCTION)
6920 /* Check whether it's ok to change the order of the computation.
6921 Generally, when vectorizing a reduction we change the order of the
6922 computation. This may change the behavior of the program in some
6923 cases, so we need to check that this is ok. One exception is when
6924 vectorizing an outer-loop: the inner-loop is executed sequentially,
6925 and therefore vectorizing reductions in the inner-loop during
6926 outer-loop vectorization is safe. Likewise when we are vectorizing
6927 a series of reductions using SLP and the VF is one the reductions
6928 are performed in scalar order. */
6929 if (slp_node
6930 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6931 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6933 else if (needs_fold_left_reduction_p (scalar_type, orig_code))
6935 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6936 is not directy used in stmt. */
6937 if (!only_slp_reduc_chain
6938 && reduc_chain_length != 1)
6940 if (dump_enabled_p ())
6941 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6942 "in-order reduction chain without SLP.\n");
6943 return false;
6945 STMT_VINFO_REDUC_TYPE (reduc_info)
6946 = reduction_type = FOLD_LEFT_REDUCTION;
6948 else if (!commutative_tree_code (orig_code)
6949 || !associative_tree_code (orig_code))
6951 if (dump_enabled_p ())
6952 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6953 "reduction: not commutative/associative");
6954 return false;
6958 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6959 && ncopies > 1)
6961 if (dump_enabled_p ())
6962 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6963 "multiple types in double reduction or condition "
6964 "reduction or fold-left reduction.\n");
6965 return false;
6968 internal_fn reduc_fn = IFN_LAST;
6969 if (reduction_type == TREE_CODE_REDUCTION
6970 || reduction_type == FOLD_LEFT_REDUCTION
6971 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6972 || reduction_type == CONST_COND_REDUCTION)
6974 if (reduction_type == FOLD_LEFT_REDUCTION
6975 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6976 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6978 if (reduc_fn != IFN_LAST
6979 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6980 OPTIMIZE_FOR_SPEED))
6982 if (dump_enabled_p ())
6983 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6984 "reduc op not supported by target.\n");
6986 reduc_fn = IFN_LAST;
6989 else
6991 if (!nested_cycle || double_reduc)
6993 if (dump_enabled_p ())
6994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6995 "no reduc code for scalar code.\n");
6997 return false;
7001 else if (reduction_type == COND_REDUCTION)
7003 int scalar_precision
7004 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7005 cr_index_scalar_type = make_unsigned_type (scalar_precision);
7006 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7007 vectype_out);
7009 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7010 OPTIMIZE_FOR_SPEED))
7011 reduc_fn = IFN_REDUC_MAX;
7013 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7015 if (reduction_type != EXTRACT_LAST_REDUCTION
7016 && (!nested_cycle || double_reduc)
7017 && reduc_fn == IFN_LAST
7018 && !nunits_out.is_constant ())
7020 if (dump_enabled_p ())
7021 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7022 "missing target support for reduction on"
7023 " variable-length vectors.\n");
7024 return false;
7027 /* For SLP reductions, see if there is a neutral value we can use. */
7028 tree neutral_op = NULL_TREE;
7029 if (slp_node)
7030 neutral_op = neutral_op_for_slp_reduction
7031 (slp_node_instance->reduc_phis, vectype_out, orig_code,
7032 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7034 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7036 /* We can't support in-order reductions of code such as this:
7038 for (int i = 0; i < n1; ++i)
7039 for (int j = 0; j < n2; ++j)
7040 l += a[j];
7042 since GCC effectively transforms the loop when vectorizing:
7044 for (int i = 0; i < n1 / VF; ++i)
7045 for (int j = 0; j < n2; ++j)
7046 for (int k = 0; k < VF; ++k)
7047 l += a[j];
7049 which is a reassociation of the original operation. */
7050 if (dump_enabled_p ())
7051 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7052 "in-order double reduction not supported.\n");
7054 return false;
7057 if (reduction_type == FOLD_LEFT_REDUCTION
7058 && slp_node
7059 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7061 /* We cannot use in-order reductions in this case because there is
7062 an implicit reassociation of the operations involved. */
7063 if (dump_enabled_p ())
7064 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7065 "in-order unchained SLP reductions not supported.\n");
7066 return false;
7069 /* For double reductions, and for SLP reductions with a neutral value,
7070 we construct a variable-length initial vector by loading a vector
7071 full of the neutral value and then shift-and-inserting the start
7072 values into the low-numbered elements. */
7073 if ((double_reduc || neutral_op)
7074 && !nunits_out.is_constant ()
7075 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7076 vectype_out, OPTIMIZE_FOR_SPEED))
7078 if (dump_enabled_p ())
7079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7080 "reduction on variable-length vectors requires"
7081 " target support for a vector-shift-and-insert"
7082 " operation.\n");
7083 return false;
7086 /* Check extra constraints for variable-length unchained SLP reductions. */
7087 if (STMT_SLP_TYPE (stmt_info)
7088 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7089 && !nunits_out.is_constant ())
7091 /* We checked above that we could build the initial vector when
7092 there's a neutral element value. Check here for the case in
7093 which each SLP statement has its own initial value and in which
7094 that value needs to be repeated for every instance of the
7095 statement within the initial vector. */
7096 unsigned int group_size = SLP_TREE_LANES (slp_node);
7097 if (!neutral_op
7098 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7099 TREE_TYPE (vectype_out)))
7101 if (dump_enabled_p ())
7102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7103 "unsupported form of SLP reduction for"
7104 " variable-length vectors: cannot build"
7105 " initial vector.\n");
7106 return false;
7108 /* The epilogue code relies on the number of elements being a multiple
7109 of the group size. The duplicate-and-interleave approach to setting
7110 up the initial vector does too. */
7111 if (!multiple_p (nunits_out, group_size))
7113 if (dump_enabled_p ())
7114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7115 "unsupported form of SLP reduction for"
7116 " variable-length vectors: the vector size"
7117 " is not a multiple of the number of results.\n");
7118 return false;
7122 if (reduction_type == COND_REDUCTION)
7124 widest_int ni;
7126 if (! max_loop_iterations (loop, &ni))
7128 if (dump_enabled_p ())
7129 dump_printf_loc (MSG_NOTE, vect_location,
7130 "loop count not known, cannot create cond "
7131 "reduction.\n");
7132 return false;
7134 /* Convert backedges to iterations. */
7135 ni += 1;
7137 /* The additional index will be the same type as the condition. Check
7138 that the loop can fit into this less one (because we'll use up the
7139 zero slot for when there are no matches). */
7140 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7141 if (wi::geu_p (ni, wi::to_widest (max_index)))
7143 if (dump_enabled_p ())
7144 dump_printf_loc (MSG_NOTE, vect_location,
7145 "loop size is greater than data size.\n");
7146 return false;
7150 /* In case the vectorization factor (VF) is bigger than the number
7151 of elements that we can fit in a vectype (nunits), we have to generate
7152 more than one vector stmt - i.e - we need to "unroll" the
7153 vector stmt by a factor VF/nunits. For more details see documentation
7154 in vectorizable_operation. */
7156 /* If the reduction is used in an outer loop we need to generate
7157 VF intermediate results, like so (e.g. for ncopies=2):
7158 r0 = phi (init, r0)
7159 r1 = phi (init, r1)
7160 r0 = x0 + r0;
7161 r1 = x1 + r1;
7162 (i.e. we generate VF results in 2 registers).
7163 In this case we have a separate def-use cycle for each copy, and therefore
7164 for each copy we get the vector def for the reduction variable from the
7165 respective phi node created for this copy.
7167 Otherwise (the reduction is unused in the loop nest), we can combine
7168 together intermediate results, like so (e.g. for ncopies=2):
7169 r = phi (init, r)
7170 r = x0 + r;
7171 r = x1 + r;
7172 (i.e. we generate VF/2 results in a single register).
7173 In this case for each copy we get the vector def for the reduction variable
7174 from the vectorized reduction operation generated in the previous iteration.
7176 This only works when we see both the reduction PHI and its only consumer
7177 in vectorizable_reduction and there are no intermediate stmts
7178 participating. */
7179 if (ncopies > 1
7180 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7181 && reduc_chain_length == 1)
7182 single_defuse_cycle = true;
7184 if (single_defuse_cycle || lane_reduc_code_p)
7186 gcc_assert (code != COND_EXPR);
7188 /* 4. Supportable by target? */
7189 bool ok = true;
7191 /* 4.1. check support for the operation in the loop */
7192 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
7193 if (!optab)
7195 if (dump_enabled_p ())
7196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7197 "no optab.\n");
7198 ok = false;
7201 machine_mode vec_mode = TYPE_MODE (vectype_in);
7202 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7204 if (dump_enabled_p ())
7205 dump_printf (MSG_NOTE, "op not supported by target.\n");
7206 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7207 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
7208 ok = false;
7209 else
7210 if (dump_enabled_p ())
7211 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7214 /* Worthwhile without SIMD support? */
7215 if (ok
7216 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
7217 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
7219 if (dump_enabled_p ())
7220 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7221 "not worthwhile without SIMD support.\n");
7222 ok = false;
7225 /* lane-reducing operations have to go through vect_transform_reduction.
7226 For the other cases try without the single cycle optimization. */
7227 if (!ok)
7229 if (lane_reduc_code_p)
7230 return false;
7231 else
7232 single_defuse_cycle = false;
7235 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7237 /* If the reduction stmt is one of the patterns that have lane
7238 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7239 if ((ncopies > 1 && ! single_defuse_cycle)
7240 && lane_reduc_code_p)
7242 if (dump_enabled_p ())
7243 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7244 "multi def-use cycle not possible for lane-reducing "
7245 "reduction operation\n");
7246 return false;
7249 if (slp_node
7250 && !(!single_defuse_cycle
7251 && code != DOT_PROD_EXPR
7252 && code != WIDEN_SUM_EXPR
7253 && code != SAD_EXPR
7254 && reduction_type != FOLD_LEFT_REDUCTION))
7255 for (i = 0; i < op_type; i++)
7256 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7258 if (dump_enabled_p ())
7259 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7260 "incompatible vector types for invariants\n");
7261 return false;
7264 if (slp_node)
7265 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7266 else
7267 vec_num = 1;
7269 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7270 reduction_type, ncopies, cost_vec);
7271 /* Cost the reduction op inside the loop if transformed via
7272 vect_transform_reduction. Otherwise this is costed by the
7273 separate vectorizable_* routines. */
7274 if (single_defuse_cycle
7275 || code == DOT_PROD_EXPR
7276 || code == WIDEN_SUM_EXPR
7277 || code == SAD_EXPR)
7278 record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7280 if (dump_enabled_p ()
7281 && reduction_type == FOLD_LEFT_REDUCTION)
7282 dump_printf_loc (MSG_NOTE, vect_location,
7283 "using an in-order (fold-left) reduction.\n");
7284 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7285 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7286 reductions go through their own vectorizable_* routines. */
7287 if (!single_defuse_cycle
7288 && code != DOT_PROD_EXPR
7289 && code != WIDEN_SUM_EXPR
7290 && code != SAD_EXPR
7291 && reduction_type != FOLD_LEFT_REDUCTION)
7293 stmt_vec_info tem
7294 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7295 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7297 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7298 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7300 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7301 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7303 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7305 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7306 internal_fn cond_fn = get_conditional_internal_fn (code);
7308 if (reduction_type != FOLD_LEFT_REDUCTION
7309 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7310 && (cond_fn == IFN_LAST
7311 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7312 OPTIMIZE_FOR_SPEED)))
7314 if (dump_enabled_p ())
7315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7316 "can't operate on partial vectors because"
7317 " no conditional operation is available.\n");
7318 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7320 else if (reduction_type == FOLD_LEFT_REDUCTION
7321 && reduc_fn == IFN_LAST
7322 && !expand_vec_cond_expr_p (vectype_in,
7323 truth_type_for (vectype_in),
7324 SSA_NAME))
7326 if (dump_enabled_p ())
7327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7328 "can't operate on partial vectors because"
7329 " no conditional operation is available.\n");
7330 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7332 else
7333 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7334 vectype_in, NULL);
7336 return true;
7339 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7340 value. */
7342 bool
7343 vect_transform_reduction (loop_vec_info loop_vinfo,
7344 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7345 gimple **vec_stmt, slp_tree slp_node)
7347 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7348 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7349 int i;
7350 int ncopies;
7351 int vec_num;
7353 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7354 gcc_assert (reduc_info->is_reduc_info);
7356 if (nested_in_vect_loop_p (loop, stmt_info))
7358 loop = loop->inner;
7359 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7362 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7363 enum tree_code code = gimple_assign_rhs_code (stmt);
7364 int op_type = TREE_CODE_LENGTH (code);
7366 /* Flatten RHS. */
7367 tree ops[3];
7368 switch (get_gimple_rhs_class (code))
7370 case GIMPLE_TERNARY_RHS:
7371 ops[2] = gimple_assign_rhs3 (stmt);
7372 /* Fall thru. */
7373 case GIMPLE_BINARY_RHS:
7374 ops[0] = gimple_assign_rhs1 (stmt);
7375 ops[1] = gimple_assign_rhs2 (stmt);
7376 break;
7377 default:
7378 gcc_unreachable ();
7381 /* All uses but the last are expected to be defined in the loop.
7382 The last use is the reduction variable. In case of nested cycle this
7383 assumption is not true: we use reduc_index to record the index of the
7384 reduction variable. */
7385 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7386 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7387 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7388 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7390 if (slp_node)
7392 ncopies = 1;
7393 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7395 else
7397 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7398 vec_num = 1;
7401 internal_fn cond_fn = get_conditional_internal_fn (code);
7402 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7403 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7405 /* Transform. */
7406 tree new_temp = NULL_TREE;
7407 auto_vec<tree> vec_oprnds0;
7408 auto_vec<tree> vec_oprnds1;
7409 auto_vec<tree> vec_oprnds2;
7410 tree def0;
7412 if (dump_enabled_p ())
7413 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7415 /* FORNOW: Multiple types are not supported for condition. */
7416 if (code == COND_EXPR)
7417 gcc_assert (ncopies == 1);
7419 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7421 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7422 if (reduction_type == FOLD_LEFT_REDUCTION)
7424 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7425 return vectorize_fold_left_reduction
7426 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7427 reduc_fn, ops, vectype_in, reduc_index, masks);
7430 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7431 gcc_assert (single_defuse_cycle
7432 || code == DOT_PROD_EXPR
7433 || code == WIDEN_SUM_EXPR
7434 || code == SAD_EXPR);
7436 /* Create the destination vector */
7437 tree scalar_dest = gimple_assign_lhs (stmt);
7438 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7440 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7441 single_defuse_cycle && reduc_index == 0
7442 ? NULL_TREE : ops[0], &vec_oprnds0,
7443 single_defuse_cycle && reduc_index == 1
7444 ? NULL_TREE : ops[1], &vec_oprnds1,
7445 op_type == ternary_op
7446 && !(single_defuse_cycle && reduc_index == 2)
7447 ? ops[2] : NULL_TREE, &vec_oprnds2);
7448 if (single_defuse_cycle)
7450 gcc_assert (!slp_node);
7451 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7452 ops[reduc_index],
7453 reduc_index == 0 ? &vec_oprnds0
7454 : (reduc_index == 1 ? &vec_oprnds1
7455 : &vec_oprnds2));
7458 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7460 gimple *new_stmt;
7461 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7462 if (masked_loop_p && !mask_by_cond_expr)
7464 /* Make sure that the reduction accumulator is vop[0]. */
7465 if (reduc_index == 1)
7467 gcc_assert (commutative_tree_code (code));
7468 std::swap (vop[0], vop[1]);
7470 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7471 vectype_in, i);
7472 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7473 vop[0], vop[1], vop[0]);
7474 new_temp = make_ssa_name (vec_dest, call);
7475 gimple_call_set_lhs (call, new_temp);
7476 gimple_call_set_nothrow (call, true);
7477 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7478 new_stmt = call;
7480 else
7482 if (op_type == ternary_op)
7483 vop[2] = vec_oprnds2[i];
7485 if (masked_loop_p && mask_by_cond_expr)
7487 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7488 vectype_in, i);
7489 build_vect_cond_expr (code, vop, mask, gsi);
7492 new_stmt = gimple_build_assign (vec_dest, code,
7493 vop[0], vop[1], vop[2]);
7494 new_temp = make_ssa_name (vec_dest, new_stmt);
7495 gimple_assign_set_lhs (new_stmt, new_temp);
7496 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7499 if (slp_node)
7500 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7501 else if (single_defuse_cycle
7502 && i < ncopies - 1)
7504 if (reduc_index == 0)
7505 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7506 else if (reduc_index == 1)
7507 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7508 else if (reduc_index == 2)
7509 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7511 else
7512 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7515 if (!slp_node)
7516 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7518 return true;
7521 /* Transform phase of a cycle PHI. */
7523 bool
7524 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7525 stmt_vec_info stmt_info, gimple **vec_stmt,
7526 slp_tree slp_node, slp_instance slp_node_instance)
7528 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7529 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7530 int i;
7531 int ncopies;
7532 int j;
7533 bool nested_cycle = false;
7534 int vec_num;
7536 if (nested_in_vect_loop_p (loop, stmt_info))
7538 loop = loop->inner;
7539 nested_cycle = true;
7542 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7543 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7544 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7545 gcc_assert (reduc_info->is_reduc_info);
7547 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7548 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7549 /* Leave the scalar phi in place. */
7550 return true;
7552 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7553 /* For a nested cycle we do not fill the above. */
7554 if (!vectype_in)
7555 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7556 gcc_assert (vectype_in);
7558 if (slp_node)
7560 /* The size vect_schedule_slp_instance computes is off for us. */
7561 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7562 * SLP_TREE_LANES (slp_node), vectype_in);
7563 ncopies = 1;
7565 else
7567 vec_num = 1;
7568 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7571 /* Check whether we should use a single PHI node and accumulate
7572 vectors to one before the backedge. */
7573 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7574 ncopies = 1;
7576 /* Create the destination vector */
7577 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7578 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7579 vectype_out);
7581 /* Get the loop-entry arguments. */
7582 tree vec_initial_def;
7583 auto_vec<tree> vec_initial_defs;
7584 if (slp_node)
7586 vec_initial_defs.reserve (vec_num);
7587 if (nested_cycle)
7589 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7590 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7591 &vec_initial_defs);
7593 else
7595 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7596 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7597 tree neutral_op
7598 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7599 STMT_VINFO_REDUC_CODE (reduc_info),
7600 first != NULL);
7601 get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7602 &vec_initial_defs, vec_num,
7603 first != NULL, neutral_op);
7606 else
7608 /* Get at the scalar def before the loop, that defines the initial
7609 value of the reduction variable. */
7610 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7611 loop_preheader_edge (loop));
7612 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7613 and we can't use zero for induc_val, use initial_def. Similarly
7614 for REDUC_MIN and initial_def larger than the base. */
7615 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7617 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7618 if (TREE_CODE (initial_def) == INTEGER_CST
7619 && !integer_zerop (induc_val)
7620 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7621 && tree_int_cst_lt (initial_def, induc_val))
7622 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7623 && tree_int_cst_lt (induc_val, initial_def))))
7625 induc_val = initial_def;
7626 /* Communicate we used the initial_def to epilouge
7627 generation. */
7628 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7630 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7631 vec_initial_defs.create (ncopies);
7632 for (i = 0; i < ncopies; ++i)
7633 vec_initial_defs.quick_push (vec_initial_def);
7635 else if (nested_cycle)
7637 /* Do not use an adjustment def as that case is not supported
7638 correctly if ncopies is not one. */
7639 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7640 ncopies, initial_def,
7641 &vec_initial_defs);
7643 else
7645 tree adjustment_def = NULL_TREE;
7646 tree *adjustment_defp = &adjustment_def;
7647 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7648 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7649 adjustment_defp = NULL;
7650 vec_initial_def
7651 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7652 initial_def, adjustment_defp);
7653 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7654 vec_initial_defs.create (ncopies);
7655 for (i = 0; i < ncopies; ++i)
7656 vec_initial_defs.quick_push (vec_initial_def);
7660 /* Generate the reduction PHIs upfront. */
7661 for (i = 0; i < vec_num; i++)
7663 tree vec_init_def = vec_initial_defs[i];
7664 for (j = 0; j < ncopies; j++)
7666 /* Create the reduction-phi that defines the reduction
7667 operand. */
7668 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7670 /* Set the loop-entry arg of the reduction-phi. */
7671 if (j != 0 && nested_cycle)
7672 vec_init_def = vec_initial_defs[j];
7673 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7674 UNKNOWN_LOCATION);
7676 /* The loop-latch arg is set in epilogue processing. */
7678 if (slp_node)
7679 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7680 else
7682 if (j == 0)
7683 *vec_stmt = new_phi;
7684 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7689 return true;
7692 /* Vectorizes LC PHIs. */
7694 bool
7695 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7696 stmt_vec_info stmt_info, gimple **vec_stmt,
7697 slp_tree slp_node)
7699 if (!loop_vinfo
7700 || !is_a <gphi *> (stmt_info->stmt)
7701 || gimple_phi_num_args (stmt_info->stmt) != 1)
7702 return false;
7704 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7705 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7706 return false;
7708 if (!vec_stmt) /* transformation not required. */
7710 /* Deal with copies from externs or constants that disguise as
7711 loop-closed PHI nodes (PR97886). */
7712 if (slp_node
7713 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7714 SLP_TREE_VECTYPE (slp_node)))
7716 if (dump_enabled_p ())
7717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7718 "incompatible vector types for invariants\n");
7719 return false;
7721 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7722 return true;
7725 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7726 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7727 basic_block bb = gimple_bb (stmt_info->stmt);
7728 edge e = single_pred_edge (bb);
7729 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7730 auto_vec<tree> vec_oprnds;
7731 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7732 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7733 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7734 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7736 /* Create the vectorized LC PHI node. */
7737 gphi *new_phi = create_phi_node (vec_dest, bb);
7738 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7739 if (slp_node)
7740 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7741 else
7742 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7744 if (!slp_node)
7745 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7747 return true;
7750 /* Vectorizes PHIs. */
7752 bool
7753 vectorizable_phi (vec_info *,
7754 stmt_vec_info stmt_info, gimple **vec_stmt,
7755 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7757 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7758 return false;
7760 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7761 return false;
7763 tree vectype = SLP_TREE_VECTYPE (slp_node);
7765 if (!vec_stmt) /* transformation not required. */
7767 slp_tree child;
7768 unsigned i;
7769 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7770 if (!child)
7772 if (dump_enabled_p ())
7773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7774 "PHI node with unvectorized backedge def\n");
7775 return false;
7777 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7779 if (dump_enabled_p ())
7780 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7781 "incompatible vector types for invariants\n");
7782 return false;
7784 /* For single-argument PHIs assume coalescing which means zero cost
7785 for the scalar and the vector PHIs. This avoids artificially
7786 favoring the vector path (but may pessimize it in some cases). */
7787 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7788 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7789 vector_stmt, stmt_info, vectype, 0, vect_body);
7790 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7791 return true;
7794 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7795 basic_block bb = gimple_bb (stmt_info->stmt);
7796 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7797 auto_vec<gphi *> new_phis;
7798 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7800 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7802 /* Skip not yet vectorized defs. */
7803 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7804 && SLP_TREE_VEC_STMTS (child).is_empty ())
7805 continue;
7807 auto_vec<tree> vec_oprnds;
7808 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7809 if (!new_phis.exists ())
7811 new_phis.create (vec_oprnds.length ());
7812 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7814 /* Create the vectorized LC PHI node. */
7815 new_phis.quick_push (create_phi_node (vec_dest, bb));
7816 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7819 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7820 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7821 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7823 /* We should have at least one already vectorized child. */
7824 gcc_assert (new_phis.exists ());
7826 return true;
7830 /* Function vect_min_worthwhile_factor.
7832 For a loop where we could vectorize the operation indicated by CODE,
7833 return the minimum vectorization factor that makes it worthwhile
7834 to use generic vectors. */
7835 static unsigned int
7836 vect_min_worthwhile_factor (enum tree_code code)
7838 switch (code)
7840 case PLUS_EXPR:
7841 case MINUS_EXPR:
7842 case NEGATE_EXPR:
7843 return 4;
7845 case BIT_AND_EXPR:
7846 case BIT_IOR_EXPR:
7847 case BIT_XOR_EXPR:
7848 case BIT_NOT_EXPR:
7849 return 2;
7851 default:
7852 return INT_MAX;
7856 /* Return true if VINFO indicates we are doing loop vectorization and if
7857 it is worth decomposing CODE operations into scalar operations for
7858 that loop's vectorization factor. */
7860 bool
7861 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7863 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7864 unsigned HOST_WIDE_INT value;
7865 return (loop_vinfo
7866 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7867 && value >= vect_min_worthwhile_factor (code));
7870 /* Function vectorizable_induction
7872 Check if STMT_INFO performs an induction computation that can be vectorized.
7873 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7874 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7875 Return true if STMT_INFO is vectorizable in this way. */
7877 bool
7878 vectorizable_induction (loop_vec_info loop_vinfo,
7879 stmt_vec_info stmt_info,
7880 gimple **vec_stmt, slp_tree slp_node,
7881 stmt_vector_for_cost *cost_vec)
7883 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7884 unsigned ncopies;
7885 bool nested_in_vect_loop = false;
7886 class loop *iv_loop;
7887 tree vec_def;
7888 edge pe = loop_preheader_edge (loop);
7889 basic_block new_bb;
7890 tree new_vec, vec_init, vec_step, t;
7891 tree new_name;
7892 gimple *new_stmt;
7893 gphi *induction_phi;
7894 tree induc_def, vec_dest;
7895 tree init_expr, step_expr;
7896 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7897 unsigned i;
7898 tree expr;
7899 gimple_stmt_iterator si;
7901 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7902 if (!phi)
7903 return false;
7905 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7906 return false;
7908 /* Make sure it was recognized as induction computation. */
7909 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7910 return false;
7912 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7913 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7915 if (slp_node)
7916 ncopies = 1;
7917 else
7918 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7919 gcc_assert (ncopies >= 1);
7921 /* FORNOW. These restrictions should be relaxed. */
7922 if (nested_in_vect_loop_p (loop, stmt_info))
7924 imm_use_iterator imm_iter;
7925 use_operand_p use_p;
7926 gimple *exit_phi;
7927 edge latch_e;
7928 tree loop_arg;
7930 if (ncopies > 1)
7932 if (dump_enabled_p ())
7933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7934 "multiple types in nested loop.\n");
7935 return false;
7938 exit_phi = NULL;
7939 latch_e = loop_latch_edge (loop->inner);
7940 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7941 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7943 gimple *use_stmt = USE_STMT (use_p);
7944 if (is_gimple_debug (use_stmt))
7945 continue;
7947 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7949 exit_phi = use_stmt;
7950 break;
7953 if (exit_phi)
7955 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7956 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7957 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7959 if (dump_enabled_p ())
7960 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7961 "inner-loop induction only used outside "
7962 "of the outer vectorized loop.\n");
7963 return false;
7967 nested_in_vect_loop = true;
7968 iv_loop = loop->inner;
7970 else
7971 iv_loop = loop;
7972 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7974 if (slp_node && !nunits.is_constant ())
7976 /* The current SLP code creates the step value element-by-element. */
7977 if (dump_enabled_p ())
7978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7979 "SLP induction not supported for variable-length"
7980 " vectors.\n");
7981 return false;
7984 if (!vec_stmt) /* transformation not required. */
7986 unsigned inside_cost = 0, prologue_cost = 0;
7987 if (slp_node)
7989 /* We eventually need to set a vector type on invariant
7990 arguments. */
7991 unsigned j;
7992 slp_tree child;
7993 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7994 if (!vect_maybe_update_slp_op_vectype
7995 (child, SLP_TREE_VECTYPE (slp_node)))
7997 if (dump_enabled_p ())
7998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7999 "incompatible vector types for "
8000 "invariants\n");
8001 return false;
8003 /* loop cost for vec_loop. */
8004 inside_cost
8005 = record_stmt_cost (cost_vec,
8006 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8007 vector_stmt, stmt_info, 0, vect_body);
8008 /* prologue cost for vec_init (if not nested) and step. */
8009 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8010 scalar_to_vec,
8011 stmt_info, 0, vect_prologue);
8013 else /* if (!slp_node) */
8015 /* loop cost for vec_loop. */
8016 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8017 stmt_info, 0, vect_body);
8018 /* prologue cost for vec_init and vec_step. */
8019 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8020 stmt_info, 0, vect_prologue);
8022 if (dump_enabled_p ())
8023 dump_printf_loc (MSG_NOTE, vect_location,
8024 "vect_model_induction_cost: inside_cost = %d, "
8025 "prologue_cost = %d .\n", inside_cost,
8026 prologue_cost);
8028 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8029 DUMP_VECT_SCOPE ("vectorizable_induction");
8030 return true;
8033 /* Transform. */
8035 /* Compute a vector variable, initialized with the first VF values of
8036 the induction variable. E.g., for an iv with IV_PHI='X' and
8037 evolution S, for a vector of 4 units, we want to compute:
8038 [X, X + S, X + 2*S, X + 3*S]. */
8040 if (dump_enabled_p ())
8041 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8043 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8044 gcc_assert (step_expr != NULL_TREE);
8045 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8047 pe = loop_preheader_edge (iv_loop);
8048 /* Find the first insertion point in the BB. */
8049 basic_block bb = gimple_bb (phi);
8050 si = gsi_after_labels (bb);
8052 /* For SLP induction we have to generate several IVs as for example
8053 with group size 3 we need
8054 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8055 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
8056 if (slp_node)
8058 /* Enforced above. */
8059 unsigned int const_nunits = nunits.to_constant ();
8061 /* The initial values are vectorized, but any lanes > group_size
8062 need adjustment. */
8063 slp_tree init_node
8064 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8066 /* Gather steps. Since we do not vectorize inductions as
8067 cycles we have to reconstruct the step from SCEV data. */
8068 unsigned group_size = SLP_TREE_LANES (slp_node);
8069 tree *steps = XALLOCAVEC (tree, group_size);
8070 tree *inits = XALLOCAVEC (tree, group_size);
8071 stmt_vec_info phi_info;
8072 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8074 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8075 if (!init_node)
8076 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8077 pe->dest_idx);
8080 /* Now generate the IVs. */
8081 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8082 gcc_assert ((const_nunits * nvects) % group_size == 0);
8083 unsigned nivs;
8084 if (nested_in_vect_loop)
8085 nivs = nvects;
8086 else
8088 /* Compute the number of distinct IVs we need. First reduce
8089 group_size if it is a multiple of const_nunits so we get
8090 one IV for a group_size of 4 but const_nunits 2. */
8091 unsigned group_sizep = group_size;
8092 if (group_sizep % const_nunits == 0)
8093 group_sizep = group_sizep / const_nunits;
8094 nivs = least_common_multiple (group_sizep,
8095 const_nunits) / const_nunits;
8097 tree stept = TREE_TYPE (step_vectype);
8098 tree lupdate_mul = NULL_TREE;
8099 if (!nested_in_vect_loop)
8101 /* The number of iterations covered in one vector iteration. */
8102 unsigned lup_mul = (nvects * const_nunits) / group_size;
8103 lupdate_mul
8104 = build_vector_from_val (step_vectype,
8105 SCALAR_FLOAT_TYPE_P (stept)
8106 ? build_real_from_wide (stept, lup_mul,
8107 UNSIGNED)
8108 : build_int_cstu (stept, lup_mul));
8110 tree peel_mul = NULL_TREE;
8111 gimple_seq init_stmts = NULL;
8112 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8114 if (SCALAR_FLOAT_TYPE_P (stept))
8115 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8116 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8117 else
8118 peel_mul = gimple_convert (&init_stmts, stept,
8119 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8120 peel_mul = gimple_build_vector_from_val (&init_stmts,
8121 step_vectype, peel_mul);
8123 unsigned ivn;
8124 auto_vec<tree> vec_steps;
8125 for (ivn = 0; ivn < nivs; ++ivn)
8127 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8128 tree_vector_builder init_elts (vectype, const_nunits, 1);
8129 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8130 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8132 /* The scalar steps of the IVs. */
8133 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8134 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8135 step_elts.quick_push (elt);
8136 if (!init_node)
8138 /* The scalar inits of the IVs if not vectorized. */
8139 elt = inits[(ivn*const_nunits + eltn) % group_size];
8140 if (!useless_type_conversion_p (TREE_TYPE (vectype),
8141 TREE_TYPE (elt)))
8142 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8143 TREE_TYPE (vectype), elt);
8144 init_elts.quick_push (elt);
8146 /* The number of steps to add to the initial values. */
8147 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8148 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8149 ? build_real_from_wide (stept,
8150 mul_elt, UNSIGNED)
8151 : build_int_cstu (stept, mul_elt));
8153 vec_step = gimple_build_vector (&init_stmts, &step_elts);
8154 vec_steps.safe_push (vec_step);
8155 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8156 if (peel_mul)
8157 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8158 step_mul, peel_mul);
8159 if (!init_node)
8160 vec_init = gimple_build_vector (&init_stmts, &init_elts);
8162 /* Create the induction-phi that defines the induction-operand. */
8163 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8164 "vec_iv_");
8165 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8166 induc_def = PHI_RESULT (induction_phi);
8168 /* Create the iv update inside the loop */
8169 tree up = vec_step;
8170 if (lupdate_mul)
8171 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8172 vec_step, lupdate_mul);
8173 gimple_seq stmts = NULL;
8174 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8175 vec_def = gimple_build (&stmts,
8176 PLUS_EXPR, step_vectype, vec_def, up);
8177 vec_def = gimple_convert (&stmts, vectype, vec_def);
8178 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8179 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8180 UNKNOWN_LOCATION);
8182 if (init_node)
8183 vec_init = vect_get_slp_vect_def (init_node, ivn);
8184 if (!nested_in_vect_loop
8185 && !integer_zerop (step_mul))
8187 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8188 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8189 vec_step, step_mul);
8190 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8191 vec_def, up);
8192 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8195 /* Set the arguments of the phi node: */
8196 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8198 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8200 if (!nested_in_vect_loop)
8202 /* Fill up to the number of vectors we need for the whole group. */
8203 nivs = least_common_multiple (group_size,
8204 const_nunits) / const_nunits;
8205 for (; ivn < nivs; ++ivn)
8207 SLP_TREE_VEC_STMTS (slp_node)
8208 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8209 vec_steps.safe_push (vec_steps[0]);
8213 /* Re-use IVs when we can. We are generating further vector
8214 stmts by adding VF' * stride to the IVs generated above. */
8215 if (ivn < nvects)
8217 unsigned vfp
8218 = least_common_multiple (group_size, const_nunits) / group_size;
8219 tree lupdate_mul
8220 = build_vector_from_val (step_vectype,
8221 SCALAR_FLOAT_TYPE_P (stept)
8222 ? build_real_from_wide (stept,
8223 vfp, UNSIGNED)
8224 : build_int_cstu (stept, vfp));
8225 for (; ivn < nvects; ++ivn)
8227 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8228 tree def = gimple_get_lhs (iv);
8229 if (ivn < 2*nivs)
8230 vec_steps[ivn - nivs]
8231 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8232 vec_steps[ivn - nivs], lupdate_mul);
8233 gimple_seq stmts = NULL;
8234 def = gimple_convert (&stmts, step_vectype, def);
8235 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8236 def, vec_steps[ivn % nivs]);
8237 def = gimple_convert (&stmts, vectype, def);
8238 if (gimple_code (iv) == GIMPLE_PHI)
8239 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8240 else
8242 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8243 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8245 SLP_TREE_VEC_STMTS (slp_node)
8246 .quick_push (SSA_NAME_DEF_STMT (def));
8250 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8251 gcc_assert (!new_bb);
8253 return true;
8256 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
8257 loop_preheader_edge (iv_loop));
8259 gimple_seq stmts = NULL;
8260 if (!nested_in_vect_loop)
8262 /* Convert the initial value to the IV update type. */
8263 tree new_type = TREE_TYPE (step_expr);
8264 init_expr = gimple_convert (&stmts, new_type, init_expr);
8266 /* If we are using the loop mask to "peel" for alignment then we need
8267 to adjust the start value here. */
8268 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8269 if (skip_niters != NULL_TREE)
8271 if (FLOAT_TYPE_P (vectype))
8272 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8273 skip_niters);
8274 else
8275 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8276 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8277 skip_niters, step_expr);
8278 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8279 init_expr, skip_step);
8283 if (stmts)
8285 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8286 gcc_assert (!new_bb);
8289 /* Create the vector that holds the initial_value of the induction. */
8290 if (nested_in_vect_loop)
8292 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8293 been created during vectorization of previous stmts. We obtain it
8294 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8295 auto_vec<tree> vec_inits;
8296 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8297 init_expr, &vec_inits);
8298 vec_init = vec_inits[0];
8299 /* If the initial value is not of proper type, convert it. */
8300 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8302 new_stmt
8303 = gimple_build_assign (vect_get_new_ssa_name (vectype,
8304 vect_simple_var,
8305 "vec_iv_"),
8306 VIEW_CONVERT_EXPR,
8307 build1 (VIEW_CONVERT_EXPR, vectype,
8308 vec_init));
8309 vec_init = gimple_assign_lhs (new_stmt);
8310 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8311 new_stmt);
8312 gcc_assert (!new_bb);
8315 else
8317 /* iv_loop is the loop to be vectorized. Create:
8318 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8319 stmts = NULL;
8320 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8322 unsigned HOST_WIDE_INT const_nunits;
8323 if (nunits.is_constant (&const_nunits))
8325 tree_vector_builder elts (step_vectype, const_nunits, 1);
8326 elts.quick_push (new_name);
8327 for (i = 1; i < const_nunits; i++)
8329 /* Create: new_name_i = new_name + step_expr */
8330 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8331 new_name, step_expr);
8332 elts.quick_push (new_name);
8334 /* Create a vector from [new_name_0, new_name_1, ...,
8335 new_name_nunits-1] */
8336 vec_init = gimple_build_vector (&stmts, &elts);
8338 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8339 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8340 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8341 new_name, step_expr);
8342 else
8344 /* Build:
8345 [base, base, base, ...]
8346 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8347 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8348 gcc_assert (flag_associative_math);
8349 tree index = build_index_vector (step_vectype, 0, 1);
8350 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8351 new_name);
8352 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8353 step_expr);
8354 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8355 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8356 vec_init, step_vec);
8357 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8358 vec_init, base_vec);
8360 vec_init = gimple_convert (&stmts, vectype, vec_init);
8362 if (stmts)
8364 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8365 gcc_assert (!new_bb);
8370 /* Create the vector that holds the step of the induction. */
8371 if (nested_in_vect_loop)
8372 /* iv_loop is nested in the loop to be vectorized. Generate:
8373 vec_step = [S, S, S, S] */
8374 new_name = step_expr;
8375 else
8377 /* iv_loop is the loop to be vectorized. Generate:
8378 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8379 gimple_seq seq = NULL;
8380 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8382 expr = build_int_cst (integer_type_node, vf);
8383 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8385 else
8386 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8387 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8388 expr, step_expr);
8389 if (seq)
8391 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8392 gcc_assert (!new_bb);
8396 t = unshare_expr (new_name);
8397 gcc_assert (CONSTANT_CLASS_P (new_name)
8398 || TREE_CODE (new_name) == SSA_NAME);
8399 new_vec = build_vector_from_val (step_vectype, t);
8400 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8401 new_vec, step_vectype, NULL);
8404 /* Create the following def-use cycle:
8405 loop prolog:
8406 vec_init = ...
8407 vec_step = ...
8408 loop:
8409 vec_iv = PHI <vec_init, vec_loop>
8411 STMT
8413 vec_loop = vec_iv + vec_step; */
8415 /* Create the induction-phi that defines the induction-operand. */
8416 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8417 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8418 induc_def = PHI_RESULT (induction_phi);
8420 /* Create the iv update inside the loop */
8421 stmts = NULL;
8422 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8423 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8424 vec_def = gimple_convert (&stmts, vectype, vec_def);
8425 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8426 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8428 /* Set the arguments of the phi node: */
8429 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8430 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8431 UNKNOWN_LOCATION);
8433 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8434 *vec_stmt = induction_phi;
8436 /* In case that vectorization factor (VF) is bigger than the number
8437 of elements that we can fit in a vectype (nunits), we have to generate
8438 more than one vector stmt - i.e - we need to "unroll" the
8439 vector stmt by a factor VF/nunits. For more details see documentation
8440 in vectorizable_operation. */
8442 if (ncopies > 1)
8444 gimple_seq seq = NULL;
8445 /* FORNOW. This restriction should be relaxed. */
8446 gcc_assert (!nested_in_vect_loop);
8448 /* Create the vector that holds the step of the induction. */
8449 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8451 expr = build_int_cst (integer_type_node, nunits);
8452 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8454 else
8455 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8456 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8457 expr, step_expr);
8458 if (seq)
8460 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8461 gcc_assert (!new_bb);
8464 t = unshare_expr (new_name);
8465 gcc_assert (CONSTANT_CLASS_P (new_name)
8466 || TREE_CODE (new_name) == SSA_NAME);
8467 new_vec = build_vector_from_val (step_vectype, t);
8468 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8469 new_vec, step_vectype, NULL);
8471 vec_def = induc_def;
8472 for (i = 1; i < ncopies; i++)
8474 /* vec_i = vec_prev + vec_step */
8475 gimple_seq stmts = NULL;
8476 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8477 vec_def = gimple_build (&stmts,
8478 PLUS_EXPR, step_vectype, vec_def, vec_step);
8479 vec_def = gimple_convert (&stmts, vectype, vec_def);
8481 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8482 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8483 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8487 if (dump_enabled_p ())
8488 dump_printf_loc (MSG_NOTE, vect_location,
8489 "transform induction: created def-use cycle: %G%G",
8490 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8492 return true;
8495 /* Function vectorizable_live_operation.
8497 STMT_INFO computes a value that is used outside the loop. Check if
8498 it can be supported. */
8500 bool
8501 vectorizable_live_operation (vec_info *vinfo,
8502 stmt_vec_info stmt_info,
8503 gimple_stmt_iterator *gsi,
8504 slp_tree slp_node, slp_instance slp_node_instance,
8505 int slp_index, bool vec_stmt_p,
8506 stmt_vector_for_cost *cost_vec)
8508 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8509 imm_use_iterator imm_iter;
8510 tree lhs, lhs_type, bitsize;
8511 tree vectype = (slp_node
8512 ? SLP_TREE_VECTYPE (slp_node)
8513 : STMT_VINFO_VECTYPE (stmt_info));
8514 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8515 int ncopies;
8516 gimple *use_stmt;
8517 auto_vec<tree> vec_oprnds;
8518 int vec_entry = 0;
8519 poly_uint64 vec_index = 0;
8521 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8523 /* If a stmt of a reduction is live, vectorize it via
8524 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8525 validity so just trigger the transform here. */
8526 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8528 if (!vec_stmt_p)
8529 return true;
8530 if (slp_node)
8532 /* For reduction chains the meta-info is attached to
8533 the group leader. */
8534 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8535 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8536 /* For SLP reductions we vectorize the epilogue for
8537 all involved stmts together. */
8538 else if (slp_index != 0)
8539 return true;
8540 else
8541 /* For SLP reductions the meta-info is attached to
8542 the representative. */
8543 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8545 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8546 gcc_assert (reduc_info->is_reduc_info);
8547 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8548 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8549 return true;
8550 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8551 slp_node_instance);
8552 return true;
8555 /* If STMT is not relevant and it is a simple assignment and its inputs are
8556 invariant then it can remain in place, unvectorized. The original last
8557 scalar value that it computes will be used. */
8558 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8560 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8561 if (dump_enabled_p ())
8562 dump_printf_loc (MSG_NOTE, vect_location,
8563 "statement is simple and uses invariant. Leaving in "
8564 "place.\n");
8565 return true;
8568 if (slp_node)
8569 ncopies = 1;
8570 else
8571 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8573 if (slp_node)
8575 gcc_assert (slp_index >= 0);
8577 /* Get the last occurrence of the scalar index from the concatenation of
8578 all the slp vectors. Calculate which slp vector it is and the index
8579 within. */
8580 int num_scalar = SLP_TREE_LANES (slp_node);
8581 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8582 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8584 /* Calculate which vector contains the result, and which lane of
8585 that vector we need. */
8586 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8588 if (dump_enabled_p ())
8589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8590 "Cannot determine which vector holds the"
8591 " final result.\n");
8592 return false;
8596 if (!vec_stmt_p)
8598 /* No transformation required. */
8599 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8601 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8602 OPTIMIZE_FOR_SPEED))
8604 if (dump_enabled_p ())
8605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8606 "can't operate on partial vectors "
8607 "because the target doesn't support extract "
8608 "last reduction.\n");
8609 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8611 else if (slp_node)
8613 if (dump_enabled_p ())
8614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8615 "can't operate on partial vectors "
8616 "because an SLP statement is live after "
8617 "the loop.\n");
8618 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8620 else if (ncopies > 1)
8622 if (dump_enabled_p ())
8623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8624 "can't operate on partial vectors "
8625 "because ncopies is greater than 1.\n");
8626 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8628 else
8630 gcc_assert (ncopies == 1 && !slp_node);
8631 vect_record_loop_mask (loop_vinfo,
8632 &LOOP_VINFO_MASKS (loop_vinfo),
8633 1, vectype, NULL);
8636 /* ??? Enable for loop costing as well. */
8637 if (!loop_vinfo)
8638 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8639 0, vect_epilogue);
8640 return true;
8643 /* Use the lhs of the original scalar statement. */
8644 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8645 if (dump_enabled_p ())
8646 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8647 "stmt %G", stmt);
8649 lhs = gimple_get_lhs (stmt);
8650 lhs_type = TREE_TYPE (lhs);
8652 bitsize = vector_element_bits_tree (vectype);
8654 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8655 tree vec_lhs, bitstart;
8656 gimple *vec_stmt;
8657 if (slp_node)
8659 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8661 /* Get the correct slp vectorized stmt. */
8662 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8663 vec_lhs = gimple_get_lhs (vec_stmt);
8665 /* Get entry to use. */
8666 bitstart = bitsize_int (vec_index);
8667 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8669 else
8671 /* For multiple copies, get the last copy. */
8672 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8673 vec_lhs = gimple_get_lhs (vec_stmt);
8675 /* Get the last lane in the vector. */
8676 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8679 if (loop_vinfo)
8681 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8682 requirement, insert one phi node for it. It looks like:
8683 loop;
8685 # lhs' = PHI <lhs>
8687 loop;
8689 # vec_lhs' = PHI <vec_lhs>
8690 new_tree = lane_extract <vec_lhs', ...>;
8691 lhs' = new_tree; */
8693 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8694 basic_block exit_bb = single_exit (loop)->dest;
8695 gcc_assert (single_pred_p (exit_bb));
8697 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8698 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8699 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8701 gimple_seq stmts = NULL;
8702 tree new_tree;
8703 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8705 /* Emit:
8707 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8709 where VEC_LHS is the vectorized live-out result and MASK is
8710 the loop mask for the final iteration. */
8711 gcc_assert (ncopies == 1 && !slp_node);
8712 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8713 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8714 1, vectype, 0);
8715 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8716 mask, vec_lhs_phi);
8718 /* Convert the extracted vector element to the scalar type. */
8719 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8721 else
8723 tree bftype = TREE_TYPE (vectype);
8724 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8725 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8726 new_tree = build3 (BIT_FIELD_REF, bftype,
8727 vec_lhs_phi, bitsize, bitstart);
8728 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8729 &stmts, true, NULL_TREE);
8732 if (stmts)
8734 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8735 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8737 /* Remove existing phi from lhs and create one copy from new_tree. */
8738 tree lhs_phi = NULL_TREE;
8739 gimple_stmt_iterator gsi;
8740 for (gsi = gsi_start_phis (exit_bb);
8741 !gsi_end_p (gsi); gsi_next (&gsi))
8743 gimple *phi = gsi_stmt (gsi);
8744 if ((gimple_phi_arg_def (phi, 0) == lhs))
8746 remove_phi_node (&gsi, false);
8747 lhs_phi = gimple_phi_result (phi);
8748 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8749 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8750 break;
8755 /* Replace use of lhs with newly computed result. If the use stmt is a
8756 single arg PHI, just replace all uses of PHI result. It's necessary
8757 because lcssa PHI defining lhs may be before newly inserted stmt. */
8758 use_operand_p use_p;
8759 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8760 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8761 && !is_gimple_debug (use_stmt))
8763 if (gimple_code (use_stmt) == GIMPLE_PHI
8764 && gimple_phi_num_args (use_stmt) == 1)
8766 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8768 else
8770 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8771 SET_USE (use_p, new_tree);
8773 update_stmt (use_stmt);
8776 else
8778 /* For basic-block vectorization simply insert the lane-extraction. */
8779 tree bftype = TREE_TYPE (vectype);
8780 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8781 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8782 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8783 vec_lhs, bitsize, bitstart);
8784 gimple_seq stmts = NULL;
8785 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8786 &stmts, true, NULL_TREE);
8787 if (TREE_CODE (new_tree) == SSA_NAME
8788 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8789 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8790 if (is_a <gphi *> (vec_stmt))
8792 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8793 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8795 else
8797 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8798 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8801 /* Replace use of lhs with newly computed result. If the use stmt is a
8802 single arg PHI, just replace all uses of PHI result. It's necessary
8803 because lcssa PHI defining lhs may be before newly inserted stmt. */
8804 use_operand_p use_p;
8805 stmt_vec_info use_stmt_info;
8806 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8807 if (!is_gimple_debug (use_stmt)
8808 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8809 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8811 /* ??? This can happen when the live lane ends up being
8812 used in a vector construction code-generated by an
8813 external SLP node (and code-generation for that already
8814 happened). See gcc.dg/vect/bb-slp-47.c.
8815 Doing this is what would happen if that vector CTOR
8816 were not code-generated yet so it is not too bad.
8817 ??? In fact we'd likely want to avoid this situation
8818 in the first place. */
8819 if (TREE_CODE (new_tree) == SSA_NAME
8820 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8821 && gimple_code (use_stmt) != GIMPLE_PHI
8822 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8823 use_stmt))
8825 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8826 gcc_assert (code == CONSTRUCTOR
8827 || code == VIEW_CONVERT_EXPR
8828 || CONVERT_EXPR_CODE_P (code));
8829 if (dump_enabled_p ())
8830 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8831 "Using original scalar computation for "
8832 "live lane because use preceeds vector "
8833 "def\n");
8834 continue;
8836 /* ??? It can also happen that we end up pulling a def into
8837 a loop where replacing out-of-loop uses would require
8838 a new LC SSA PHI node. Retain the original scalar in
8839 those cases as well. PR98064. */
8840 if (TREE_CODE (new_tree) == SSA_NAME
8841 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8842 && (gimple_bb (use_stmt)->loop_father
8843 != gimple_bb (vec_stmt)->loop_father)
8844 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8845 gimple_bb (use_stmt)->loop_father))
8847 if (dump_enabled_p ())
8848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8849 "Using original scalar computation for "
8850 "live lane because there is an out-of-loop "
8851 "definition for it\n");
8852 continue;
8854 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8855 SET_USE (use_p, new_tree);
8856 update_stmt (use_stmt);
8860 return true;
8863 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8865 static void
8866 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8868 ssa_op_iter op_iter;
8869 imm_use_iterator imm_iter;
8870 def_operand_p def_p;
8871 gimple *ustmt;
8873 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8875 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8877 basic_block bb;
8879 if (!is_gimple_debug (ustmt))
8880 continue;
8882 bb = gimple_bb (ustmt);
8884 if (!flow_bb_inside_loop_p (loop, bb))
8886 if (gimple_debug_bind_p (ustmt))
8888 if (dump_enabled_p ())
8889 dump_printf_loc (MSG_NOTE, vect_location,
8890 "killing debug use\n");
8892 gimple_debug_bind_reset_value (ustmt);
8893 update_stmt (ustmt);
8895 else
8896 gcc_unreachable ();
8902 /* Given loop represented by LOOP_VINFO, return true if computation of
8903 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8904 otherwise. */
8906 static bool
8907 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8909 /* Constant case. */
8910 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8912 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8913 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8915 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8916 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8917 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8918 return true;
8921 widest_int max;
8922 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8923 /* Check the upper bound of loop niters. */
8924 if (get_max_loop_iterations (loop, &max))
8926 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8927 signop sgn = TYPE_SIGN (type);
8928 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8929 if (max < type_max)
8930 return true;
8932 return false;
8935 /* Return a mask type with half the number of elements as OLD_TYPE,
8936 given that it should have mode NEW_MODE. */
8938 tree
8939 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8941 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8942 return build_truth_vector_type_for_mode (nunits, new_mode);
8945 /* Return a mask type with twice as many elements as OLD_TYPE,
8946 given that it should have mode NEW_MODE. */
8948 tree
8949 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8951 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8952 return build_truth_vector_type_for_mode (nunits, new_mode);
8955 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8956 contain a sequence of NVECTORS masks that each control a vector of type
8957 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8958 these vector masks with the vector version of SCALAR_MASK. */
8960 void
8961 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8962 unsigned int nvectors, tree vectype, tree scalar_mask)
8964 gcc_assert (nvectors != 0);
8965 if (masks->length () < nvectors)
8966 masks->safe_grow_cleared (nvectors, true);
8967 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8968 /* The number of scalars per iteration and the number of vectors are
8969 both compile-time constants. */
8970 unsigned int nscalars_per_iter
8971 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8972 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8974 if (scalar_mask)
8976 scalar_cond_masked_key cond (scalar_mask, nvectors);
8977 loop_vinfo->scalar_cond_masked_set.add (cond);
8980 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8982 rgm->max_nscalars_per_iter = nscalars_per_iter;
8983 rgm->type = truth_type_for (vectype);
8984 rgm->factor = 1;
8988 /* Given a complete set of masks MASKS, extract mask number INDEX
8989 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8990 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8992 See the comment above vec_loop_masks for more details about the mask
8993 arrangement. */
8995 tree
8996 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8997 unsigned int nvectors, tree vectype, unsigned int index)
8999 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9000 tree mask_type = rgm->type;
9002 /* Populate the rgroup's mask array, if this is the first time we've
9003 used it. */
9004 if (rgm->controls.is_empty ())
9006 rgm->controls.safe_grow_cleared (nvectors, true);
9007 for (unsigned int i = 0; i < nvectors; ++i)
9009 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9010 /* Provide a dummy definition until the real one is available. */
9011 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9012 rgm->controls[i] = mask;
9016 tree mask = rgm->controls[index];
9017 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9018 TYPE_VECTOR_SUBPARTS (vectype)))
9020 /* A loop mask for data type X can be reused for data type Y
9021 if X has N times more elements than Y and if Y's elements
9022 are N times bigger than X's. In this case each sequence
9023 of N elements in the loop mask will be all-zero or all-one.
9024 We can then view-convert the mask so that each sequence of
9025 N elements is replaced by a single element. */
9026 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9027 TYPE_VECTOR_SUBPARTS (vectype)));
9028 gimple_seq seq = NULL;
9029 mask_type = truth_type_for (vectype);
9030 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9031 if (seq)
9032 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9034 return mask;
9037 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9038 lengths for controlling an operation on VECTYPE. The operation splits
9039 each element of VECTYPE into FACTOR separate subelements, measuring the
9040 length as a number of these subelements. */
9042 void
9043 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9044 unsigned int nvectors, tree vectype, unsigned int factor)
9046 gcc_assert (nvectors != 0);
9047 if (lens->length () < nvectors)
9048 lens->safe_grow_cleared (nvectors, true);
9049 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9051 /* The number of scalars per iteration, scalar occupied bytes and
9052 the number of vectors are both compile-time constants. */
9053 unsigned int nscalars_per_iter
9054 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9055 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9057 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9059 /* For now, we only support cases in which all loads and stores fall back
9060 to VnQI or none do. */
9061 gcc_assert (!rgl->max_nscalars_per_iter
9062 || (rgl->factor == 1 && factor == 1)
9063 || (rgl->max_nscalars_per_iter * rgl->factor
9064 == nscalars_per_iter * factor));
9065 rgl->max_nscalars_per_iter = nscalars_per_iter;
9066 rgl->type = vectype;
9067 rgl->factor = factor;
9071 /* Given a complete set of length LENS, extract length number INDEX for an
9072 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
9074 tree
9075 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9076 unsigned int nvectors, unsigned int index)
9078 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9080 /* Populate the rgroup's len array, if this is the first time we've
9081 used it. */
9082 if (rgl->controls.is_empty ())
9084 rgl->controls.safe_grow_cleared (nvectors, true);
9085 for (unsigned int i = 0; i < nvectors; ++i)
9087 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9088 gcc_assert (len_type != NULL_TREE);
9089 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9091 /* Provide a dummy definition until the real one is available. */
9092 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9093 rgl->controls[i] = len;
9097 return rgl->controls[index];
9100 /* Scale profiling counters by estimation for LOOP which is vectorized
9101 by factor VF. */
9103 static void
9104 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9106 edge preheader = loop_preheader_edge (loop);
9107 /* Reduce loop iterations by the vectorization factor. */
9108 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9109 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9111 if (freq_h.nonzero_p ())
9113 profile_probability p;
9115 /* Avoid dropping loop body profile counter to 0 because of zero count
9116 in loop's preheader. */
9117 if (!(freq_e == profile_count::zero ()))
9118 freq_e = freq_e.force_nonzero ();
9119 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9120 scale_loop_frequencies (loop, p);
9123 edge exit_e = single_exit (loop);
9124 exit_e->probability = profile_probability::always ()
9125 .apply_scale (1, new_est_niter + 1);
9127 edge exit_l = single_pred_edge (loop->latch);
9128 profile_probability prob = exit_l->probability;
9129 exit_l->probability = exit_e->probability.invert ();
9130 if (prob.initialized_p () && exit_l->probability.initialized_p ())
9131 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9134 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9135 latch edge values originally defined by it. */
9137 static void
9138 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9139 stmt_vec_info def_stmt_info)
9141 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9142 if (!def || TREE_CODE (def) != SSA_NAME)
9143 return;
9144 stmt_vec_info phi_info;
9145 imm_use_iterator iter;
9146 use_operand_p use_p;
9147 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9148 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9149 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9150 && (phi_info = loop_vinfo->lookup_stmt (phi))
9151 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9152 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9153 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9155 loop_p loop = gimple_bb (phi)->loop_father;
9156 edge e = loop_latch_edge (loop);
9157 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9159 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9160 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9161 gcc_assert (phi_defs.length () == latch_defs.length ());
9162 for (unsigned i = 0; i < phi_defs.length (); ++i)
9163 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9164 gimple_get_lhs (latch_defs[i]), e,
9165 gimple_phi_arg_location (phi, e->dest_idx));
9170 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9171 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9172 stmt_vec_info. */
9174 static bool
9175 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9176 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9178 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9179 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9181 if (dump_enabled_p ())
9182 dump_printf_loc (MSG_NOTE, vect_location,
9183 "------>vectorizing statement: %G", stmt_info->stmt);
9185 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9186 vect_loop_kill_debug_uses (loop, stmt_info);
9188 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9189 && !STMT_VINFO_LIVE_P (stmt_info))
9190 return false;
9192 if (STMT_VINFO_VECTYPE (stmt_info))
9194 poly_uint64 nunits
9195 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9196 if (!STMT_SLP_TYPE (stmt_info)
9197 && maybe_ne (nunits, vf)
9198 && dump_enabled_p ())
9199 /* For SLP VF is set according to unrolling factor, and not
9200 to vector size, hence for SLP this print is not valid. */
9201 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9204 /* Pure SLP statements have already been vectorized. We still need
9205 to apply loop vectorization to hybrid SLP statements. */
9206 if (PURE_SLP_STMT (stmt_info))
9207 return false;
9209 if (dump_enabled_p ())
9210 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9212 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9213 *seen_store = stmt_info;
9215 return true;
9218 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9219 in the hash_map with its corresponding values. */
9221 static tree
9222 find_in_mapping (tree t, void *context)
9224 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9226 tree *value = mapping->get (t);
9227 return value ? *value : t;
9230 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9231 original loop that has now been vectorized.
9233 The inits of the data_references need to be advanced with the number of
9234 iterations of the main loop. This has been computed in vect_do_peeling and
9235 is stored in parameter ADVANCE. We first restore the data_references
9236 initial offset with the values recored in ORIG_DRS_INIT.
9238 Since the loop_vec_info of this EPILOGUE was constructed for the original
9239 loop, its stmt_vec_infos all point to the original statements. These need
9240 to be updated to point to their corresponding copies as well as the SSA_NAMES
9241 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9243 The data_reference's connections also need to be updated. Their
9244 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9245 stmt_vec_infos, their statements need to point to their corresponding copy,
9246 if they are gather loads or scatter stores then their reference needs to be
9247 updated to point to its corresponding copy and finally we set
9248 'base_misaligned' to false as we have already peeled for alignment in the
9249 prologue of the main loop. */
9251 static void
9252 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9254 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9255 auto_vec<gimple *> stmt_worklist;
9256 hash_map<tree,tree> mapping;
9257 gimple *orig_stmt, *new_stmt;
9258 gimple_stmt_iterator epilogue_gsi;
9259 gphi_iterator epilogue_phi_gsi;
9260 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9261 basic_block *epilogue_bbs = get_loop_body (epilogue);
9262 unsigned i;
9264 free (LOOP_VINFO_BBS (epilogue_vinfo));
9265 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9267 /* Advance data_reference's with the number of iterations of the previous
9268 loop and its prologue. */
9269 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9272 /* The EPILOGUE loop is a copy of the original loop so they share the same
9273 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9274 point to the copied statements. We also create a mapping of all LHS' in
9275 the original loop and all the LHS' in the EPILOGUE and create worklists to
9276 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9277 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9279 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9280 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9282 new_stmt = epilogue_phi_gsi.phi ();
9284 gcc_assert (gimple_uid (new_stmt) > 0);
9285 stmt_vinfo
9286 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9288 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9289 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9291 mapping.put (gimple_phi_result (orig_stmt),
9292 gimple_phi_result (new_stmt));
9293 /* PHI nodes can not have patterns or related statements. */
9294 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9295 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9298 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9299 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9301 new_stmt = gsi_stmt (epilogue_gsi);
9302 if (is_gimple_debug (new_stmt))
9303 continue;
9305 gcc_assert (gimple_uid (new_stmt) > 0);
9306 stmt_vinfo
9307 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9309 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9310 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9312 if (tree old_lhs = gimple_get_lhs (orig_stmt))
9313 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9315 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9317 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9318 for (gimple_stmt_iterator gsi = gsi_start (seq);
9319 !gsi_end_p (gsi); gsi_next (&gsi))
9320 stmt_worklist.safe_push (gsi_stmt (gsi));
9323 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9324 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9326 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9327 stmt_worklist.safe_push (stmt);
9328 /* Set BB such that the assert in
9329 'get_initial_def_for_reduction' is able to determine that
9330 the BB of the related stmt is inside this loop. */
9331 gimple_set_bb (stmt,
9332 gimple_bb (new_stmt));
9333 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9334 gcc_assert (related_vinfo == NULL
9335 || related_vinfo == stmt_vinfo);
9340 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9341 using the original main loop and thus need to be updated to refer to the
9342 cloned variables used in the epilogue. */
9343 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9345 gimple *stmt = stmt_worklist[i];
9346 tree *new_op;
9348 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9350 tree op = gimple_op (stmt, j);
9351 if ((new_op = mapping.get(op)))
9352 gimple_set_op (stmt, j, *new_op);
9353 else
9355 /* PR92429: The last argument of simplify_replace_tree disables
9356 folding when replacing arguments. This is required as
9357 otherwise you might end up with different statements than the
9358 ones analyzed in vect_loop_analyze, leading to different
9359 vectorization. */
9360 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9361 &find_in_mapping, &mapping, false);
9362 gimple_set_op (stmt, j, op);
9367 struct data_reference *dr;
9368 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9369 FOR_EACH_VEC_ELT (datarefs, i, dr)
9371 orig_stmt = DR_STMT (dr);
9372 gcc_assert (gimple_uid (orig_stmt) > 0);
9373 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9374 /* Data references for gather loads and scatter stores do not use the
9375 updated offset we set using ADVANCE. Instead we have to make sure the
9376 reference in the data references point to the corresponding copy of
9377 the original in the epilogue. */
9378 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9379 == VMAT_GATHER_SCATTER)
9381 DR_REF (dr)
9382 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9383 &find_in_mapping, &mapping);
9384 DR_BASE_ADDRESS (dr)
9385 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9386 &find_in_mapping, &mapping);
9388 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9389 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9390 /* The vector size of the epilogue is smaller than that of the main loop
9391 so the alignment is either the same or lower. This means the dr will
9392 thus by definition be aligned. */
9393 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9396 epilogue_vinfo->shared->datarefs_copy.release ();
9397 epilogue_vinfo->shared->save_datarefs ();
9400 /* Function vect_transform_loop.
9402 The analysis phase has determined that the loop is vectorizable.
9403 Vectorize the loop - created vectorized stmts to replace the scalar
9404 stmts in the loop, and update the loop exit condition.
9405 Returns scalar epilogue loop if any. */
9407 class loop *
9408 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9410 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9411 class loop *epilogue = NULL;
9412 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9413 int nbbs = loop->num_nodes;
9414 int i;
9415 tree niters_vector = NULL_TREE;
9416 tree step_vector = NULL_TREE;
9417 tree niters_vector_mult_vf = NULL_TREE;
9418 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9419 unsigned int lowest_vf = constant_lower_bound (vf);
9420 gimple *stmt;
9421 bool check_profitability = false;
9422 unsigned int th;
9424 DUMP_VECT_SCOPE ("vec_transform_loop");
9426 loop_vinfo->shared->check_datarefs ();
9428 /* Use the more conservative vectorization threshold. If the number
9429 of iterations is constant assume the cost check has been performed
9430 by our caller. If the threshold makes all loops profitable that
9431 run at least the (estimated) vectorization factor number of times
9432 checking is pointless, too. */
9433 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9434 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9436 if (dump_enabled_p ())
9437 dump_printf_loc (MSG_NOTE, vect_location,
9438 "Profitability threshold is %d loop iterations.\n",
9439 th);
9440 check_profitability = true;
9443 /* Make sure there exists a single-predecessor exit bb. Do this before
9444 versioning. */
9445 edge e = single_exit (loop);
9446 if (! single_pred_p (e->dest))
9448 split_loop_exit_edge (e, true);
9449 if (dump_enabled_p ())
9450 dump_printf (MSG_NOTE, "split exit edge\n");
9453 /* Version the loop first, if required, so the profitability check
9454 comes first. */
9456 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9458 class loop *sloop
9459 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9460 sloop->force_vectorize = false;
9461 check_profitability = false;
9464 /* Make sure there exists a single-predecessor exit bb also on the
9465 scalar loop copy. Do this after versioning but before peeling
9466 so CFG structure is fine for both scalar and if-converted loop
9467 to make slpeel_duplicate_current_defs_from_edges face matched
9468 loop closed PHI nodes on the exit. */
9469 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9471 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9472 if (! single_pred_p (e->dest))
9474 split_loop_exit_edge (e, true);
9475 if (dump_enabled_p ())
9476 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9480 tree niters = vect_build_loop_niters (loop_vinfo);
9481 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9482 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9483 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9484 tree advance;
9485 drs_init_vec orig_drs_init;
9487 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9488 &step_vector, &niters_vector_mult_vf, th,
9489 check_profitability, niters_no_overflow,
9490 &advance);
9492 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9493 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9494 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9495 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9497 if (niters_vector == NULL_TREE)
9499 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9500 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9501 && known_eq (lowest_vf, vf))
9503 niters_vector
9504 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9505 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9506 step_vector = build_one_cst (TREE_TYPE (niters));
9508 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9509 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9510 &step_vector, niters_no_overflow);
9511 else
9512 /* vect_do_peeling subtracted the number of peeled prologue
9513 iterations from LOOP_VINFO_NITERS. */
9514 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9515 &niters_vector, &step_vector,
9516 niters_no_overflow);
9519 /* 1) Make sure the loop header has exactly two entries
9520 2) Make sure we have a preheader basic block. */
9522 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9524 split_edge (loop_preheader_edge (loop));
9526 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9527 /* This will deal with any possible peeling. */
9528 vect_prepare_for_masked_peels (loop_vinfo);
9530 /* Schedule the SLP instances first, then handle loop vectorization
9531 below. */
9532 if (!loop_vinfo->slp_instances.is_empty ())
9534 DUMP_VECT_SCOPE ("scheduling SLP instances");
9535 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9538 /* FORNOW: the vectorizer supports only loops which body consist
9539 of one basic block (header + empty latch). When the vectorizer will
9540 support more involved loop forms, the order by which the BBs are
9541 traversed need to be reconsidered. */
9543 for (i = 0; i < nbbs; i++)
9545 basic_block bb = bbs[i];
9546 stmt_vec_info stmt_info;
9548 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9549 gsi_next (&si))
9551 gphi *phi = si.phi ();
9552 if (dump_enabled_p ())
9553 dump_printf_loc (MSG_NOTE, vect_location,
9554 "------>vectorizing phi: %G", phi);
9555 stmt_info = loop_vinfo->lookup_stmt (phi);
9556 if (!stmt_info)
9557 continue;
9559 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9560 vect_loop_kill_debug_uses (loop, stmt_info);
9562 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9563 && !STMT_VINFO_LIVE_P (stmt_info))
9564 continue;
9566 if (STMT_VINFO_VECTYPE (stmt_info)
9567 && (maybe_ne
9568 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9569 && dump_enabled_p ())
9570 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9572 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9573 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9574 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9575 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9576 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9577 && ! PURE_SLP_STMT (stmt_info))
9579 if (dump_enabled_p ())
9580 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9581 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9585 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9586 gsi_next (&si))
9588 gphi *phi = si.phi ();
9589 stmt_info = loop_vinfo->lookup_stmt (phi);
9590 if (!stmt_info)
9591 continue;
9593 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9594 && !STMT_VINFO_LIVE_P (stmt_info))
9595 continue;
9597 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9598 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9599 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9600 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9601 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9602 && ! PURE_SLP_STMT (stmt_info))
9603 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9606 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9607 !gsi_end_p (si);)
9609 stmt = gsi_stmt (si);
9610 /* During vectorization remove existing clobber stmts. */
9611 if (gimple_clobber_p (stmt))
9613 unlink_stmt_vdef (stmt);
9614 gsi_remove (&si, true);
9615 release_defs (stmt);
9617 else
9619 /* Ignore vector stmts created in the outer loop. */
9620 stmt_info = loop_vinfo->lookup_stmt (stmt);
9622 /* vector stmts created in the outer-loop during vectorization of
9623 stmts in an inner-loop may not have a stmt_info, and do not
9624 need to be vectorized. */
9625 stmt_vec_info seen_store = NULL;
9626 if (stmt_info)
9628 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9630 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9631 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9632 !gsi_end_p (subsi); gsi_next (&subsi))
9634 stmt_vec_info pat_stmt_info
9635 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9636 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9637 &si, &seen_store);
9639 stmt_vec_info pat_stmt_info
9640 = STMT_VINFO_RELATED_STMT (stmt_info);
9641 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9642 &si, &seen_store))
9643 maybe_set_vectorized_backedge_value (loop_vinfo,
9644 pat_stmt_info);
9646 else
9648 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9649 &seen_store))
9650 maybe_set_vectorized_backedge_value (loop_vinfo,
9651 stmt_info);
9654 gsi_next (&si);
9655 if (seen_store)
9657 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9658 /* Interleaving. If IS_STORE is TRUE, the
9659 vectorization of the interleaving chain was
9660 completed - free all the stores in the chain. */
9661 vect_remove_stores (loop_vinfo,
9662 DR_GROUP_FIRST_ELEMENT (seen_store));
9663 else
9664 /* Free the attached stmt_vec_info and remove the stmt. */
9665 loop_vinfo->remove_stmt (stmt_info);
9670 /* Stub out scalar statements that must not survive vectorization.
9671 Doing this here helps with grouped statements, or statements that
9672 are involved in patterns. */
9673 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9674 !gsi_end_p (gsi); gsi_next (&gsi))
9676 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9677 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9679 tree lhs = gimple_get_lhs (call);
9680 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9682 tree zero = build_zero_cst (TREE_TYPE (lhs));
9683 gimple *new_stmt = gimple_build_assign (lhs, zero);
9684 gsi_replace (&gsi, new_stmt, true);
9688 } /* BBs in loop */
9690 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9691 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9692 if (integer_onep (step_vector))
9693 niters_no_overflow = true;
9694 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9695 niters_vector_mult_vf, !niters_no_overflow);
9697 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9698 scale_profile_for_vect_loop (loop, assumed_vf);
9700 /* True if the final iteration might not handle a full vector's
9701 worth of scalar iterations. */
9702 bool final_iter_may_be_partial
9703 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9704 /* The minimum number of iterations performed by the epilogue. This
9705 is 1 when peeling for gaps because we always need a final scalar
9706 iteration. */
9707 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9708 /* +1 to convert latch counts to loop iteration counts,
9709 -min_epilogue_iters to remove iterations that cannot be performed
9710 by the vector code. */
9711 int bias_for_lowest = 1 - min_epilogue_iters;
9712 int bias_for_assumed = bias_for_lowest;
9713 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9714 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9716 /* When the amount of peeling is known at compile time, the first
9717 iteration will have exactly alignment_npeels active elements.
9718 In the worst case it will have at least one. */
9719 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9720 bias_for_lowest += lowest_vf - min_first_active;
9721 bias_for_assumed += assumed_vf - min_first_active;
9723 /* In these calculations the "- 1" converts loop iteration counts
9724 back to latch counts. */
9725 if (loop->any_upper_bound)
9726 loop->nb_iterations_upper_bound
9727 = (final_iter_may_be_partial
9728 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9729 lowest_vf) - 1
9730 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9731 lowest_vf) - 1);
9732 if (loop->any_likely_upper_bound)
9733 loop->nb_iterations_likely_upper_bound
9734 = (final_iter_may_be_partial
9735 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9736 + bias_for_lowest, lowest_vf) - 1
9737 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9738 + bias_for_lowest, lowest_vf) - 1);
9739 if (loop->any_estimate)
9740 loop->nb_iterations_estimate
9741 = (final_iter_may_be_partial
9742 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9743 assumed_vf) - 1
9744 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9745 assumed_vf) - 1);
9747 if (dump_enabled_p ())
9749 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9751 dump_printf_loc (MSG_NOTE, vect_location,
9752 "LOOP VECTORIZED\n");
9753 if (loop->inner)
9754 dump_printf_loc (MSG_NOTE, vect_location,
9755 "OUTER LOOP VECTORIZED\n");
9756 dump_printf (MSG_NOTE, "\n");
9758 else
9759 dump_printf_loc (MSG_NOTE, vect_location,
9760 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9761 GET_MODE_NAME (loop_vinfo->vector_mode));
9764 /* Loops vectorized with a variable factor won't benefit from
9765 unrolling/peeling. */
9766 if (!vf.is_constant ())
9768 loop->unroll = 1;
9769 if (dump_enabled_p ())
9770 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9771 " variable-length vectorization factor\n");
9773 /* Free SLP instances here because otherwise stmt reference counting
9774 won't work. */
9775 slp_instance instance;
9776 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9777 vect_free_slp_instance (instance);
9778 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9779 /* Clear-up safelen field since its value is invalid after vectorization
9780 since vectorized loop can have loop-carried dependencies. */
9781 loop->safelen = 0;
9783 if (epilogue)
9785 update_epilogue_loop_vinfo (epilogue, advance);
9787 epilogue->simduid = loop->simduid;
9788 epilogue->force_vectorize = loop->force_vectorize;
9789 epilogue->dont_vectorize = false;
9792 return epilogue;
9795 /* The code below is trying to perform simple optimization - revert
9796 if-conversion for masked stores, i.e. if the mask of a store is zero
9797 do not perform it and all stored value producers also if possible.
9798 For example,
9799 for (i=0; i<n; i++)
9800 if (c[i])
9802 p1[i] += 1;
9803 p2[i] = p3[i] +2;
9805 this transformation will produce the following semi-hammock:
9807 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9809 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9810 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9811 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9812 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9813 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9814 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9818 void
9819 optimize_mask_stores (class loop *loop)
9821 basic_block *bbs = get_loop_body (loop);
9822 unsigned nbbs = loop->num_nodes;
9823 unsigned i;
9824 basic_block bb;
9825 class loop *bb_loop;
9826 gimple_stmt_iterator gsi;
9827 gimple *stmt;
9828 auto_vec<gimple *> worklist;
9829 auto_purge_vect_location sentinel;
9831 vect_location = find_loop_location (loop);
9832 /* Pick up all masked stores in loop if any. */
9833 for (i = 0; i < nbbs; i++)
9835 bb = bbs[i];
9836 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9837 gsi_next (&gsi))
9839 stmt = gsi_stmt (gsi);
9840 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9841 worklist.safe_push (stmt);
9845 free (bbs);
9846 if (worklist.is_empty ())
9847 return;
9849 /* Loop has masked stores. */
9850 while (!worklist.is_empty ())
9852 gimple *last, *last_store;
9853 edge e, efalse;
9854 tree mask;
9855 basic_block store_bb, join_bb;
9856 gimple_stmt_iterator gsi_to;
9857 tree vdef, new_vdef;
9858 gphi *phi;
9859 tree vectype;
9860 tree zero;
9862 last = worklist.pop ();
9863 mask = gimple_call_arg (last, 2);
9864 bb = gimple_bb (last);
9865 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9866 the same loop as if_bb. It could be different to LOOP when two
9867 level loop-nest is vectorized and mask_store belongs to the inner
9868 one. */
9869 e = split_block (bb, last);
9870 bb_loop = bb->loop_father;
9871 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9872 join_bb = e->dest;
9873 store_bb = create_empty_bb (bb);
9874 add_bb_to_loop (store_bb, bb_loop);
9875 e->flags = EDGE_TRUE_VALUE;
9876 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9877 /* Put STORE_BB to likely part. */
9878 efalse->probability = profile_probability::unlikely ();
9879 store_bb->count = efalse->count ();
9880 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9881 if (dom_info_available_p (CDI_DOMINATORS))
9882 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9883 if (dump_enabled_p ())
9884 dump_printf_loc (MSG_NOTE, vect_location,
9885 "Create new block %d to sink mask stores.",
9886 store_bb->index);
9887 /* Create vector comparison with boolean result. */
9888 vectype = TREE_TYPE (mask);
9889 zero = build_zero_cst (vectype);
9890 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9891 gsi = gsi_last_bb (bb);
9892 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9893 /* Create new PHI node for vdef of the last masked store:
9894 .MEM_2 = VDEF <.MEM_1>
9895 will be converted to
9896 .MEM.3 = VDEF <.MEM_1>
9897 and new PHI node will be created in join bb
9898 .MEM_2 = PHI <.MEM_1, .MEM_3>
9900 vdef = gimple_vdef (last);
9901 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9902 gimple_set_vdef (last, new_vdef);
9903 phi = create_phi_node (vdef, join_bb);
9904 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9906 /* Put all masked stores with the same mask to STORE_BB if possible. */
9907 while (true)
9909 gimple_stmt_iterator gsi_from;
9910 gimple *stmt1 = NULL;
9912 /* Move masked store to STORE_BB. */
9913 last_store = last;
9914 gsi = gsi_for_stmt (last);
9915 gsi_from = gsi;
9916 /* Shift GSI to the previous stmt for further traversal. */
9917 gsi_prev (&gsi);
9918 gsi_to = gsi_start_bb (store_bb);
9919 gsi_move_before (&gsi_from, &gsi_to);
9920 /* Setup GSI_TO to the non-empty block start. */
9921 gsi_to = gsi_start_bb (store_bb);
9922 if (dump_enabled_p ())
9923 dump_printf_loc (MSG_NOTE, vect_location,
9924 "Move stmt to created bb\n%G", last);
9925 /* Move all stored value producers if possible. */
9926 while (!gsi_end_p (gsi))
9928 tree lhs;
9929 imm_use_iterator imm_iter;
9930 use_operand_p use_p;
9931 bool res;
9933 /* Skip debug statements. */
9934 if (is_gimple_debug (gsi_stmt (gsi)))
9936 gsi_prev (&gsi);
9937 continue;
9939 stmt1 = gsi_stmt (gsi);
9940 /* Do not consider statements writing to memory or having
9941 volatile operand. */
9942 if (gimple_vdef (stmt1)
9943 || gimple_has_volatile_ops (stmt1))
9944 break;
9945 gsi_from = gsi;
9946 gsi_prev (&gsi);
9947 lhs = gimple_get_lhs (stmt1);
9948 if (!lhs)
9949 break;
9951 /* LHS of vectorized stmt must be SSA_NAME. */
9952 if (TREE_CODE (lhs) != SSA_NAME)
9953 break;
9955 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9957 /* Remove dead scalar statement. */
9958 if (has_zero_uses (lhs))
9960 gsi_remove (&gsi_from, true);
9961 continue;
9965 /* Check that LHS does not have uses outside of STORE_BB. */
9966 res = true;
9967 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9969 gimple *use_stmt;
9970 use_stmt = USE_STMT (use_p);
9971 if (is_gimple_debug (use_stmt))
9972 continue;
9973 if (gimple_bb (use_stmt) != store_bb)
9975 res = false;
9976 break;
9979 if (!res)
9980 break;
9982 if (gimple_vuse (stmt1)
9983 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9984 break;
9986 /* Can move STMT1 to STORE_BB. */
9987 if (dump_enabled_p ())
9988 dump_printf_loc (MSG_NOTE, vect_location,
9989 "Move stmt to created bb\n%G", stmt1);
9990 gsi_move_before (&gsi_from, &gsi_to);
9991 /* Shift GSI_TO for further insertion. */
9992 gsi_prev (&gsi_to);
9994 /* Put other masked stores with the same mask to STORE_BB. */
9995 if (worklist.is_empty ()
9996 || gimple_call_arg (worklist.last (), 2) != mask
9997 || worklist.last () != stmt1)
9998 break;
9999 last = worklist.pop ();
10001 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10005 /* Decide whether it is possible to use a zero-based induction variable
10006 when vectorizing LOOP_VINFO with partial vectors. If it is, return
10007 the value that the induction variable must be able to hold in order
10008 to ensure that the rgroups eventually have no active vector elements.
10009 Return -1 otherwise. */
10011 widest_int
10012 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10014 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10015 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10016 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10018 /* Calculate the value that the induction variable must be able
10019 to hit in order to ensure that we end the loop with an all-false mask.
10020 This involves adding the maximum number of inactive trailing scalar
10021 iterations. */
10022 widest_int iv_limit = -1;
10023 if (max_loop_iterations (loop, &iv_limit))
10025 if (niters_skip)
10027 /* Add the maximum number of skipped iterations to the
10028 maximum iteration count. */
10029 if (TREE_CODE (niters_skip) == INTEGER_CST)
10030 iv_limit += wi::to_widest (niters_skip);
10031 else
10032 iv_limit += max_vf - 1;
10034 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10035 /* Make a conservatively-correct assumption. */
10036 iv_limit += max_vf - 1;
10038 /* IV_LIMIT is the maximum number of latch iterations, which is also
10039 the maximum in-range IV value. Round this value down to the previous
10040 vector alignment boundary and then add an extra full iteration. */
10041 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10042 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10044 return iv_limit;
10047 /* For the given rgroup_controls RGC, check whether an induction variable
10048 would ever hit a value that produces a set of all-false masks or zero
10049 lengths before wrapping around. Return true if it's possible to wrap
10050 around before hitting the desirable value, otherwise return false. */
10052 bool
10053 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10055 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10057 if (iv_limit == -1)
10058 return true;
10060 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10061 unsigned int compare_precision = TYPE_PRECISION (compare_type);
10062 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10064 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10065 return true;
10067 return false;