AArch64: Add cost table for Cortex-A76
[official-gcc.git] / gcc / tree-vect-loop.c
blob856bbfebf7ca2345b8983cb26b303a859bff113c
1 /* Loop Vectorization
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
57 /* Loop Vectorization Pass.
59 This pass tries to vectorize loops.
61 For example, the vectorizer transforms the following simple loop:
63 short a[N]; short b[N]; short c[N]; int i;
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
69 as if it was manually vectorized by rewriting the source code into:
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
117 For example, say stmt S1 was vectorized into stmt VS1:
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
163 static opt_result
164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
181 &stmt_vectype,
182 &nunits_vectype);
183 if (!res)
184 return res;
186 if (stmt_vectype)
188 if (STMT_VINFO_VECTYPE (stmt_info))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
193 || vectype_maybe_set_p)
194 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
202 return opt_result::success ();
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
210 static opt_result
211 vect_determine_vf_for_stmt (vec_info *vinfo,
212 stmt_vec_info stmt_info, poly_uint64 *vf)
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
216 stmt_info->stmt);
217 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
218 if (!res)
219 return res;
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
222 && STMT_VINFO_RELATED_STMT (stmt_info))
224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
225 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
229 !gsi_end_p (si); gsi_next (&si))
231 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE, vect_location,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info->stmt);
236 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
237 if (!res)
238 return res;
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE, vect_location,
243 "==> examining pattern statement: %G",
244 stmt_info->stmt);
245 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
246 if (!res)
247 return res;
250 return opt_result::success ();
253 /* Function vect_determine_vectorization_factor
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
264 in the loop.
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
267 original loop:
268 for (i=0; i<N; i++){
269 a[i] = b[i] + c[i];
272 vectorized loop:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
278 static opt_result
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
281 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
283 unsigned nbbs = loop->num_nodes;
284 poly_uint64 vectorization_factor = 1;
285 tree scalar_type = NULL_TREE;
286 gphi *phi;
287 tree vectype;
288 stmt_vec_info stmt_info;
289 unsigned i;
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
293 for (i = 0; i < nbbs; i++)
295 basic_block bb = bbs[i];
297 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
298 gsi_next (&si))
300 phi = si.phi ();
301 stmt_info = loop_vinfo->lookup_stmt (phi);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
304 phi);
306 gcc_assert (stmt_info);
308 if (STMT_VINFO_RELEVANT_P (stmt_info)
309 || STMT_VINFO_LIVE_P (stmt_info))
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
312 scalar_type = TREE_TYPE (PHI_RESULT (phi));
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "get vectype for scalar type: %T\n",
317 scalar_type);
319 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
320 if (!vectype)
321 return opt_result::failure_at (phi,
322 "not vectorized: unsupported "
323 "data-type %T\n",
324 scalar_type);
325 STMT_VINFO_VECTYPE (stmt_info) = vectype;
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
329 vectype);
331 if (dump_enabled_p ())
333 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
334 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
335 dump_printf (MSG_NOTE, "\n");
338 vect_update_max_nunits (&vectorization_factor, vectype);
342 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
343 gsi_next (&si))
345 if (is_gimple_debug (gsi_stmt (si)))
346 continue;
347 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
348 opt_result res
349 = vect_determine_vf_for_stmt (loop_vinfo,
350 stmt_info, &vectorization_factor);
351 if (!res)
352 return res;
356 /* TODO: Analyze cost. Decide if worth while to vectorize. */
357 if (dump_enabled_p ())
359 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
360 dump_dec (MSG_NOTE, vectorization_factor);
361 dump_printf (MSG_NOTE, "\n");
364 if (known_le (vectorization_factor, 1U))
365 return opt_result::failure_at (vect_location,
366 "not vectorized: unsupported data-type\n");
367 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
368 return opt_result::success ();
372 /* Function vect_is_simple_iv_evolution.
374 FORNOW: A simple evolution of an induction variables in the loop is
375 considered a polynomial evolution. */
377 static bool
378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
379 tree * step)
381 tree init_expr;
382 tree step_expr;
383 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
384 basic_block bb;
386 /* When there is no evolution in this loop, the evolution function
387 is not "simple". */
388 if (evolution_part == NULL_TREE)
389 return false;
391 /* When the evolution is a polynomial of degree >= 2
392 the evolution function is not "simple". */
393 if (tree_is_chrec (evolution_part))
394 return false;
396 step_expr = evolution_part;
397 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
401 step_expr, init_expr);
403 *init = init_expr;
404 *step = step_expr;
406 if (TREE_CODE (step_expr) != INTEGER_CST
407 && (TREE_CODE (step_expr) != SSA_NAME
408 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
409 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
410 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
411 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
412 || !flag_associative_math)))
413 && (TREE_CODE (step_expr) != REAL_CST
414 || !flag_associative_math))
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "step unknown.\n");
419 return false;
422 return true;
425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
426 what we are assuming is a double reduction. For example, given
427 a structure like this:
429 outer1:
430 x_1 = PHI <x_4(outer2), ...>;
433 inner:
434 x_2 = PHI <x_1(outer1), ...>;
436 x_3 = ...;
439 outer2:
440 x_4 = PHI <x_3(inner)>;
443 outer loop analysis would treat x_1 as a double reduction phi and
444 this function would then return true for x_2. */
446 static bool
447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
449 use_operand_p use_p;
450 ssa_op_iter op_iter;
451 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
452 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
453 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
454 return true;
455 return false;
458 /* Function vect_analyze_scalar_cycles_1.
460 Examine the cross iteration def-use cycles of scalar variables
461 in LOOP. LOOP_VINFO represents the loop that is now being
462 considered for vectorization (can be LOOP, or an outer-loop
463 enclosing LOOP). */
465 static void
466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
468 basic_block bb = loop->header;
469 tree init, step;
470 auto_vec<stmt_vec_info, 64> worklist;
471 gphi_iterator gsi;
472 bool double_reduc, reduc_chain;
474 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
476 /* First - identify all inductions. Reduction detection assumes that all the
477 inductions have been identified, therefore, this order must not be
478 changed. */
479 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
481 gphi *phi = gsi.phi ();
482 tree access_fn = NULL;
483 tree def = PHI_RESULT (phi);
484 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
489 /* Skip virtual phi's. The data dependences that are associated with
490 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
491 if (virtual_operand_p (def))
492 continue;
494 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
496 /* Analyze the evolution function. */
497 access_fn = analyze_scalar_evolution (loop, def);
498 if (access_fn)
500 STRIP_NOPS (access_fn);
501 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE, vect_location,
503 "Access function of PHI: %T\n", access_fn);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
505 = initial_condition_in_loop_num (access_fn, loop->num);
506 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
507 = evolution_part_in_loop_num (access_fn, loop->num);
510 if (!access_fn
511 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
512 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
513 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
514 && TREE_CODE (step) != INTEGER_CST))
516 worklist.safe_push (stmt_vinfo);
517 continue;
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
521 != NULL_TREE);
522 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
526 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
530 /* Second - identify all reductions and nested cycles. */
531 while (worklist.length () > 0)
533 stmt_vec_info stmt_vinfo = worklist.pop ();
534 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
535 tree def = PHI_RESULT (phi);
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
540 gcc_assert (!virtual_operand_p (def)
541 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
543 stmt_vec_info reduc_stmt_info
544 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
545 &reduc_chain);
546 if (reduc_stmt_info)
548 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
549 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
550 if (double_reduc)
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE, vect_location,
554 "Detected double reduction.\n");
556 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
557 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
559 else
561 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_NOTE, vect_location,
565 "Detected vectorizable nested cycle.\n");
567 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
569 else
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location,
573 "Detected reduction.\n");
575 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
577 /* Store the reduction cycles for possible vectorization in
578 loop-aware SLP if it was not detected as reduction
579 chain. */
580 if (! reduc_chain)
581 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
582 (reduc_stmt_info);
586 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
589 "Unknown def-use cycle pattern.\n");
594 /* Function vect_analyze_scalar_cycles.
596 Examine the cross iteration def-use cycles of scalar variables, by
597 analyzing the loop-header PHIs of scalar variables. Classify each
598 cycle as one of the following: invariant, induction, reduction, unknown.
599 We do that for the loop represented by LOOP_VINFO, and also to its
600 inner-loop, if exists.
601 Examples for scalar cycles:
603 Example1: reduction:
605 loop1:
606 for (i=0; i<N; i++)
607 sum += a[i];
609 Example2: induction:
611 loop2:
612 for (i=0; i<N; i++)
613 a[i] = i; */
615 static void
616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
618 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
620 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
622 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
623 Reductions in such inner-loop therefore have different properties than
624 the reductions in the nest that gets vectorized:
625 1. When vectorized, they are executed in the same order as in the original
626 scalar loop, so we can't change the order of computation when
627 vectorizing them.
628 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
629 current checks are too strict. */
631 if (loop->inner)
632 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
635 /* Transfer group and reduction information from STMT_INFO to its
636 pattern stmt. */
638 static void
639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
641 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
642 stmt_vec_info stmtp;
643 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
644 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
645 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
648 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
649 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
650 == STMT_VINFO_DEF_TYPE (stmt_info));
651 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
652 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
653 if (stmt_info)
654 REDUC_GROUP_NEXT_ELEMENT (stmtp)
655 = STMT_VINFO_RELATED_STMT (stmt_info);
657 while (stmt_info);
660 /* Fixup scalar cycles that now have their stmts detected as patterns. */
662 static void
663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
665 stmt_vec_info first;
666 unsigned i;
668 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
670 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
671 while (next)
673 if ((STMT_VINFO_IN_PATTERN_P (next)
674 != STMT_VINFO_IN_PATTERN_P (first))
675 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
676 break;
677 next = REDUC_GROUP_NEXT_ELEMENT (next);
679 /* If all reduction chain members are well-formed patterns adjust
680 the group to group the pattern stmts instead. */
681 if (! next
682 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
684 if (STMT_VINFO_IN_PATTERN_P (first))
686 vect_fixup_reduc_chain (first);
687 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
688 = STMT_VINFO_RELATED_STMT (first);
691 /* If not all stmt in the chain are patterns or if we failed
692 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
693 it as regular reduction instead. */
694 else
696 stmt_vec_info vinfo = first;
697 stmt_vec_info last = NULL;
698 while (vinfo)
700 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
701 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
702 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
703 last = vinfo;
704 vinfo = next;
706 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
707 = vect_internal_def;
708 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
709 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
710 --i;
715 /* Function vect_get_loop_niters.
717 Determine how many iterations the loop is executed and place it
718 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
719 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
720 niter information holds in ASSUMPTIONS.
722 Return the loop exit condition. */
725 static gcond *
726 vect_get_loop_niters (class loop *loop, tree *assumptions,
727 tree *number_of_iterations, tree *number_of_iterationsm1)
729 edge exit = single_exit (loop);
730 class tree_niter_desc niter_desc;
731 tree niter_assumptions, niter, may_be_zero;
732 gcond *cond = get_loop_exit_condition (loop);
734 *assumptions = boolean_true_node;
735 *number_of_iterationsm1 = chrec_dont_know;
736 *number_of_iterations = chrec_dont_know;
737 DUMP_VECT_SCOPE ("get_loop_niters");
739 if (!exit)
740 return cond;
742 may_be_zero = NULL_TREE;
743 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
744 || chrec_contains_undetermined (niter_desc.niter))
745 return cond;
747 niter_assumptions = niter_desc.assumptions;
748 may_be_zero = niter_desc.may_be_zero;
749 niter = niter_desc.niter;
751 if (may_be_zero && integer_zerop (may_be_zero))
752 may_be_zero = NULL_TREE;
754 if (may_be_zero)
756 if (COMPARISON_CLASS_P (may_be_zero))
758 /* Try to combine may_be_zero with assumptions, this can simplify
759 computation of niter expression. */
760 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
761 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
762 niter_assumptions,
763 fold_build1 (TRUTH_NOT_EXPR,
764 boolean_type_node,
765 may_be_zero));
766 else
767 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
768 build_int_cst (TREE_TYPE (niter), 0),
769 rewrite_to_non_trapping_overflow (niter));
771 may_be_zero = NULL_TREE;
773 else if (integer_nonzerop (may_be_zero))
775 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
776 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
777 return cond;
779 else
780 return cond;
783 *assumptions = niter_assumptions;
784 *number_of_iterationsm1 = niter;
786 /* We want the number of loop header executions which is the number
787 of latch executions plus one.
788 ??? For UINT_MAX latch executions this number overflows to zero
789 for loops like do { n++; } while (n != 0); */
790 if (niter && !chrec_contains_undetermined (niter))
791 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
792 build_int_cst (TREE_TYPE (niter), 1));
793 *number_of_iterations = niter;
795 return cond;
798 /* Function bb_in_loop_p
800 Used as predicate for dfs order traversal of the loop bbs. */
802 static bool
803 bb_in_loop_p (const_basic_block bb, const void *data)
805 const class loop *const loop = (const class loop *)data;
806 if (flow_bb_inside_loop_p (loop, bb))
807 return true;
808 return false;
812 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
813 stmt_vec_info structs for all the stmts in LOOP_IN. */
815 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
816 : vec_info (vec_info::loop, init_cost (loop_in), shared),
817 loop (loop_in),
818 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
819 num_itersm1 (NULL_TREE),
820 num_iters (NULL_TREE),
821 num_iters_unchanged (NULL_TREE),
822 num_iters_assumptions (NULL_TREE),
823 th (0),
824 versioning_threshold (0),
825 vectorization_factor (0),
826 max_vectorization_factor (0),
827 mask_skip_niters (NULL_TREE),
828 rgroup_compare_type (NULL_TREE),
829 simd_if_cond (NULL_TREE),
830 unaligned_dr (NULL),
831 peeling_for_alignment (0),
832 ptr_mask (0),
833 ivexpr_map (NULL),
834 scan_map (NULL),
835 slp_unrolling_factor (1),
836 single_scalar_iteration_cost (0),
837 vec_outside_cost (0),
838 vec_inside_cost (0),
839 vectorizable (false),
840 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
841 using_partial_vectors_p (false),
842 epil_using_partial_vectors_p (false),
843 peeling_for_gaps (false),
844 peeling_for_niter (false),
845 no_data_dependencies (false),
846 has_mask_store (false),
847 scalar_loop_scaling (profile_probability::uninitialized ()),
848 scalar_loop (NULL),
849 orig_loop_info (NULL)
851 /* CHECKME: We want to visit all BBs before their successors (except for
852 latch blocks, for which this assertion wouldn't hold). In the simple
853 case of the loop forms we allow, a dfs order of the BBs would the same
854 as reversed postorder traversal, so we are safe. */
856 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
857 bbs, loop->num_nodes, loop);
858 gcc_assert (nbbs == loop->num_nodes);
860 for (unsigned int i = 0; i < nbbs; i++)
862 basic_block bb = bbs[i];
863 gimple_stmt_iterator si;
865 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
867 gimple *phi = gsi_stmt (si);
868 gimple_set_uid (phi, 0);
869 add_stmt (phi);
872 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
874 gimple *stmt = gsi_stmt (si);
875 gimple_set_uid (stmt, 0);
876 if (is_gimple_debug (stmt))
877 continue;
878 add_stmt (stmt);
879 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
880 third argument is the #pragma omp simd if (x) condition, when 0,
881 loop shouldn't be vectorized, when non-zero constant, it should
882 be vectorized normally, otherwise versioned with vectorized loop
883 done if the condition is non-zero at runtime. */
884 if (loop_in->simduid
885 && is_gimple_call (stmt)
886 && gimple_call_internal_p (stmt)
887 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
888 && gimple_call_num_args (stmt) >= 3
889 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
890 && (loop_in->simduid
891 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
893 tree arg = gimple_call_arg (stmt, 2);
894 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
895 simd_if_cond = arg;
896 else
897 gcc_assert (integer_nonzerop (arg));
902 epilogue_vinfos.create (6);
905 /* Free all levels of rgroup CONTROLS. */
907 void
908 release_vec_loop_controls (vec<rgroup_controls> *controls)
910 rgroup_controls *rgc;
911 unsigned int i;
912 FOR_EACH_VEC_ELT (*controls, i, rgc)
913 rgc->controls.release ();
914 controls->release ();
917 /* Free all memory used by the _loop_vec_info, as well as all the
918 stmt_vec_info structs of all the stmts in the loop. */
920 _loop_vec_info::~_loop_vec_info ()
922 free (bbs);
924 release_vec_loop_controls (&masks);
925 release_vec_loop_controls (&lens);
926 delete ivexpr_map;
927 delete scan_map;
928 epilogue_vinfos.release ();
930 loop->aux = NULL;
933 /* Return an invariant or register for EXPR and emit necessary
934 computations in the LOOP_VINFO loop preheader. */
936 tree
937 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
939 if (is_gimple_reg (expr)
940 || is_gimple_min_invariant (expr))
941 return expr;
943 if (! loop_vinfo->ivexpr_map)
944 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
945 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
946 if (! cached)
948 gimple_seq stmts = NULL;
949 cached = force_gimple_operand (unshare_expr (expr),
950 &stmts, true, NULL_TREE);
951 if (stmts)
953 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
954 gsi_insert_seq_on_edge_immediate (e, stmts);
957 return cached;
960 /* Return true if we can use CMP_TYPE as the comparison type to produce
961 all masks required to mask LOOP_VINFO. */
963 static bool
964 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
966 rgroup_controls *rgm;
967 unsigned int i;
968 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
969 if (rgm->type != NULL_TREE
970 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
971 cmp_type, rgm->type,
972 OPTIMIZE_FOR_SPEED))
973 return false;
974 return true;
977 /* Calculate the maximum number of scalars per iteration for every
978 rgroup in LOOP_VINFO. */
980 static unsigned int
981 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
983 unsigned int res = 1;
984 unsigned int i;
985 rgroup_controls *rgm;
986 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
987 res = MAX (res, rgm->max_nscalars_per_iter);
988 return res;
991 /* Calculate the minimum precision necessary to represent:
993 MAX_NITERS * FACTOR
995 as an unsigned integer, where MAX_NITERS is the maximum number of
996 loop header iterations for the original scalar form of LOOP_VINFO. */
998 static unsigned
999 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1001 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1003 /* Get the maximum number of iterations that is representable
1004 in the counter type. */
1005 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1006 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1008 /* Get a more refined estimate for the number of iterations. */
1009 widest_int max_back_edges;
1010 if (max_loop_iterations (loop, &max_back_edges))
1011 max_ni = wi::smin (max_ni, max_back_edges + 1);
1013 /* Work out how many bits we need to represent the limit. */
1014 return wi::min_precision (max_ni * factor, UNSIGNED);
1017 /* True if the loop needs peeling or partial vectors when vectorized. */
1019 static bool
1020 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1022 unsigned HOST_WIDE_INT const_vf;
1023 HOST_WIDE_INT max_niter
1024 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1026 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1027 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1028 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1029 (loop_vinfo));
1031 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1032 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1034 /* Work out the (constant) number of iterations that need to be
1035 peeled for reasons other than niters. */
1036 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1037 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1038 peel_niter += 1;
1039 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1040 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1041 return true;
1043 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1044 /* ??? When peeling for gaps but not alignment, we could
1045 try to check whether the (variable) niters is known to be
1046 VF * N + 1. That's something of a niche case though. */
1047 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1048 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1049 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1050 < (unsigned) exact_log2 (const_vf))
1051 /* In case of versioning, check if the maximum number of
1052 iterations is greater than th. If they are identical,
1053 the epilogue is unnecessary. */
1054 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1055 || ((unsigned HOST_WIDE_INT) max_niter
1056 > (th / const_vf) * const_vf))))
1057 return true;
1059 return false;
1062 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1063 whether we can actually generate the masks required. Return true if so,
1064 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1066 static bool
1067 vect_verify_full_masking (loop_vec_info loop_vinfo)
1069 unsigned int min_ni_width;
1070 unsigned int max_nscalars_per_iter
1071 = vect_get_max_nscalars_per_iter (loop_vinfo);
1073 /* Use a normal loop if there are no statements that need masking.
1074 This only happens in rare degenerate cases: it means that the loop
1075 has no loads, no stores, and no live-out values. */
1076 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1077 return false;
1079 /* Work out how many bits we need to represent the limit. */
1080 min_ni_width
1081 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1083 /* Find a scalar mode for which WHILE_ULT is supported. */
1084 opt_scalar_int_mode cmp_mode_iter;
1085 tree cmp_type = NULL_TREE;
1086 tree iv_type = NULL_TREE;
1087 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1088 unsigned int iv_precision = UINT_MAX;
1090 if (iv_limit != -1)
1091 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1092 UNSIGNED);
1094 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1096 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1097 if (cmp_bits >= min_ni_width
1098 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1100 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1101 if (this_type
1102 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1104 /* Although we could stop as soon as we find a valid mode,
1105 there are at least two reasons why that's not always the
1106 best choice:
1108 - An IV that's Pmode or wider is more likely to be reusable
1109 in address calculations than an IV that's narrower than
1110 Pmode.
1112 - Doing the comparison in IV_PRECISION or wider allows
1113 a natural 0-based IV, whereas using a narrower comparison
1114 type requires mitigations against wrap-around.
1116 Conversely, if the IV limit is variable, doing the comparison
1117 in a wider type than the original type can introduce
1118 unnecessary extensions, so picking the widest valid mode
1119 is not always a good choice either.
1121 Here we prefer the first IV type that's Pmode or wider,
1122 and the first comparison type that's IV_PRECISION or wider.
1123 (The comparison type must be no wider than the IV type,
1124 to avoid extensions in the vector loop.)
1126 ??? We might want to try continuing beyond Pmode for ILP32
1127 targets if CMP_BITS < IV_PRECISION. */
1128 iv_type = this_type;
1129 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1130 cmp_type = this_type;
1131 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1132 break;
1137 if (!cmp_type)
1138 return false;
1140 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1141 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1142 return true;
1145 /* Check whether we can use vector access with length based on precison
1146 comparison. So far, to keep it simple, we only allow the case that the
1147 precision of the target supported length is larger than the precision
1148 required by loop niters. */
1150 static bool
1151 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1153 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1154 return false;
1156 unsigned int max_nitems_per_iter = 1;
1157 unsigned int i;
1158 rgroup_controls *rgl;
1159 /* Find the maximum number of items per iteration for every rgroup. */
1160 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1162 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1163 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1166 /* Work out how many bits we need to represent the length limit. */
1167 unsigned int min_ni_prec
1168 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1170 /* Now use the maximum of below precisions for one suitable IV type:
1171 - the IV's natural precision
1172 - the precision needed to hold: the maximum number of scalar
1173 iterations multiplied by the scale factor (min_ni_prec above)
1174 - the Pmode precision
1176 If min_ni_prec is less than the precision of the current niters,
1177 we perfer to still use the niters type. Prefer to use Pmode and
1178 wider IV to avoid narrow conversions. */
1180 unsigned int ni_prec
1181 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1182 min_ni_prec = MAX (min_ni_prec, ni_prec);
1183 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1185 tree iv_type = NULL_TREE;
1186 opt_scalar_int_mode tmode_iter;
1187 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1189 scalar_mode tmode = tmode_iter.require ();
1190 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1192 /* ??? Do we really want to construct one IV whose precision exceeds
1193 BITS_PER_WORD? */
1194 if (tbits > BITS_PER_WORD)
1195 break;
1197 /* Find the first available standard integral type. */
1198 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1200 iv_type = build_nonstandard_integer_type (tbits, true);
1201 break;
1205 if (!iv_type)
1207 if (dump_enabled_p ())
1208 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1209 "can't vectorize with length-based partial vectors"
1210 " because there is no suitable iv type.\n");
1211 return false;
1214 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1215 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1217 return true;
1220 /* Calculate the cost of one scalar iteration of the loop. */
1221 static void
1222 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1224 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1225 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1226 int nbbs = loop->num_nodes, factor;
1227 int innerloop_iters, i;
1229 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1231 /* Gather costs for statements in the scalar loop. */
1233 /* FORNOW. */
1234 innerloop_iters = 1;
1235 if (loop->inner)
1236 innerloop_iters = 50; /* FIXME */
1238 for (i = 0; i < nbbs; i++)
1240 gimple_stmt_iterator si;
1241 basic_block bb = bbs[i];
1243 if (bb->loop_father == loop->inner)
1244 factor = innerloop_iters;
1245 else
1246 factor = 1;
1248 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1250 gimple *stmt = gsi_stmt (si);
1251 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1253 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1254 continue;
1256 /* Skip stmts that are not vectorized inside the loop. */
1257 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1258 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1259 && (!STMT_VINFO_LIVE_P (vstmt_info)
1260 || !VECTORIZABLE_CYCLE_DEF
1261 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1262 continue;
1264 vect_cost_for_stmt kind;
1265 if (STMT_VINFO_DATA_REF (stmt_info))
1267 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1268 kind = scalar_load;
1269 else
1270 kind = scalar_store;
1272 else if (vect_nop_conversion_p (stmt_info))
1273 continue;
1274 else
1275 kind = scalar_stmt;
1277 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1278 factor, kind, stmt_info, 0, vect_prologue);
1282 /* Now accumulate cost. */
1283 void *target_cost_data = init_cost (loop);
1284 stmt_info_for_cost *si;
1285 int j;
1286 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1287 j, si)
1288 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1289 si->kind, si->stmt_info, si->vectype,
1290 si->misalign, vect_body);
1291 unsigned dummy, body_cost = 0;
1292 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1293 destroy_cost_data (target_cost_data);
1294 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1298 /* Function vect_analyze_loop_form_1.
1300 Verify that certain CFG restrictions hold, including:
1301 - the loop has a pre-header
1302 - the loop has a single entry and exit
1303 - the loop exit condition is simple enough
1304 - the number of iterations can be analyzed, i.e, a countable loop. The
1305 niter could be analyzed under some assumptions. */
1307 opt_result
1308 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1309 tree *assumptions, tree *number_of_iterationsm1,
1310 tree *number_of_iterations, gcond **inner_loop_cond)
1312 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1314 /* Different restrictions apply when we are considering an inner-most loop,
1315 vs. an outer (nested) loop.
1316 (FORNOW. May want to relax some of these restrictions in the future). */
1318 if (!loop->inner)
1320 /* Inner-most loop. We currently require that the number of BBs is
1321 exactly 2 (the header and latch). Vectorizable inner-most loops
1322 look like this:
1324 (pre-header)
1326 header <--------+
1327 | | |
1328 | +--> latch --+
1330 (exit-bb) */
1332 if (loop->num_nodes != 2)
1333 return opt_result::failure_at (vect_location,
1334 "not vectorized:"
1335 " control flow in loop.\n");
1337 if (empty_block_p (loop->header))
1338 return opt_result::failure_at (vect_location,
1339 "not vectorized: empty loop.\n");
1341 else
1343 class loop *innerloop = loop->inner;
1344 edge entryedge;
1346 /* Nested loop. We currently require that the loop is doubly-nested,
1347 contains a single inner loop, and the number of BBs is exactly 5.
1348 Vectorizable outer-loops look like this:
1350 (pre-header)
1352 header <---+
1354 inner-loop |
1356 tail ------+
1358 (exit-bb)
1360 The inner-loop has the properties expected of inner-most loops
1361 as described above. */
1363 if ((loop->inner)->inner || (loop->inner)->next)
1364 return opt_result::failure_at (vect_location,
1365 "not vectorized:"
1366 " multiple nested loops.\n");
1368 if (loop->num_nodes != 5)
1369 return opt_result::failure_at (vect_location,
1370 "not vectorized:"
1371 " control flow in loop.\n");
1373 entryedge = loop_preheader_edge (innerloop);
1374 if (entryedge->src != loop->header
1375 || !single_exit (innerloop)
1376 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1377 return opt_result::failure_at (vect_location,
1378 "not vectorized:"
1379 " unsupported outerloop form.\n");
1381 /* Analyze the inner-loop. */
1382 tree inner_niterm1, inner_niter, inner_assumptions;
1383 opt_result res
1384 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1385 &inner_assumptions, &inner_niterm1,
1386 &inner_niter, NULL);
1387 if (!res)
1389 if (dump_enabled_p ())
1390 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1391 "not vectorized: Bad inner loop.\n");
1392 return res;
1395 /* Don't support analyzing niter under assumptions for inner
1396 loop. */
1397 if (!integer_onep (inner_assumptions))
1398 return opt_result::failure_at (vect_location,
1399 "not vectorized: Bad inner loop.\n");
1401 if (!expr_invariant_in_loop_p (loop, inner_niter))
1402 return opt_result::failure_at (vect_location,
1403 "not vectorized: inner-loop count not"
1404 " invariant.\n");
1406 if (dump_enabled_p ())
1407 dump_printf_loc (MSG_NOTE, vect_location,
1408 "Considering outer-loop vectorization.\n");
1411 if (!single_exit (loop))
1412 return opt_result::failure_at (vect_location,
1413 "not vectorized: multiple exits.\n");
1414 if (EDGE_COUNT (loop->header->preds) != 2)
1415 return opt_result::failure_at (vect_location,
1416 "not vectorized:"
1417 " too many incoming edges.\n");
1419 /* We assume that the loop exit condition is at the end of the loop. i.e,
1420 that the loop is represented as a do-while (with a proper if-guard
1421 before the loop if needed), where the loop header contains all the
1422 executable statements, and the latch is empty. */
1423 if (!empty_block_p (loop->latch)
1424 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1425 return opt_result::failure_at (vect_location,
1426 "not vectorized: latch block not empty.\n");
1428 /* Make sure the exit is not abnormal. */
1429 edge e = single_exit (loop);
1430 if (e->flags & EDGE_ABNORMAL)
1431 return opt_result::failure_at (vect_location,
1432 "not vectorized:"
1433 " abnormal loop exit edge.\n");
1435 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1436 number_of_iterationsm1);
1437 if (!*loop_cond)
1438 return opt_result::failure_at
1439 (vect_location,
1440 "not vectorized: complicated exit condition.\n");
1442 if (integer_zerop (*assumptions)
1443 || !*number_of_iterations
1444 || chrec_contains_undetermined (*number_of_iterations))
1445 return opt_result::failure_at
1446 (*loop_cond,
1447 "not vectorized: number of iterations cannot be computed.\n");
1449 if (integer_zerop (*number_of_iterations))
1450 return opt_result::failure_at
1451 (*loop_cond,
1452 "not vectorized: number of iterations = 0.\n");
1454 return opt_result::success ();
1457 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1459 opt_loop_vec_info
1460 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1462 tree assumptions, number_of_iterations, number_of_iterationsm1;
1463 gcond *loop_cond, *inner_loop_cond = NULL;
1465 opt_result res
1466 = vect_analyze_loop_form_1 (loop, &loop_cond,
1467 &assumptions, &number_of_iterationsm1,
1468 &number_of_iterations, &inner_loop_cond);
1469 if (!res)
1470 return opt_loop_vec_info::propagate_failure (res);
1472 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1473 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1474 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1475 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1476 if (!integer_onep (assumptions))
1478 /* We consider to vectorize this loop by versioning it under
1479 some assumptions. In order to do this, we need to clear
1480 existing information computed by scev and niter analyzer. */
1481 scev_reset_htab ();
1482 free_numbers_of_iterations_estimates (loop);
1483 /* Also set flag for this loop so that following scev and niter
1484 analysis are done under the assumptions. */
1485 loop_constraint_set (loop, LOOP_C_FINITE);
1486 /* Also record the assumptions for versioning. */
1487 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1490 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1492 if (dump_enabled_p ())
1494 dump_printf_loc (MSG_NOTE, vect_location,
1495 "Symbolic number of iterations is ");
1496 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1497 dump_printf (MSG_NOTE, "\n");
1501 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1502 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1503 if (inner_loop_cond)
1505 stmt_vec_info inner_loop_cond_info
1506 = loop_vinfo->lookup_stmt (inner_loop_cond);
1507 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1510 gcc_assert (!loop->aux);
1511 loop->aux = loop_vinfo;
1512 return opt_loop_vec_info::success (loop_vinfo);
1517 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1518 statements update the vectorization factor. */
1520 static void
1521 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1523 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1524 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1525 int nbbs = loop->num_nodes;
1526 poly_uint64 vectorization_factor;
1527 int i;
1529 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1531 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1532 gcc_assert (known_ne (vectorization_factor, 0U));
1534 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1535 vectorization factor of the loop is the unrolling factor required by
1536 the SLP instances. If that unrolling factor is 1, we say, that we
1537 perform pure SLP on loop - cross iteration parallelism is not
1538 exploited. */
1539 bool only_slp_in_loop = true;
1540 for (i = 0; i < nbbs; i++)
1542 basic_block bb = bbs[i];
1543 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1544 gsi_next (&si))
1546 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1547 if (!stmt_info)
1548 continue;
1549 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1550 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1551 && !PURE_SLP_STMT (stmt_info))
1552 /* STMT needs both SLP and loop-based vectorization. */
1553 only_slp_in_loop = false;
1555 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1556 gsi_next (&si))
1558 if (is_gimple_debug (gsi_stmt (si)))
1559 continue;
1560 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1561 stmt_info = vect_stmt_to_vectorize (stmt_info);
1562 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1563 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1564 && !PURE_SLP_STMT (stmt_info))
1565 /* STMT needs both SLP and loop-based vectorization. */
1566 only_slp_in_loop = false;
1570 if (only_slp_in_loop)
1572 if (dump_enabled_p ())
1573 dump_printf_loc (MSG_NOTE, vect_location,
1574 "Loop contains only SLP stmts\n");
1575 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1577 else
1579 if (dump_enabled_p ())
1580 dump_printf_loc (MSG_NOTE, vect_location,
1581 "Loop contains SLP and non-SLP stmts\n");
1582 /* Both the vectorization factor and unroll factor have the form
1583 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1584 so they must have a common multiple. */
1585 vectorization_factor
1586 = force_common_multiple (vectorization_factor,
1587 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1590 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1591 if (dump_enabled_p ())
1593 dump_printf_loc (MSG_NOTE, vect_location,
1594 "Updating vectorization factor to ");
1595 dump_dec (MSG_NOTE, vectorization_factor);
1596 dump_printf (MSG_NOTE, ".\n");
1600 /* Return true if STMT_INFO describes a double reduction phi and if
1601 the other phi in the reduction is also relevant for vectorization.
1602 This rejects cases such as:
1604 outer1:
1605 x_1 = PHI <x_3(outer2), ...>;
1608 inner:
1609 x_2 = ...;
1612 outer2:
1613 x_3 = PHI <x_2(inner)>;
1615 if nothing in x_2 or elsewhere makes x_1 relevant. */
1617 static bool
1618 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1620 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1621 return false;
1623 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1626 /* Function vect_analyze_loop_operations.
1628 Scan the loop stmts and make sure they are all vectorizable. */
1630 static opt_result
1631 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1633 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1634 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1635 int nbbs = loop->num_nodes;
1636 int i;
1637 stmt_vec_info stmt_info;
1638 bool need_to_vectorize = false;
1639 bool ok;
1641 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1643 auto_vec<stmt_info_for_cost> cost_vec;
1645 for (i = 0; i < nbbs; i++)
1647 basic_block bb = bbs[i];
1649 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1650 gsi_next (&si))
1652 gphi *phi = si.phi ();
1653 ok = true;
1655 stmt_info = loop_vinfo->lookup_stmt (phi);
1656 if (dump_enabled_p ())
1657 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1658 if (virtual_operand_p (gimple_phi_result (phi)))
1659 continue;
1661 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1662 (i.e., a phi in the tail of the outer-loop). */
1663 if (! is_loop_header_bb_p (bb))
1665 /* FORNOW: we currently don't support the case that these phis
1666 are not used in the outerloop (unless it is double reduction,
1667 i.e., this phi is vect_reduction_def), cause this case
1668 requires to actually do something here. */
1669 if (STMT_VINFO_LIVE_P (stmt_info)
1670 && !vect_active_double_reduction_p (stmt_info))
1671 return opt_result::failure_at (phi,
1672 "Unsupported loop-closed phi"
1673 " in outer-loop.\n");
1675 /* If PHI is used in the outer loop, we check that its operand
1676 is defined in the inner loop. */
1677 if (STMT_VINFO_RELEVANT_P (stmt_info))
1679 tree phi_op;
1681 if (gimple_phi_num_args (phi) != 1)
1682 return opt_result::failure_at (phi, "unsupported phi");
1684 phi_op = PHI_ARG_DEF (phi, 0);
1685 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1686 if (!op_def_info)
1687 return opt_result::failure_at (phi, "unsupported phi\n");
1689 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1690 && (STMT_VINFO_RELEVANT (op_def_info)
1691 != vect_used_in_outer_by_reduction))
1692 return opt_result::failure_at (phi, "unsupported phi\n");
1694 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1695 || (STMT_VINFO_DEF_TYPE (stmt_info)
1696 == vect_double_reduction_def))
1697 && !vectorizable_lc_phi (loop_vinfo,
1698 stmt_info, NULL, NULL))
1699 return opt_result::failure_at (phi, "unsupported phi\n");
1702 continue;
1705 gcc_assert (stmt_info);
1707 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1708 || STMT_VINFO_LIVE_P (stmt_info))
1709 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1710 /* A scalar-dependence cycle that we don't support. */
1711 return opt_result::failure_at (phi,
1712 "not vectorized:"
1713 " scalar dependence cycle.\n");
1715 if (STMT_VINFO_RELEVANT_P (stmt_info))
1717 need_to_vectorize = true;
1718 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1719 && ! PURE_SLP_STMT (stmt_info))
1720 ok = vectorizable_induction (loop_vinfo,
1721 stmt_info, NULL, NULL,
1722 &cost_vec);
1723 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1724 || (STMT_VINFO_DEF_TYPE (stmt_info)
1725 == vect_double_reduction_def)
1726 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1727 && ! PURE_SLP_STMT (stmt_info))
1728 ok = vectorizable_reduction (loop_vinfo,
1729 stmt_info, NULL, NULL, &cost_vec);
1732 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1733 if (ok
1734 && STMT_VINFO_LIVE_P (stmt_info)
1735 && !PURE_SLP_STMT (stmt_info))
1736 ok = vectorizable_live_operation (loop_vinfo,
1737 stmt_info, NULL, NULL, NULL,
1738 -1, false, &cost_vec);
1740 if (!ok)
1741 return opt_result::failure_at (phi,
1742 "not vectorized: relevant phi not "
1743 "supported: %G",
1744 static_cast <gimple *> (phi));
1747 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1748 gsi_next (&si))
1750 gimple *stmt = gsi_stmt (si);
1751 if (!gimple_clobber_p (stmt)
1752 && !is_gimple_debug (stmt))
1754 opt_result res
1755 = vect_analyze_stmt (loop_vinfo,
1756 loop_vinfo->lookup_stmt (stmt),
1757 &need_to_vectorize,
1758 NULL, NULL, &cost_vec);
1759 if (!res)
1760 return res;
1763 } /* bbs */
1765 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1767 /* All operations in the loop are either irrelevant (deal with loop
1768 control, or dead), or only used outside the loop and can be moved
1769 out of the loop (e.g. invariants, inductions). The loop can be
1770 optimized away by scalar optimizations. We're better off not
1771 touching this loop. */
1772 if (!need_to_vectorize)
1774 if (dump_enabled_p ())
1775 dump_printf_loc (MSG_NOTE, vect_location,
1776 "All the computation can be taken out of the loop.\n");
1777 return opt_result::failure_at
1778 (vect_location,
1779 "not vectorized: redundant loop. no profit to vectorize.\n");
1782 return opt_result::success ();
1785 /* Return true if we know that the iteration count is smaller than the
1786 vectorization factor. Return false if it isn't, or if we can't be sure
1787 either way. */
1789 static bool
1790 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1792 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1794 HOST_WIDE_INT max_niter;
1795 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1796 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1797 else
1798 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1800 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1801 return true;
1803 return false;
1806 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1807 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1808 definitely no, or -1 if it's worth retrying. */
1810 static int
1811 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1813 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1814 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1816 /* Only loops that can handle partially-populated vectors can have iteration
1817 counts less than the vectorization factor. */
1818 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1820 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1822 if (dump_enabled_p ())
1823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824 "not vectorized: iteration count smaller than "
1825 "vectorization factor.\n");
1826 return 0;
1830 int min_profitable_iters, min_profitable_estimate;
1831 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1832 &min_profitable_estimate);
1834 if (min_profitable_iters < 0)
1836 if (dump_enabled_p ())
1837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838 "not vectorized: vectorization not profitable.\n");
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1841 "not vectorized: vector version will never be "
1842 "profitable.\n");
1843 return -1;
1846 int min_scalar_loop_bound = (param_min_vect_loop_bound
1847 * assumed_vf);
1849 /* Use the cost model only if it is more conservative than user specified
1850 threshold. */
1851 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1852 min_profitable_iters);
1854 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1856 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1857 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1859 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861 "not vectorized: vectorization not profitable.\n");
1862 if (dump_enabled_p ())
1863 dump_printf_loc (MSG_NOTE, vect_location,
1864 "not vectorized: iteration count smaller than user "
1865 "specified loop bound parameter or minimum profitable "
1866 "iterations (whichever is more conservative).\n");
1867 return 0;
1870 /* The static profitablity threshold min_profitable_estimate includes
1871 the cost of having to check at runtime whether the scalar loop
1872 should be used instead. If it turns out that we don't need or want
1873 such a check, the threshold we should use for the static estimate
1874 is simply the point at which the vector loop becomes more profitable
1875 than the scalar loop. */
1876 if (min_profitable_estimate > min_profitable_iters
1877 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1878 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1879 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1880 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1882 if (dump_enabled_p ())
1883 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1884 " choice between the scalar and vector loops\n");
1885 min_profitable_estimate = min_profitable_iters;
1888 HOST_WIDE_INT estimated_niter;
1890 /* If we are vectorizing an epilogue then we know the maximum number of
1891 scalar iterations it will cover is at least one lower than the
1892 vectorization factor of the main loop. */
1893 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1894 estimated_niter
1895 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1896 else
1898 estimated_niter = estimated_stmt_executions_int (loop);
1899 if (estimated_niter == -1)
1900 estimated_niter = likely_max_stmt_executions_int (loop);
1902 if (estimated_niter != -1
1903 && ((unsigned HOST_WIDE_INT) estimated_niter
1904 < MAX (th, (unsigned) min_profitable_estimate)))
1906 if (dump_enabled_p ())
1907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1908 "not vectorized: estimated iteration count too "
1909 "small.\n");
1910 if (dump_enabled_p ())
1911 dump_printf_loc (MSG_NOTE, vect_location,
1912 "not vectorized: estimated iteration count smaller "
1913 "than specified loop bound parameter or minimum "
1914 "profitable iterations (whichever is more "
1915 "conservative).\n");
1916 return -1;
1919 return 1;
1922 static opt_result
1923 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1924 vec<data_reference_p> *datarefs,
1925 unsigned int *n_stmts)
1927 *n_stmts = 0;
1928 for (unsigned i = 0; i < loop->num_nodes; i++)
1929 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1930 !gsi_end_p (gsi); gsi_next (&gsi))
1932 gimple *stmt = gsi_stmt (gsi);
1933 if (is_gimple_debug (stmt))
1934 continue;
1935 ++(*n_stmts);
1936 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1937 NULL, 0);
1938 if (!res)
1940 if (is_gimple_call (stmt) && loop->safelen)
1942 tree fndecl = gimple_call_fndecl (stmt), op;
1943 if (fndecl != NULL_TREE)
1945 cgraph_node *node = cgraph_node::get (fndecl);
1946 if (node != NULL && node->simd_clones != NULL)
1948 unsigned int j, n = gimple_call_num_args (stmt);
1949 for (j = 0; j < n; j++)
1951 op = gimple_call_arg (stmt, j);
1952 if (DECL_P (op)
1953 || (REFERENCE_CLASS_P (op)
1954 && get_base_address (op)))
1955 break;
1957 op = gimple_call_lhs (stmt);
1958 /* Ignore #pragma omp declare simd functions
1959 if they don't have data references in the
1960 call stmt itself. */
1961 if (j == n
1962 && !(op
1963 && (DECL_P (op)
1964 || (REFERENCE_CLASS_P (op)
1965 && get_base_address (op)))))
1966 continue;
1970 return res;
1972 /* If dependence analysis will give up due to the limit on the
1973 number of datarefs stop here and fail fatally. */
1974 if (datarefs->length ()
1975 > (unsigned)param_loop_max_datarefs_for_datadeps)
1976 return opt_result::failure_at (stmt, "exceeded param "
1977 "loop-max-datarefs-for-datadeps\n");
1979 return opt_result::success ();
1982 /* Look for SLP-only access groups and turn each individual access into its own
1983 group. */
1984 static void
1985 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1987 unsigned int i;
1988 struct data_reference *dr;
1990 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1992 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1993 FOR_EACH_VEC_ELT (datarefs, i, dr)
1995 gcc_assert (DR_REF (dr));
1996 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1998 /* Check if the load is a part of an interleaving chain. */
1999 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2001 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2002 unsigned int group_size = DR_GROUP_SIZE (first_element);
2004 /* Check if SLP-only groups. */
2005 if (!STMT_SLP_TYPE (stmt_info)
2006 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2008 /* Dissolve the group. */
2009 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2011 stmt_vec_info vinfo = first_element;
2012 while (vinfo)
2014 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2015 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2016 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2017 DR_GROUP_SIZE (vinfo) = 1;
2018 if (STMT_VINFO_STRIDED_P (first_element))
2019 DR_GROUP_GAP (vinfo) = 0;
2020 else
2021 DR_GROUP_GAP (vinfo) = group_size - 1;
2022 vinfo = next;
2029 /* Determine if operating on full vectors for LOOP_VINFO might leave
2030 some scalar iterations still to do. If so, decide how we should
2031 handle those scalar iterations. The possibilities are:
2033 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2034 In this case:
2036 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2037 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2038 LOOP_VINFO_PEELING_FOR_NITER == false
2040 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2041 to handle the remaining scalar iterations. In this case:
2043 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2044 LOOP_VINFO_PEELING_FOR_NITER == true
2046 There are two choices:
2048 (2a) Consider vectorizing the epilogue loop at the same VF as the
2049 main loop, but using partial vectors instead of full vectors.
2050 In this case:
2052 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2054 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2055 In this case:
2057 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2059 When FOR_EPILOGUE_P is true, make this determination based on the
2060 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2061 based on the assumption that LOOP_VINFO is the main loop. The caller
2062 has made sure that the number of iterations is set appropriately for
2063 this value of FOR_EPILOGUE_P. */
2065 opt_result
2066 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2067 bool for_epilogue_p)
2069 /* Determine whether there would be any scalar iterations left over. */
2070 bool need_peeling_or_partial_vectors_p
2071 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2073 /* Decide whether to vectorize the loop with partial vectors. */
2074 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2075 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2076 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2077 && need_peeling_or_partial_vectors_p)
2079 /* For partial-vector-usage=1, try to push the handling of partial
2080 vectors to the epilogue, with the main loop continuing to operate
2081 on full vectors.
2083 ??? We could then end up failing to use partial vectors if we
2084 decide to peel iterations into a prologue, and if the main loop
2085 then ends up processing fewer than VF iterations. */
2086 if (param_vect_partial_vector_usage == 1
2087 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2088 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2089 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2090 else
2091 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2094 if (dump_enabled_p ())
2096 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2097 dump_printf_loc (MSG_NOTE, vect_location,
2098 "operating on partial vectors%s.\n",
2099 for_epilogue_p ? " for epilogue loop" : "");
2100 else
2101 dump_printf_loc (MSG_NOTE, vect_location,
2102 "operating only on full vectors%s.\n",
2103 for_epilogue_p ? " for epilogue loop" : "");
2106 if (for_epilogue_p)
2108 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2109 gcc_assert (orig_loop_vinfo);
2110 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2111 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2112 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2115 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2116 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2118 /* Check that the loop processes at least one full vector. */
2119 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2120 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2121 if (known_lt (wi::to_widest (scalar_niters), vf))
2122 return opt_result::failure_at (vect_location,
2123 "loop does not have enough iterations"
2124 " to support vectorization.\n");
2126 /* If we need to peel an extra epilogue iteration to handle data
2127 accesses with gaps, check that there are enough scalar iterations
2128 available.
2130 The check above is redundant with this one when peeling for gaps,
2131 but the distinction is useful for diagnostics. */
2132 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2133 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2134 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2135 return opt_result::failure_at (vect_location,
2136 "loop does not have enough iterations"
2137 " to support peeling for gaps.\n");
2140 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2141 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2142 && need_peeling_or_partial_vectors_p);
2144 return opt_result::success ();
2147 /* Function vect_analyze_loop_2.
2149 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2150 for it. The different analyses will record information in the
2151 loop_vec_info struct. */
2152 static opt_result
2153 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2155 opt_result ok = opt_result::success ();
2156 int res;
2157 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2158 poly_uint64 min_vf = 2;
2159 loop_vec_info orig_loop_vinfo = NULL;
2161 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2162 loop_vec_info of the first vectorized loop. */
2163 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2164 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2165 else
2166 orig_loop_vinfo = loop_vinfo;
2167 gcc_assert (orig_loop_vinfo);
2169 /* The first group of checks is independent of the vector size. */
2170 fatal = true;
2172 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2173 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2174 return opt_result::failure_at (vect_location,
2175 "not vectorized: simd if(0)\n");
2177 /* Find all data references in the loop (which correspond to vdefs/vuses)
2178 and analyze their evolution in the loop. */
2180 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2182 /* Gather the data references and count stmts in the loop. */
2183 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2185 opt_result res
2186 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2187 &LOOP_VINFO_DATAREFS (loop_vinfo),
2188 n_stmts);
2189 if (!res)
2191 if (dump_enabled_p ())
2192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193 "not vectorized: loop contains function "
2194 "calls or data references that cannot "
2195 "be analyzed\n");
2196 return res;
2198 loop_vinfo->shared->save_datarefs ();
2200 else
2201 loop_vinfo->shared->check_datarefs ();
2203 /* Analyze the data references and also adjust the minimal
2204 vectorization factor according to the loads and stores. */
2206 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2207 if (!ok)
2209 if (dump_enabled_p ())
2210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2211 "bad data references.\n");
2212 return ok;
2215 /* Classify all cross-iteration scalar data-flow cycles.
2216 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2217 vect_analyze_scalar_cycles (loop_vinfo);
2219 vect_pattern_recog (loop_vinfo);
2221 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2223 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2224 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2226 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2227 if (!ok)
2229 if (dump_enabled_p ())
2230 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2231 "bad data access.\n");
2232 return ok;
2235 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2237 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2238 if (!ok)
2240 if (dump_enabled_p ())
2241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2242 "unexpected pattern.\n");
2243 return ok;
2246 /* While the rest of the analysis below depends on it in some way. */
2247 fatal = false;
2249 /* Analyze data dependences between the data-refs in the loop
2250 and adjust the maximum vectorization factor according to
2251 the dependences.
2252 FORNOW: fail at the first data dependence that we encounter. */
2254 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2255 if (!ok)
2257 if (dump_enabled_p ())
2258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259 "bad data dependence.\n");
2260 return ok;
2262 if (max_vf != MAX_VECTORIZATION_FACTOR
2263 && maybe_lt (max_vf, min_vf))
2264 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2265 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2267 ok = vect_determine_vectorization_factor (loop_vinfo);
2268 if (!ok)
2270 if (dump_enabled_p ())
2271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2272 "can't determine vectorization factor.\n");
2273 return ok;
2275 if (max_vf != MAX_VECTORIZATION_FACTOR
2276 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2277 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2279 /* Compute the scalar iteration cost. */
2280 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2282 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2284 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2285 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2286 if (!ok)
2287 return ok;
2289 /* If there are any SLP instances mark them as pure_slp. */
2290 bool slp = vect_make_slp_decision (loop_vinfo);
2291 if (slp)
2293 /* Find stmts that need to be both vectorized and SLPed. */
2294 vect_detect_hybrid_slp (loop_vinfo);
2296 /* Update the vectorization factor based on the SLP decision. */
2297 vect_update_vf_for_slp (loop_vinfo);
2299 /* Optimize the SLP graph with the vectorization factor fixed. */
2300 vect_optimize_slp (loop_vinfo);
2302 /* Gather the loads reachable from the SLP graph entries. */
2303 vect_gather_slp_loads (loop_vinfo);
2306 bool saved_can_use_partial_vectors_p
2307 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2309 /* We don't expect to have to roll back to anything other than an empty
2310 set of rgroups. */
2311 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2313 /* This is the point where we can re-start analysis with SLP forced off. */
2314 start_over:
2316 /* Now the vectorization factor is final. */
2317 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2318 gcc_assert (known_ne (vectorization_factor, 0U));
2320 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2322 dump_printf_loc (MSG_NOTE, vect_location,
2323 "vectorization_factor = ");
2324 dump_dec (MSG_NOTE, vectorization_factor);
2325 dump_printf (MSG_NOTE, ", niters = %wd\n",
2326 LOOP_VINFO_INT_NITERS (loop_vinfo));
2329 /* Analyze the alignment of the data-refs in the loop.
2330 Fail if a data reference is found that cannot be vectorized. */
2332 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2333 if (!ok)
2335 if (dump_enabled_p ())
2336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337 "bad data alignment.\n");
2338 return ok;
2341 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2342 It is important to call pruning after vect_analyze_data_ref_accesses,
2343 since we use grouping information gathered by interleaving analysis. */
2344 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2345 if (!ok)
2346 return ok;
2348 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2349 vectorization, since we do not want to add extra peeling or
2350 add versioning for alignment. */
2351 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2352 /* This pass will decide on using loop versioning and/or loop peeling in
2353 order to enhance the alignment of data references in the loop. */
2354 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2355 if (!ok)
2356 return ok;
2358 if (slp)
2360 /* Analyze operations in the SLP instances. Note this may
2361 remove unsupported SLP instances which makes the above
2362 SLP kind detection invalid. */
2363 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2364 vect_slp_analyze_operations (loop_vinfo);
2365 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2367 ok = opt_result::failure_at (vect_location,
2368 "unsupported SLP instances\n");
2369 goto again;
2372 /* Check whether any load in ALL SLP instances is possibly permuted. */
2373 slp_tree load_node, slp_root;
2374 unsigned i, x;
2375 slp_instance instance;
2376 bool can_use_lanes = true;
2377 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2379 slp_root = SLP_INSTANCE_TREE (instance);
2380 int group_size = SLP_TREE_LANES (slp_root);
2381 tree vectype = SLP_TREE_VECTYPE (slp_root);
2382 bool loads_permuted = false;
2383 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2385 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2386 continue;
2387 unsigned j;
2388 stmt_vec_info load_info;
2389 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2390 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2392 loads_permuted = true;
2393 break;
2397 /* If the loads and stores can be handled with load/store-lane
2398 instructions record it and move on to the next instance. */
2399 if (loads_permuted
2400 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2401 && vect_store_lanes_supported (vectype, group_size, false))
2403 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2405 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2406 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2407 /* Use SLP for strided accesses (or if we can't
2408 load-lanes). */
2409 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2410 || ! vect_load_lanes_supported
2411 (STMT_VINFO_VECTYPE (stmt_vinfo),
2412 DR_GROUP_SIZE (stmt_vinfo), false))
2413 break;
2416 can_use_lanes
2417 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2419 if (can_use_lanes && dump_enabled_p ())
2420 dump_printf_loc (MSG_NOTE, vect_location,
2421 "SLP instance %p can use load/store-lanes\n",
2422 instance);
2424 else
2426 can_use_lanes = false;
2427 break;
2431 /* If all SLP instances can use load/store-lanes abort SLP and try again
2432 with SLP disabled. */
2433 if (can_use_lanes)
2435 ok = opt_result::failure_at (vect_location,
2436 "Built SLP cancelled: can use "
2437 "load/store-lanes\n");
2438 if (dump_enabled_p ())
2439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2440 "Built SLP cancelled: all SLP instances support "
2441 "load/store-lanes\n");
2442 goto again;
2446 /* Dissolve SLP-only groups. */
2447 vect_dissolve_slp_only_groups (loop_vinfo);
2449 /* Scan all the remaining operations in the loop that are not subject
2450 to SLP and make sure they are vectorizable. */
2451 ok = vect_analyze_loop_operations (loop_vinfo);
2452 if (!ok)
2454 if (dump_enabled_p ())
2455 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2456 "bad operation or unsupported loop bound.\n");
2457 return ok;
2460 /* For now, we don't expect to mix both masking and length approaches for one
2461 loop, disable it if both are recorded. */
2462 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2463 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2464 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2466 if (dump_enabled_p ())
2467 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2468 "can't vectorize a loop with partial vectors"
2469 " because we don't expect to mix different"
2470 " approaches with partial vectors for the"
2471 " same loop.\n");
2472 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2475 /* If we still have the option of using partial vectors,
2476 check whether we can generate the necessary loop controls. */
2477 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2478 && !vect_verify_full_masking (loop_vinfo)
2479 && !vect_verify_loop_lens (loop_vinfo))
2480 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2482 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2483 to be able to handle fewer than VF scalars, or needs to have a lower VF
2484 than the main loop. */
2485 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2486 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2487 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2488 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2489 return opt_result::failure_at (vect_location,
2490 "Vectorization factor too high for"
2491 " epilogue loop.\n");
2493 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2494 assuming that the loop will be used as a main loop. We will redo
2495 this analysis later if we instead decide to use the loop as an
2496 epilogue loop. */
2497 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2498 if (!ok)
2499 return ok;
2501 /* Check the costings of the loop make vectorizing worthwhile. */
2502 res = vect_analyze_loop_costing (loop_vinfo);
2503 if (res < 0)
2505 ok = opt_result::failure_at (vect_location,
2506 "Loop costings may not be worthwhile.\n");
2507 goto again;
2509 if (!res)
2510 return opt_result::failure_at (vect_location,
2511 "Loop costings not worthwhile.\n");
2513 /* If an epilogue loop is required make sure we can create one. */
2514 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2515 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2517 if (dump_enabled_p ())
2518 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2519 if (!vect_can_advance_ivs_p (loop_vinfo)
2520 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2521 single_exit (LOOP_VINFO_LOOP
2522 (loop_vinfo))))
2524 ok = opt_result::failure_at (vect_location,
2525 "not vectorized: can't create required "
2526 "epilog loop\n");
2527 goto again;
2531 /* During peeling, we need to check if number of loop iterations is
2532 enough for both peeled prolog loop and vector loop. This check
2533 can be merged along with threshold check of loop versioning, so
2534 increase threshold for this case if necessary.
2536 If we are analyzing an epilogue we still want to check what its
2537 versioning threshold would be. If we decide to vectorize the epilogues we
2538 will want to use the lowest versioning threshold of all epilogues and main
2539 loop. This will enable us to enter a vectorized epilogue even when
2540 versioning the loop. We can't simply check whether the epilogue requires
2541 versioning though since we may have skipped some versioning checks when
2542 analyzing the epilogue. For instance, checks for alias versioning will be
2543 skipped when dealing with epilogues as we assume we already checked them
2544 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2545 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2547 poly_uint64 niters_th = 0;
2548 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2550 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2552 /* Niters for peeled prolog loop. */
2553 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2555 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2556 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2557 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2559 else
2560 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2563 /* Niters for at least one iteration of vectorized loop. */
2564 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2565 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2566 /* One additional iteration because of peeling for gap. */
2567 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2568 niters_th += 1;
2570 /* Use the same condition as vect_transform_loop to decide when to use
2571 the cost to determine a versioning threshold. */
2572 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2573 && ordered_p (th, niters_th))
2574 niters_th = ordered_max (poly_uint64 (th), niters_th);
2576 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2579 gcc_assert (known_eq (vectorization_factor,
2580 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2582 /* Ok to vectorize! */
2583 return opt_result::success ();
2585 again:
2586 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2587 gcc_assert (!ok);
2589 /* Try again with SLP forced off but if we didn't do any SLP there is
2590 no point in re-trying. */
2591 if (!slp)
2592 return ok;
2594 /* If there are reduction chains re-trying will fail anyway. */
2595 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2596 return ok;
2598 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2599 via interleaving or lane instructions. */
2600 slp_instance instance;
2601 slp_tree node;
2602 unsigned i, j;
2603 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2605 stmt_vec_info vinfo;
2606 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2607 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2608 continue;
2609 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2610 unsigned int size = DR_GROUP_SIZE (vinfo);
2611 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2612 if (! vect_store_lanes_supported (vectype, size, false)
2613 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2614 && ! vect_grouped_store_supported (vectype, size))
2615 return opt_result::failure_at (vinfo->stmt,
2616 "unsupported grouped store\n");
2617 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2619 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2620 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2621 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2622 size = DR_GROUP_SIZE (vinfo);
2623 vectype = STMT_VINFO_VECTYPE (vinfo);
2624 if (! vect_load_lanes_supported (vectype, size, false)
2625 && ! vect_grouped_load_supported (vectype, single_element_p,
2626 size))
2627 return opt_result::failure_at (vinfo->stmt,
2628 "unsupported grouped load\n");
2632 if (dump_enabled_p ())
2633 dump_printf_loc (MSG_NOTE, vect_location,
2634 "re-trying with SLP disabled\n");
2636 /* Roll back state appropriately. No SLP this time. */
2637 slp = false;
2638 /* Restore vectorization factor as it were without SLP. */
2639 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2640 /* Free the SLP instances. */
2641 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2642 vect_free_slp_instance (instance);
2643 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2644 /* Reset SLP type to loop_vect on all stmts. */
2645 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2647 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2648 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2649 !gsi_end_p (si); gsi_next (&si))
2651 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2652 STMT_SLP_TYPE (stmt_info) = loop_vect;
2653 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2654 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2656 /* vectorizable_reduction adjusts reduction stmt def-types,
2657 restore them to that of the PHI. */
2658 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2659 = STMT_VINFO_DEF_TYPE (stmt_info);
2660 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2661 (STMT_VINFO_REDUC_DEF (stmt_info)))
2662 = STMT_VINFO_DEF_TYPE (stmt_info);
2665 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2666 !gsi_end_p (si); gsi_next (&si))
2668 if (is_gimple_debug (gsi_stmt (si)))
2669 continue;
2670 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2671 STMT_SLP_TYPE (stmt_info) = loop_vect;
2672 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2674 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2675 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2676 STMT_SLP_TYPE (stmt_info) = loop_vect;
2677 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2678 !gsi_end_p (pi); gsi_next (&pi))
2679 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2680 = loop_vect;
2684 /* Free optimized alias test DDRS. */
2685 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2686 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2687 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2688 /* Reset target cost data. */
2689 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2690 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2691 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2692 /* Reset accumulated rgroup information. */
2693 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2694 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2695 /* Reset assorted flags. */
2696 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2697 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2698 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2699 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2700 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2701 = saved_can_use_partial_vectors_p;
2703 goto start_over;
2706 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2707 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2708 OLD_LOOP_VINFO is better unless something specifically indicates
2709 otherwise.
2711 Note that this deliberately isn't a partial order. */
2713 static bool
2714 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2715 loop_vec_info old_loop_vinfo)
2717 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2718 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2720 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2721 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2723 /* Always prefer a VF of loop->simdlen over any other VF. */
2724 if (loop->simdlen)
2726 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2727 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2728 if (new_simdlen_p != old_simdlen_p)
2729 return new_simdlen_p;
2732 /* Limit the VFs to what is likely to be the maximum number of iterations,
2733 to handle cases in which at least one loop_vinfo is fully-masked. */
2734 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2735 if (estimated_max_niter != -1)
2737 if (known_le (estimated_max_niter, new_vf))
2738 new_vf = estimated_max_niter;
2739 if (known_le (estimated_max_niter, old_vf))
2740 old_vf = estimated_max_niter;
2743 /* Check whether the (fractional) cost per scalar iteration is lower
2744 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2745 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2746 * poly_widest_int (old_vf));
2747 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2748 * poly_widest_int (new_vf));
2749 if (maybe_lt (rel_old, rel_new))
2751 /* When old_loop_vinfo uses a variable vectorization factor,
2752 we know that it has a lower cost for at least one runtime VF.
2753 However, we don't know how likely that VF is.
2755 One option would be to compare the costs for the estimated VFs.
2756 The problem is that that can put too much pressure on the cost
2757 model. E.g. if the estimated VF is also the lowest possible VF,
2758 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2759 for the estimated VF, we'd then choose new_loop_vinfo even
2760 though (a) new_loop_vinfo might not actually be better than
2761 old_loop_vinfo for that VF and (b) it would be significantly
2762 worse at larger VFs.
2764 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2765 no more expensive than old_loop_vinfo even after doubling the
2766 estimated old_loop_vinfo VF. For all but trivial loops, this
2767 ensures that we only pick new_loop_vinfo if it is significantly
2768 better than old_loop_vinfo at the estimated VF. */
2769 if (rel_new.is_constant ())
2770 return false;
2772 HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2773 HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2774 widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2775 * widest_int (old_estimated_vf));
2776 widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2777 * widest_int (new_estimated_vf));
2778 return estimated_rel_new * 2 <= estimated_rel_old;
2780 if (known_lt (rel_new, rel_old))
2781 return true;
2783 /* If there's nothing to choose between the loop bodies, see whether
2784 there's a difference in the prologue and epilogue costs. */
2785 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2786 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2788 return false;
2791 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2792 true if we should. */
2794 static bool
2795 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2796 loop_vec_info old_loop_vinfo)
2798 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2799 return false;
2801 if (dump_enabled_p ())
2802 dump_printf_loc (MSG_NOTE, vect_location,
2803 "***** Preferring vector mode %s to vector mode %s\n",
2804 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2805 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2806 return true;
2809 /* Function vect_analyze_loop.
2811 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2812 for it. The different analyses will record information in the
2813 loop_vec_info struct. */
2814 opt_loop_vec_info
2815 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2817 auto_vector_modes vector_modes;
2819 /* Autodetect first vector size we try. */
2820 unsigned int autovec_flags
2821 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2822 loop->simdlen != 0);
2823 unsigned int mode_i = 0;
2825 DUMP_VECT_SCOPE ("analyze_loop_nest");
2827 if (loop_outer (loop)
2828 && loop_vec_info_for_loop (loop_outer (loop))
2829 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2830 return opt_loop_vec_info::failure_at (vect_location,
2831 "outer-loop already vectorized.\n");
2833 if (!find_loop_nest (loop, &shared->loop_nest))
2834 return opt_loop_vec_info::failure_at
2835 (vect_location,
2836 "not vectorized: loop nest containing two or more consecutive inner"
2837 " loops cannot be vectorized\n");
2839 unsigned n_stmts = 0;
2840 machine_mode autodetected_vector_mode = VOIDmode;
2841 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2842 machine_mode next_vector_mode = VOIDmode;
2843 poly_uint64 lowest_th = 0;
2844 unsigned vectorized_loops = 0;
2845 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2846 && !unlimited_cost_model (loop));
2848 bool vect_epilogues = false;
2849 opt_result res = opt_result::success ();
2850 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2851 while (1)
2853 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2854 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2855 if (!loop_vinfo)
2857 if (dump_enabled_p ())
2858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2859 "bad loop form.\n");
2860 gcc_checking_assert (first_loop_vinfo == NULL);
2861 return loop_vinfo;
2863 loop_vinfo->vector_mode = next_vector_mode;
2865 bool fatal = false;
2867 /* When pick_lowest_cost_p is true, we should in principle iterate
2868 over all the loop_vec_infos that LOOP_VINFO could replace and
2869 try to vectorize LOOP_VINFO under the same conditions.
2870 E.g. when trying to replace an epilogue loop, we should vectorize
2871 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2872 to replace the main loop, we should vectorize LOOP_VINFO as a main
2873 loop too.
2875 However, autovectorize_vector_modes is usually sorted as follows:
2877 - Modes that naturally produce lower VFs usually follow modes that
2878 naturally produce higher VFs.
2880 - When modes naturally produce the same VF, maskable modes
2881 usually follow unmaskable ones, so that the maskable mode
2882 can be used to vectorize the epilogue of the unmaskable mode.
2884 This order is preferred because it leads to the maximum
2885 epilogue vectorization opportunities. Targets should only use
2886 a different order if they want to make wide modes available while
2887 disparaging them relative to earlier, smaller modes. The assumption
2888 in that case is that the wider modes are more expensive in some
2889 way that isn't reflected directly in the costs.
2891 There should therefore be few interesting cases in which
2892 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2893 treated as a standalone loop, and ends up being genuinely cheaper
2894 than FIRST_LOOP_VINFO. */
2895 if (vect_epilogues)
2896 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2898 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2899 if (mode_i == 0)
2900 autodetected_vector_mode = loop_vinfo->vector_mode;
2901 if (dump_enabled_p ())
2903 if (res)
2904 dump_printf_loc (MSG_NOTE, vect_location,
2905 "***** Analysis succeeded with vector mode %s\n",
2906 GET_MODE_NAME (loop_vinfo->vector_mode));
2907 else
2908 dump_printf_loc (MSG_NOTE, vect_location,
2909 "***** Analysis failed with vector mode %s\n",
2910 GET_MODE_NAME (loop_vinfo->vector_mode));
2913 loop->aux = NULL;
2915 if (!fatal)
2916 while (mode_i < vector_modes.length ()
2917 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2919 if (dump_enabled_p ())
2920 dump_printf_loc (MSG_NOTE, vect_location,
2921 "***** The result for vector mode %s would"
2922 " be the same\n",
2923 GET_MODE_NAME (vector_modes[mode_i]));
2924 mode_i += 1;
2927 if (res)
2929 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2930 vectorized_loops++;
2932 /* Once we hit the desired simdlen for the first time,
2933 discard any previous attempts. */
2934 if (simdlen
2935 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2937 delete first_loop_vinfo;
2938 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2939 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2940 simdlen = 0;
2942 else if (pick_lowest_cost_p && first_loop_vinfo)
2944 /* Keep trying to roll back vectorization attempts while the
2945 loop_vec_infos they produced were worse than this one. */
2946 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2947 while (!vinfos.is_empty ()
2948 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2950 gcc_assert (vect_epilogues);
2951 delete vinfos.pop ();
2953 if (vinfos.is_empty ()
2954 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2956 delete first_loop_vinfo;
2957 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2958 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2962 if (first_loop_vinfo == NULL)
2964 first_loop_vinfo = loop_vinfo;
2965 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2967 else if (vect_epilogues
2968 /* For now only allow one epilogue loop. */
2969 && first_loop_vinfo->epilogue_vinfos.is_empty ())
2971 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2972 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2973 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2974 || maybe_ne (lowest_th, 0U));
2975 /* Keep track of the known smallest versioning
2976 threshold. */
2977 if (ordered_p (lowest_th, th))
2978 lowest_th = ordered_min (lowest_th, th);
2980 else
2982 delete loop_vinfo;
2983 loop_vinfo = opt_loop_vec_info::success (NULL);
2986 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2987 enabled, SIMDUID is not set, it is the innermost loop and we have
2988 either already found the loop's SIMDLEN or there was no SIMDLEN to
2989 begin with.
2990 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
2991 vect_epilogues = (!simdlen
2992 && loop->inner == NULL
2993 && param_vect_epilogues_nomask
2994 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2995 && !loop->simduid
2996 /* For now only allow one epilogue loop, but allow
2997 pick_lowest_cost_p to replace it. */
2998 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2999 || pick_lowest_cost_p));
3001 /* Commit to first_loop_vinfo if we have no reason to try
3002 alternatives. */
3003 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
3004 break;
3006 else
3008 delete loop_vinfo;
3009 loop_vinfo = opt_loop_vec_info::success (NULL);
3010 if (fatal)
3012 gcc_checking_assert (first_loop_vinfo == NULL);
3013 break;
3017 /* Handle the case that the original loop can use partial
3018 vectorization, but want to only adopt it for the epilogue.
3019 The retry should be in the same mode as original. */
3020 if (vect_epilogues
3021 && loop_vinfo
3022 && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
3024 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3025 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
3026 if (dump_enabled_p ())
3027 dump_printf_loc (MSG_NOTE, vect_location,
3028 "***** Re-trying analysis with same vector mode"
3029 " %s for epilogue with partial vectors.\n",
3030 GET_MODE_NAME (loop_vinfo->vector_mode));
3031 continue;
3034 if (mode_i < vector_modes.length ()
3035 && VECTOR_MODE_P (autodetected_vector_mode)
3036 && (related_vector_mode (vector_modes[mode_i],
3037 GET_MODE_INNER (autodetected_vector_mode))
3038 == autodetected_vector_mode)
3039 && (related_vector_mode (autodetected_vector_mode,
3040 GET_MODE_INNER (vector_modes[mode_i]))
3041 == vector_modes[mode_i]))
3043 if (dump_enabled_p ())
3044 dump_printf_loc (MSG_NOTE, vect_location,
3045 "***** Skipping vector mode %s, which would"
3046 " repeat the analysis for %s\n",
3047 GET_MODE_NAME (vector_modes[mode_i]),
3048 GET_MODE_NAME (autodetected_vector_mode));
3049 mode_i += 1;
3052 if (mode_i == vector_modes.length ()
3053 || autodetected_vector_mode == VOIDmode)
3054 break;
3056 /* Try the next biggest vector size. */
3057 next_vector_mode = vector_modes[mode_i++];
3058 if (dump_enabled_p ())
3059 dump_printf_loc (MSG_NOTE, vect_location,
3060 "***** Re-trying analysis with vector mode %s\n",
3061 GET_MODE_NAME (next_vector_mode));
3064 if (first_loop_vinfo)
3066 loop->aux = (loop_vec_info) first_loop_vinfo;
3067 if (dump_enabled_p ())
3068 dump_printf_loc (MSG_NOTE, vect_location,
3069 "***** Choosing vector mode %s\n",
3070 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3071 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3072 return first_loop_vinfo;
3075 return opt_loop_vec_info::propagate_failure (res);
3078 /* Return true if there is an in-order reduction function for CODE, storing
3079 it in *REDUC_FN if so. */
3081 static bool
3082 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3084 switch (code)
3086 case PLUS_EXPR:
3087 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3088 return true;
3090 default:
3091 return false;
3095 /* Function reduction_fn_for_scalar_code
3097 Input:
3098 CODE - tree_code of a reduction operations.
3100 Output:
3101 REDUC_FN - the corresponding internal function to be used to reduce the
3102 vector of partial results into a single scalar result, or IFN_LAST
3103 if the operation is a supported reduction operation, but does not have
3104 such an internal function.
3106 Return FALSE if CODE currently cannot be vectorized as reduction. */
3108 static bool
3109 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3111 switch (code)
3113 case MAX_EXPR:
3114 *reduc_fn = IFN_REDUC_MAX;
3115 return true;
3117 case MIN_EXPR:
3118 *reduc_fn = IFN_REDUC_MIN;
3119 return true;
3121 case PLUS_EXPR:
3122 *reduc_fn = IFN_REDUC_PLUS;
3123 return true;
3125 case BIT_AND_EXPR:
3126 *reduc_fn = IFN_REDUC_AND;
3127 return true;
3129 case BIT_IOR_EXPR:
3130 *reduc_fn = IFN_REDUC_IOR;
3131 return true;
3133 case BIT_XOR_EXPR:
3134 *reduc_fn = IFN_REDUC_XOR;
3135 return true;
3137 case MULT_EXPR:
3138 case MINUS_EXPR:
3139 *reduc_fn = IFN_LAST;
3140 return true;
3142 default:
3143 return false;
3147 /* If there is a neutral value X such that SLP reduction NODE would not
3148 be affected by the introduction of additional X elements, return that X,
3149 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
3150 is the vector type that would hold element X. REDUC_CHAIN is true if
3151 the SLP statements perform a single reduction, false if each statement
3152 performs an independent reduction. */
3154 static tree
3155 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
3156 tree_code code, bool reduc_chain)
3158 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3159 stmt_vec_info stmt_vinfo = stmts[0];
3160 tree scalar_type = TREE_TYPE (vector_type);
3161 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
3162 gcc_assert (loop);
3164 switch (code)
3166 case WIDEN_SUM_EXPR:
3167 case DOT_PROD_EXPR:
3168 case SAD_EXPR:
3169 case PLUS_EXPR:
3170 case MINUS_EXPR:
3171 case BIT_IOR_EXPR:
3172 case BIT_XOR_EXPR:
3173 return build_zero_cst (scalar_type);
3175 case MULT_EXPR:
3176 return build_one_cst (scalar_type);
3178 case BIT_AND_EXPR:
3179 return build_all_ones_cst (scalar_type);
3181 case MAX_EXPR:
3182 case MIN_EXPR:
3183 /* For MIN/MAX the initial values are neutral. A reduction chain
3184 has only a single initial value, so that value is neutral for
3185 all statements. */
3186 if (reduc_chain)
3187 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3188 loop_preheader_edge (loop));
3189 return NULL_TREE;
3191 default:
3192 return NULL_TREE;
3196 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3197 STMT is printed with a message MSG. */
3199 static void
3200 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3202 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3205 /* Return true if we need an in-order reduction for operation CODE
3206 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3207 overflow must wrap. */
3209 bool
3210 needs_fold_left_reduction_p (tree type, tree_code code)
3212 /* CHECKME: check for !flag_finite_math_only too? */
3213 if (SCALAR_FLOAT_TYPE_P (type))
3214 switch (code)
3216 case MIN_EXPR:
3217 case MAX_EXPR:
3218 return false;
3220 default:
3221 return !flag_associative_math;
3224 if (INTEGRAL_TYPE_P (type))
3226 if (!operation_no_trapping_overflow (type, code))
3227 return true;
3228 return false;
3231 if (SAT_FIXED_POINT_TYPE_P (type))
3232 return true;
3234 return false;
3237 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3238 has a handled computation expression. Store the main reduction
3239 operation in *CODE. */
3241 static bool
3242 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3243 tree loop_arg, enum tree_code *code,
3244 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3246 auto_bitmap visited;
3247 tree lookfor = PHI_RESULT (phi);
3248 ssa_op_iter curri;
3249 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3250 while (USE_FROM_PTR (curr) != loop_arg)
3251 curr = op_iter_next_use (&curri);
3252 curri.i = curri.numops;
3255 path.safe_push (std::make_pair (curri, curr));
3256 tree use = USE_FROM_PTR (curr);
3257 if (use == lookfor)
3258 break;
3259 gimple *def = SSA_NAME_DEF_STMT (use);
3260 if (gimple_nop_p (def)
3261 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3263 pop:
3266 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3267 curri = x.first;
3268 curr = x.second;
3270 curr = op_iter_next_use (&curri);
3271 /* Skip already visited or non-SSA operands (from iterating
3272 over PHI args). */
3273 while (curr != NULL_USE_OPERAND_P
3274 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3275 || ! bitmap_set_bit (visited,
3276 SSA_NAME_VERSION
3277 (USE_FROM_PTR (curr)))));
3279 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3280 if (curr == NULL_USE_OPERAND_P)
3281 break;
3283 else
3285 if (gimple_code (def) == GIMPLE_PHI)
3286 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3287 else
3288 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3289 while (curr != NULL_USE_OPERAND_P
3290 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3291 || ! bitmap_set_bit (visited,
3292 SSA_NAME_VERSION
3293 (USE_FROM_PTR (curr)))))
3294 curr = op_iter_next_use (&curri);
3295 if (curr == NULL_USE_OPERAND_P)
3296 goto pop;
3299 while (1);
3300 if (dump_file && (dump_flags & TDF_DETAILS))
3302 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3303 unsigned i;
3304 std::pair<ssa_op_iter, use_operand_p> *x;
3305 FOR_EACH_VEC_ELT (path, i, x)
3306 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3307 dump_printf (MSG_NOTE, "\n");
3310 /* Check whether the reduction path detected is valid. */
3311 bool fail = path.length () == 0;
3312 bool neg = false;
3313 int sign = -1;
3314 *code = ERROR_MARK;
3315 for (unsigned i = 1; i < path.length (); ++i)
3317 gimple *use_stmt = USE_STMT (path[i].second);
3318 tree op = USE_FROM_PTR (path[i].second);
3319 if (! is_gimple_assign (use_stmt)
3320 /* The following make sure we can compute the operand index
3321 easily plus it mostly disallows chaining via COND_EXPR condition
3322 operands. */
3323 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3324 && (gimple_num_ops (use_stmt) <= 2
3325 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3326 && (gimple_num_ops (use_stmt) <= 3
3327 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3329 fail = true;
3330 break;
3332 /* Check there's only a single stmt the op is used on. For the
3333 not value-changing tail and the last stmt allow out-of-loop uses.
3334 ??? We could relax this and handle arbitrary live stmts by
3335 forcing a scalar epilogue for example. */
3336 imm_use_iterator imm_iter;
3337 gimple *op_use_stmt;
3338 unsigned cnt = 0;
3339 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3340 if (!is_gimple_debug (op_use_stmt)
3341 && (*code != ERROR_MARK
3342 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3344 /* We want to allow x + x but not x < 1 ? x : 2. */
3345 if (is_gimple_assign (op_use_stmt)
3346 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3348 use_operand_p use_p;
3349 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3350 cnt++;
3352 else
3353 cnt++;
3355 if (cnt != 1)
3357 fail = true;
3358 break;
3360 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3361 if (use_code == MINUS_EXPR)
3363 use_code = PLUS_EXPR;
3364 /* Track whether we negate the reduction value each iteration. */
3365 if (gimple_assign_rhs2 (use_stmt) == op)
3366 neg = ! neg;
3368 if (CONVERT_EXPR_CODE_P (use_code)
3369 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3370 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3372 else if (*code == ERROR_MARK)
3374 *code = use_code;
3375 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3377 else if (use_code != *code)
3379 fail = true;
3380 break;
3382 else if ((use_code == MIN_EXPR
3383 || use_code == MAX_EXPR)
3384 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3386 fail = true;
3387 break;
3390 return ! fail && ! neg && *code != ERROR_MARK;
3393 bool
3394 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3395 tree loop_arg, enum tree_code code)
3397 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3398 enum tree_code code_;
3399 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3400 && code_ == code);
3405 /* Function vect_is_simple_reduction
3407 (1) Detect a cross-iteration def-use cycle that represents a simple
3408 reduction computation. We look for the following pattern:
3410 loop_header:
3411 a1 = phi < a0, a2 >
3412 a3 = ...
3413 a2 = operation (a3, a1)
3417 a3 = ...
3418 loop_header:
3419 a1 = phi < a0, a2 >
3420 a2 = operation (a3, a1)
3422 such that:
3423 1. operation is commutative and associative and it is safe to
3424 change the order of the computation
3425 2. no uses for a2 in the loop (a2 is used out of the loop)
3426 3. no uses of a1 in the loop besides the reduction operation
3427 4. no uses of a1 outside the loop.
3429 Conditions 1,4 are tested here.
3430 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3432 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3433 nested cycles.
3435 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3436 reductions:
3438 a1 = phi < a0, a2 >
3439 inner loop (def of a3)
3440 a2 = phi < a3 >
3442 (4) Detect condition expressions, ie:
3443 for (int i = 0; i < N; i++)
3444 if (a[i] < val)
3445 ret_val = a[i];
3449 static stmt_vec_info
3450 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3451 bool *double_reduc, bool *reduc_chain_p)
3453 gphi *phi = as_a <gphi *> (phi_info->stmt);
3454 gimple *phi_use_stmt = NULL;
3455 imm_use_iterator imm_iter;
3456 use_operand_p use_p;
3458 *double_reduc = false;
3459 *reduc_chain_p = false;
3460 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3462 tree phi_name = PHI_RESULT (phi);
3463 /* ??? If there are no uses of the PHI result the inner loop reduction
3464 won't be detected as possibly double-reduction by vectorizable_reduction
3465 because that tries to walk the PHI arg from the preheader edge which
3466 can be constant. See PR60382. */
3467 if (has_zero_uses (phi_name))
3468 return NULL;
3469 class loop *loop = (gimple_bb (phi))->loop_father;
3470 unsigned nphi_def_loop_uses = 0;
3471 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3473 gimple *use_stmt = USE_STMT (use_p);
3474 if (is_gimple_debug (use_stmt))
3475 continue;
3477 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3479 if (dump_enabled_p ())
3480 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3481 "intermediate value used outside loop.\n");
3483 return NULL;
3486 nphi_def_loop_uses++;
3487 phi_use_stmt = use_stmt;
3490 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3491 if (TREE_CODE (latch_def) != SSA_NAME)
3493 if (dump_enabled_p ())
3494 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3495 "reduction: not ssa_name: %T\n", latch_def);
3496 return NULL;
3499 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3500 if (!def_stmt_info
3501 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3502 return NULL;
3504 bool nested_in_vect_loop
3505 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3506 unsigned nlatch_def_loop_uses = 0;
3507 auto_vec<gphi *, 3> lcphis;
3508 bool inner_loop_of_double_reduc = false;
3509 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3511 gimple *use_stmt = USE_STMT (use_p);
3512 if (is_gimple_debug (use_stmt))
3513 continue;
3514 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3515 nlatch_def_loop_uses++;
3516 else
3518 /* We can have more than one loop-closed PHI. */
3519 lcphis.safe_push (as_a <gphi *> (use_stmt));
3520 if (nested_in_vect_loop
3521 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3522 == vect_double_reduction_def))
3523 inner_loop_of_double_reduc = true;
3527 /* If we are vectorizing an inner reduction we are executing that
3528 in the original order only in case we are not dealing with a
3529 double reduction. */
3530 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3532 if (dump_enabled_p ())
3533 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3534 "detected nested cycle: ");
3535 return def_stmt_info;
3538 /* If this isn't a nested cycle or if the nested cycle reduction value
3539 is used ouside of the inner loop we cannot handle uses of the reduction
3540 value. */
3541 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3543 if (dump_enabled_p ())
3544 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3545 "reduction used in loop.\n");
3546 return NULL;
3549 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3550 defined in the inner loop. */
3551 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3553 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3554 if (gimple_phi_num_args (def_stmt) != 1
3555 || TREE_CODE (op1) != SSA_NAME)
3557 if (dump_enabled_p ())
3558 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3559 "unsupported phi node definition.\n");
3561 return NULL;
3564 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3565 if (gimple_bb (def1)
3566 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3567 && loop->inner
3568 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3569 && is_gimple_assign (def1)
3570 && is_a <gphi *> (phi_use_stmt)
3571 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3573 if (dump_enabled_p ())
3574 report_vect_op (MSG_NOTE, def_stmt,
3575 "detected double reduction: ");
3577 *double_reduc = true;
3578 return def_stmt_info;
3581 return NULL;
3584 /* Look for the expression computing latch_def from then loop PHI result. */
3585 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3586 enum tree_code code;
3587 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3588 path))
3590 STMT_VINFO_REDUC_CODE (phi_info) = code;
3591 if (code == COND_EXPR && !nested_in_vect_loop)
3592 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3594 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3595 reduction chain for which the additional restriction is that
3596 all operations in the chain are the same. */
3597 auto_vec<stmt_vec_info, 8> reduc_chain;
3598 unsigned i;
3599 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3600 for (i = path.length () - 1; i >= 1; --i)
3602 gimple *stmt = USE_STMT (path[i].second);
3603 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3604 STMT_VINFO_REDUC_IDX (stmt_info)
3605 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3606 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3607 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3608 && (i == 1 || i == path.length () - 1));
3609 if ((stmt_code != code && !leading_conversion)
3610 /* We can only handle the final value in epilogue
3611 generation for reduction chains. */
3612 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3613 is_slp_reduc = false;
3614 /* For reduction chains we support a trailing/leading
3615 conversions. We do not store those in the actual chain. */
3616 if (leading_conversion)
3617 continue;
3618 reduc_chain.safe_push (stmt_info);
3620 if (is_slp_reduc && reduc_chain.length () > 1)
3622 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3624 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3625 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3627 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3628 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3630 /* Save the chain for further analysis in SLP detection. */
3631 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3632 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3634 *reduc_chain_p = true;
3635 if (dump_enabled_p ())
3636 dump_printf_loc (MSG_NOTE, vect_location,
3637 "reduction: detected reduction chain\n");
3639 else if (dump_enabled_p ())
3640 dump_printf_loc (MSG_NOTE, vect_location,
3641 "reduction: detected reduction\n");
3643 return def_stmt_info;
3646 if (dump_enabled_p ())
3647 dump_printf_loc (MSG_NOTE, vect_location,
3648 "reduction: unknown pattern\n");
3650 return NULL;
3653 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3654 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3655 or -1 if not known. */
3657 static int
3658 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3660 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3661 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3663 if (dump_enabled_p ())
3664 dump_printf_loc (MSG_NOTE, vect_location,
3665 "cost model: epilogue peel iters set to vf/2 "
3666 "because loop iterations are unknown .\n");
3667 return assumed_vf / 2;
3669 else
3671 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3672 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3673 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3674 /* If we need to peel for gaps, but no peeling is required, we have to
3675 peel VF iterations. */
3676 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3677 peel_iters_epilogue = assumed_vf;
3678 return peel_iters_epilogue;
3682 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3684 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3685 int *peel_iters_epilogue,
3686 stmt_vector_for_cost *scalar_cost_vec,
3687 stmt_vector_for_cost *prologue_cost_vec,
3688 stmt_vector_for_cost *epilogue_cost_vec)
3690 int retval = 0;
3692 *peel_iters_epilogue
3693 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3695 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3697 /* If peeled iterations are known but number of scalar loop
3698 iterations are unknown, count a taken branch per peeled loop. */
3699 if (peel_iters_prologue > 0)
3700 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3701 NULL, NULL_TREE, 0, vect_prologue);
3702 if (*peel_iters_epilogue > 0)
3703 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3704 NULL, NULL_TREE, 0, vect_epilogue);
3707 stmt_info_for_cost *si;
3708 int j;
3709 if (peel_iters_prologue)
3710 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3711 retval += record_stmt_cost (prologue_cost_vec,
3712 si->count * peel_iters_prologue,
3713 si->kind, si->stmt_info, si->misalign,
3714 vect_prologue);
3715 if (*peel_iters_epilogue)
3716 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3717 retval += record_stmt_cost (epilogue_cost_vec,
3718 si->count * *peel_iters_epilogue,
3719 si->kind, si->stmt_info, si->misalign,
3720 vect_epilogue);
3722 return retval;
3725 /* Function vect_estimate_min_profitable_iters
3727 Return the number of iterations required for the vector version of the
3728 loop to be profitable relative to the cost of the scalar version of the
3729 loop.
3731 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3732 of iterations for vectorization. -1 value means loop vectorization
3733 is not profitable. This returned value may be used for dynamic
3734 profitability check.
3736 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3737 for static check against estimated number of iterations. */
3739 static void
3740 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3741 int *ret_min_profitable_niters,
3742 int *ret_min_profitable_estimate)
3744 int min_profitable_iters;
3745 int min_profitable_estimate;
3746 int peel_iters_prologue;
3747 int peel_iters_epilogue;
3748 unsigned vec_inside_cost = 0;
3749 int vec_outside_cost = 0;
3750 unsigned vec_prologue_cost = 0;
3751 unsigned vec_epilogue_cost = 0;
3752 int scalar_single_iter_cost = 0;
3753 int scalar_outside_cost = 0;
3754 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3755 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3756 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3758 /* Cost model disabled. */
3759 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3761 if (dump_enabled_p ())
3762 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3763 *ret_min_profitable_niters = 0;
3764 *ret_min_profitable_estimate = 0;
3765 return;
3768 /* Requires loop versioning tests to handle misalignment. */
3769 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3771 /* FIXME: Make cost depend on complexity of individual check. */
3772 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3773 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3774 NULL, NULL_TREE, 0, vect_prologue);
3775 if (dump_enabled_p ())
3776 dump_printf (MSG_NOTE,
3777 "cost model: Adding cost of checks for loop "
3778 "versioning to treat misalignment.\n");
3781 /* Requires loop versioning with alias checks. */
3782 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3784 /* FIXME: Make cost depend on complexity of individual check. */
3785 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3786 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3787 NULL, NULL_TREE, 0, vect_prologue);
3788 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3789 if (len)
3790 /* Count LEN - 1 ANDs and LEN comparisons. */
3791 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3792 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3793 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3794 if (len)
3796 /* Count LEN - 1 ANDs and LEN comparisons. */
3797 unsigned int nstmts = len * 2 - 1;
3798 /* +1 for each bias that needs adding. */
3799 for (unsigned int i = 0; i < len; ++i)
3800 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3801 nstmts += 1;
3802 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3803 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3805 if (dump_enabled_p ())
3806 dump_printf (MSG_NOTE,
3807 "cost model: Adding cost of checks for loop "
3808 "versioning aliasing.\n");
3811 /* Requires loop versioning with niter checks. */
3812 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3814 /* FIXME: Make cost depend on complexity of individual check. */
3815 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3816 NULL, NULL_TREE, 0, vect_prologue);
3817 if (dump_enabled_p ())
3818 dump_printf (MSG_NOTE,
3819 "cost model: Adding cost of checks for loop "
3820 "versioning niters.\n");
3823 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3824 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3825 NULL, NULL_TREE, 0, vect_prologue);
3827 /* Count statements in scalar loop. Using this as scalar cost for a single
3828 iteration for now.
3830 TODO: Add outer loop support.
3832 TODO: Consider assigning different costs to different scalar
3833 statements. */
3835 scalar_single_iter_cost
3836 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3838 /* Add additional cost for the peeled instructions in prologue and epilogue
3839 loop. (For fully-masked loops there will be no peeling.)
3841 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3842 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3844 TODO: Build an expression that represents peel_iters for prologue and
3845 epilogue to be used in a run-time test. */
3847 bool prologue_need_br_taken_cost = false;
3848 bool prologue_need_br_not_taken_cost = false;
3850 /* Calculate peel_iters_prologue. */
3851 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3852 peel_iters_prologue = 0;
3853 else if (npeel < 0)
3855 peel_iters_prologue = assumed_vf / 2;
3856 if (dump_enabled_p ())
3857 dump_printf (MSG_NOTE, "cost model: "
3858 "prologue peel iters set to vf/2.\n");
3860 /* If peeled iterations are unknown, count a taken branch and a not taken
3861 branch per peeled loop. Even if scalar loop iterations are known,
3862 vector iterations are not known since peeled prologue iterations are
3863 not known. Hence guards remain the same. */
3864 prologue_need_br_taken_cost = true;
3865 prologue_need_br_not_taken_cost = true;
3867 else
3869 peel_iters_prologue = npeel;
3870 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3871 /* If peeled iterations are known but number of scalar loop
3872 iterations are unknown, count a taken branch per peeled loop. */
3873 prologue_need_br_taken_cost = true;
3876 bool epilogue_need_br_taken_cost = false;
3877 bool epilogue_need_br_not_taken_cost = false;
3879 /* Calculate peel_iters_epilogue. */
3880 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3881 /* We need to peel exactly one iteration for gaps. */
3882 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3883 else if (npeel < 0)
3885 /* If peeling for alignment is unknown, loop bound of main loop
3886 becomes unknown. */
3887 peel_iters_epilogue = assumed_vf / 2;
3888 if (dump_enabled_p ())
3889 dump_printf (MSG_NOTE, "cost model: "
3890 "epilogue peel iters set to vf/2 because "
3891 "peeling for alignment is unknown.\n");
3893 /* See the same reason above in peel_iters_prologue calculation. */
3894 epilogue_need_br_taken_cost = true;
3895 epilogue_need_br_not_taken_cost = true;
3897 else
3899 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3900 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3901 /* If peeled iterations are known but number of scalar loop
3902 iterations are unknown, count a taken branch per peeled loop. */
3903 epilogue_need_br_taken_cost = true;
3906 stmt_info_for_cost *si;
3907 int j;
3908 /* Add costs associated with peel_iters_prologue. */
3909 if (peel_iters_prologue)
3910 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3912 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3913 si->count * peel_iters_prologue, si->kind,
3914 si->stmt_info, si->vectype, si->misalign,
3915 vect_prologue);
3918 /* Add costs associated with peel_iters_epilogue. */
3919 if (peel_iters_epilogue)
3920 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3922 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3923 si->count * peel_iters_epilogue, si->kind,
3924 si->stmt_info, si->vectype, si->misalign,
3925 vect_epilogue);
3928 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
3930 if (prologue_need_br_taken_cost)
3931 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3932 NULL, NULL_TREE, 0, vect_prologue);
3934 if (prologue_need_br_not_taken_cost)
3935 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3936 cond_branch_not_taken, NULL, NULL_TREE, 0,
3937 vect_prologue);
3939 if (epilogue_need_br_taken_cost)
3940 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3941 NULL, NULL_TREE, 0, vect_epilogue);
3943 if (epilogue_need_br_not_taken_cost)
3944 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3945 cond_branch_not_taken, NULL, NULL_TREE, 0,
3946 vect_epilogue);
3948 /* Take care of special costs for rgroup controls of partial vectors. */
3949 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3951 /* Calculate how many masks we need to generate. */
3952 unsigned int num_masks = 0;
3953 rgroup_controls *rgm;
3954 unsigned int num_vectors_m1;
3955 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3956 if (rgm->type)
3957 num_masks += num_vectors_m1 + 1;
3958 gcc_assert (num_masks > 0);
3960 /* In the worst case, we need to generate each mask in the prologue
3961 and in the loop body. One of the loop body mask instructions
3962 replaces the comparison in the scalar loop, and since we don't
3963 count the scalar comparison against the scalar body, we shouldn't
3964 count that vector instruction against the vector body either.
3966 Sometimes we can use unpacks instead of generating prologue
3967 masks and sometimes the prologue mask will fold to a constant,
3968 so the actual prologue cost might be smaller. However, it's
3969 simpler and safer to use the worst-case cost; if this ends up
3970 being the tie-breaker between vectorizing or not, then it's
3971 probably better not to vectorize. */
3972 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
3973 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
3974 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
3975 vector_stmt, NULL, NULL_TREE, 0, vect_body);
3977 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
3979 /* Referring to the functions vect_set_loop_condition_partial_vectors
3980 and vect_set_loop_controls_directly, we need to generate each
3981 length in the prologue and in the loop body if required. Although
3982 there are some possible optimizations, we consider the worst case
3983 here. */
3985 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
3986 bool need_iterate_p
3987 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3988 && !vect_known_niters_smaller_than_vf (loop_vinfo));
3990 /* Calculate how many statements to be added. */
3991 unsigned int prologue_stmts = 0;
3992 unsigned int body_stmts = 0;
3994 rgroup_controls *rgc;
3995 unsigned int num_vectors_m1;
3996 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
3997 if (rgc->type)
3999 /* May need one SHIFT for nitems_total computation. */
4000 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4001 if (nitems != 1 && !niters_known_p)
4002 prologue_stmts += 1;
4004 /* May need one MAX and one MINUS for wrap around. */
4005 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4006 prologue_stmts += 2;
4008 /* Need one MAX and one MINUS for each batch limit excepting for
4009 the 1st one. */
4010 prologue_stmts += num_vectors_m1 * 2;
4012 unsigned int num_vectors = num_vectors_m1 + 1;
4014 /* Need to set up lengths in prologue, only one MIN required
4015 for each since start index is zero. */
4016 prologue_stmts += num_vectors;
4018 /* Each may need two MINs and one MINUS to update lengths in body
4019 for next iteration. */
4020 if (need_iterate_p)
4021 body_stmts += 3 * num_vectors;
4024 (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
4025 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4026 (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
4027 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4030 /* FORNOW: The scalar outside cost is incremented in one of the
4031 following ways:
4033 1. The vectorizer checks for alignment and aliasing and generates
4034 a condition that allows dynamic vectorization. A cost model
4035 check is ANDED with the versioning condition. Hence scalar code
4036 path now has the added cost of the versioning check.
4038 if (cost > th & versioning_check)
4039 jmp to vector code
4041 Hence run-time scalar is incremented by not-taken branch cost.
4043 2. The vectorizer then checks if a prologue is required. If the
4044 cost model check was not done before during versioning, it has to
4045 be done before the prologue check.
4047 if (cost <= th)
4048 prologue = scalar_iters
4049 if (prologue == 0)
4050 jmp to vector code
4051 else
4052 execute prologue
4053 if (prologue == num_iters)
4054 go to exit
4056 Hence the run-time scalar cost is incremented by a taken branch,
4057 plus a not-taken branch, plus a taken branch cost.
4059 3. The vectorizer then checks if an epilogue is required. If the
4060 cost model check was not done before during prologue check, it
4061 has to be done with the epilogue check.
4063 if (prologue == 0)
4064 jmp to vector code
4065 else
4066 execute prologue
4067 if (prologue == num_iters)
4068 go to exit
4069 vector code:
4070 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4071 jmp to epilogue
4073 Hence the run-time scalar cost should be incremented by 2 taken
4074 branches.
4076 TODO: The back end may reorder the BBS's differently and reverse
4077 conditions/branch directions. Change the estimates below to
4078 something more reasonable. */
4080 /* If the number of iterations is known and we do not do versioning, we can
4081 decide whether to vectorize at compile time. Hence the scalar version
4082 do not carry cost model guard costs. */
4083 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4084 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4086 /* Cost model check occurs at versioning. */
4087 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4088 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4089 else
4091 /* Cost model check occurs at prologue generation. */
4092 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4093 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4094 + vect_get_stmt_cost (cond_branch_not_taken);
4095 /* Cost model check occurs at epilogue generation. */
4096 else
4097 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4101 /* Complete the target-specific cost calculations. */
4102 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4103 &vec_inside_cost, &vec_epilogue_cost);
4105 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4107 /* Stash the costs so that we can compare two loop_vec_infos. */
4108 loop_vinfo->vec_inside_cost = vec_inside_cost;
4109 loop_vinfo->vec_outside_cost = vec_outside_cost;
4111 if (dump_enabled_p ())
4113 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4114 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4115 vec_inside_cost);
4116 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4117 vec_prologue_cost);
4118 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4119 vec_epilogue_cost);
4120 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4121 scalar_single_iter_cost);
4122 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4123 scalar_outside_cost);
4124 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4125 vec_outside_cost);
4126 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4127 peel_iters_prologue);
4128 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4129 peel_iters_epilogue);
4132 /* Calculate number of iterations required to make the vector version
4133 profitable, relative to the loop bodies only. The following condition
4134 must hold true:
4135 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4136 where
4137 SIC = scalar iteration cost, VIC = vector iteration cost,
4138 VOC = vector outside cost, VF = vectorization factor,
4139 NPEEL = prologue iterations + epilogue iterations,
4140 SOC = scalar outside cost for run time cost model check. */
4142 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4143 - vec_inside_cost);
4144 if (saving_per_viter <= 0)
4146 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4147 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4148 "vectorization did not happen for a simd loop");
4150 if (dump_enabled_p ())
4151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4152 "cost model: the vector iteration cost = %d "
4153 "divided by the scalar iteration cost = %d "
4154 "is greater or equal to the vectorization factor = %d"
4155 ".\n",
4156 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4157 *ret_min_profitable_niters = -1;
4158 *ret_min_profitable_estimate = -1;
4159 return;
4162 /* ??? The "if" arm is written to handle all cases; see below for what
4163 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4164 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4166 /* Rewriting the condition above in terms of the number of
4167 vector iterations (vniters) rather than the number of
4168 scalar iterations (niters) gives:
4170 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4172 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4174 For integer N, X and Y when X > 0:
4176 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4177 int outside_overhead = (vec_outside_cost
4178 - scalar_single_iter_cost * peel_iters_prologue
4179 - scalar_single_iter_cost * peel_iters_epilogue
4180 - scalar_outside_cost);
4181 /* We're only interested in cases that require at least one
4182 vector iteration. */
4183 int min_vec_niters = 1;
4184 if (outside_overhead > 0)
4185 min_vec_niters = outside_overhead / saving_per_viter + 1;
4187 if (dump_enabled_p ())
4188 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4189 min_vec_niters);
4191 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4193 /* Now that we know the minimum number of vector iterations,
4194 find the minimum niters for which the scalar cost is larger:
4196 SIC * niters > VIC * vniters + VOC - SOC
4198 We know that the minimum niters is no more than
4199 vniters * VF + NPEEL, but it might be (and often is) less
4200 than that if a partial vector iteration is cheaper than the
4201 equivalent scalar code. */
4202 int threshold = (vec_inside_cost * min_vec_niters
4203 + vec_outside_cost
4204 - scalar_outside_cost);
4205 if (threshold <= 0)
4206 min_profitable_iters = 1;
4207 else
4208 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4210 else
4211 /* Convert the number of vector iterations into a number of
4212 scalar iterations. */
4213 min_profitable_iters = (min_vec_niters * assumed_vf
4214 + peel_iters_prologue
4215 + peel_iters_epilogue);
4217 else
4219 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4220 * assumed_vf
4221 - vec_inside_cost * peel_iters_prologue
4222 - vec_inside_cost * peel_iters_epilogue);
4223 if (min_profitable_iters <= 0)
4224 min_profitable_iters = 0;
4225 else
4227 min_profitable_iters /= saving_per_viter;
4229 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4230 <= (((int) vec_inside_cost * min_profitable_iters)
4231 + (((int) vec_outside_cost - scalar_outside_cost)
4232 * assumed_vf)))
4233 min_profitable_iters++;
4237 if (dump_enabled_p ())
4238 dump_printf (MSG_NOTE,
4239 " Calculated minimum iters for profitability: %d\n",
4240 min_profitable_iters);
4242 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4243 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4244 /* We want the vectorized loop to execute at least once. */
4245 min_profitable_iters = assumed_vf + peel_iters_prologue;
4246 else if (min_profitable_iters < peel_iters_prologue)
4247 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4248 vectorized loop executes at least once. */
4249 min_profitable_iters = peel_iters_prologue;
4251 if (dump_enabled_p ())
4252 dump_printf_loc (MSG_NOTE, vect_location,
4253 " Runtime profitability threshold = %d\n",
4254 min_profitable_iters);
4256 *ret_min_profitable_niters = min_profitable_iters;
4258 /* Calculate number of iterations required to make the vector version
4259 profitable, relative to the loop bodies only.
4261 Non-vectorized variant is SIC * niters and it must win over vector
4262 variant on the expected loop trip count. The following condition must hold true:
4263 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4265 if (vec_outside_cost <= 0)
4266 min_profitable_estimate = 0;
4267 /* ??? This "else if" arm is written to handle all cases; see below for
4268 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4269 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4271 /* This is a repeat of the code above, but with + SOC rather
4272 than - SOC. */
4273 int outside_overhead = (vec_outside_cost
4274 - scalar_single_iter_cost * peel_iters_prologue
4275 - scalar_single_iter_cost * peel_iters_epilogue
4276 + scalar_outside_cost);
4277 int min_vec_niters = 1;
4278 if (outside_overhead > 0)
4279 min_vec_niters = outside_overhead / saving_per_viter + 1;
4281 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4283 int threshold = (vec_inside_cost * min_vec_niters
4284 + vec_outside_cost
4285 + scalar_outside_cost);
4286 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4288 else
4289 min_profitable_estimate = (min_vec_niters * assumed_vf
4290 + peel_iters_prologue
4291 + peel_iters_epilogue);
4293 else
4295 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4296 * assumed_vf
4297 - vec_inside_cost * peel_iters_prologue
4298 - vec_inside_cost * peel_iters_epilogue)
4299 / ((scalar_single_iter_cost * assumed_vf)
4300 - vec_inside_cost);
4302 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4303 if (dump_enabled_p ())
4304 dump_printf_loc (MSG_NOTE, vect_location,
4305 " Static estimate profitability threshold = %d\n",
4306 min_profitable_estimate);
4308 *ret_min_profitable_estimate = min_profitable_estimate;
4311 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4312 vector elements (not bits) for a vector with NELT elements. */
4313 static void
4314 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4315 vec_perm_builder *sel)
4317 /* The encoding is a single stepped pattern. Any wrap-around is handled
4318 by vec_perm_indices. */
4319 sel->new_vector (nelt, 1, 3);
4320 for (unsigned int i = 0; i < 3; i++)
4321 sel->quick_push (i + offset);
4324 /* Checks whether the target supports whole-vector shifts for vectors of mode
4325 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4326 it supports vec_perm_const with masks for all necessary shift amounts. */
4327 static bool
4328 have_whole_vector_shift (machine_mode mode)
4330 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4331 return true;
4333 /* Variable-length vectors should be handled via the optab. */
4334 unsigned int nelt;
4335 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4336 return false;
4338 vec_perm_builder sel;
4339 vec_perm_indices indices;
4340 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4342 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4343 indices.new_vector (sel, 2, nelt);
4344 if (!can_vec_perm_const_p (mode, indices, false))
4345 return false;
4347 return true;
4350 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4351 functions. Design better to avoid maintenance issues. */
4353 /* Function vect_model_reduction_cost.
4355 Models cost for a reduction operation, including the vector ops
4356 generated within the strip-mine loop, the initial definition before
4357 the loop, and the epilogue code that must be generated. */
4359 static void
4360 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4361 stmt_vec_info stmt_info, internal_fn reduc_fn,
4362 vect_reduction_type reduction_type,
4363 int ncopies, stmt_vector_for_cost *cost_vec)
4365 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4366 enum tree_code code;
4367 optab optab;
4368 tree vectype;
4369 machine_mode mode;
4370 class loop *loop = NULL;
4372 if (loop_vinfo)
4373 loop = LOOP_VINFO_LOOP (loop_vinfo);
4375 /* Condition reductions generate two reductions in the loop. */
4376 if (reduction_type == COND_REDUCTION)
4377 ncopies *= 2;
4379 vectype = STMT_VINFO_VECTYPE (stmt_info);
4380 mode = TYPE_MODE (vectype);
4381 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4383 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4385 if (reduction_type == EXTRACT_LAST_REDUCTION)
4386 /* No extra instructions are needed in the prologue. The loop body
4387 operations are costed in vectorizable_condition. */
4388 inside_cost = 0;
4389 else if (reduction_type == FOLD_LEFT_REDUCTION)
4391 /* No extra instructions needed in the prologue. */
4392 prologue_cost = 0;
4394 if (reduc_fn != IFN_LAST)
4395 /* Count one reduction-like operation per vector. */
4396 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4397 stmt_info, 0, vect_body);
4398 else
4400 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4401 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4402 inside_cost = record_stmt_cost (cost_vec, nelements,
4403 vec_to_scalar, stmt_info, 0,
4404 vect_body);
4405 inside_cost += record_stmt_cost (cost_vec, nelements,
4406 scalar_stmt, stmt_info, 0,
4407 vect_body);
4410 else
4412 /* Add in cost for initial definition.
4413 For cond reduction we have four vectors: initial index, step,
4414 initial result of the data reduction, initial value of the index
4415 reduction. */
4416 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4417 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4418 scalar_to_vec, stmt_info, 0,
4419 vect_prologue);
4421 /* Cost of reduction op inside loop. */
4422 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4423 stmt_info, 0, vect_body);
4426 /* Determine cost of epilogue code.
4428 We have a reduction operator that will reduce the vector in one statement.
4429 Also requires scalar extract. */
4431 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4433 if (reduc_fn != IFN_LAST)
4435 if (reduction_type == COND_REDUCTION)
4437 /* An EQ stmt and an COND_EXPR stmt. */
4438 epilogue_cost += record_stmt_cost (cost_vec, 2,
4439 vector_stmt, stmt_info, 0,
4440 vect_epilogue);
4441 /* Reduction of the max index and a reduction of the found
4442 values. */
4443 epilogue_cost += record_stmt_cost (cost_vec, 2,
4444 vec_to_scalar, stmt_info, 0,
4445 vect_epilogue);
4446 /* A broadcast of the max value. */
4447 epilogue_cost += record_stmt_cost (cost_vec, 1,
4448 scalar_to_vec, stmt_info, 0,
4449 vect_epilogue);
4451 else
4453 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4454 stmt_info, 0, vect_epilogue);
4455 epilogue_cost += record_stmt_cost (cost_vec, 1,
4456 vec_to_scalar, stmt_info, 0,
4457 vect_epilogue);
4460 else if (reduction_type == COND_REDUCTION)
4462 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4463 /* Extraction of scalar elements. */
4464 epilogue_cost += record_stmt_cost (cost_vec,
4465 2 * estimated_nunits,
4466 vec_to_scalar, stmt_info, 0,
4467 vect_epilogue);
4468 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4469 epilogue_cost += record_stmt_cost (cost_vec,
4470 2 * estimated_nunits - 3,
4471 scalar_stmt, stmt_info, 0,
4472 vect_epilogue);
4474 else if (reduction_type == EXTRACT_LAST_REDUCTION
4475 || reduction_type == FOLD_LEFT_REDUCTION)
4476 /* No extra instructions need in the epilogue. */
4478 else
4480 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4481 tree bitsize =
4482 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4483 int element_bitsize = tree_to_uhwi (bitsize);
4484 int nelements = vec_size_in_bits / element_bitsize;
4486 if (code == COND_EXPR)
4487 code = MAX_EXPR;
4489 optab = optab_for_tree_code (code, vectype, optab_default);
4491 /* We have a whole vector shift available. */
4492 if (optab != unknown_optab
4493 && VECTOR_MODE_P (mode)
4494 && optab_handler (optab, mode) != CODE_FOR_nothing
4495 && have_whole_vector_shift (mode))
4497 /* Final reduction via vector shifts and the reduction operator.
4498 Also requires scalar extract. */
4499 epilogue_cost += record_stmt_cost (cost_vec,
4500 exact_log2 (nelements) * 2,
4501 vector_stmt, stmt_info, 0,
4502 vect_epilogue);
4503 epilogue_cost += record_stmt_cost (cost_vec, 1,
4504 vec_to_scalar, stmt_info, 0,
4505 vect_epilogue);
4507 else
4508 /* Use extracts and reduction op for final reduction. For N
4509 elements, we have N extracts and N-1 reduction ops. */
4510 epilogue_cost += record_stmt_cost (cost_vec,
4511 nelements + nelements - 1,
4512 vector_stmt, stmt_info, 0,
4513 vect_epilogue);
4517 if (dump_enabled_p ())
4518 dump_printf (MSG_NOTE,
4519 "vect_model_reduction_cost: inside_cost = %d, "
4520 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4521 prologue_cost, epilogue_cost);
4526 /* Function get_initial_def_for_reduction
4528 Input:
4529 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4530 INIT_VAL - the initial value of the reduction variable
4532 Output:
4533 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4534 of the reduction (used for adjusting the epilog - see below).
4535 Return a vector variable, initialized according to the operation that
4536 STMT_VINFO performs. This vector will be used as the initial value
4537 of the vector of partial results.
4539 Option1 (adjust in epilog): Initialize the vector as follows:
4540 add/bit or/xor: [0,0,...,0,0]
4541 mult/bit and: [1,1,...,1,1]
4542 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4543 and when necessary (e.g. add/mult case) let the caller know
4544 that it needs to adjust the result by init_val.
4546 Option2: Initialize the vector as follows:
4547 add/bit or/xor: [init_val,0,0,...,0]
4548 mult/bit and: [init_val,1,1,...,1]
4549 min/max/cond_expr: [init_val,init_val,...,init_val]
4550 and no adjustments are needed.
4552 For example, for the following code:
4554 s = init_val;
4555 for (i=0;i<n;i++)
4556 s = s + a[i];
4558 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4559 For a vector of 4 units, we want to return either [0,0,0,init_val],
4560 or [0,0,0,0] and let the caller know that it needs to adjust
4561 the result at the end by 'init_val'.
4563 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4564 initialization vector is simpler (same element in all entries), if
4565 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4567 A cost model should help decide between these two schemes. */
4569 static tree
4570 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4571 stmt_vec_info stmt_vinfo,
4572 enum tree_code code, tree init_val,
4573 tree *adjustment_def)
4575 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4576 tree scalar_type = TREE_TYPE (init_val);
4577 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4578 tree def_for_init;
4579 tree init_def;
4580 REAL_VALUE_TYPE real_init_val = dconst0;
4581 int int_init_val = 0;
4582 gimple_seq stmts = NULL;
4584 gcc_assert (vectype);
4586 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4587 || SCALAR_FLOAT_TYPE_P (scalar_type));
4589 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4590 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4592 /* ADJUSTMENT_DEF is NULL when called from
4593 vect_create_epilog_for_reduction to vectorize double reduction. */
4594 if (adjustment_def)
4595 *adjustment_def = NULL;
4597 switch (code)
4599 case WIDEN_SUM_EXPR:
4600 case DOT_PROD_EXPR:
4601 case SAD_EXPR:
4602 case PLUS_EXPR:
4603 case MINUS_EXPR:
4604 case BIT_IOR_EXPR:
4605 case BIT_XOR_EXPR:
4606 case MULT_EXPR:
4607 case BIT_AND_EXPR:
4609 if (code == MULT_EXPR)
4611 real_init_val = dconst1;
4612 int_init_val = 1;
4615 if (code == BIT_AND_EXPR)
4616 int_init_val = -1;
4618 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4619 def_for_init = build_real (scalar_type, real_init_val);
4620 else
4621 def_for_init = build_int_cst (scalar_type, int_init_val);
4623 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4625 /* Option1: the first element is '0' or '1' as well. */
4626 if (!operand_equal_p (def_for_init, init_val, 0))
4627 *adjustment_def = init_val;
4628 init_def = gimple_build_vector_from_val (&stmts, vectype,
4629 def_for_init);
4631 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4633 /* Option2 (variable length): the first element is INIT_VAL. */
4634 init_def = gimple_build_vector_from_val (&stmts, vectype,
4635 def_for_init);
4636 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4637 vectype, init_def, init_val);
4639 else
4641 /* Option2: the first element is INIT_VAL. */
4642 tree_vector_builder elts (vectype, 1, 2);
4643 elts.quick_push (init_val);
4644 elts.quick_push (def_for_init);
4645 init_def = gimple_build_vector (&stmts, &elts);
4648 break;
4650 case MIN_EXPR:
4651 case MAX_EXPR:
4652 case COND_EXPR:
4654 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4655 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4657 break;
4659 default:
4660 gcc_unreachable ();
4663 if (stmts)
4664 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4665 return init_def;
4668 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4669 NUMBER_OF_VECTORS is the number of vector defs to create.
4670 If NEUTRAL_OP is nonnull, introducing extra elements of that
4671 value will not change the result. */
4673 static void
4674 get_initial_defs_for_reduction (vec_info *vinfo,
4675 slp_tree slp_node,
4676 vec<tree> *vec_oprnds,
4677 unsigned int number_of_vectors,
4678 bool reduc_chain, tree neutral_op)
4680 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4681 stmt_vec_info stmt_vinfo = stmts[0];
4682 unsigned HOST_WIDE_INT nunits;
4683 unsigned j, number_of_places_left_in_vector;
4684 tree vector_type;
4685 unsigned int group_size = stmts.length ();
4686 unsigned int i;
4687 class loop *loop;
4689 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4691 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4693 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4694 gcc_assert (loop);
4695 edge pe = loop_preheader_edge (loop);
4697 gcc_assert (!reduc_chain || neutral_op);
4699 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4700 created vectors. It is greater than 1 if unrolling is performed.
4702 For example, we have two scalar operands, s1 and s2 (e.g., group of
4703 strided accesses of size two), while NUNITS is four (i.e., four scalars
4704 of this type can be packed in a vector). The output vector will contain
4705 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4706 will be 2).
4708 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4709 vectors containing the operands.
4711 For example, NUNITS is four as before, and the group size is 8
4712 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4713 {s5, s6, s7, s8}. */
4715 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4716 nunits = group_size;
4718 number_of_places_left_in_vector = nunits;
4719 bool constant_p = true;
4720 tree_vector_builder elts (vector_type, nunits, 1);
4721 elts.quick_grow (nunits);
4722 gimple_seq ctor_seq = NULL;
4723 for (j = 0; j < nunits * number_of_vectors; ++j)
4725 tree op;
4726 i = j % group_size;
4727 stmt_vinfo = stmts[i];
4729 /* Get the def before the loop. In reduction chain we have only
4730 one initial value. Else we have as many as PHIs in the group. */
4731 if (reduc_chain)
4732 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4733 else if (((vec_oprnds->length () + 1) * nunits
4734 - number_of_places_left_in_vector >= group_size)
4735 && neutral_op)
4736 op = neutral_op;
4737 else
4738 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4740 /* Create 'vect_ = {op0,op1,...,opn}'. */
4741 number_of_places_left_in_vector--;
4742 elts[nunits - number_of_places_left_in_vector - 1] = op;
4743 if (!CONSTANT_CLASS_P (op))
4744 constant_p = false;
4746 if (number_of_places_left_in_vector == 0)
4748 tree init;
4749 if (constant_p && !neutral_op
4750 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4751 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4752 /* Build the vector directly from ELTS. */
4753 init = gimple_build_vector (&ctor_seq, &elts);
4754 else if (neutral_op)
4756 /* Build a vector of the neutral value and shift the
4757 other elements into place. */
4758 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4759 neutral_op);
4760 int k = nunits;
4761 while (k > 0 && elts[k - 1] == neutral_op)
4762 k -= 1;
4763 while (k > 0)
4765 k -= 1;
4766 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4767 vector_type, init, elts[k]);
4770 else
4772 /* First time round, duplicate ELTS to fill the
4773 required number of vectors. */
4774 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4775 number_of_vectors, *vec_oprnds);
4776 break;
4778 vec_oprnds->quick_push (init);
4780 number_of_places_left_in_vector = nunits;
4781 elts.new_vector (vector_type, nunits, 1);
4782 elts.quick_grow (nunits);
4783 constant_p = true;
4786 if (ctor_seq != NULL)
4787 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4790 /* For a statement STMT_INFO taking part in a reduction operation return
4791 the stmt_vec_info the meta information is stored on. */
4793 stmt_vec_info
4794 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4796 stmt_info = vect_orig_stmt (stmt_info);
4797 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4798 if (!is_a <gphi *> (stmt_info->stmt)
4799 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4800 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4801 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4802 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4804 if (gimple_phi_num_args (phi) == 1)
4805 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4807 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4809 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4810 stmt_vec_info info
4811 = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4812 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4813 stmt_info = info;
4815 return stmt_info;
4818 /* Function vect_create_epilog_for_reduction
4820 Create code at the loop-epilog to finalize the result of a reduction
4821 computation.
4823 STMT_INFO is the scalar reduction stmt that is being vectorized.
4824 SLP_NODE is an SLP node containing a group of reduction statements. The
4825 first one in this group is STMT_INFO.
4826 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4827 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4828 (counting from 0)
4830 This function:
4831 1. Completes the reduction def-use cycles.
4832 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4833 by calling the function specified by REDUC_FN if available, or by
4834 other means (whole-vector shifts or a scalar loop).
4835 The function also creates a new phi node at the loop exit to preserve
4836 loop-closed form, as illustrated below.
4838 The flow at the entry to this function:
4840 loop:
4841 vec_def = phi <vec_init, null> # REDUCTION_PHI
4842 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4843 s_loop = scalar_stmt # (scalar) STMT_INFO
4844 loop_exit:
4845 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4846 use <s_out0>
4847 use <s_out0>
4849 The above is transformed by this function into:
4851 loop:
4852 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4853 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4854 s_loop = scalar_stmt # (scalar) STMT_INFO
4855 loop_exit:
4856 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4857 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4858 v_out2 = reduce <v_out1>
4859 s_out3 = extract_field <v_out2, 0>
4860 s_out4 = adjust_result <s_out3>
4861 use <s_out4>
4862 use <s_out4>
4865 static void
4866 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4867 stmt_vec_info stmt_info,
4868 slp_tree slp_node,
4869 slp_instance slp_node_instance)
4871 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4872 gcc_assert (reduc_info->is_reduc_info);
4873 /* For double reductions we need to get at the inner loop reduction
4874 stmt which has the meta info attached. Our stmt_info is that of the
4875 loop-closed PHI of the inner loop which we remember as
4876 def for the reduction PHI generation. */
4877 bool double_reduc = false;
4878 stmt_vec_info rdef_info = stmt_info;
4879 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4881 gcc_assert (!slp_node);
4882 double_reduc = true;
4883 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4884 (stmt_info->stmt, 0));
4885 stmt_info = vect_stmt_to_vectorize (stmt_info);
4887 gphi *reduc_def_stmt
4888 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4889 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4890 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4891 tree vectype;
4892 machine_mode mode;
4893 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4894 basic_block exit_bb;
4895 tree scalar_dest;
4896 tree scalar_type;
4897 gimple *new_phi = NULL, *phi;
4898 gimple_stmt_iterator exit_gsi;
4899 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4900 gimple *epilog_stmt = NULL;
4901 gimple *exit_phi;
4902 tree bitsize;
4903 tree def;
4904 tree orig_name, scalar_result;
4905 imm_use_iterator imm_iter, phi_imm_iter;
4906 use_operand_p use_p, phi_use_p;
4907 gimple *use_stmt;
4908 bool nested_in_vect_loop = false;
4909 auto_vec<gimple *> new_phis;
4910 int j, i;
4911 auto_vec<tree> scalar_results;
4912 unsigned int group_size = 1, k;
4913 auto_vec<gimple *> phis;
4914 bool slp_reduc = false;
4915 bool direct_slp_reduc;
4916 tree new_phi_result;
4917 tree induction_index = NULL_TREE;
4919 if (slp_node)
4920 group_size = SLP_TREE_LANES (slp_node);
4922 if (nested_in_vect_loop_p (loop, stmt_info))
4924 outer_loop = loop;
4925 loop = loop->inner;
4926 nested_in_vect_loop = true;
4927 gcc_assert (!slp_node);
4929 gcc_assert (!nested_in_vect_loop || double_reduc);
4931 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4932 gcc_assert (vectype);
4933 mode = TYPE_MODE (vectype);
4935 tree initial_def = NULL;
4936 tree induc_val = NULL_TREE;
4937 tree adjustment_def = NULL;
4938 if (slp_node)
4940 else
4942 /* Get at the scalar def before the loop, that defines the initial value
4943 of the reduction variable. */
4944 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4945 loop_preheader_edge (loop));
4946 /* Optimize: for induction condition reduction, if we can't use zero
4947 for induc_val, use initial_def. */
4948 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4949 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4950 else if (double_reduc)
4952 else if (nested_in_vect_loop)
4954 else
4955 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4958 unsigned vec_num;
4959 int ncopies;
4960 if (slp_node)
4962 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4963 ncopies = 1;
4965 else
4967 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4968 vec_num = 1;
4969 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4972 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4973 which is updated with the current index of the loop for every match of
4974 the original loop's cond_expr (VEC_STMT). This results in a vector
4975 containing the last time the condition passed for that vector lane.
4976 The first match will be a 1 to allow 0 to be used for non-matching
4977 indexes. If there are no matches at all then the vector will be all
4978 zeroes.
4980 PR92772: This algorithm is broken for architectures that support
4981 masked vectors, but do not provide fold_extract_last. */
4982 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4984 auto_vec<std::pair<tree, bool>, 2> ccompares;
4985 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4986 cond_info = vect_stmt_to_vectorize (cond_info);
4987 while (cond_info != reduc_info)
4989 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4991 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4992 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4993 ccompares.safe_push
4994 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4995 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4997 cond_info
4998 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4999 1 + STMT_VINFO_REDUC_IDX
5000 (cond_info)));
5001 cond_info = vect_stmt_to_vectorize (cond_info);
5003 gcc_assert (ccompares.length () != 0);
5005 tree indx_before_incr, indx_after_incr;
5006 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5007 int scalar_precision
5008 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5009 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5010 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5011 (TYPE_MODE (vectype), cr_index_scalar_type,
5012 TYPE_VECTOR_SUBPARTS (vectype));
5014 /* First we create a simple vector induction variable which starts
5015 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5016 vector size (STEP). */
5018 /* Create a {1,2,3,...} vector. */
5019 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5021 /* Create a vector of the step value. */
5022 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5023 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5025 /* Create an induction variable. */
5026 gimple_stmt_iterator incr_gsi;
5027 bool insert_after;
5028 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5029 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5030 insert_after, &indx_before_incr, &indx_after_incr);
5032 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5033 filled with zeros (VEC_ZERO). */
5035 /* Create a vector of 0s. */
5036 tree zero = build_zero_cst (cr_index_scalar_type);
5037 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5039 /* Create a vector phi node. */
5040 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5041 new_phi = create_phi_node (new_phi_tree, loop->header);
5042 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5043 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5045 /* Now take the condition from the loops original cond_exprs
5046 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5047 every match uses values from the induction variable
5048 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5049 (NEW_PHI_TREE).
5050 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5051 the new cond_expr (INDEX_COND_EXPR). */
5052 gimple_seq stmts = NULL;
5053 for (int i = ccompares.length () - 1; i != -1; --i)
5055 tree ccompare = ccompares[i].first;
5056 if (ccompares[i].second)
5057 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5058 cr_index_vector_type,
5059 ccompare,
5060 indx_before_incr, new_phi_tree);
5061 else
5062 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5063 cr_index_vector_type,
5064 ccompare,
5065 new_phi_tree, indx_before_incr);
5067 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5069 /* Update the phi with the vec cond. */
5070 induction_index = new_phi_tree;
5071 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5072 loop_latch_edge (loop), UNKNOWN_LOCATION);
5075 /* 2. Create epilog code.
5076 The reduction epilog code operates across the elements of the vector
5077 of partial results computed by the vectorized loop.
5078 The reduction epilog code consists of:
5080 step 1: compute the scalar result in a vector (v_out2)
5081 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5082 step 3: adjust the scalar result (s_out3) if needed.
5084 Step 1 can be accomplished using one the following three schemes:
5085 (scheme 1) using reduc_fn, if available.
5086 (scheme 2) using whole-vector shifts, if available.
5087 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5088 combined.
5090 The overall epilog code looks like this:
5092 s_out0 = phi <s_loop> # original EXIT_PHI
5093 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5094 v_out2 = reduce <v_out1> # step 1
5095 s_out3 = extract_field <v_out2, 0> # step 2
5096 s_out4 = adjust_result <s_out3> # step 3
5098 (step 3 is optional, and steps 1 and 2 may be combined).
5099 Lastly, the uses of s_out0 are replaced by s_out4. */
5102 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5103 v_out1 = phi <VECT_DEF>
5104 Store them in NEW_PHIS. */
5105 if (double_reduc)
5106 loop = outer_loop;
5107 exit_bb = single_exit (loop)->dest;
5108 new_phis.create (slp_node ? vec_num : ncopies);
5109 for (unsigned i = 0; i < vec_num; i++)
5111 if (slp_node)
5112 def = vect_get_slp_vect_def (slp_node, i);
5113 else
5114 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5115 for (j = 0; j < ncopies; j++)
5117 tree new_def = copy_ssa_name (def);
5118 phi = create_phi_node (new_def, exit_bb);
5119 if (j == 0)
5120 new_phis.quick_push (phi);
5121 else
5123 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5124 new_phis.quick_push (phi);
5127 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5131 exit_gsi = gsi_after_labels (exit_bb);
5133 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5134 (i.e. when reduc_fn is not available) and in the final adjustment
5135 code (if needed). Also get the original scalar reduction variable as
5136 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5137 represents a reduction pattern), the tree-code and scalar-def are
5138 taken from the original stmt that the pattern-stmt (STMT) replaces.
5139 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5140 are taken from STMT. */
5142 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5143 if (orig_stmt_info != stmt_info)
5145 /* Reduction pattern */
5146 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5147 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5150 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5151 scalar_type = TREE_TYPE (scalar_dest);
5152 scalar_results.create (group_size);
5153 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5154 bitsize = TYPE_SIZE (scalar_type);
5156 /* SLP reduction without reduction chain, e.g.,
5157 # a1 = phi <a2, a0>
5158 # b1 = phi <b2, b0>
5159 a2 = operation (a1)
5160 b2 = operation (b1) */
5161 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5163 /* True if we should implement SLP_REDUC using native reduction operations
5164 instead of scalar operations. */
5165 direct_slp_reduc = (reduc_fn != IFN_LAST
5166 && slp_reduc
5167 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5169 /* In case of reduction chain, e.g.,
5170 # a1 = phi <a3, a0>
5171 a2 = operation (a1)
5172 a3 = operation (a2),
5174 we may end up with more than one vector result. Here we reduce them to
5175 one vector. */
5176 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5178 gimple_seq stmts = NULL;
5179 tree first_vect = PHI_RESULT (new_phis[0]);
5180 first_vect = gimple_convert (&stmts, vectype, first_vect);
5181 for (k = 1; k < new_phis.length (); k++)
5183 gimple *next_phi = new_phis[k];
5184 tree second_vect = PHI_RESULT (next_phi);
5185 second_vect = gimple_convert (&stmts, vectype, second_vect);
5186 first_vect = gimple_build (&stmts, code, vectype,
5187 first_vect, second_vect);
5189 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5191 new_phi_result = first_vect;
5192 new_phis.truncate (0);
5193 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5195 /* Likewise if we couldn't use a single defuse cycle. */
5196 else if (ncopies > 1)
5198 gimple_seq stmts = NULL;
5199 tree first_vect = PHI_RESULT (new_phis[0]);
5200 first_vect = gimple_convert (&stmts, vectype, first_vect);
5201 for (int k = 1; k < ncopies; ++k)
5203 tree second_vect = PHI_RESULT (new_phis[k]);
5204 second_vect = gimple_convert (&stmts, vectype, second_vect);
5205 first_vect = gimple_build (&stmts, code, vectype,
5206 first_vect, second_vect);
5208 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5209 new_phi_result = first_vect;
5210 new_phis.truncate (0);
5211 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5213 else
5214 new_phi_result = PHI_RESULT (new_phis[0]);
5216 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5217 && reduc_fn != IFN_LAST)
5219 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5220 various data values where the condition matched and another vector
5221 (INDUCTION_INDEX) containing all the indexes of those matches. We
5222 need to extract the last matching index (which will be the index with
5223 highest value) and use this to index into the data vector.
5224 For the case where there were no matches, the data vector will contain
5225 all default values and the index vector will be all zeros. */
5227 /* Get various versions of the type of the vector of indexes. */
5228 tree index_vec_type = TREE_TYPE (induction_index);
5229 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5230 tree index_scalar_type = TREE_TYPE (index_vec_type);
5231 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5233 /* Get an unsigned integer version of the type of the data vector. */
5234 int scalar_precision
5235 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5236 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5237 tree vectype_unsigned = build_vector_type
5238 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5240 /* First we need to create a vector (ZERO_VEC) of zeros and another
5241 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5242 can create using a MAX reduction and then expanding.
5243 In the case where the loop never made any matches, the max index will
5244 be zero. */
5246 /* Vector of {0, 0, 0,...}. */
5247 tree zero_vec = build_zero_cst (vectype);
5249 gimple_seq stmts = NULL;
5250 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5251 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5253 /* Find maximum value from the vector of found indexes. */
5254 tree max_index = make_ssa_name (index_scalar_type);
5255 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5256 1, induction_index);
5257 gimple_call_set_lhs (max_index_stmt, max_index);
5258 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5260 /* Vector of {max_index, max_index, max_index,...}. */
5261 tree max_index_vec = make_ssa_name (index_vec_type);
5262 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5263 max_index);
5264 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5265 max_index_vec_rhs);
5266 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5268 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5269 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5270 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5271 otherwise. Only one value should match, resulting in a vector
5272 (VEC_COND) with one data value and the rest zeros.
5273 In the case where the loop never made any matches, every index will
5274 match, resulting in a vector with all data values (which will all be
5275 the default value). */
5277 /* Compare the max index vector to the vector of found indexes to find
5278 the position of the max value. */
5279 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5280 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5281 induction_index,
5282 max_index_vec);
5283 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5285 /* Use the compare to choose either values from the data vector or
5286 zero. */
5287 tree vec_cond = make_ssa_name (vectype);
5288 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5289 vec_compare, new_phi_result,
5290 zero_vec);
5291 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5293 /* Finally we need to extract the data value from the vector (VEC_COND)
5294 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5295 reduction, but because this doesn't exist, we can use a MAX reduction
5296 instead. The data value might be signed or a float so we need to cast
5297 it first.
5298 In the case where the loop never made any matches, the data values are
5299 all identical, and so will reduce down correctly. */
5301 /* Make the matched data values unsigned. */
5302 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5303 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5304 vec_cond);
5305 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5306 VIEW_CONVERT_EXPR,
5307 vec_cond_cast_rhs);
5308 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5310 /* Reduce down to a scalar value. */
5311 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5312 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5313 1, vec_cond_cast);
5314 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5315 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5317 /* Convert the reduced value back to the result type and set as the
5318 result. */
5319 stmts = NULL;
5320 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5321 data_reduc);
5322 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5323 scalar_results.safe_push (new_temp);
5325 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5326 && reduc_fn == IFN_LAST)
5328 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5329 idx = 0;
5330 idx_val = induction_index[0];
5331 val = data_reduc[0];
5332 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5333 if (induction_index[i] > idx_val)
5334 val = data_reduc[i], idx_val = induction_index[i];
5335 return val; */
5337 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5338 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5339 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5340 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5341 /* Enforced by vectorizable_reduction, which ensures we have target
5342 support before allowing a conditional reduction on variable-length
5343 vectors. */
5344 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5345 tree idx_val = NULL_TREE, val = NULL_TREE;
5346 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5348 tree old_idx_val = idx_val;
5349 tree old_val = val;
5350 idx_val = make_ssa_name (idx_eltype);
5351 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5352 build3 (BIT_FIELD_REF, idx_eltype,
5353 induction_index,
5354 bitsize_int (el_size),
5355 bitsize_int (off)));
5356 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5357 val = make_ssa_name (data_eltype);
5358 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5359 build3 (BIT_FIELD_REF,
5360 data_eltype,
5361 new_phi_result,
5362 bitsize_int (el_size),
5363 bitsize_int (off)));
5364 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5365 if (off != 0)
5367 tree new_idx_val = idx_val;
5368 if (off != v_size - el_size)
5370 new_idx_val = make_ssa_name (idx_eltype);
5371 epilog_stmt = gimple_build_assign (new_idx_val,
5372 MAX_EXPR, idx_val,
5373 old_idx_val);
5374 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5376 tree new_val = make_ssa_name (data_eltype);
5377 epilog_stmt = gimple_build_assign (new_val,
5378 COND_EXPR,
5379 build2 (GT_EXPR,
5380 boolean_type_node,
5381 idx_val,
5382 old_idx_val),
5383 val, old_val);
5384 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5385 idx_val = new_idx_val;
5386 val = new_val;
5389 /* Convert the reduced value back to the result type and set as the
5390 result. */
5391 gimple_seq stmts = NULL;
5392 val = gimple_convert (&stmts, scalar_type, val);
5393 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5394 scalar_results.safe_push (val);
5397 /* 2.3 Create the reduction code, using one of the three schemes described
5398 above. In SLP we simply need to extract all the elements from the
5399 vector (without reducing them), so we use scalar shifts. */
5400 else if (reduc_fn != IFN_LAST && !slp_reduc)
5402 tree tmp;
5403 tree vec_elem_type;
5405 /* Case 1: Create:
5406 v_out2 = reduc_expr <v_out1> */
5408 if (dump_enabled_p ())
5409 dump_printf_loc (MSG_NOTE, vect_location,
5410 "Reduce using direct vector reduction.\n");
5412 gimple_seq stmts = NULL;
5413 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5414 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5415 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5416 vec_elem_type, new_phi_result);
5417 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5418 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5420 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5421 && induc_val)
5423 /* Earlier we set the initial value to be a vector if induc_val
5424 values. Check the result and if it is induc_val then replace
5425 with the original initial value, unless induc_val is
5426 the same as initial_def already. */
5427 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5428 induc_val);
5430 tmp = make_ssa_name (new_scalar_dest);
5431 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5432 initial_def, new_temp);
5433 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5434 new_temp = tmp;
5437 scalar_results.safe_push (new_temp);
5439 else if (direct_slp_reduc)
5441 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5442 with the elements for other SLP statements replaced with the
5443 neutral value. We can then do a normal reduction on each vector. */
5445 /* Enforced by vectorizable_reduction. */
5446 gcc_assert (new_phis.length () == 1);
5447 gcc_assert (pow2p_hwi (group_size));
5449 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5450 vec<stmt_vec_info> orig_phis
5451 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5452 gimple_seq seq = NULL;
5454 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5455 and the same element size as VECTYPE. */
5456 tree index = build_index_vector (vectype, 0, 1);
5457 tree index_type = TREE_TYPE (index);
5458 tree index_elt_type = TREE_TYPE (index_type);
5459 tree mask_type = truth_type_for (index_type);
5461 /* Create a vector that, for each element, identifies which of
5462 the REDUC_GROUP_SIZE results should use it. */
5463 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5464 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5465 build_vector_from_val (index_type, index_mask));
5467 /* Get a neutral vector value. This is simply a splat of the neutral
5468 scalar value if we have one, otherwise the initial scalar value
5469 is itself a neutral value. */
5470 tree vector_identity = NULL_TREE;
5471 tree neutral_op = NULL_TREE;
5472 if (slp_node)
5474 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5475 neutral_op
5476 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5477 vectype, code, first != NULL);
5479 if (neutral_op)
5480 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5481 neutral_op);
5482 for (unsigned int i = 0; i < group_size; ++i)
5484 /* If there's no univeral neutral value, we can use the
5485 initial scalar value from the original PHI. This is used
5486 for MIN and MAX reduction, for example. */
5487 if (!neutral_op)
5489 tree scalar_value
5490 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5491 loop_preheader_edge (loop));
5492 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5493 scalar_value);
5494 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5495 scalar_value);
5498 /* Calculate the equivalent of:
5500 sel[j] = (index[j] == i);
5502 which selects the elements of NEW_PHI_RESULT that should
5503 be included in the result. */
5504 tree compare_val = build_int_cst (index_elt_type, i);
5505 compare_val = build_vector_from_val (index_type, compare_val);
5506 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5507 index, compare_val);
5509 /* Calculate the equivalent of:
5511 vec = seq ? new_phi_result : vector_identity;
5513 VEC is now suitable for a full vector reduction. */
5514 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5515 sel, new_phi_result, vector_identity);
5517 /* Do the reduction and convert it to the appropriate type. */
5518 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5519 TREE_TYPE (vectype), vec);
5520 scalar = gimple_convert (&seq, scalar_type, scalar);
5521 scalar_results.safe_push (scalar);
5523 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5525 else
5527 bool reduce_with_shift;
5528 tree vec_temp;
5530 gcc_assert (slp_reduc || new_phis.length () == 1);
5532 /* See if the target wants to do the final (shift) reduction
5533 in a vector mode of smaller size and first reduce upper/lower
5534 halves against each other. */
5535 enum machine_mode mode1 = mode;
5536 tree stype = TREE_TYPE (vectype);
5537 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5538 unsigned nunits1 = nunits;
5539 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5540 && new_phis.length () == 1)
5542 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5543 /* For SLP reductions we have to make sure lanes match up, but
5544 since we're doing individual element final reduction reducing
5545 vector width here is even more important.
5546 ??? We can also separate lanes with permutes, for the common
5547 case of power-of-two group-size odd/even extracts would work. */
5548 if (slp_reduc && nunits != nunits1)
5550 nunits1 = least_common_multiple (nunits1, group_size);
5551 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5554 if (!slp_reduc
5555 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5556 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5558 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5559 stype, nunits1);
5560 reduce_with_shift = have_whole_vector_shift (mode1);
5561 if (!VECTOR_MODE_P (mode1))
5562 reduce_with_shift = false;
5563 else
5565 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5566 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5567 reduce_with_shift = false;
5570 /* First reduce the vector to the desired vector size we should
5571 do shift reduction on by combining upper and lower halves. */
5572 new_temp = new_phi_result;
5573 while (nunits > nunits1)
5575 nunits /= 2;
5576 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5577 stype, nunits);
5578 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5580 /* The target has to make sure we support lowpart/highpart
5581 extraction, either via direct vector extract or through
5582 an integer mode punning. */
5583 tree dst1, dst2;
5584 if (convert_optab_handler (vec_extract_optab,
5585 TYPE_MODE (TREE_TYPE (new_temp)),
5586 TYPE_MODE (vectype1))
5587 != CODE_FOR_nothing)
5589 /* Extract sub-vectors directly once vec_extract becomes
5590 a conversion optab. */
5591 dst1 = make_ssa_name (vectype1);
5592 epilog_stmt
5593 = gimple_build_assign (dst1, BIT_FIELD_REF,
5594 build3 (BIT_FIELD_REF, vectype1,
5595 new_temp, TYPE_SIZE (vectype1),
5596 bitsize_int (0)));
5597 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5598 dst2 = make_ssa_name (vectype1);
5599 epilog_stmt
5600 = gimple_build_assign (dst2, BIT_FIELD_REF,
5601 build3 (BIT_FIELD_REF, vectype1,
5602 new_temp, TYPE_SIZE (vectype1),
5603 bitsize_int (bitsize)));
5604 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5606 else
5608 /* Extract via punning to appropriately sized integer mode
5609 vector. */
5610 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5611 tree etype = build_vector_type (eltype, 2);
5612 gcc_assert (convert_optab_handler (vec_extract_optab,
5613 TYPE_MODE (etype),
5614 TYPE_MODE (eltype))
5615 != CODE_FOR_nothing);
5616 tree tem = make_ssa_name (etype);
5617 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5618 build1 (VIEW_CONVERT_EXPR,
5619 etype, new_temp));
5620 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5621 new_temp = tem;
5622 tem = make_ssa_name (eltype);
5623 epilog_stmt
5624 = gimple_build_assign (tem, BIT_FIELD_REF,
5625 build3 (BIT_FIELD_REF, eltype,
5626 new_temp, TYPE_SIZE (eltype),
5627 bitsize_int (0)));
5628 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5629 dst1 = make_ssa_name (vectype1);
5630 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5631 build1 (VIEW_CONVERT_EXPR,
5632 vectype1, tem));
5633 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5634 tem = make_ssa_name (eltype);
5635 epilog_stmt
5636 = gimple_build_assign (tem, BIT_FIELD_REF,
5637 build3 (BIT_FIELD_REF, eltype,
5638 new_temp, TYPE_SIZE (eltype),
5639 bitsize_int (bitsize)));
5640 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5641 dst2 = make_ssa_name (vectype1);
5642 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5643 build1 (VIEW_CONVERT_EXPR,
5644 vectype1, tem));
5645 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5648 new_temp = make_ssa_name (vectype1);
5649 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5650 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5651 new_phis[0] = epilog_stmt;
5654 if (reduce_with_shift && !slp_reduc)
5656 int element_bitsize = tree_to_uhwi (bitsize);
5657 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5658 for variable-length vectors and also requires direct target support
5659 for loop reductions. */
5660 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5661 int nelements = vec_size_in_bits / element_bitsize;
5662 vec_perm_builder sel;
5663 vec_perm_indices indices;
5665 int elt_offset;
5667 tree zero_vec = build_zero_cst (vectype1);
5668 /* Case 2: Create:
5669 for (offset = nelements/2; offset >= 1; offset/=2)
5671 Create: va' = vec_shift <va, offset>
5672 Create: va = vop <va, va'>
5673 } */
5675 tree rhs;
5677 if (dump_enabled_p ())
5678 dump_printf_loc (MSG_NOTE, vect_location,
5679 "Reduce using vector shifts\n");
5681 gimple_seq stmts = NULL;
5682 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5683 for (elt_offset = nelements / 2;
5684 elt_offset >= 1;
5685 elt_offset /= 2)
5687 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5688 indices.new_vector (sel, 2, nelements);
5689 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5690 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5691 new_temp, zero_vec, mask);
5692 new_temp = gimple_build (&stmts, code,
5693 vectype1, new_name, new_temp);
5695 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5697 /* 2.4 Extract the final scalar result. Create:
5698 s_out3 = extract_field <v_out2, bitpos> */
5700 if (dump_enabled_p ())
5701 dump_printf_loc (MSG_NOTE, vect_location,
5702 "extract scalar result\n");
5704 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5705 bitsize, bitsize_zero_node);
5706 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5707 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5708 gimple_assign_set_lhs (epilog_stmt, new_temp);
5709 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5710 scalar_results.safe_push (new_temp);
5712 else
5714 /* Case 3: Create:
5715 s = extract_field <v_out2, 0>
5716 for (offset = element_size;
5717 offset < vector_size;
5718 offset += element_size;)
5720 Create: s' = extract_field <v_out2, offset>
5721 Create: s = op <s, s'> // For non SLP cases
5722 } */
5724 if (dump_enabled_p ())
5725 dump_printf_loc (MSG_NOTE, vect_location,
5726 "Reduce using scalar code.\n");
5728 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5729 int element_bitsize = tree_to_uhwi (bitsize);
5730 tree compute_type = TREE_TYPE (vectype);
5731 gimple_seq stmts = NULL;
5732 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5734 int bit_offset;
5735 if (gimple_code (new_phi) == GIMPLE_PHI)
5736 vec_temp = PHI_RESULT (new_phi);
5737 else
5738 vec_temp = gimple_assign_lhs (new_phi);
5739 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5740 vec_temp, bitsize, bitsize_zero_node);
5742 /* In SLP we don't need to apply reduction operation, so we just
5743 collect s' values in SCALAR_RESULTS. */
5744 if (slp_reduc)
5745 scalar_results.safe_push (new_temp);
5747 for (bit_offset = element_bitsize;
5748 bit_offset < vec_size_in_bits;
5749 bit_offset += element_bitsize)
5751 tree bitpos = bitsize_int (bit_offset);
5752 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5753 compute_type, vec_temp,
5754 bitsize, bitpos);
5755 if (slp_reduc)
5757 /* In SLP we don't need to apply reduction operation, so
5758 we just collect s' values in SCALAR_RESULTS. */
5759 new_temp = new_name;
5760 scalar_results.safe_push (new_name);
5762 else
5763 new_temp = gimple_build (&stmts, code, compute_type,
5764 new_name, new_temp);
5768 /* The only case where we need to reduce scalar results in SLP, is
5769 unrolling. If the size of SCALAR_RESULTS is greater than
5770 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5771 REDUC_GROUP_SIZE. */
5772 if (slp_reduc)
5774 tree res, first_res, new_res;
5776 /* Reduce multiple scalar results in case of SLP unrolling. */
5777 for (j = group_size; scalar_results.iterate (j, &res);
5778 j++)
5780 first_res = scalar_results[j % group_size];
5781 new_res = gimple_build (&stmts, code, compute_type,
5782 first_res, res);
5783 scalar_results[j % group_size] = new_res;
5785 for (k = 0; k < group_size; k++)
5786 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5787 scalar_results[k]);
5789 else
5791 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5792 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5793 scalar_results.safe_push (new_temp);
5796 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5799 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5800 && induc_val)
5802 /* Earlier we set the initial value to be a vector if induc_val
5803 values. Check the result and if it is induc_val then replace
5804 with the original initial value, unless induc_val is
5805 the same as initial_def already. */
5806 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5807 induc_val);
5809 tree tmp = make_ssa_name (new_scalar_dest);
5810 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5811 initial_def, new_temp);
5812 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5813 scalar_results[0] = tmp;
5817 /* 2.5 Adjust the final result by the initial value of the reduction
5818 variable. (When such adjustment is not needed, then
5819 'adjustment_def' is zero). For example, if code is PLUS we create:
5820 new_temp = loop_exit_def + adjustment_def */
5822 if (adjustment_def)
5824 gcc_assert (!slp_reduc);
5825 gimple_seq stmts = NULL;
5826 if (nested_in_vect_loop)
5828 new_phi = new_phis[0];
5829 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5830 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5831 new_temp = gimple_build (&stmts, code, vectype,
5832 PHI_RESULT (new_phi), adjustment_def);
5834 else
5836 new_temp = scalar_results[0];
5837 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5838 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5839 new_temp = gimple_build (&stmts, code, scalar_type,
5840 new_temp, adjustment_def);
5843 epilog_stmt = gimple_seq_last_stmt (stmts);
5844 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5845 if (nested_in_vect_loop)
5847 if (!double_reduc)
5848 scalar_results.quick_push (new_temp);
5849 else
5850 scalar_results[0] = new_temp;
5852 else
5853 scalar_results[0] = new_temp;
5855 new_phis[0] = epilog_stmt;
5858 if (double_reduc)
5859 loop = loop->inner;
5861 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5862 phis with new adjusted scalar results, i.e., replace use <s_out0>
5863 with use <s_out4>.
5865 Transform:
5866 loop_exit:
5867 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5868 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5869 v_out2 = reduce <v_out1>
5870 s_out3 = extract_field <v_out2, 0>
5871 s_out4 = adjust_result <s_out3>
5872 use <s_out0>
5873 use <s_out0>
5875 into:
5877 loop_exit:
5878 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5879 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5880 v_out2 = reduce <v_out1>
5881 s_out3 = extract_field <v_out2, 0>
5882 s_out4 = adjust_result <s_out3>
5883 use <s_out4>
5884 use <s_out4> */
5887 /* In SLP reduction chain we reduce vector results into one vector if
5888 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5889 LHS of the last stmt in the reduction chain, since we are looking for
5890 the loop exit phi node. */
5891 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5893 stmt_vec_info dest_stmt_info
5894 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5895 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5896 group_size = 1;
5899 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5900 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5901 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5902 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5903 correspond to the first vector stmt, etc.
5904 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5905 if (group_size > new_phis.length ())
5906 gcc_assert (!(group_size % new_phis.length ()));
5908 for (k = 0; k < group_size; k++)
5910 if (slp_reduc)
5912 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5914 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5915 /* SLP statements can't participate in patterns. */
5916 gcc_assert (!orig_stmt_info);
5917 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5920 if (nested_in_vect_loop)
5922 if (double_reduc)
5923 loop = outer_loop;
5924 else
5925 gcc_unreachable ();
5928 phis.create (3);
5929 /* Find the loop-closed-use at the loop exit of the original scalar
5930 result. (The reduction result is expected to have two immediate uses,
5931 one at the latch block, and one at the loop exit). For double
5932 reductions we are looking for exit phis of the outer loop. */
5933 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5935 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5937 if (!is_gimple_debug (USE_STMT (use_p)))
5938 phis.safe_push (USE_STMT (use_p));
5940 else
5942 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5944 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5946 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5948 if (!flow_bb_inside_loop_p (loop,
5949 gimple_bb (USE_STMT (phi_use_p)))
5950 && !is_gimple_debug (USE_STMT (phi_use_p)))
5951 phis.safe_push (USE_STMT (phi_use_p));
5957 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5959 /* Replace the uses: */
5960 orig_name = PHI_RESULT (exit_phi);
5961 scalar_result = scalar_results[k];
5962 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5964 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5965 SET_USE (use_p, scalar_result);
5966 update_stmt (use_stmt);
5970 phis.release ();
5974 /* Return a vector of type VECTYPE that is equal to the vector select
5975 operation "MASK ? VEC : IDENTITY". Insert the select statements
5976 before GSI. */
5978 static tree
5979 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5980 tree vec, tree identity)
5982 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5983 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5984 mask, vec, identity);
5985 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5986 return cond;
5989 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5990 order, starting with LHS. Insert the extraction statements before GSI and
5991 associate the new scalar SSA names with variable SCALAR_DEST.
5992 Return the SSA name for the result. */
5994 static tree
5995 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5996 tree_code code, tree lhs, tree vector_rhs)
5998 tree vectype = TREE_TYPE (vector_rhs);
5999 tree scalar_type = TREE_TYPE (vectype);
6000 tree bitsize = TYPE_SIZE (scalar_type);
6001 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6002 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6004 for (unsigned HOST_WIDE_INT bit_offset = 0;
6005 bit_offset < vec_size_in_bits;
6006 bit_offset += element_bitsize)
6008 tree bitpos = bitsize_int (bit_offset);
6009 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6010 bitsize, bitpos);
6012 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6013 rhs = make_ssa_name (scalar_dest, stmt);
6014 gimple_assign_set_lhs (stmt, rhs);
6015 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6017 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6018 tree new_name = make_ssa_name (scalar_dest, stmt);
6019 gimple_assign_set_lhs (stmt, new_name);
6020 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6021 lhs = new_name;
6023 return lhs;
6026 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6027 type of the vector input. */
6029 static internal_fn
6030 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6032 internal_fn mask_reduc_fn;
6034 switch (reduc_fn)
6036 case IFN_FOLD_LEFT_PLUS:
6037 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6038 break;
6040 default:
6041 return IFN_LAST;
6044 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6045 OPTIMIZE_FOR_SPEED))
6046 return mask_reduc_fn;
6047 return IFN_LAST;
6050 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6051 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6052 statement. CODE is the operation performed by STMT_INFO and OPS are
6053 its scalar operands. REDUC_INDEX is the index of the operand in
6054 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6055 implements in-order reduction, or IFN_LAST if we should open-code it.
6056 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6057 that should be used to control the operation in a fully-masked loop. */
6059 static bool
6060 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6061 stmt_vec_info stmt_info,
6062 gimple_stmt_iterator *gsi,
6063 gimple **vec_stmt, slp_tree slp_node,
6064 gimple *reduc_def_stmt,
6065 tree_code code, internal_fn reduc_fn,
6066 tree ops[3], tree vectype_in,
6067 int reduc_index, vec_loop_masks *masks)
6069 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6070 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6071 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6073 int ncopies;
6074 if (slp_node)
6075 ncopies = 1;
6076 else
6077 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6079 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6080 gcc_assert (ncopies == 1);
6081 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6083 if (slp_node)
6084 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6085 TYPE_VECTOR_SUBPARTS (vectype_in)));
6087 tree op0 = ops[1 - reduc_index];
6089 int group_size = 1;
6090 stmt_vec_info scalar_dest_def_info;
6091 auto_vec<tree> vec_oprnds0;
6092 if (slp_node)
6094 auto_vec<vec<tree> > vec_defs (2);
6095 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6096 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6097 vec_defs[0].release ();
6098 vec_defs[1].release ();
6099 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6100 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6102 else
6104 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6105 op0, &vec_oprnds0);
6106 scalar_dest_def_info = stmt_info;
6109 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6110 tree scalar_type = TREE_TYPE (scalar_dest);
6111 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6113 int vec_num = vec_oprnds0.length ();
6114 gcc_assert (vec_num == 1 || slp_node);
6115 tree vec_elem_type = TREE_TYPE (vectype_out);
6116 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6118 tree vector_identity = NULL_TREE;
6119 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6120 vector_identity = build_zero_cst (vectype_out);
6122 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6123 int i;
6124 tree def0;
6125 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6127 gimple *new_stmt;
6128 tree mask = NULL_TREE;
6129 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6130 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6132 /* Handle MINUS by adding the negative. */
6133 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6135 tree negated = make_ssa_name (vectype_out);
6136 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6137 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6138 def0 = negated;
6141 if (mask && mask_reduc_fn == IFN_LAST)
6142 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6143 vector_identity);
6145 /* On the first iteration the input is simply the scalar phi
6146 result, and for subsequent iterations it is the output of
6147 the preceding operation. */
6148 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6150 if (mask && mask_reduc_fn != IFN_LAST)
6151 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6152 def0, mask);
6153 else
6154 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6155 def0);
6156 /* For chained SLP reductions the output of the previous reduction
6157 operation serves as the input of the next. For the final statement
6158 the output cannot be a temporary - we reuse the original
6159 scalar destination of the last statement. */
6160 if (i != vec_num - 1)
6162 gimple_set_lhs (new_stmt, scalar_dest_var);
6163 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6164 gimple_set_lhs (new_stmt, reduc_var);
6167 else
6169 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6170 reduc_var, def0);
6171 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6172 /* Remove the statement, so that we can use the same code paths
6173 as for statements that we've just created. */
6174 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6175 gsi_remove (&tmp_gsi, true);
6178 if (i == vec_num - 1)
6180 gimple_set_lhs (new_stmt, scalar_dest);
6181 vect_finish_replace_stmt (loop_vinfo,
6182 scalar_dest_def_info,
6183 new_stmt);
6185 else
6186 vect_finish_stmt_generation (loop_vinfo,
6187 scalar_dest_def_info,
6188 new_stmt, gsi);
6190 if (slp_node)
6191 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6192 else
6194 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6195 *vec_stmt = new_stmt;
6199 return true;
6202 /* Function is_nonwrapping_integer_induction.
6204 Check if STMT_VINO (which is part of loop LOOP) both increments and
6205 does not cause overflow. */
6207 static bool
6208 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6210 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6211 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6212 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6213 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6214 widest_int ni, max_loop_value, lhs_max;
6215 wi::overflow_type overflow = wi::OVF_NONE;
6217 /* Make sure the loop is integer based. */
6218 if (TREE_CODE (base) != INTEGER_CST
6219 || TREE_CODE (step) != INTEGER_CST)
6220 return false;
6222 /* Check that the max size of the loop will not wrap. */
6224 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6225 return true;
6227 if (! max_stmt_executions (loop, &ni))
6228 return false;
6230 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6231 &overflow);
6232 if (overflow)
6233 return false;
6235 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6236 TYPE_SIGN (lhs_type), &overflow);
6237 if (overflow)
6238 return false;
6240 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6241 <= TYPE_PRECISION (lhs_type));
6244 /* Check if masking can be supported by inserting a conditional expression.
6245 CODE is the code for the operation. COND_FN is the conditional internal
6246 function, if it exists. VECTYPE_IN is the type of the vector input. */
6247 static bool
6248 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6249 tree vectype_in)
6251 if (cond_fn != IFN_LAST
6252 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6253 OPTIMIZE_FOR_SPEED))
6254 return false;
6256 switch (code)
6258 case DOT_PROD_EXPR:
6259 case SAD_EXPR:
6260 return true;
6262 default:
6263 return false;
6267 /* Insert a conditional expression to enable masked vectorization. CODE is the
6268 code for the operation. VOP is the array of operands. MASK is the loop
6269 mask. GSI is a statement iterator used to place the new conditional
6270 expression. */
6271 static void
6272 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6273 gimple_stmt_iterator *gsi)
6275 switch (code)
6277 case DOT_PROD_EXPR:
6279 tree vectype = TREE_TYPE (vop[1]);
6280 tree zero = build_zero_cst (vectype);
6281 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6282 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6283 mask, vop[1], zero);
6284 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6285 vop[1] = masked_op1;
6286 break;
6289 case SAD_EXPR:
6291 tree vectype = TREE_TYPE (vop[1]);
6292 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6293 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6294 mask, vop[1], vop[0]);
6295 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6296 vop[1] = masked_op1;
6297 break;
6300 default:
6301 gcc_unreachable ();
6305 /* Function vectorizable_reduction.
6307 Check if STMT_INFO performs a reduction operation that can be vectorized.
6308 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6309 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6310 Return true if STMT_INFO is vectorizable in this way.
6312 This function also handles reduction idioms (patterns) that have been
6313 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6314 may be of this form:
6315 X = pattern_expr (arg0, arg1, ..., X)
6316 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6317 sequence that had been detected and replaced by the pattern-stmt
6318 (STMT_INFO).
6320 This function also handles reduction of condition expressions, for example:
6321 for (int i = 0; i < N; i++)
6322 if (a[i] < value)
6323 last = a[i];
6324 This is handled by vectorising the loop and creating an additional vector
6325 containing the loop indexes for which "a[i] < value" was true. In the
6326 function epilogue this is reduced to a single max value and then used to
6327 index into the vector of results.
6329 In some cases of reduction patterns, the type of the reduction variable X is
6330 different than the type of the other arguments of STMT_INFO.
6331 In such cases, the vectype that is used when transforming STMT_INFO into
6332 a vector stmt is different than the vectype that is used to determine the
6333 vectorization factor, because it consists of a different number of elements
6334 than the actual number of elements that are being operated upon in parallel.
6336 For example, consider an accumulation of shorts into an int accumulator.
6337 On some targets it's possible to vectorize this pattern operating on 8
6338 shorts at a time (hence, the vectype for purposes of determining the
6339 vectorization factor should be V8HI); on the other hand, the vectype that
6340 is used to create the vector form is actually V4SI (the type of the result).
6342 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6343 indicates what is the actual level of parallelism (V8HI in the example), so
6344 that the right vectorization factor would be derived. This vectype
6345 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6346 be used to create the vectorized stmt. The right vectype for the vectorized
6347 stmt is obtained from the type of the result X:
6348 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6350 This means that, contrary to "regular" reductions (or "regular" stmts in
6351 general), the following equation:
6352 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6353 does *NOT* necessarily hold for reduction patterns. */
6355 bool
6356 vectorizable_reduction (loop_vec_info loop_vinfo,
6357 stmt_vec_info stmt_info, slp_tree slp_node,
6358 slp_instance slp_node_instance,
6359 stmt_vector_for_cost *cost_vec)
6361 tree scalar_dest;
6362 tree vectype_in = NULL_TREE;
6363 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6364 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6365 stmt_vec_info cond_stmt_vinfo = NULL;
6366 tree scalar_type;
6367 int i;
6368 int ncopies;
6369 bool single_defuse_cycle = false;
6370 bool nested_cycle = false;
6371 bool double_reduc = false;
6372 int vec_num;
6373 tree tem;
6374 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6375 tree cond_reduc_val = NULL_TREE;
6377 /* Make sure it was already recognized as a reduction computation. */
6378 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6379 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6380 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6381 return false;
6383 /* The stmt we store reduction analysis meta on. */
6384 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6385 reduc_info->is_reduc_info = true;
6387 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6389 if (is_a <gphi *> (stmt_info->stmt))
6391 if (slp_node)
6393 /* We eventually need to set a vector type on invariant
6394 arguments. */
6395 unsigned j;
6396 slp_tree child;
6397 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6398 if (!vect_maybe_update_slp_op_vectype
6399 (child, SLP_TREE_VECTYPE (slp_node)))
6401 if (dump_enabled_p ())
6402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6403 "incompatible vector types for "
6404 "invariants\n");
6405 return false;
6408 /* Analysis for double-reduction is done on the outer
6409 loop PHI, nested cycles have no further restrictions. */
6410 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6412 else
6413 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6414 return true;
6417 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6418 stmt_vec_info phi_info = stmt_info;
6419 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6420 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6422 if (!is_a <gphi *> (stmt_info->stmt))
6424 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6425 return true;
6427 if (slp_node)
6429 slp_node_instance->reduc_phis = slp_node;
6430 /* ??? We're leaving slp_node to point to the PHIs, we only
6431 need it to get at the number of vector stmts which wasn't
6432 yet initialized for the instance root. */
6434 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6435 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6436 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6438 use_operand_p use_p;
6439 gimple *use_stmt;
6440 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6441 &use_p, &use_stmt);
6442 gcc_assert (res);
6443 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6444 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6448 /* PHIs should not participate in patterns. */
6449 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6450 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6452 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6453 and compute the reduction chain length. Discover the real
6454 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6455 tree reduc_def
6456 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6457 loop_latch_edge
6458 (gimple_bb (reduc_def_phi)->loop_father));
6459 unsigned reduc_chain_length = 0;
6460 bool only_slp_reduc_chain = true;
6461 stmt_info = NULL;
6462 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6463 while (reduc_def != PHI_RESULT (reduc_def_phi))
6465 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6466 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6467 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6469 if (dump_enabled_p ())
6470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6471 "reduction chain broken by patterns.\n");
6472 return false;
6474 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6475 only_slp_reduc_chain = false;
6476 /* ??? For epilogue generation live members of the chain need
6477 to point back to the PHI via their original stmt for
6478 info_for_reduction to work. */
6479 if (STMT_VINFO_LIVE_P (vdef))
6480 STMT_VINFO_REDUC_DEF (def) = phi_info;
6481 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6482 if (!assign)
6484 if (dump_enabled_p ())
6485 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6486 "reduction chain includes calls.\n");
6487 return false;
6489 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6491 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6492 TREE_TYPE (gimple_assign_rhs1 (assign))))
6494 if (dump_enabled_p ())
6495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6496 "conversion in the reduction chain.\n");
6497 return false;
6500 else if (!stmt_info)
6501 /* First non-conversion stmt. */
6502 stmt_info = vdef;
6503 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6504 reduc_chain_length++;
6505 if (!stmt_info && slp_node)
6506 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6508 /* PHIs should not participate in patterns. */
6509 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6511 if (nested_in_vect_loop_p (loop, stmt_info))
6513 loop = loop->inner;
6514 nested_cycle = true;
6517 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6518 element. */
6519 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6521 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6522 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6524 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6525 gcc_assert (slp_node
6526 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6528 /* 1. Is vectorizable reduction? */
6529 /* Not supportable if the reduction variable is used in the loop, unless
6530 it's a reduction chain. */
6531 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6532 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6533 return false;
6535 /* Reductions that are not used even in an enclosing outer-loop,
6536 are expected to be "live" (used out of the loop). */
6537 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6538 && !STMT_VINFO_LIVE_P (stmt_info))
6539 return false;
6541 /* 2. Has this been recognized as a reduction pattern?
6543 Check if STMT represents a pattern that has been recognized
6544 in earlier analysis stages. For stmts that represent a pattern,
6545 the STMT_VINFO_RELATED_STMT field records the last stmt in
6546 the original sequence that constitutes the pattern. */
6548 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6549 if (orig_stmt_info)
6551 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6552 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6555 /* 3. Check the operands of the operation. The first operands are defined
6556 inside the loop body. The last operand is the reduction variable,
6557 which is defined by the loop-header-phi. */
6559 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6560 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6561 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6562 enum tree_code code = gimple_assign_rhs_code (stmt);
6563 bool lane_reduc_code_p
6564 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6565 int op_type = TREE_CODE_LENGTH (code);
6567 scalar_dest = gimple_assign_lhs (stmt);
6568 scalar_type = TREE_TYPE (scalar_dest);
6569 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6570 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6571 return false;
6573 /* Do not try to vectorize bit-precision reductions. */
6574 if (!type_has_mode_precision_p (scalar_type))
6575 return false;
6577 /* For lane-reducing ops we're reducing the number of reduction PHIs
6578 which means the only use of that may be in the lane-reducing operation. */
6579 if (lane_reduc_code_p
6580 && reduc_chain_length != 1
6581 && !only_slp_reduc_chain)
6583 if (dump_enabled_p ())
6584 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6585 "lane-reducing reduction with extra stmts.\n");
6586 return false;
6589 /* All uses but the last are expected to be defined in the loop.
6590 The last use is the reduction variable. In case of nested cycle this
6591 assumption is not true: we use reduc_index to record the index of the
6592 reduction variable. */
6593 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6594 /* We need to skip an extra operand for COND_EXPRs with embedded
6595 comparison. */
6596 unsigned opno_adjust = 0;
6597 if (code == COND_EXPR
6598 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6599 opno_adjust = 1;
6600 for (i = 0; i < op_type; i++)
6602 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6603 if (i == 0 && code == COND_EXPR)
6604 continue;
6606 stmt_vec_info def_stmt_info;
6607 enum vect_def_type dt;
6608 tree op;
6609 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6610 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6611 &def_stmt_info))
6613 if (dump_enabled_p ())
6614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6615 "use not simple.\n");
6616 return false;
6618 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6619 continue;
6621 /* There should be only one cycle def in the stmt, the one
6622 leading to reduc_def. */
6623 if (VECTORIZABLE_CYCLE_DEF (dt))
6624 return false;
6626 /* To properly compute ncopies we are interested in the widest
6627 non-reduction input type in case we're looking at a widening
6628 accumulation that we later handle in vect_transform_reduction. */
6629 if (lane_reduc_code_p
6630 && tem
6631 && (!vectype_in
6632 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6633 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6634 vectype_in = tem;
6636 if (code == COND_EXPR)
6638 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6639 if (dt == vect_constant_def)
6641 cond_reduc_dt = dt;
6642 cond_reduc_val = op;
6644 if (dt == vect_induction_def
6645 && def_stmt_info
6646 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6648 cond_reduc_dt = dt;
6649 cond_stmt_vinfo = def_stmt_info;
6653 if (!vectype_in)
6654 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6655 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6657 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6658 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6659 /* If we have a condition reduction, see if we can simplify it further. */
6660 if (v_reduc_type == COND_REDUCTION)
6662 if (slp_node)
6663 return false;
6665 /* When the condition uses the reduction value in the condition, fail. */
6666 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6668 if (dump_enabled_p ())
6669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6670 "condition depends on previous iteration\n");
6671 return false;
6674 if (reduc_chain_length == 1
6675 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6676 vectype_in, OPTIMIZE_FOR_SPEED))
6678 if (dump_enabled_p ())
6679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6680 "optimizing condition reduction with"
6681 " FOLD_EXTRACT_LAST.\n");
6682 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6684 else if (cond_reduc_dt == vect_induction_def)
6686 tree base
6687 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6688 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6690 gcc_assert (TREE_CODE (base) == INTEGER_CST
6691 && TREE_CODE (step) == INTEGER_CST);
6692 cond_reduc_val = NULL_TREE;
6693 enum tree_code cond_reduc_op_code = ERROR_MARK;
6694 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6695 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6697 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6698 above base; punt if base is the minimum value of the type for
6699 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6700 else if (tree_int_cst_sgn (step) == -1)
6702 cond_reduc_op_code = MIN_EXPR;
6703 if (tree_int_cst_sgn (base) == -1)
6704 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6705 else if (tree_int_cst_lt (base,
6706 TYPE_MAX_VALUE (TREE_TYPE (base))))
6707 cond_reduc_val
6708 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6710 else
6712 cond_reduc_op_code = MAX_EXPR;
6713 if (tree_int_cst_sgn (base) == 1)
6714 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6715 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6716 base))
6717 cond_reduc_val
6718 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6720 if (cond_reduc_val)
6722 if (dump_enabled_p ())
6723 dump_printf_loc (MSG_NOTE, vect_location,
6724 "condition expression based on "
6725 "integer induction.\n");
6726 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6727 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6728 = cond_reduc_val;
6729 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6732 else if (cond_reduc_dt == vect_constant_def)
6734 enum vect_def_type cond_initial_dt;
6735 tree cond_initial_val
6736 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6738 gcc_assert (cond_reduc_val != NULL_TREE);
6739 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6740 if (cond_initial_dt == vect_constant_def
6741 && types_compatible_p (TREE_TYPE (cond_initial_val),
6742 TREE_TYPE (cond_reduc_val)))
6744 tree e = fold_binary (LE_EXPR, boolean_type_node,
6745 cond_initial_val, cond_reduc_val);
6746 if (e && (integer_onep (e) || integer_zerop (e)))
6748 if (dump_enabled_p ())
6749 dump_printf_loc (MSG_NOTE, vect_location,
6750 "condition expression based on "
6751 "compile time constant.\n");
6752 /* Record reduction code at analysis stage. */
6753 STMT_VINFO_REDUC_CODE (reduc_info)
6754 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6755 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6761 if (STMT_VINFO_LIVE_P (phi_info))
6762 return false;
6764 if (slp_node)
6765 ncopies = 1;
6766 else
6767 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6769 gcc_assert (ncopies >= 1);
6771 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6773 if (nested_cycle)
6775 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6776 == vect_double_reduction_def);
6777 double_reduc = true;
6780 /* 4.2. Check support for the epilog operation.
6782 If STMT represents a reduction pattern, then the type of the
6783 reduction variable may be different than the type of the rest
6784 of the arguments. For example, consider the case of accumulation
6785 of shorts into an int accumulator; The original code:
6786 S1: int_a = (int) short_a;
6787 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6789 was replaced with:
6790 STMT: int_acc = widen_sum <short_a, int_acc>
6792 This means that:
6793 1. The tree-code that is used to create the vector operation in the
6794 epilog code (that reduces the partial results) is not the
6795 tree-code of STMT, but is rather the tree-code of the original
6796 stmt from the pattern that STMT is replacing. I.e, in the example
6797 above we want to use 'widen_sum' in the loop, but 'plus' in the
6798 epilog.
6799 2. The type (mode) we use to check available target support
6800 for the vector operation to be created in the *epilog*, is
6801 determined by the type of the reduction variable (in the example
6802 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6803 However the type (mode) we use to check available target support
6804 for the vector operation to be created *inside the loop*, is
6805 determined by the type of the other arguments to STMT (in the
6806 example we'd check this: optab_handler (widen_sum_optab,
6807 vect_short_mode)).
6809 This is contrary to "regular" reductions, in which the types of all
6810 the arguments are the same as the type of the reduction variable.
6811 For "regular" reductions we can therefore use the same vector type
6812 (and also the same tree-code) when generating the epilog code and
6813 when generating the code inside the loop. */
6815 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6816 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6818 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6819 if (reduction_type == TREE_CODE_REDUCTION)
6821 /* Check whether it's ok to change the order of the computation.
6822 Generally, when vectorizing a reduction we change the order of the
6823 computation. This may change the behavior of the program in some
6824 cases, so we need to check that this is ok. One exception is when
6825 vectorizing an outer-loop: the inner-loop is executed sequentially,
6826 and therefore vectorizing reductions in the inner-loop during
6827 outer-loop vectorization is safe. */
6828 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6830 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6831 is not directy used in stmt. */
6832 if (!only_slp_reduc_chain
6833 && reduc_chain_length != 1)
6835 if (dump_enabled_p ())
6836 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6837 "in-order reduction chain without SLP.\n");
6838 return false;
6840 STMT_VINFO_REDUC_TYPE (reduc_info)
6841 = reduction_type = FOLD_LEFT_REDUCTION;
6843 else if (!commutative_tree_code (orig_code)
6844 || !associative_tree_code (orig_code))
6846 if (dump_enabled_p ())
6847 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6848 "reduction: not commutative/associative");
6849 return false;
6853 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6854 && ncopies > 1)
6856 if (dump_enabled_p ())
6857 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6858 "multiple types in double reduction or condition "
6859 "reduction or fold-left reduction.\n");
6860 return false;
6863 internal_fn reduc_fn = IFN_LAST;
6864 if (reduction_type == TREE_CODE_REDUCTION
6865 || reduction_type == FOLD_LEFT_REDUCTION
6866 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6867 || reduction_type == CONST_COND_REDUCTION)
6869 if (reduction_type == FOLD_LEFT_REDUCTION
6870 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6871 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6873 if (reduc_fn != IFN_LAST
6874 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6875 OPTIMIZE_FOR_SPEED))
6877 if (dump_enabled_p ())
6878 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6879 "reduc op not supported by target.\n");
6881 reduc_fn = IFN_LAST;
6884 else
6886 if (!nested_cycle || double_reduc)
6888 if (dump_enabled_p ())
6889 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6890 "no reduc code for scalar code.\n");
6892 return false;
6896 else if (reduction_type == COND_REDUCTION)
6898 int scalar_precision
6899 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6900 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6901 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6902 nunits_out);
6904 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6905 OPTIMIZE_FOR_SPEED))
6906 reduc_fn = IFN_REDUC_MAX;
6908 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6910 if (reduction_type != EXTRACT_LAST_REDUCTION
6911 && (!nested_cycle || double_reduc)
6912 && reduc_fn == IFN_LAST
6913 && !nunits_out.is_constant ())
6915 if (dump_enabled_p ())
6916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6917 "missing target support for reduction on"
6918 " variable-length vectors.\n");
6919 return false;
6922 /* For SLP reductions, see if there is a neutral value we can use. */
6923 tree neutral_op = NULL_TREE;
6924 if (slp_node)
6925 neutral_op = neutral_op_for_slp_reduction
6926 (slp_node_instance->reduc_phis, vectype_out, orig_code,
6927 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6929 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6931 /* We can't support in-order reductions of code such as this:
6933 for (int i = 0; i < n1; ++i)
6934 for (int j = 0; j < n2; ++j)
6935 l += a[j];
6937 since GCC effectively transforms the loop when vectorizing:
6939 for (int i = 0; i < n1 / VF; ++i)
6940 for (int j = 0; j < n2; ++j)
6941 for (int k = 0; k < VF; ++k)
6942 l += a[j];
6944 which is a reassociation of the original operation. */
6945 if (dump_enabled_p ())
6946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6947 "in-order double reduction not supported.\n");
6949 return false;
6952 if (reduction_type == FOLD_LEFT_REDUCTION
6953 && slp_node
6954 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6956 /* We cannot use in-order reductions in this case because there is
6957 an implicit reassociation of the operations involved. */
6958 if (dump_enabled_p ())
6959 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6960 "in-order unchained SLP reductions not supported.\n");
6961 return false;
6964 /* For double reductions, and for SLP reductions with a neutral value,
6965 we construct a variable-length initial vector by loading a vector
6966 full of the neutral value and then shift-and-inserting the start
6967 values into the low-numbered elements. */
6968 if ((double_reduc || neutral_op)
6969 && !nunits_out.is_constant ()
6970 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6971 vectype_out, OPTIMIZE_FOR_SPEED))
6973 if (dump_enabled_p ())
6974 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6975 "reduction on variable-length vectors requires"
6976 " target support for a vector-shift-and-insert"
6977 " operation.\n");
6978 return false;
6981 /* Check extra constraints for variable-length unchained SLP reductions. */
6982 if (STMT_SLP_TYPE (stmt_info)
6983 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6984 && !nunits_out.is_constant ())
6986 /* We checked above that we could build the initial vector when
6987 there's a neutral element value. Check here for the case in
6988 which each SLP statement has its own initial value and in which
6989 that value needs to be repeated for every instance of the
6990 statement within the initial vector. */
6991 unsigned int group_size = SLP_TREE_LANES (slp_node);
6992 if (!neutral_op
6993 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6994 TREE_TYPE (vectype_out)))
6996 if (dump_enabled_p ())
6997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6998 "unsupported form of SLP reduction for"
6999 " variable-length vectors: cannot build"
7000 " initial vector.\n");
7001 return false;
7003 /* The epilogue code relies on the number of elements being a multiple
7004 of the group size. The duplicate-and-interleave approach to setting
7005 up the initial vector does too. */
7006 if (!multiple_p (nunits_out, group_size))
7008 if (dump_enabled_p ())
7009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7010 "unsupported form of SLP reduction for"
7011 " variable-length vectors: the vector size"
7012 " is not a multiple of the number of results.\n");
7013 return false;
7017 if (reduction_type == COND_REDUCTION)
7019 widest_int ni;
7021 if (! max_loop_iterations (loop, &ni))
7023 if (dump_enabled_p ())
7024 dump_printf_loc (MSG_NOTE, vect_location,
7025 "loop count not known, cannot create cond "
7026 "reduction.\n");
7027 return false;
7029 /* Convert backedges to iterations. */
7030 ni += 1;
7032 /* The additional index will be the same type as the condition. Check
7033 that the loop can fit into this less one (because we'll use up the
7034 zero slot for when there are no matches). */
7035 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7036 if (wi::geu_p (ni, wi::to_widest (max_index)))
7038 if (dump_enabled_p ())
7039 dump_printf_loc (MSG_NOTE, vect_location,
7040 "loop size is greater than data size.\n");
7041 return false;
7045 /* In case the vectorization factor (VF) is bigger than the number
7046 of elements that we can fit in a vectype (nunits), we have to generate
7047 more than one vector stmt - i.e - we need to "unroll" the
7048 vector stmt by a factor VF/nunits. For more details see documentation
7049 in vectorizable_operation. */
7051 /* If the reduction is used in an outer loop we need to generate
7052 VF intermediate results, like so (e.g. for ncopies=2):
7053 r0 = phi (init, r0)
7054 r1 = phi (init, r1)
7055 r0 = x0 + r0;
7056 r1 = x1 + r1;
7057 (i.e. we generate VF results in 2 registers).
7058 In this case we have a separate def-use cycle for each copy, and therefore
7059 for each copy we get the vector def for the reduction variable from the
7060 respective phi node created for this copy.
7062 Otherwise (the reduction is unused in the loop nest), we can combine
7063 together intermediate results, like so (e.g. for ncopies=2):
7064 r = phi (init, r)
7065 r = x0 + r;
7066 r = x1 + r;
7067 (i.e. we generate VF/2 results in a single register).
7068 In this case for each copy we get the vector def for the reduction variable
7069 from the vectorized reduction operation generated in the previous iteration.
7071 This only works when we see both the reduction PHI and its only consumer
7072 in vectorizable_reduction and there are no intermediate stmts
7073 participating. */
7074 if (ncopies > 1
7075 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7076 && reduc_chain_length == 1)
7077 single_defuse_cycle = true;
7079 if (single_defuse_cycle || lane_reduc_code_p)
7081 gcc_assert (code != COND_EXPR);
7083 /* 4. Supportable by target? */
7084 bool ok = true;
7086 /* 4.1. check support for the operation in the loop */
7087 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
7088 if (!optab)
7090 if (dump_enabled_p ())
7091 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7092 "no optab.\n");
7093 ok = false;
7096 machine_mode vec_mode = TYPE_MODE (vectype_in);
7097 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7099 if (dump_enabled_p ())
7100 dump_printf (MSG_NOTE, "op not supported by target.\n");
7101 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7102 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
7103 ok = false;
7104 else
7105 if (dump_enabled_p ())
7106 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7109 /* Worthwhile without SIMD support? */
7110 if (ok
7111 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
7112 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
7114 if (dump_enabled_p ())
7115 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7116 "not worthwhile without SIMD support.\n");
7117 ok = false;
7120 /* lane-reducing operations have to go through vect_transform_reduction.
7121 For the other cases try without the single cycle optimization. */
7122 if (!ok)
7124 if (lane_reduc_code_p)
7125 return false;
7126 else
7127 single_defuse_cycle = false;
7130 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7132 /* If the reduction stmt is one of the patterns that have lane
7133 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7134 if ((ncopies > 1 && ! single_defuse_cycle)
7135 && lane_reduc_code_p)
7137 if (dump_enabled_p ())
7138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7139 "multi def-use cycle not possible for lane-reducing "
7140 "reduction operation\n");
7141 return false;
7144 if (slp_node
7145 && !(!single_defuse_cycle
7146 && code != DOT_PROD_EXPR
7147 && code != WIDEN_SUM_EXPR
7148 && code != SAD_EXPR
7149 && reduction_type != FOLD_LEFT_REDUCTION))
7150 for (i = 0; i < op_type; i++)
7151 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7153 if (dump_enabled_p ())
7154 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7155 "incompatible vector types for invariants\n");
7156 return false;
7159 if (slp_node)
7160 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7161 else
7162 vec_num = 1;
7164 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7165 reduction_type, ncopies, cost_vec);
7166 if (dump_enabled_p ()
7167 && reduction_type == FOLD_LEFT_REDUCTION)
7168 dump_printf_loc (MSG_NOTE, vect_location,
7169 "using an in-order (fold-left) reduction.\n");
7170 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7171 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7172 reductions go through their own vectorizable_* routines. */
7173 if (!single_defuse_cycle
7174 && code != DOT_PROD_EXPR
7175 && code != WIDEN_SUM_EXPR
7176 && code != SAD_EXPR
7177 && reduction_type != FOLD_LEFT_REDUCTION)
7179 stmt_vec_info tem
7180 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7181 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7183 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7184 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7186 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7187 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7189 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7191 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7192 internal_fn cond_fn = get_conditional_internal_fn (code);
7194 if (reduction_type != FOLD_LEFT_REDUCTION
7195 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7196 && (cond_fn == IFN_LAST
7197 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7198 OPTIMIZE_FOR_SPEED)))
7200 if (dump_enabled_p ())
7201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7202 "can't operate on partial vectors because"
7203 " no conditional operation is available.\n");
7204 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7206 else if (reduction_type == FOLD_LEFT_REDUCTION
7207 && reduc_fn == IFN_LAST
7208 && !expand_vec_cond_expr_p (vectype_in,
7209 truth_type_for (vectype_in),
7210 SSA_NAME))
7212 if (dump_enabled_p ())
7213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7214 "can't operate on partial vectors because"
7215 " no conditional operation is available.\n");
7216 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7218 else
7219 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7220 vectype_in, NULL);
7222 return true;
7225 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7226 value. */
7228 bool
7229 vect_transform_reduction (loop_vec_info loop_vinfo,
7230 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7231 gimple **vec_stmt, slp_tree slp_node)
7233 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7234 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7235 int i;
7236 int ncopies;
7237 int vec_num;
7239 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7240 gcc_assert (reduc_info->is_reduc_info);
7242 if (nested_in_vect_loop_p (loop, stmt_info))
7244 loop = loop->inner;
7245 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7248 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7249 enum tree_code code = gimple_assign_rhs_code (stmt);
7250 int op_type = TREE_CODE_LENGTH (code);
7252 /* Flatten RHS. */
7253 tree ops[3];
7254 switch (get_gimple_rhs_class (code))
7256 case GIMPLE_TERNARY_RHS:
7257 ops[2] = gimple_assign_rhs3 (stmt);
7258 /* Fall thru. */
7259 case GIMPLE_BINARY_RHS:
7260 ops[0] = gimple_assign_rhs1 (stmt);
7261 ops[1] = gimple_assign_rhs2 (stmt);
7262 break;
7263 default:
7264 gcc_unreachable ();
7267 /* All uses but the last are expected to be defined in the loop.
7268 The last use is the reduction variable. In case of nested cycle this
7269 assumption is not true: we use reduc_index to record the index of the
7270 reduction variable. */
7271 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7272 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7273 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7274 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7276 if (slp_node)
7278 ncopies = 1;
7279 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7281 else
7283 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7284 vec_num = 1;
7287 internal_fn cond_fn = get_conditional_internal_fn (code);
7288 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7289 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7291 /* Transform. */
7292 tree new_temp = NULL_TREE;
7293 auto_vec<tree> vec_oprnds0;
7294 auto_vec<tree> vec_oprnds1;
7295 auto_vec<tree> vec_oprnds2;
7296 tree def0;
7298 if (dump_enabled_p ())
7299 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7301 /* FORNOW: Multiple types are not supported for condition. */
7302 if (code == COND_EXPR)
7303 gcc_assert (ncopies == 1);
7305 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7307 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7308 if (reduction_type == FOLD_LEFT_REDUCTION)
7310 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7311 return vectorize_fold_left_reduction
7312 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7313 reduc_fn, ops, vectype_in, reduc_index, masks);
7316 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7317 gcc_assert (single_defuse_cycle
7318 || code == DOT_PROD_EXPR
7319 || code == WIDEN_SUM_EXPR
7320 || code == SAD_EXPR);
7322 /* Create the destination vector */
7323 tree scalar_dest = gimple_assign_lhs (stmt);
7324 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7326 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7327 single_defuse_cycle && reduc_index == 0
7328 ? NULL_TREE : ops[0], &vec_oprnds0,
7329 single_defuse_cycle && reduc_index == 1
7330 ? NULL_TREE : ops[1], &vec_oprnds1,
7331 op_type == ternary_op
7332 && !(single_defuse_cycle && reduc_index == 2)
7333 ? ops[2] : NULL_TREE, &vec_oprnds2);
7334 if (single_defuse_cycle)
7336 gcc_assert (!slp_node);
7337 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7338 ops[reduc_index],
7339 reduc_index == 0 ? &vec_oprnds0
7340 : (reduc_index == 1 ? &vec_oprnds1
7341 : &vec_oprnds2));
7344 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7346 gimple *new_stmt;
7347 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7348 if (masked_loop_p && !mask_by_cond_expr)
7350 /* Make sure that the reduction accumulator is vop[0]. */
7351 if (reduc_index == 1)
7353 gcc_assert (commutative_tree_code (code));
7354 std::swap (vop[0], vop[1]);
7356 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7357 vectype_in, i);
7358 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7359 vop[0], vop[1], vop[0]);
7360 new_temp = make_ssa_name (vec_dest, call);
7361 gimple_call_set_lhs (call, new_temp);
7362 gimple_call_set_nothrow (call, true);
7363 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7364 new_stmt = call;
7366 else
7368 if (op_type == ternary_op)
7369 vop[2] = vec_oprnds2[i];
7371 if (masked_loop_p && mask_by_cond_expr)
7373 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7374 vectype_in, i);
7375 build_vect_cond_expr (code, vop, mask, gsi);
7378 new_stmt = gimple_build_assign (vec_dest, code,
7379 vop[0], vop[1], vop[2]);
7380 new_temp = make_ssa_name (vec_dest, new_stmt);
7381 gimple_assign_set_lhs (new_stmt, new_temp);
7382 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7385 if (slp_node)
7386 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7387 else if (single_defuse_cycle
7388 && i < ncopies - 1)
7390 if (reduc_index == 0)
7391 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7392 else if (reduc_index == 1)
7393 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7394 else if (reduc_index == 2)
7395 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7397 else
7398 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7401 if (!slp_node)
7402 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7404 return true;
7407 /* Transform phase of a cycle PHI. */
7409 bool
7410 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7411 stmt_vec_info stmt_info, gimple **vec_stmt,
7412 slp_tree slp_node, slp_instance slp_node_instance)
7414 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7415 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7416 int i;
7417 int ncopies;
7418 int j;
7419 bool nested_cycle = false;
7420 int vec_num;
7422 if (nested_in_vect_loop_p (loop, stmt_info))
7424 loop = loop->inner;
7425 nested_cycle = true;
7428 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7429 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7430 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7431 gcc_assert (reduc_info->is_reduc_info);
7433 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7434 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7435 /* Leave the scalar phi in place. */
7436 return true;
7438 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7439 /* For a nested cycle we do not fill the above. */
7440 if (!vectype_in)
7441 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7442 gcc_assert (vectype_in);
7444 if (slp_node)
7446 /* The size vect_schedule_slp_instance computes is off for us. */
7447 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7448 * SLP_TREE_LANES (slp_node), vectype_in);
7449 ncopies = 1;
7451 else
7453 vec_num = 1;
7454 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7457 /* Check whether we should use a single PHI node and accumulate
7458 vectors to one before the backedge. */
7459 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7460 ncopies = 1;
7462 /* Create the destination vector */
7463 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7464 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7465 vectype_out);
7467 /* Get the loop-entry arguments. */
7468 tree vec_initial_def;
7469 auto_vec<tree> vec_initial_defs;
7470 if (slp_node)
7472 vec_initial_defs.reserve (vec_num);
7473 if (nested_cycle)
7475 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7476 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7477 &vec_initial_defs);
7479 else
7481 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7482 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7483 tree neutral_op
7484 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7485 STMT_VINFO_REDUC_CODE (reduc_info),
7486 first != NULL);
7487 get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7488 &vec_initial_defs, vec_num,
7489 first != NULL, neutral_op);
7492 else
7494 /* Get at the scalar def before the loop, that defines the initial
7495 value of the reduction variable. */
7496 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7497 loop_preheader_edge (loop));
7498 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7499 and we can't use zero for induc_val, use initial_def. Similarly
7500 for REDUC_MIN and initial_def larger than the base. */
7501 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7503 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7504 if (TREE_CODE (initial_def) == INTEGER_CST
7505 && !integer_zerop (induc_val)
7506 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7507 && tree_int_cst_lt (initial_def, induc_val))
7508 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7509 && tree_int_cst_lt (induc_val, initial_def))))
7511 induc_val = initial_def;
7512 /* Communicate we used the initial_def to epilouge
7513 generation. */
7514 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7516 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7517 vec_initial_defs.create (ncopies);
7518 for (i = 0; i < ncopies; ++i)
7519 vec_initial_defs.quick_push (vec_initial_def);
7521 else if (nested_cycle)
7523 /* Do not use an adjustment def as that case is not supported
7524 correctly if ncopies is not one. */
7525 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7526 ncopies, initial_def,
7527 &vec_initial_defs);
7529 else
7531 tree adjustment_def = NULL_TREE;
7532 tree *adjustment_defp = &adjustment_def;
7533 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7534 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7535 adjustment_defp = NULL;
7536 vec_initial_def
7537 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7538 initial_def, adjustment_defp);
7539 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7540 vec_initial_defs.create (ncopies);
7541 for (i = 0; i < ncopies; ++i)
7542 vec_initial_defs.quick_push (vec_initial_def);
7546 /* Generate the reduction PHIs upfront. */
7547 for (i = 0; i < vec_num; i++)
7549 tree vec_init_def = vec_initial_defs[i];
7550 for (j = 0; j < ncopies; j++)
7552 /* Create the reduction-phi that defines the reduction
7553 operand. */
7554 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7556 /* Set the loop-entry arg of the reduction-phi. */
7557 if (j != 0 && nested_cycle)
7558 vec_init_def = vec_initial_defs[j];
7559 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7560 UNKNOWN_LOCATION);
7562 /* The loop-latch arg is set in epilogue processing. */
7564 if (slp_node)
7565 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7566 else
7568 if (j == 0)
7569 *vec_stmt = new_phi;
7570 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7575 return true;
7578 /* Vectorizes LC PHIs. */
7580 bool
7581 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7582 stmt_vec_info stmt_info, gimple **vec_stmt,
7583 slp_tree slp_node)
7585 if (!loop_vinfo
7586 || !is_a <gphi *> (stmt_info->stmt)
7587 || gimple_phi_num_args (stmt_info->stmt) != 1)
7588 return false;
7590 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7591 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7592 return false;
7594 if (!vec_stmt) /* transformation not required. */
7596 /* Deal with copies from externs or constants that disguise as
7597 loop-closed PHI nodes (PR97886). */
7598 if (slp_node
7599 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7600 SLP_TREE_VECTYPE (slp_node)))
7602 if (dump_enabled_p ())
7603 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7604 "incompatible vector types for invariants\n");
7605 return false;
7607 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7608 return true;
7611 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7612 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7613 basic_block bb = gimple_bb (stmt_info->stmt);
7614 edge e = single_pred_edge (bb);
7615 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7616 auto_vec<tree> vec_oprnds;
7617 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7618 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7619 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7620 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7622 /* Create the vectorized LC PHI node. */
7623 gphi *new_phi = create_phi_node (vec_dest, bb);
7624 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7625 if (slp_node)
7626 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7627 else
7628 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7630 if (!slp_node)
7631 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7633 return true;
7636 /* Vectorizes PHIs. */
7638 bool
7639 vectorizable_phi (vec_info *,
7640 stmt_vec_info stmt_info, gimple **vec_stmt,
7641 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7643 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7644 return false;
7646 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7647 return false;
7649 tree vectype = SLP_TREE_VECTYPE (slp_node);
7651 if (!vec_stmt) /* transformation not required. */
7653 slp_tree child;
7654 unsigned i;
7655 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7656 if (!child)
7658 if (dump_enabled_p ())
7659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7660 "PHI node with unvectorized backedge def\n");
7661 return false;
7663 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7665 if (dump_enabled_p ())
7666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7667 "incompatible vector types for invariants\n");
7668 return false;
7670 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7671 vector_stmt, stmt_info, vectype, 0, vect_body);
7672 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7673 return true;
7676 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7677 basic_block bb = gimple_bb (stmt_info->stmt);
7678 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7679 auto_vec<gphi *> new_phis;
7680 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7682 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7684 /* Skip not yet vectorized defs. */
7685 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7686 && SLP_TREE_VEC_STMTS (child).is_empty ())
7687 continue;
7689 auto_vec<tree> vec_oprnds;
7690 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7691 if (!new_phis.exists ())
7693 new_phis.create (vec_oprnds.length ());
7694 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7696 /* Create the vectorized LC PHI node. */
7697 new_phis.quick_push (create_phi_node (vec_dest, bb));
7698 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7701 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7702 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7703 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7705 /* We should have at least one already vectorized child. */
7706 gcc_assert (new_phis.exists ());
7708 return true;
7712 /* Function vect_min_worthwhile_factor.
7714 For a loop where we could vectorize the operation indicated by CODE,
7715 return the minimum vectorization factor that makes it worthwhile
7716 to use generic vectors. */
7717 static unsigned int
7718 vect_min_worthwhile_factor (enum tree_code code)
7720 switch (code)
7722 case PLUS_EXPR:
7723 case MINUS_EXPR:
7724 case NEGATE_EXPR:
7725 return 4;
7727 case BIT_AND_EXPR:
7728 case BIT_IOR_EXPR:
7729 case BIT_XOR_EXPR:
7730 case BIT_NOT_EXPR:
7731 return 2;
7733 default:
7734 return INT_MAX;
7738 /* Return true if VINFO indicates we are doing loop vectorization and if
7739 it is worth decomposing CODE operations into scalar operations for
7740 that loop's vectorization factor. */
7742 bool
7743 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7745 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7746 unsigned HOST_WIDE_INT value;
7747 return (loop_vinfo
7748 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7749 && value >= vect_min_worthwhile_factor (code));
7752 /* Function vectorizable_induction
7754 Check if STMT_INFO performs an induction computation that can be vectorized.
7755 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7756 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7757 Return true if STMT_INFO is vectorizable in this way. */
7759 bool
7760 vectorizable_induction (loop_vec_info loop_vinfo,
7761 stmt_vec_info stmt_info,
7762 gimple **vec_stmt, slp_tree slp_node,
7763 stmt_vector_for_cost *cost_vec)
7765 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7766 unsigned ncopies;
7767 bool nested_in_vect_loop = false;
7768 class loop *iv_loop;
7769 tree vec_def;
7770 edge pe = loop_preheader_edge (loop);
7771 basic_block new_bb;
7772 tree new_vec, vec_init, vec_step, t;
7773 tree new_name;
7774 gimple *new_stmt;
7775 gphi *induction_phi;
7776 tree induc_def, vec_dest;
7777 tree init_expr, step_expr;
7778 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7779 unsigned i;
7780 tree expr;
7781 gimple_stmt_iterator si;
7783 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7784 if (!phi)
7785 return false;
7787 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7788 return false;
7790 /* Make sure it was recognized as induction computation. */
7791 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7792 return false;
7794 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7795 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7797 if (slp_node)
7798 ncopies = 1;
7799 else
7800 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7801 gcc_assert (ncopies >= 1);
7803 /* FORNOW. These restrictions should be relaxed. */
7804 if (nested_in_vect_loop_p (loop, stmt_info))
7806 imm_use_iterator imm_iter;
7807 use_operand_p use_p;
7808 gimple *exit_phi;
7809 edge latch_e;
7810 tree loop_arg;
7812 if (ncopies > 1)
7814 if (dump_enabled_p ())
7815 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7816 "multiple types in nested loop.\n");
7817 return false;
7820 exit_phi = NULL;
7821 latch_e = loop_latch_edge (loop->inner);
7822 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7823 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7825 gimple *use_stmt = USE_STMT (use_p);
7826 if (is_gimple_debug (use_stmt))
7827 continue;
7829 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7831 exit_phi = use_stmt;
7832 break;
7835 if (exit_phi)
7837 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7838 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7839 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7841 if (dump_enabled_p ())
7842 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7843 "inner-loop induction only used outside "
7844 "of the outer vectorized loop.\n");
7845 return false;
7849 nested_in_vect_loop = true;
7850 iv_loop = loop->inner;
7852 else
7853 iv_loop = loop;
7854 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7856 if (slp_node && !nunits.is_constant ())
7858 /* The current SLP code creates the step value element-by-element. */
7859 if (dump_enabled_p ())
7860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7861 "SLP induction not supported for variable-length"
7862 " vectors.\n");
7863 return false;
7866 if (!vec_stmt) /* transformation not required. */
7868 unsigned inside_cost = 0, prologue_cost = 0;
7869 if (slp_node)
7871 /* We eventually need to set a vector type on invariant
7872 arguments. */
7873 unsigned j;
7874 slp_tree child;
7875 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7876 if (!vect_maybe_update_slp_op_vectype
7877 (child, SLP_TREE_VECTYPE (slp_node)))
7879 if (dump_enabled_p ())
7880 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7881 "incompatible vector types for "
7882 "invariants\n");
7883 return false;
7885 /* loop cost for vec_loop. */
7886 inside_cost
7887 = record_stmt_cost (cost_vec,
7888 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7889 vector_stmt, stmt_info, 0, vect_body);
7890 /* prologue cost for vec_init (if not nested) and step. */
7891 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
7892 scalar_to_vec,
7893 stmt_info, 0, vect_prologue);
7895 else /* if (!slp_node) */
7897 /* loop cost for vec_loop. */
7898 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
7899 stmt_info, 0, vect_body);
7900 /* prologue cost for vec_init and vec_step. */
7901 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
7902 stmt_info, 0, vect_prologue);
7904 if (dump_enabled_p ())
7905 dump_printf_loc (MSG_NOTE, vect_location,
7906 "vect_model_induction_cost: inside_cost = %d, "
7907 "prologue_cost = %d .\n", inside_cost,
7908 prologue_cost);
7910 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7911 DUMP_VECT_SCOPE ("vectorizable_induction");
7912 return true;
7915 /* Transform. */
7917 /* Compute a vector variable, initialized with the first VF values of
7918 the induction variable. E.g., for an iv with IV_PHI='X' and
7919 evolution S, for a vector of 4 units, we want to compute:
7920 [X, X + S, X + 2*S, X + 3*S]. */
7922 if (dump_enabled_p ())
7923 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7925 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7926 gcc_assert (step_expr != NULL_TREE);
7927 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7929 pe = loop_preheader_edge (iv_loop);
7930 /* Find the first insertion point in the BB. */
7931 basic_block bb = gimple_bb (phi);
7932 si = gsi_after_labels (bb);
7934 /* For SLP induction we have to generate several IVs as for example
7935 with group size 3 we need
7936 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
7937 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
7938 if (slp_node)
7940 /* Enforced above. */
7941 unsigned int const_nunits = nunits.to_constant ();
7943 /* The initial values are vectorized, but any lanes > group_size
7944 need adjustment. */
7945 slp_tree init_node
7946 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
7948 /* Gather steps. Since we do not vectorize inductions as
7949 cycles we have to reconstruct the step from SCEV data. */
7950 unsigned group_size = SLP_TREE_LANES (slp_node);
7951 tree *steps = XALLOCAVEC (tree, group_size);
7952 tree *inits = XALLOCAVEC (tree, group_size);
7953 stmt_vec_info phi_info;
7954 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
7956 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
7957 if (!init_node)
7958 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
7959 pe->dest_idx);
7962 /* Now generate the IVs. */
7963 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7964 gcc_assert ((const_nunits * nvects) % group_size == 0);
7965 unsigned nivs;
7966 if (nested_in_vect_loop)
7967 nivs = nvects;
7968 else
7970 /* Compute the number of distinct IVs we need. First reduce
7971 group_size if it is a multiple of const_nunits so we get
7972 one IV for a group_size of 4 but const_nunits 2. */
7973 unsigned group_sizep = group_size;
7974 if (group_sizep % const_nunits == 0)
7975 group_sizep = group_sizep / const_nunits;
7976 nivs = least_common_multiple (group_sizep,
7977 const_nunits) / const_nunits;
7979 tree stept = TREE_TYPE (step_vectype);
7980 tree lupdate_mul = NULL_TREE;
7981 if (!nested_in_vect_loop)
7983 /* The number of iterations covered in one vector iteration. */
7984 unsigned lup_mul = (nvects * const_nunits) / group_size;
7985 lupdate_mul
7986 = build_vector_from_val (step_vectype,
7987 SCALAR_FLOAT_TYPE_P (stept)
7988 ? build_real_from_wide (stept, lup_mul,
7989 UNSIGNED)
7990 : build_int_cstu (stept, lup_mul));
7992 tree peel_mul = NULL_TREE;
7993 gimple_seq init_stmts = NULL;
7994 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
7996 if (SCALAR_FLOAT_TYPE_P (stept))
7997 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
7998 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
7999 else
8000 peel_mul = gimple_convert (&init_stmts, stept,
8001 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8002 peel_mul = gimple_build_vector_from_val (&init_stmts,
8003 step_vectype, peel_mul);
8005 unsigned ivn;
8006 auto_vec<tree> vec_steps;
8007 for (ivn = 0; ivn < nivs; ++ivn)
8009 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8010 tree_vector_builder init_elts (vectype, const_nunits, 1);
8011 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8012 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8014 /* The scalar steps of the IVs. */
8015 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8016 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8017 step_elts.quick_push (elt);
8018 if (!init_node)
8020 /* The scalar inits of the IVs if not vectorized. */
8021 elt = inits[(ivn*const_nunits + eltn) % group_size];
8022 if (!useless_type_conversion_p (TREE_TYPE (vectype),
8023 TREE_TYPE (elt)))
8024 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8025 TREE_TYPE (vectype), elt);
8026 init_elts.quick_push (elt);
8028 /* The number of steps to add to the initial values. */
8029 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8030 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8031 ? build_real_from_wide (stept,
8032 mul_elt, UNSIGNED)
8033 : build_int_cstu (stept, mul_elt));
8035 vec_step = gimple_build_vector (&init_stmts, &step_elts);
8036 vec_steps.safe_push (vec_step);
8037 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8038 if (peel_mul)
8039 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8040 step_mul, peel_mul);
8041 if (!init_node)
8042 vec_init = gimple_build_vector (&init_stmts, &init_elts);
8044 /* Create the induction-phi that defines the induction-operand. */
8045 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8046 "vec_iv_");
8047 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8048 induc_def = PHI_RESULT (induction_phi);
8050 /* Create the iv update inside the loop */
8051 tree up = vec_step;
8052 if (lupdate_mul)
8053 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8054 vec_step, lupdate_mul);
8055 gimple_seq stmts = NULL;
8056 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8057 vec_def = gimple_build (&stmts,
8058 PLUS_EXPR, step_vectype, vec_def, up);
8059 vec_def = gimple_convert (&stmts, vectype, vec_def);
8060 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8061 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8062 UNKNOWN_LOCATION);
8064 if (init_node)
8065 vec_init = vect_get_slp_vect_def (init_node, ivn);
8066 if (!nested_in_vect_loop
8067 && !integer_zerop (step_mul))
8069 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8070 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8071 vec_step, step_mul);
8072 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8073 vec_def, up);
8074 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8077 /* Set the arguments of the phi node: */
8078 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8080 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8082 if (!nested_in_vect_loop)
8084 /* Fill up to the number of vectors we need for the whole group. */
8085 nivs = least_common_multiple (group_size,
8086 const_nunits) / const_nunits;
8087 for (; ivn < nivs; ++ivn)
8089 SLP_TREE_VEC_STMTS (slp_node)
8090 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8091 vec_steps.safe_push (vec_steps[0]);
8095 /* Re-use IVs when we can. We are generating further vector
8096 stmts by adding VF' * stride to the IVs generated above. */
8097 if (ivn < nvects)
8099 unsigned vfp
8100 = least_common_multiple (group_size, const_nunits) / group_size;
8101 tree lupdate_mul
8102 = build_vector_from_val (step_vectype,
8103 SCALAR_FLOAT_TYPE_P (stept)
8104 ? build_real_from_wide (stept,
8105 vfp, UNSIGNED)
8106 : build_int_cstu (stept, vfp));
8107 for (; ivn < nvects; ++ivn)
8109 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8110 tree def = gimple_get_lhs (iv);
8111 if (ivn < 2*nivs)
8112 vec_steps[ivn - nivs]
8113 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8114 vec_steps[ivn - nivs], lupdate_mul);
8115 gimple_seq stmts = NULL;
8116 def = gimple_convert (&stmts, step_vectype, def);
8117 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8118 def, vec_steps[ivn % nivs]);
8119 def = gimple_convert (&stmts, vectype, def);
8120 if (gimple_code (iv) == GIMPLE_PHI)
8121 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8122 else
8124 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8125 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8127 SLP_TREE_VEC_STMTS (slp_node)
8128 .quick_push (SSA_NAME_DEF_STMT (def));
8132 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8133 gcc_assert (!new_bb);
8135 return true;
8138 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
8139 loop_preheader_edge (iv_loop));
8141 gimple_seq stmts = NULL;
8142 if (!nested_in_vect_loop)
8144 /* Convert the initial value to the IV update type. */
8145 tree new_type = TREE_TYPE (step_expr);
8146 init_expr = gimple_convert (&stmts, new_type, init_expr);
8148 /* If we are using the loop mask to "peel" for alignment then we need
8149 to adjust the start value here. */
8150 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8151 if (skip_niters != NULL_TREE)
8153 if (FLOAT_TYPE_P (vectype))
8154 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8155 skip_niters);
8156 else
8157 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8158 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8159 skip_niters, step_expr);
8160 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8161 init_expr, skip_step);
8165 if (stmts)
8167 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8168 gcc_assert (!new_bb);
8171 /* Create the vector that holds the initial_value of the induction. */
8172 if (nested_in_vect_loop)
8174 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8175 been created during vectorization of previous stmts. We obtain it
8176 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8177 auto_vec<tree> vec_inits;
8178 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8179 init_expr, &vec_inits);
8180 vec_init = vec_inits[0];
8181 /* If the initial value is not of proper type, convert it. */
8182 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8184 new_stmt
8185 = gimple_build_assign (vect_get_new_ssa_name (vectype,
8186 vect_simple_var,
8187 "vec_iv_"),
8188 VIEW_CONVERT_EXPR,
8189 build1 (VIEW_CONVERT_EXPR, vectype,
8190 vec_init));
8191 vec_init = gimple_assign_lhs (new_stmt);
8192 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8193 new_stmt);
8194 gcc_assert (!new_bb);
8197 else
8199 /* iv_loop is the loop to be vectorized. Create:
8200 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8201 stmts = NULL;
8202 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8204 unsigned HOST_WIDE_INT const_nunits;
8205 if (nunits.is_constant (&const_nunits))
8207 tree_vector_builder elts (step_vectype, const_nunits, 1);
8208 elts.quick_push (new_name);
8209 for (i = 1; i < const_nunits; i++)
8211 /* Create: new_name_i = new_name + step_expr */
8212 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8213 new_name, step_expr);
8214 elts.quick_push (new_name);
8216 /* Create a vector from [new_name_0, new_name_1, ...,
8217 new_name_nunits-1] */
8218 vec_init = gimple_build_vector (&stmts, &elts);
8220 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8221 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8222 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8223 new_name, step_expr);
8224 else
8226 /* Build:
8227 [base, base, base, ...]
8228 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8229 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8230 gcc_assert (flag_associative_math);
8231 tree index = build_index_vector (step_vectype, 0, 1);
8232 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8233 new_name);
8234 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8235 step_expr);
8236 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8237 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8238 vec_init, step_vec);
8239 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8240 vec_init, base_vec);
8242 vec_init = gimple_convert (&stmts, vectype, vec_init);
8244 if (stmts)
8246 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8247 gcc_assert (!new_bb);
8252 /* Create the vector that holds the step of the induction. */
8253 if (nested_in_vect_loop)
8254 /* iv_loop is nested in the loop to be vectorized. Generate:
8255 vec_step = [S, S, S, S] */
8256 new_name = step_expr;
8257 else
8259 /* iv_loop is the loop to be vectorized. Generate:
8260 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8261 gimple_seq seq = NULL;
8262 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8264 expr = build_int_cst (integer_type_node, vf);
8265 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8267 else
8268 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8269 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8270 expr, step_expr);
8271 if (seq)
8273 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8274 gcc_assert (!new_bb);
8278 t = unshare_expr (new_name);
8279 gcc_assert (CONSTANT_CLASS_P (new_name)
8280 || TREE_CODE (new_name) == SSA_NAME);
8281 new_vec = build_vector_from_val (step_vectype, t);
8282 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8283 new_vec, step_vectype, NULL);
8286 /* Create the following def-use cycle:
8287 loop prolog:
8288 vec_init = ...
8289 vec_step = ...
8290 loop:
8291 vec_iv = PHI <vec_init, vec_loop>
8293 STMT
8295 vec_loop = vec_iv + vec_step; */
8297 /* Create the induction-phi that defines the induction-operand. */
8298 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8299 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8300 induc_def = PHI_RESULT (induction_phi);
8302 /* Create the iv update inside the loop */
8303 stmts = NULL;
8304 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8305 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8306 vec_def = gimple_convert (&stmts, vectype, vec_def);
8307 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8308 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8310 /* Set the arguments of the phi node: */
8311 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8312 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8313 UNKNOWN_LOCATION);
8315 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8316 *vec_stmt = induction_phi;
8318 /* In case that vectorization factor (VF) is bigger than the number
8319 of elements that we can fit in a vectype (nunits), we have to generate
8320 more than one vector stmt - i.e - we need to "unroll" the
8321 vector stmt by a factor VF/nunits. For more details see documentation
8322 in vectorizable_operation. */
8324 if (ncopies > 1)
8326 gimple_seq seq = NULL;
8327 /* FORNOW. This restriction should be relaxed. */
8328 gcc_assert (!nested_in_vect_loop);
8330 /* Create the vector that holds the step of the induction. */
8331 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8333 expr = build_int_cst (integer_type_node, nunits);
8334 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8336 else
8337 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8338 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8339 expr, step_expr);
8340 if (seq)
8342 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8343 gcc_assert (!new_bb);
8346 t = unshare_expr (new_name);
8347 gcc_assert (CONSTANT_CLASS_P (new_name)
8348 || TREE_CODE (new_name) == SSA_NAME);
8349 new_vec = build_vector_from_val (step_vectype, t);
8350 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8351 new_vec, step_vectype, NULL);
8353 vec_def = induc_def;
8354 for (i = 1; i < ncopies; i++)
8356 /* vec_i = vec_prev + vec_step */
8357 gimple_seq stmts = NULL;
8358 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8359 vec_def = gimple_build (&stmts,
8360 PLUS_EXPR, step_vectype, vec_def, vec_step);
8361 vec_def = gimple_convert (&stmts, vectype, vec_def);
8363 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8364 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8365 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8369 if (dump_enabled_p ())
8370 dump_printf_loc (MSG_NOTE, vect_location,
8371 "transform induction: created def-use cycle: %G%G",
8372 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8374 return true;
8377 /* Function vectorizable_live_operation.
8379 STMT_INFO computes a value that is used outside the loop. Check if
8380 it can be supported. */
8382 bool
8383 vectorizable_live_operation (vec_info *vinfo,
8384 stmt_vec_info stmt_info,
8385 gimple_stmt_iterator *gsi,
8386 slp_tree slp_node, slp_instance slp_node_instance,
8387 int slp_index, bool vec_stmt_p,
8388 stmt_vector_for_cost *cost_vec)
8390 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8391 imm_use_iterator imm_iter;
8392 tree lhs, lhs_type, bitsize, vec_bitsize;
8393 tree vectype = (slp_node
8394 ? SLP_TREE_VECTYPE (slp_node)
8395 : STMT_VINFO_VECTYPE (stmt_info));
8396 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8397 int ncopies;
8398 gimple *use_stmt;
8399 auto_vec<tree> vec_oprnds;
8400 int vec_entry = 0;
8401 poly_uint64 vec_index = 0;
8403 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8405 /* If a stmt of a reduction is live, vectorize it via
8406 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8407 validity so just trigger the transform here. */
8408 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8410 if (!vec_stmt_p)
8411 return true;
8412 if (slp_node)
8414 /* For reduction chains the meta-info is attached to
8415 the group leader. */
8416 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8417 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8418 /* For SLP reductions we vectorize the epilogue for
8419 all involved stmts together. */
8420 else if (slp_index != 0)
8421 return true;
8422 else
8423 /* For SLP reductions the meta-info is attached to
8424 the representative. */
8425 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8427 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8428 gcc_assert (reduc_info->is_reduc_info);
8429 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8430 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8431 return true;
8432 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8433 slp_node_instance);
8434 return true;
8437 /* If STMT is not relevant and it is a simple assignment and its inputs are
8438 invariant then it can remain in place, unvectorized. The original last
8439 scalar value that it computes will be used. */
8440 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8442 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8443 if (dump_enabled_p ())
8444 dump_printf_loc (MSG_NOTE, vect_location,
8445 "statement is simple and uses invariant. Leaving in "
8446 "place.\n");
8447 return true;
8450 if (slp_node)
8451 ncopies = 1;
8452 else
8453 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8455 if (slp_node)
8457 gcc_assert (slp_index >= 0);
8459 /* Get the last occurrence of the scalar index from the concatenation of
8460 all the slp vectors. Calculate which slp vector it is and the index
8461 within. */
8462 int num_scalar = SLP_TREE_LANES (slp_node);
8463 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8464 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8466 /* Calculate which vector contains the result, and which lane of
8467 that vector we need. */
8468 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8470 if (dump_enabled_p ())
8471 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8472 "Cannot determine which vector holds the"
8473 " final result.\n");
8474 return false;
8478 if (!vec_stmt_p)
8480 /* No transformation required. */
8481 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8483 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8484 OPTIMIZE_FOR_SPEED))
8486 if (dump_enabled_p ())
8487 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8488 "can't operate on partial vectors "
8489 "because the target doesn't support extract "
8490 "last reduction.\n");
8491 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8493 else if (slp_node)
8495 if (dump_enabled_p ())
8496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8497 "can't operate on partial vectors "
8498 "because an SLP statement is live after "
8499 "the loop.\n");
8500 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8502 else if (ncopies > 1)
8504 if (dump_enabled_p ())
8505 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8506 "can't operate on partial vectors "
8507 "because ncopies is greater than 1.\n");
8508 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8510 else
8512 gcc_assert (ncopies == 1 && !slp_node);
8513 vect_record_loop_mask (loop_vinfo,
8514 &LOOP_VINFO_MASKS (loop_vinfo),
8515 1, vectype, NULL);
8518 /* ??? Enable for loop costing as well. */
8519 if (!loop_vinfo)
8520 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8521 0, vect_epilogue);
8522 return true;
8525 /* Use the lhs of the original scalar statement. */
8526 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8527 if (dump_enabled_p ())
8528 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8529 "stmt %G", stmt);
8531 lhs = gimple_get_lhs (stmt);
8532 lhs_type = TREE_TYPE (lhs);
8534 bitsize = vector_element_bits_tree (vectype);
8535 vec_bitsize = TYPE_SIZE (vectype);
8537 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8538 tree vec_lhs, bitstart;
8539 gimple *vec_stmt;
8540 if (slp_node)
8542 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8544 /* Get the correct slp vectorized stmt. */
8545 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8546 vec_lhs = gimple_get_lhs (vec_stmt);
8548 /* Get entry to use. */
8549 bitstart = bitsize_int (vec_index);
8550 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8552 else
8554 /* For multiple copies, get the last copy. */
8555 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8556 vec_lhs = gimple_get_lhs (vec_stmt);
8558 /* Get the last lane in the vector. */
8559 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8562 if (loop_vinfo)
8564 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8565 requirement, insert one phi node for it. It looks like:
8566 loop;
8568 # lhs' = PHI <lhs>
8570 loop;
8572 # vec_lhs' = PHI <vec_lhs>
8573 new_tree = lane_extract <vec_lhs', ...>;
8574 lhs' = new_tree; */
8576 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8577 basic_block exit_bb = single_exit (loop)->dest;
8578 gcc_assert (single_pred_p (exit_bb));
8580 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8581 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8582 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8584 gimple_seq stmts = NULL;
8585 tree new_tree;
8586 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8588 /* Emit:
8590 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8592 where VEC_LHS is the vectorized live-out result and MASK is
8593 the loop mask for the final iteration. */
8594 gcc_assert (ncopies == 1 && !slp_node);
8595 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8596 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8597 1, vectype, 0);
8598 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8599 mask, vec_lhs_phi);
8601 /* Convert the extracted vector element to the scalar type. */
8602 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8604 else
8606 tree bftype = TREE_TYPE (vectype);
8607 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8608 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8609 new_tree = build3 (BIT_FIELD_REF, bftype,
8610 vec_lhs_phi, bitsize, bitstart);
8611 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8612 &stmts, true, NULL_TREE);
8615 if (stmts)
8617 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8618 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8620 /* Remove existing phi from lhs and create one copy from new_tree. */
8621 tree lhs_phi = NULL_TREE;
8622 gimple_stmt_iterator gsi;
8623 for (gsi = gsi_start_phis (exit_bb);
8624 !gsi_end_p (gsi); gsi_next (&gsi))
8626 gimple *phi = gsi_stmt (gsi);
8627 if ((gimple_phi_arg_def (phi, 0) == lhs))
8629 remove_phi_node (&gsi, false);
8630 lhs_phi = gimple_phi_result (phi);
8631 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8632 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8633 break;
8638 /* Replace use of lhs with newly computed result. If the use stmt is a
8639 single arg PHI, just replace all uses of PHI result. It's necessary
8640 because lcssa PHI defining lhs may be before newly inserted stmt. */
8641 use_operand_p use_p;
8642 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8643 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8644 && !is_gimple_debug (use_stmt))
8646 if (gimple_code (use_stmt) == GIMPLE_PHI
8647 && gimple_phi_num_args (use_stmt) == 1)
8649 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8651 else
8653 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8654 SET_USE (use_p, new_tree);
8656 update_stmt (use_stmt);
8659 else
8661 /* For basic-block vectorization simply insert the lane-extraction. */
8662 tree bftype = TREE_TYPE (vectype);
8663 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8664 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8665 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8666 vec_lhs, bitsize, bitstart);
8667 gimple_seq stmts = NULL;
8668 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8669 &stmts, true, NULL_TREE);
8670 if (TREE_CODE (new_tree) == SSA_NAME
8671 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8672 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8673 if (is_a <gphi *> (vec_stmt))
8675 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8676 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8678 else
8680 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8681 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8684 /* Replace use of lhs with newly computed result. If the use stmt is a
8685 single arg PHI, just replace all uses of PHI result. It's necessary
8686 because lcssa PHI defining lhs may be before newly inserted stmt. */
8687 use_operand_p use_p;
8688 stmt_vec_info use_stmt_info;
8689 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8690 if (!is_gimple_debug (use_stmt)
8691 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8692 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8694 /* ??? This can happen when the live lane ends up being
8695 used in a vector construction code-generated by an
8696 external SLP node (and code-generation for that already
8697 happened). See gcc.dg/vect/bb-slp-47.c.
8698 Doing this is what would happen if that vector CTOR
8699 were not code-generated yet so it is not too bad.
8700 ??? In fact we'd likely want to avoid this situation
8701 in the first place. */
8702 if (TREE_CODE (new_tree) == SSA_NAME
8703 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8704 && gimple_code (use_stmt) != GIMPLE_PHI
8705 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8706 use_stmt))
8708 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8709 gcc_assert (code == CONSTRUCTOR
8710 || code == VIEW_CONVERT_EXPR
8711 || CONVERT_EXPR_CODE_P (code));
8712 if (dump_enabled_p ())
8713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8714 "Using original scalar computation for "
8715 "live lane because use preceeds vector "
8716 "def\n");
8717 continue;
8719 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8720 SET_USE (use_p, new_tree);
8721 update_stmt (use_stmt);
8725 return true;
8728 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8730 static void
8731 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8733 ssa_op_iter op_iter;
8734 imm_use_iterator imm_iter;
8735 def_operand_p def_p;
8736 gimple *ustmt;
8738 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8740 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8742 basic_block bb;
8744 if (!is_gimple_debug (ustmt))
8745 continue;
8747 bb = gimple_bb (ustmt);
8749 if (!flow_bb_inside_loop_p (loop, bb))
8751 if (gimple_debug_bind_p (ustmt))
8753 if (dump_enabled_p ())
8754 dump_printf_loc (MSG_NOTE, vect_location,
8755 "killing debug use\n");
8757 gimple_debug_bind_reset_value (ustmt);
8758 update_stmt (ustmt);
8760 else
8761 gcc_unreachable ();
8767 /* Given loop represented by LOOP_VINFO, return true if computation of
8768 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8769 otherwise. */
8771 static bool
8772 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8774 /* Constant case. */
8775 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8777 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8778 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8780 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8781 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8782 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8783 return true;
8786 widest_int max;
8787 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8788 /* Check the upper bound of loop niters. */
8789 if (get_max_loop_iterations (loop, &max))
8791 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8792 signop sgn = TYPE_SIGN (type);
8793 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8794 if (max < type_max)
8795 return true;
8797 return false;
8800 /* Return a mask type with half the number of elements as OLD_TYPE,
8801 given that it should have mode NEW_MODE. */
8803 tree
8804 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8806 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8807 return build_truth_vector_type_for_mode (nunits, new_mode);
8810 /* Return a mask type with twice as many elements as OLD_TYPE,
8811 given that it should have mode NEW_MODE. */
8813 tree
8814 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8816 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8817 return build_truth_vector_type_for_mode (nunits, new_mode);
8820 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8821 contain a sequence of NVECTORS masks that each control a vector of type
8822 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8823 these vector masks with the vector version of SCALAR_MASK. */
8825 void
8826 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8827 unsigned int nvectors, tree vectype, tree scalar_mask)
8829 gcc_assert (nvectors != 0);
8830 if (masks->length () < nvectors)
8831 masks->safe_grow_cleared (nvectors, true);
8832 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8833 /* The number of scalars per iteration and the number of vectors are
8834 both compile-time constants. */
8835 unsigned int nscalars_per_iter
8836 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8837 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8839 if (scalar_mask)
8841 scalar_cond_masked_key cond (scalar_mask, nvectors);
8842 loop_vinfo->scalar_cond_masked_set.add (cond);
8845 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8847 rgm->max_nscalars_per_iter = nscalars_per_iter;
8848 rgm->type = truth_type_for (vectype);
8849 rgm->factor = 1;
8853 /* Given a complete set of masks MASKS, extract mask number INDEX
8854 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8855 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8857 See the comment above vec_loop_masks for more details about the mask
8858 arrangement. */
8860 tree
8861 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8862 unsigned int nvectors, tree vectype, unsigned int index)
8864 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8865 tree mask_type = rgm->type;
8867 /* Populate the rgroup's mask array, if this is the first time we've
8868 used it. */
8869 if (rgm->controls.is_empty ())
8871 rgm->controls.safe_grow_cleared (nvectors, true);
8872 for (unsigned int i = 0; i < nvectors; ++i)
8874 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8875 /* Provide a dummy definition until the real one is available. */
8876 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8877 rgm->controls[i] = mask;
8881 tree mask = rgm->controls[index];
8882 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8883 TYPE_VECTOR_SUBPARTS (vectype)))
8885 /* A loop mask for data type X can be reused for data type Y
8886 if X has N times more elements than Y and if Y's elements
8887 are N times bigger than X's. In this case each sequence
8888 of N elements in the loop mask will be all-zero or all-one.
8889 We can then view-convert the mask so that each sequence of
8890 N elements is replaced by a single element. */
8891 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8892 TYPE_VECTOR_SUBPARTS (vectype)));
8893 gimple_seq seq = NULL;
8894 mask_type = truth_type_for (vectype);
8895 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8896 if (seq)
8897 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8899 return mask;
8902 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
8903 lengths for controlling an operation on VECTYPE. The operation splits
8904 each element of VECTYPE into FACTOR separate subelements, measuring the
8905 length as a number of these subelements. */
8907 void
8908 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8909 unsigned int nvectors, tree vectype, unsigned int factor)
8911 gcc_assert (nvectors != 0);
8912 if (lens->length () < nvectors)
8913 lens->safe_grow_cleared (nvectors, true);
8914 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8916 /* The number of scalars per iteration, scalar occupied bytes and
8917 the number of vectors are both compile-time constants. */
8918 unsigned int nscalars_per_iter
8919 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8920 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8922 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
8924 /* For now, we only support cases in which all loads and stores fall back
8925 to VnQI or none do. */
8926 gcc_assert (!rgl->max_nscalars_per_iter
8927 || (rgl->factor == 1 && factor == 1)
8928 || (rgl->max_nscalars_per_iter * rgl->factor
8929 == nscalars_per_iter * factor));
8930 rgl->max_nscalars_per_iter = nscalars_per_iter;
8931 rgl->type = vectype;
8932 rgl->factor = factor;
8936 /* Given a complete set of length LENS, extract length number INDEX for an
8937 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
8939 tree
8940 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8941 unsigned int nvectors, unsigned int index)
8943 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8945 /* Populate the rgroup's len array, if this is the first time we've
8946 used it. */
8947 if (rgl->controls.is_empty ())
8949 rgl->controls.safe_grow_cleared (nvectors, true);
8950 for (unsigned int i = 0; i < nvectors; ++i)
8952 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
8953 gcc_assert (len_type != NULL_TREE);
8954 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
8956 /* Provide a dummy definition until the real one is available. */
8957 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
8958 rgl->controls[i] = len;
8962 return rgl->controls[index];
8965 /* Scale profiling counters by estimation for LOOP which is vectorized
8966 by factor VF. */
8968 static void
8969 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8971 edge preheader = loop_preheader_edge (loop);
8972 /* Reduce loop iterations by the vectorization factor. */
8973 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8974 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8976 if (freq_h.nonzero_p ())
8978 profile_probability p;
8980 /* Avoid dropping loop body profile counter to 0 because of zero count
8981 in loop's preheader. */
8982 if (!(freq_e == profile_count::zero ()))
8983 freq_e = freq_e.force_nonzero ();
8984 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8985 scale_loop_frequencies (loop, p);
8988 edge exit_e = single_exit (loop);
8989 exit_e->probability = profile_probability::always ()
8990 .apply_scale (1, new_est_niter + 1);
8992 edge exit_l = single_pred_edge (loop->latch);
8993 profile_probability prob = exit_l->probability;
8994 exit_l->probability = exit_e->probability.invert ();
8995 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8996 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8999 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9000 latch edge values originally defined by it. */
9002 static void
9003 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9004 stmt_vec_info def_stmt_info)
9006 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9007 if (!def || TREE_CODE (def) != SSA_NAME)
9008 return;
9009 stmt_vec_info phi_info;
9010 imm_use_iterator iter;
9011 use_operand_p use_p;
9012 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9013 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9014 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9015 && (phi_info = loop_vinfo->lookup_stmt (phi))
9016 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9017 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9018 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9020 loop_p loop = gimple_bb (phi)->loop_father;
9021 edge e = loop_latch_edge (loop);
9022 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9024 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9025 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9026 gcc_assert (phi_defs.length () == latch_defs.length ());
9027 for (unsigned i = 0; i < phi_defs.length (); ++i)
9028 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9029 gimple_get_lhs (latch_defs[i]), e,
9030 gimple_phi_arg_location (phi, e->dest_idx));
9035 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9036 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9037 stmt_vec_info. */
9039 static void
9040 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9041 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9043 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9044 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9046 if (dump_enabled_p ())
9047 dump_printf_loc (MSG_NOTE, vect_location,
9048 "------>vectorizing statement: %G", stmt_info->stmt);
9050 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9051 vect_loop_kill_debug_uses (loop, stmt_info);
9053 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9054 && !STMT_VINFO_LIVE_P (stmt_info))
9055 return;
9057 if (STMT_VINFO_VECTYPE (stmt_info))
9059 poly_uint64 nunits
9060 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9061 if (!STMT_SLP_TYPE (stmt_info)
9062 && maybe_ne (nunits, vf)
9063 && dump_enabled_p ())
9064 /* For SLP VF is set according to unrolling factor, and not
9065 to vector size, hence for SLP this print is not valid. */
9066 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9069 /* Pure SLP statements have already been vectorized. We still need
9070 to apply loop vectorization to hybrid SLP statements. */
9071 if (PURE_SLP_STMT (stmt_info))
9072 return;
9074 if (dump_enabled_p ())
9075 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9077 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9078 *seen_store = stmt_info;
9081 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9082 in the hash_map with its corresponding values. */
9084 static tree
9085 find_in_mapping (tree t, void *context)
9087 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9089 tree *value = mapping->get (t);
9090 return value ? *value : t;
9093 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9094 original loop that has now been vectorized.
9096 The inits of the data_references need to be advanced with the number of
9097 iterations of the main loop. This has been computed in vect_do_peeling and
9098 is stored in parameter ADVANCE. We first restore the data_references
9099 initial offset with the values recored in ORIG_DRS_INIT.
9101 Since the loop_vec_info of this EPILOGUE was constructed for the original
9102 loop, its stmt_vec_infos all point to the original statements. These need
9103 to be updated to point to their corresponding copies as well as the SSA_NAMES
9104 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9106 The data_reference's connections also need to be updated. Their
9107 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9108 stmt_vec_infos, their statements need to point to their corresponding copy,
9109 if they are gather loads or scatter stores then their reference needs to be
9110 updated to point to its corresponding copy and finally we set
9111 'base_misaligned' to false as we have already peeled for alignment in the
9112 prologue of the main loop. */
9114 static void
9115 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9117 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9118 auto_vec<gimple *> stmt_worklist;
9119 hash_map<tree,tree> mapping;
9120 gimple *orig_stmt, *new_stmt;
9121 gimple_stmt_iterator epilogue_gsi;
9122 gphi_iterator epilogue_phi_gsi;
9123 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9124 basic_block *epilogue_bbs = get_loop_body (epilogue);
9125 unsigned i;
9127 free (LOOP_VINFO_BBS (epilogue_vinfo));
9128 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9130 /* Advance data_reference's with the number of iterations of the previous
9131 loop and its prologue. */
9132 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9135 /* The EPILOGUE loop is a copy of the original loop so they share the same
9136 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9137 point to the copied statements. We also create a mapping of all LHS' in
9138 the original loop and all the LHS' in the EPILOGUE and create worklists to
9139 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9140 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9142 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9143 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9145 new_stmt = epilogue_phi_gsi.phi ();
9147 gcc_assert (gimple_uid (new_stmt) > 0);
9148 stmt_vinfo
9149 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9151 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9152 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9154 mapping.put (gimple_phi_result (orig_stmt),
9155 gimple_phi_result (new_stmt));
9156 /* PHI nodes can not have patterns or related statements. */
9157 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9158 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9161 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9162 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9164 new_stmt = gsi_stmt (epilogue_gsi);
9165 if (is_gimple_debug (new_stmt))
9166 continue;
9168 gcc_assert (gimple_uid (new_stmt) > 0);
9169 stmt_vinfo
9170 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9172 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9173 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9175 if (tree old_lhs = gimple_get_lhs (orig_stmt))
9176 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9178 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9180 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9181 for (gimple_stmt_iterator gsi = gsi_start (seq);
9182 !gsi_end_p (gsi); gsi_next (&gsi))
9183 stmt_worklist.safe_push (gsi_stmt (gsi));
9186 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9187 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9189 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9190 stmt_worklist.safe_push (stmt);
9191 /* Set BB such that the assert in
9192 'get_initial_def_for_reduction' is able to determine that
9193 the BB of the related stmt is inside this loop. */
9194 gimple_set_bb (stmt,
9195 gimple_bb (new_stmt));
9196 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9197 gcc_assert (related_vinfo == NULL
9198 || related_vinfo == stmt_vinfo);
9203 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9204 using the original main loop and thus need to be updated to refer to the
9205 cloned variables used in the epilogue. */
9206 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9208 gimple *stmt = stmt_worklist[i];
9209 tree *new_op;
9211 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9213 tree op = gimple_op (stmt, j);
9214 if ((new_op = mapping.get(op)))
9215 gimple_set_op (stmt, j, *new_op);
9216 else
9218 /* PR92429: The last argument of simplify_replace_tree disables
9219 folding when replacing arguments. This is required as
9220 otherwise you might end up with different statements than the
9221 ones analyzed in vect_loop_analyze, leading to different
9222 vectorization. */
9223 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9224 &find_in_mapping, &mapping, false);
9225 gimple_set_op (stmt, j, op);
9230 struct data_reference *dr;
9231 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9232 FOR_EACH_VEC_ELT (datarefs, i, dr)
9234 orig_stmt = DR_STMT (dr);
9235 gcc_assert (gimple_uid (orig_stmt) > 0);
9236 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9237 /* Data references for gather loads and scatter stores do not use the
9238 updated offset we set using ADVANCE. Instead we have to make sure the
9239 reference in the data references point to the corresponding copy of
9240 the original in the epilogue. */
9241 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9242 == VMAT_GATHER_SCATTER)
9244 DR_REF (dr)
9245 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9246 &find_in_mapping, &mapping);
9247 DR_BASE_ADDRESS (dr)
9248 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9249 &find_in_mapping, &mapping);
9251 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9252 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9253 /* The vector size of the epilogue is smaller than that of the main loop
9254 so the alignment is either the same or lower. This means the dr will
9255 thus by definition be aligned. */
9256 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9259 epilogue_vinfo->shared->datarefs_copy.release ();
9260 epilogue_vinfo->shared->save_datarefs ();
9263 /* Function vect_transform_loop.
9265 The analysis phase has determined that the loop is vectorizable.
9266 Vectorize the loop - created vectorized stmts to replace the scalar
9267 stmts in the loop, and update the loop exit condition.
9268 Returns scalar epilogue loop if any. */
9270 class loop *
9271 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9273 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9274 class loop *epilogue = NULL;
9275 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9276 int nbbs = loop->num_nodes;
9277 int i;
9278 tree niters_vector = NULL_TREE;
9279 tree step_vector = NULL_TREE;
9280 tree niters_vector_mult_vf = NULL_TREE;
9281 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9282 unsigned int lowest_vf = constant_lower_bound (vf);
9283 gimple *stmt;
9284 bool check_profitability = false;
9285 unsigned int th;
9287 DUMP_VECT_SCOPE ("vec_transform_loop");
9289 loop_vinfo->shared->check_datarefs ();
9291 /* Use the more conservative vectorization threshold. If the number
9292 of iterations is constant assume the cost check has been performed
9293 by our caller. If the threshold makes all loops profitable that
9294 run at least the (estimated) vectorization factor number of times
9295 checking is pointless, too. */
9296 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9297 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9299 if (dump_enabled_p ())
9300 dump_printf_loc (MSG_NOTE, vect_location,
9301 "Profitability threshold is %d loop iterations.\n",
9302 th);
9303 check_profitability = true;
9306 /* Make sure there exists a single-predecessor exit bb. Do this before
9307 versioning. */
9308 edge e = single_exit (loop);
9309 if (! single_pred_p (e->dest))
9311 split_loop_exit_edge (e, true);
9312 if (dump_enabled_p ())
9313 dump_printf (MSG_NOTE, "split exit edge\n");
9316 /* Version the loop first, if required, so the profitability check
9317 comes first. */
9319 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9321 class loop *sloop
9322 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9323 sloop->force_vectorize = false;
9324 check_profitability = false;
9327 /* Make sure there exists a single-predecessor exit bb also on the
9328 scalar loop copy. Do this after versioning but before peeling
9329 so CFG structure is fine for both scalar and if-converted loop
9330 to make slpeel_duplicate_current_defs_from_edges face matched
9331 loop closed PHI nodes on the exit. */
9332 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9334 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9335 if (! single_pred_p (e->dest))
9337 split_loop_exit_edge (e, true);
9338 if (dump_enabled_p ())
9339 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9343 tree niters = vect_build_loop_niters (loop_vinfo);
9344 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9345 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9346 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9347 tree advance;
9348 drs_init_vec orig_drs_init;
9350 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9351 &step_vector, &niters_vector_mult_vf, th,
9352 check_profitability, niters_no_overflow,
9353 &advance);
9355 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9356 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9357 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9358 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9360 if (niters_vector == NULL_TREE)
9362 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9363 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9364 && known_eq (lowest_vf, vf))
9366 niters_vector
9367 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9368 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9369 step_vector = build_one_cst (TREE_TYPE (niters));
9371 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9372 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9373 &step_vector, niters_no_overflow);
9374 else
9375 /* vect_do_peeling subtracted the number of peeled prologue
9376 iterations from LOOP_VINFO_NITERS. */
9377 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9378 &niters_vector, &step_vector,
9379 niters_no_overflow);
9382 /* 1) Make sure the loop header has exactly two entries
9383 2) Make sure we have a preheader basic block. */
9385 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9387 split_edge (loop_preheader_edge (loop));
9389 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9390 /* This will deal with any possible peeling. */
9391 vect_prepare_for_masked_peels (loop_vinfo);
9393 /* Schedule the SLP instances first, then handle loop vectorization
9394 below. */
9395 if (!loop_vinfo->slp_instances.is_empty ())
9397 DUMP_VECT_SCOPE ("scheduling SLP instances");
9398 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9401 /* FORNOW: the vectorizer supports only loops which body consist
9402 of one basic block (header + empty latch). When the vectorizer will
9403 support more involved loop forms, the order by which the BBs are
9404 traversed need to be reconsidered. */
9406 for (i = 0; i < nbbs; i++)
9408 basic_block bb = bbs[i];
9409 stmt_vec_info stmt_info;
9411 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9412 gsi_next (&si))
9414 gphi *phi = si.phi ();
9415 if (dump_enabled_p ())
9416 dump_printf_loc (MSG_NOTE, vect_location,
9417 "------>vectorizing phi: %G", phi);
9418 stmt_info = loop_vinfo->lookup_stmt (phi);
9419 if (!stmt_info)
9420 continue;
9422 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9423 vect_loop_kill_debug_uses (loop, stmt_info);
9425 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9426 && !STMT_VINFO_LIVE_P (stmt_info))
9427 continue;
9429 if (STMT_VINFO_VECTYPE (stmt_info)
9430 && (maybe_ne
9431 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9432 && dump_enabled_p ())
9433 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9435 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9436 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9437 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9438 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9439 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9440 && ! PURE_SLP_STMT (stmt_info))
9442 if (dump_enabled_p ())
9443 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9444 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9448 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9449 gsi_next (&si))
9451 gphi *phi = si.phi ();
9452 stmt_info = loop_vinfo->lookup_stmt (phi);
9453 if (!stmt_info)
9454 continue;
9456 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9457 && !STMT_VINFO_LIVE_P (stmt_info))
9458 continue;
9460 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9461 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9462 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9463 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9464 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9465 && ! PURE_SLP_STMT (stmt_info))
9466 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9469 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9470 !gsi_end_p (si);)
9472 stmt = gsi_stmt (si);
9473 /* During vectorization remove existing clobber stmts. */
9474 if (gimple_clobber_p (stmt))
9476 unlink_stmt_vdef (stmt);
9477 gsi_remove (&si, true);
9478 release_defs (stmt);
9480 else
9482 /* Ignore vector stmts created in the outer loop. */
9483 stmt_info = loop_vinfo->lookup_stmt (stmt);
9485 /* vector stmts created in the outer-loop during vectorization of
9486 stmts in an inner-loop may not have a stmt_info, and do not
9487 need to be vectorized. */
9488 stmt_vec_info seen_store = NULL;
9489 if (stmt_info)
9491 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9493 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9494 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9495 !gsi_end_p (subsi); gsi_next (&subsi))
9497 stmt_vec_info pat_stmt_info
9498 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9499 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9500 &si, &seen_store);
9502 stmt_vec_info pat_stmt_info
9503 = STMT_VINFO_RELATED_STMT (stmt_info);
9504 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
9505 &seen_store);
9506 maybe_set_vectorized_backedge_value (loop_vinfo,
9507 pat_stmt_info);
9509 else
9511 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9512 &seen_store);
9513 maybe_set_vectorized_backedge_value (loop_vinfo,
9514 stmt_info);
9517 gsi_next (&si);
9518 if (seen_store)
9520 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9521 /* Interleaving. If IS_STORE is TRUE, the
9522 vectorization of the interleaving chain was
9523 completed - free all the stores in the chain. */
9524 vect_remove_stores (loop_vinfo,
9525 DR_GROUP_FIRST_ELEMENT (seen_store));
9526 else
9527 /* Free the attached stmt_vec_info and remove the stmt. */
9528 loop_vinfo->remove_stmt (stmt_info);
9533 /* Stub out scalar statements that must not survive vectorization.
9534 Doing this here helps with grouped statements, or statements that
9535 are involved in patterns. */
9536 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9537 !gsi_end_p (gsi); gsi_next (&gsi))
9539 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9540 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9542 tree lhs = gimple_get_lhs (call);
9543 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9545 tree zero = build_zero_cst (TREE_TYPE (lhs));
9546 gimple *new_stmt = gimple_build_assign (lhs, zero);
9547 gsi_replace (&gsi, new_stmt, true);
9551 } /* BBs in loop */
9553 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9554 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9555 if (integer_onep (step_vector))
9556 niters_no_overflow = true;
9557 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9558 niters_vector_mult_vf, !niters_no_overflow);
9560 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9561 scale_profile_for_vect_loop (loop, assumed_vf);
9563 /* True if the final iteration might not handle a full vector's
9564 worth of scalar iterations. */
9565 bool final_iter_may_be_partial
9566 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9567 /* The minimum number of iterations performed by the epilogue. This
9568 is 1 when peeling for gaps because we always need a final scalar
9569 iteration. */
9570 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9571 /* +1 to convert latch counts to loop iteration counts,
9572 -min_epilogue_iters to remove iterations that cannot be performed
9573 by the vector code. */
9574 int bias_for_lowest = 1 - min_epilogue_iters;
9575 int bias_for_assumed = bias_for_lowest;
9576 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9577 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9579 /* When the amount of peeling is known at compile time, the first
9580 iteration will have exactly alignment_npeels active elements.
9581 In the worst case it will have at least one. */
9582 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9583 bias_for_lowest += lowest_vf - min_first_active;
9584 bias_for_assumed += assumed_vf - min_first_active;
9586 /* In these calculations the "- 1" converts loop iteration counts
9587 back to latch counts. */
9588 if (loop->any_upper_bound)
9589 loop->nb_iterations_upper_bound
9590 = (final_iter_may_be_partial
9591 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9592 lowest_vf) - 1
9593 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9594 lowest_vf) - 1);
9595 if (loop->any_likely_upper_bound)
9596 loop->nb_iterations_likely_upper_bound
9597 = (final_iter_may_be_partial
9598 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9599 + bias_for_lowest, lowest_vf) - 1
9600 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9601 + bias_for_lowest, lowest_vf) - 1);
9602 if (loop->any_estimate)
9603 loop->nb_iterations_estimate
9604 = (final_iter_may_be_partial
9605 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9606 assumed_vf) - 1
9607 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9608 assumed_vf) - 1);
9610 if (dump_enabled_p ())
9612 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9614 dump_printf_loc (MSG_NOTE, vect_location,
9615 "LOOP VECTORIZED\n");
9616 if (loop->inner)
9617 dump_printf_loc (MSG_NOTE, vect_location,
9618 "OUTER LOOP VECTORIZED\n");
9619 dump_printf (MSG_NOTE, "\n");
9621 else
9622 dump_printf_loc (MSG_NOTE, vect_location,
9623 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9624 GET_MODE_NAME (loop_vinfo->vector_mode));
9627 /* Loops vectorized with a variable factor won't benefit from
9628 unrolling/peeling. */
9629 if (!vf.is_constant ())
9631 loop->unroll = 1;
9632 if (dump_enabled_p ())
9633 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9634 " variable-length vectorization factor\n");
9636 /* Free SLP instances here because otherwise stmt reference counting
9637 won't work. */
9638 slp_instance instance;
9639 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9640 vect_free_slp_instance (instance);
9641 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9642 /* Clear-up safelen field since its value is invalid after vectorization
9643 since vectorized loop can have loop-carried dependencies. */
9644 loop->safelen = 0;
9646 if (epilogue)
9648 update_epilogue_loop_vinfo (epilogue, advance);
9650 epilogue->simduid = loop->simduid;
9651 epilogue->force_vectorize = loop->force_vectorize;
9652 epilogue->dont_vectorize = false;
9655 return epilogue;
9658 /* The code below is trying to perform simple optimization - revert
9659 if-conversion for masked stores, i.e. if the mask of a store is zero
9660 do not perform it and all stored value producers also if possible.
9661 For example,
9662 for (i=0; i<n; i++)
9663 if (c[i])
9665 p1[i] += 1;
9666 p2[i] = p3[i] +2;
9668 this transformation will produce the following semi-hammock:
9670 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9672 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9673 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9674 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9675 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9676 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9677 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9681 void
9682 optimize_mask_stores (class loop *loop)
9684 basic_block *bbs = get_loop_body (loop);
9685 unsigned nbbs = loop->num_nodes;
9686 unsigned i;
9687 basic_block bb;
9688 class loop *bb_loop;
9689 gimple_stmt_iterator gsi;
9690 gimple *stmt;
9691 auto_vec<gimple *> worklist;
9692 auto_purge_vect_location sentinel;
9694 vect_location = find_loop_location (loop);
9695 /* Pick up all masked stores in loop if any. */
9696 for (i = 0; i < nbbs; i++)
9698 bb = bbs[i];
9699 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9700 gsi_next (&gsi))
9702 stmt = gsi_stmt (gsi);
9703 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9704 worklist.safe_push (stmt);
9708 free (bbs);
9709 if (worklist.is_empty ())
9710 return;
9712 /* Loop has masked stores. */
9713 while (!worklist.is_empty ())
9715 gimple *last, *last_store;
9716 edge e, efalse;
9717 tree mask;
9718 basic_block store_bb, join_bb;
9719 gimple_stmt_iterator gsi_to;
9720 tree vdef, new_vdef;
9721 gphi *phi;
9722 tree vectype;
9723 tree zero;
9725 last = worklist.pop ();
9726 mask = gimple_call_arg (last, 2);
9727 bb = gimple_bb (last);
9728 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9729 the same loop as if_bb. It could be different to LOOP when two
9730 level loop-nest is vectorized and mask_store belongs to the inner
9731 one. */
9732 e = split_block (bb, last);
9733 bb_loop = bb->loop_father;
9734 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9735 join_bb = e->dest;
9736 store_bb = create_empty_bb (bb);
9737 add_bb_to_loop (store_bb, bb_loop);
9738 e->flags = EDGE_TRUE_VALUE;
9739 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9740 /* Put STORE_BB to likely part. */
9741 efalse->probability = profile_probability::unlikely ();
9742 store_bb->count = efalse->count ();
9743 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9744 if (dom_info_available_p (CDI_DOMINATORS))
9745 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9746 if (dump_enabled_p ())
9747 dump_printf_loc (MSG_NOTE, vect_location,
9748 "Create new block %d to sink mask stores.",
9749 store_bb->index);
9750 /* Create vector comparison with boolean result. */
9751 vectype = TREE_TYPE (mask);
9752 zero = build_zero_cst (vectype);
9753 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9754 gsi = gsi_last_bb (bb);
9755 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9756 /* Create new PHI node for vdef of the last masked store:
9757 .MEM_2 = VDEF <.MEM_1>
9758 will be converted to
9759 .MEM.3 = VDEF <.MEM_1>
9760 and new PHI node will be created in join bb
9761 .MEM_2 = PHI <.MEM_1, .MEM_3>
9763 vdef = gimple_vdef (last);
9764 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9765 gimple_set_vdef (last, new_vdef);
9766 phi = create_phi_node (vdef, join_bb);
9767 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9769 /* Put all masked stores with the same mask to STORE_BB if possible. */
9770 while (true)
9772 gimple_stmt_iterator gsi_from;
9773 gimple *stmt1 = NULL;
9775 /* Move masked store to STORE_BB. */
9776 last_store = last;
9777 gsi = gsi_for_stmt (last);
9778 gsi_from = gsi;
9779 /* Shift GSI to the previous stmt for further traversal. */
9780 gsi_prev (&gsi);
9781 gsi_to = gsi_start_bb (store_bb);
9782 gsi_move_before (&gsi_from, &gsi_to);
9783 /* Setup GSI_TO to the non-empty block start. */
9784 gsi_to = gsi_start_bb (store_bb);
9785 if (dump_enabled_p ())
9786 dump_printf_loc (MSG_NOTE, vect_location,
9787 "Move stmt to created bb\n%G", last);
9788 /* Move all stored value producers if possible. */
9789 while (!gsi_end_p (gsi))
9791 tree lhs;
9792 imm_use_iterator imm_iter;
9793 use_operand_p use_p;
9794 bool res;
9796 /* Skip debug statements. */
9797 if (is_gimple_debug (gsi_stmt (gsi)))
9799 gsi_prev (&gsi);
9800 continue;
9802 stmt1 = gsi_stmt (gsi);
9803 /* Do not consider statements writing to memory or having
9804 volatile operand. */
9805 if (gimple_vdef (stmt1)
9806 || gimple_has_volatile_ops (stmt1))
9807 break;
9808 gsi_from = gsi;
9809 gsi_prev (&gsi);
9810 lhs = gimple_get_lhs (stmt1);
9811 if (!lhs)
9812 break;
9814 /* LHS of vectorized stmt must be SSA_NAME. */
9815 if (TREE_CODE (lhs) != SSA_NAME)
9816 break;
9818 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9820 /* Remove dead scalar statement. */
9821 if (has_zero_uses (lhs))
9823 gsi_remove (&gsi_from, true);
9824 continue;
9828 /* Check that LHS does not have uses outside of STORE_BB. */
9829 res = true;
9830 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9832 gimple *use_stmt;
9833 use_stmt = USE_STMT (use_p);
9834 if (is_gimple_debug (use_stmt))
9835 continue;
9836 if (gimple_bb (use_stmt) != store_bb)
9838 res = false;
9839 break;
9842 if (!res)
9843 break;
9845 if (gimple_vuse (stmt1)
9846 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9847 break;
9849 /* Can move STMT1 to STORE_BB. */
9850 if (dump_enabled_p ())
9851 dump_printf_loc (MSG_NOTE, vect_location,
9852 "Move stmt to created bb\n%G", stmt1);
9853 gsi_move_before (&gsi_from, &gsi_to);
9854 /* Shift GSI_TO for further insertion. */
9855 gsi_prev (&gsi_to);
9857 /* Put other masked stores with the same mask to STORE_BB. */
9858 if (worklist.is_empty ()
9859 || gimple_call_arg (worklist.last (), 2) != mask
9860 || worklist.last () != stmt1)
9861 break;
9862 last = worklist.pop ();
9864 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9868 /* Decide whether it is possible to use a zero-based induction variable
9869 when vectorizing LOOP_VINFO with partial vectors. If it is, return
9870 the value that the induction variable must be able to hold in order
9871 to ensure that the rgroups eventually have no active vector elements.
9872 Return -1 otherwise. */
9874 widest_int
9875 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
9877 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9878 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9879 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9881 /* Calculate the value that the induction variable must be able
9882 to hit in order to ensure that we end the loop with an all-false mask.
9883 This involves adding the maximum number of inactive trailing scalar
9884 iterations. */
9885 widest_int iv_limit = -1;
9886 if (max_loop_iterations (loop, &iv_limit))
9888 if (niters_skip)
9890 /* Add the maximum number of skipped iterations to the
9891 maximum iteration count. */
9892 if (TREE_CODE (niters_skip) == INTEGER_CST)
9893 iv_limit += wi::to_widest (niters_skip);
9894 else
9895 iv_limit += max_vf - 1;
9897 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9898 /* Make a conservatively-correct assumption. */
9899 iv_limit += max_vf - 1;
9901 /* IV_LIMIT is the maximum number of latch iterations, which is also
9902 the maximum in-range IV value. Round this value down to the previous
9903 vector alignment boundary and then add an extra full iteration. */
9904 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9905 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9907 return iv_limit;
9910 /* For the given rgroup_controls RGC, check whether an induction variable
9911 would ever hit a value that produces a set of all-false masks or zero
9912 lengths before wrapping around. Return true if it's possible to wrap
9913 around before hitting the desirable value, otherwise return false. */
9915 bool
9916 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
9918 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
9920 if (iv_limit == -1)
9921 return true;
9923 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9924 unsigned int compare_precision = TYPE_PRECISION (compare_type);
9925 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
9927 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
9928 return true;
9930 return false;