Daily bump.
[official-gcc.git] / gcc / tree-vect-loop.c
blobc95ec5ad267c4cb65fa26fe93d70cfe02cfa9ca6
1 /* Loop Vectorization
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
57 /* Loop Vectorization Pass.
59 This pass tries to vectorize loops.
61 For example, the vectorizer transforms the following simple loop:
63 short a[N]; short b[N]; short c[N]; int i;
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
69 as if it was manually vectorized by rewriting the source code into:
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
117 For example, say stmt S1 was vectorized into stmt VS1:
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
163 static opt_result
164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
168 gimple *stmt = stmt_info->stmt;
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
181 &stmt_vectype,
182 &nunits_vectype);
183 if (!res)
184 return res;
186 if (stmt_vectype)
188 if (STMT_VINFO_VECTYPE (stmt_info))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
193 || vectype_maybe_set_p)
194 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
202 return opt_result::success ();
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
210 static opt_result
211 vect_determine_vf_for_stmt (vec_info *vinfo,
212 stmt_vec_info stmt_info, poly_uint64 *vf)
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
216 stmt_info->stmt);
217 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
218 if (!res)
219 return res;
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
222 && STMT_VINFO_RELATED_STMT (stmt_info))
224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
225 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
229 !gsi_end_p (si); gsi_next (&si))
231 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE, vect_location,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info->stmt);
236 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
237 if (!res)
238 return res;
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE, vect_location,
243 "==> examining pattern statement: %G",
244 stmt_info->stmt);
245 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
246 if (!res)
247 return res;
250 return opt_result::success ();
253 /* Function vect_determine_vectorization_factor
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
264 in the loop.
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
267 original loop:
268 for (i=0; i<N; i++){
269 a[i] = b[i] + c[i];
272 vectorized loop:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
278 static opt_result
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
281 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
283 unsigned nbbs = loop->num_nodes;
284 poly_uint64 vectorization_factor = 1;
285 tree scalar_type = NULL_TREE;
286 gphi *phi;
287 tree vectype;
288 stmt_vec_info stmt_info;
289 unsigned i;
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
293 for (i = 0; i < nbbs; i++)
295 basic_block bb = bbs[i];
297 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
298 gsi_next (&si))
300 phi = si.phi ();
301 stmt_info = loop_vinfo->lookup_stmt (phi);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
304 phi);
306 gcc_assert (stmt_info);
308 if (STMT_VINFO_RELEVANT_P (stmt_info)
309 || STMT_VINFO_LIVE_P (stmt_info))
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
312 scalar_type = TREE_TYPE (PHI_RESULT (phi));
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "get vectype for scalar type: %T\n",
317 scalar_type);
319 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
320 if (!vectype)
321 return opt_result::failure_at (phi,
322 "not vectorized: unsupported "
323 "data-type %T\n",
324 scalar_type);
325 STMT_VINFO_VECTYPE (stmt_info) = vectype;
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
329 vectype);
331 if (dump_enabled_p ())
333 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
334 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
335 dump_printf (MSG_NOTE, "\n");
338 vect_update_max_nunits (&vectorization_factor, vectype);
342 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
343 gsi_next (&si))
345 if (is_gimple_debug (gsi_stmt (si)))
346 continue;
347 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
348 opt_result res
349 = vect_determine_vf_for_stmt (loop_vinfo,
350 stmt_info, &vectorization_factor);
351 if (!res)
352 return res;
356 /* TODO: Analyze cost. Decide if worth while to vectorize. */
357 if (dump_enabled_p ())
359 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
360 dump_dec (MSG_NOTE, vectorization_factor);
361 dump_printf (MSG_NOTE, "\n");
364 if (known_le (vectorization_factor, 1U))
365 return opt_result::failure_at (vect_location,
366 "not vectorized: unsupported data-type\n");
367 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
368 return opt_result::success ();
372 /* Function vect_is_simple_iv_evolution.
374 FORNOW: A simple evolution of an induction variables in the loop is
375 considered a polynomial evolution. */
377 static bool
378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
379 tree * step)
381 tree init_expr;
382 tree step_expr;
383 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
384 basic_block bb;
386 /* When there is no evolution in this loop, the evolution function
387 is not "simple". */
388 if (evolution_part == NULL_TREE)
389 return false;
391 /* When the evolution is a polynomial of degree >= 2
392 the evolution function is not "simple". */
393 if (tree_is_chrec (evolution_part))
394 return false;
396 step_expr = evolution_part;
397 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
401 step_expr, init_expr);
403 *init = init_expr;
404 *step = step_expr;
406 if (TREE_CODE (step_expr) != INTEGER_CST
407 && (TREE_CODE (step_expr) != SSA_NAME
408 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
409 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
410 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
411 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
412 || !flag_associative_math)))
413 && (TREE_CODE (step_expr) != REAL_CST
414 || !flag_associative_math))
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "step unknown.\n");
419 return false;
422 return true;
425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
426 what we are assuming is a double reduction. For example, given
427 a structure like this:
429 outer1:
430 x_1 = PHI <x_4(outer2), ...>;
433 inner:
434 x_2 = PHI <x_1(outer1), ...>;
436 x_3 = ...;
439 outer2:
440 x_4 = PHI <x_3(inner)>;
443 outer loop analysis would treat x_1 as a double reduction phi and
444 this function would then return true for x_2. */
446 static bool
447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
449 use_operand_p use_p;
450 ssa_op_iter op_iter;
451 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
452 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
453 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
454 return true;
455 return false;
458 /* Function vect_analyze_scalar_cycles_1.
460 Examine the cross iteration def-use cycles of scalar variables
461 in LOOP. LOOP_VINFO represents the loop that is now being
462 considered for vectorization (can be LOOP, or an outer-loop
463 enclosing LOOP). */
465 static void
466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
468 basic_block bb = loop->header;
469 tree init, step;
470 auto_vec<stmt_vec_info, 64> worklist;
471 gphi_iterator gsi;
472 bool double_reduc, reduc_chain;
474 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
476 /* First - identify all inductions. Reduction detection assumes that all the
477 inductions have been identified, therefore, this order must not be
478 changed. */
479 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
481 gphi *phi = gsi.phi ();
482 tree access_fn = NULL;
483 tree def = PHI_RESULT (phi);
484 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
489 /* Skip virtual phi's. The data dependences that are associated with
490 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
491 if (virtual_operand_p (def))
492 continue;
494 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
496 /* Analyze the evolution function. */
497 access_fn = analyze_scalar_evolution (loop, def);
498 if (access_fn)
500 STRIP_NOPS (access_fn);
501 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE, vect_location,
503 "Access function of PHI: %T\n", access_fn);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
505 = initial_condition_in_loop_num (access_fn, loop->num);
506 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
507 = evolution_part_in_loop_num (access_fn, loop->num);
510 if (!access_fn
511 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
512 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
513 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
514 && TREE_CODE (step) != INTEGER_CST))
516 worklist.safe_push (stmt_vinfo);
517 continue;
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
521 != NULL_TREE);
522 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
526 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
530 /* Second - identify all reductions and nested cycles. */
531 while (worklist.length () > 0)
533 stmt_vec_info stmt_vinfo = worklist.pop ();
534 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
535 tree def = PHI_RESULT (phi);
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
540 gcc_assert (!virtual_operand_p (def)
541 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
543 stmt_vec_info reduc_stmt_info
544 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
545 &reduc_chain);
546 if (reduc_stmt_info)
548 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
549 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
550 if (double_reduc)
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE, vect_location,
554 "Detected double reduction.\n");
556 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
557 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
559 else
561 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_NOTE, vect_location,
565 "Detected vectorizable nested cycle.\n");
567 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
569 else
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location,
573 "Detected reduction.\n");
575 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
577 /* Store the reduction cycles for possible vectorization in
578 loop-aware SLP if it was not detected as reduction
579 chain. */
580 if (! reduc_chain)
581 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
582 (reduc_stmt_info);
586 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
589 "Unknown def-use cycle pattern.\n");
594 /* Function vect_analyze_scalar_cycles.
596 Examine the cross iteration def-use cycles of scalar variables, by
597 analyzing the loop-header PHIs of scalar variables. Classify each
598 cycle as one of the following: invariant, induction, reduction, unknown.
599 We do that for the loop represented by LOOP_VINFO, and also to its
600 inner-loop, if exists.
601 Examples for scalar cycles:
603 Example1: reduction:
605 loop1:
606 for (i=0; i<N; i++)
607 sum += a[i];
609 Example2: induction:
611 loop2:
612 for (i=0; i<N; i++)
613 a[i] = i; */
615 static void
616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
618 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
620 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
622 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
623 Reductions in such inner-loop therefore have different properties than
624 the reductions in the nest that gets vectorized:
625 1. When vectorized, they are executed in the same order as in the original
626 scalar loop, so we can't change the order of computation when
627 vectorizing them.
628 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
629 current checks are too strict. */
631 if (loop->inner)
632 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
635 /* Transfer group and reduction information from STMT_INFO to its
636 pattern stmt. */
638 static void
639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
641 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
642 stmt_vec_info stmtp;
643 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
644 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
645 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
648 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
649 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
650 == STMT_VINFO_DEF_TYPE (stmt_info));
651 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
652 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
653 if (stmt_info)
654 REDUC_GROUP_NEXT_ELEMENT (stmtp)
655 = STMT_VINFO_RELATED_STMT (stmt_info);
657 while (stmt_info);
660 /* Fixup scalar cycles that now have their stmts detected as patterns. */
662 static void
663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
665 stmt_vec_info first;
666 unsigned i;
668 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
669 if (STMT_VINFO_IN_PATTERN_P (first))
671 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
672 while (next)
674 if (! STMT_VINFO_IN_PATTERN_P (next)
675 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
676 break;
677 next = REDUC_GROUP_NEXT_ELEMENT (next);
679 /* If not all stmt in the chain are patterns or if we failed
680 to update STMT_VINFO_REDUC_IDX try to handle the chain
681 without patterns. */
682 if (! next
683 && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
685 vect_fixup_reduc_chain (first);
686 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
687 = STMT_VINFO_RELATED_STMT (first);
692 /* Function vect_get_loop_niters.
694 Determine how many iterations the loop is executed and place it
695 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
696 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
697 niter information holds in ASSUMPTIONS.
699 Return the loop exit condition. */
702 static gcond *
703 vect_get_loop_niters (class loop *loop, tree *assumptions,
704 tree *number_of_iterations, tree *number_of_iterationsm1)
706 edge exit = single_exit (loop);
707 class tree_niter_desc niter_desc;
708 tree niter_assumptions, niter, may_be_zero;
709 gcond *cond = get_loop_exit_condition (loop);
711 *assumptions = boolean_true_node;
712 *number_of_iterationsm1 = chrec_dont_know;
713 *number_of_iterations = chrec_dont_know;
714 DUMP_VECT_SCOPE ("get_loop_niters");
716 if (!exit)
717 return cond;
719 may_be_zero = NULL_TREE;
720 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
721 || chrec_contains_undetermined (niter_desc.niter))
722 return cond;
724 niter_assumptions = niter_desc.assumptions;
725 may_be_zero = niter_desc.may_be_zero;
726 niter = niter_desc.niter;
728 if (may_be_zero && integer_zerop (may_be_zero))
729 may_be_zero = NULL_TREE;
731 if (may_be_zero)
733 if (COMPARISON_CLASS_P (may_be_zero))
735 /* Try to combine may_be_zero with assumptions, this can simplify
736 computation of niter expression. */
737 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
738 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
739 niter_assumptions,
740 fold_build1 (TRUTH_NOT_EXPR,
741 boolean_type_node,
742 may_be_zero));
743 else
744 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
745 build_int_cst (TREE_TYPE (niter), 0),
746 rewrite_to_non_trapping_overflow (niter));
748 may_be_zero = NULL_TREE;
750 else if (integer_nonzerop (may_be_zero))
752 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
753 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
754 return cond;
756 else
757 return cond;
760 *assumptions = niter_assumptions;
761 *number_of_iterationsm1 = niter;
763 /* We want the number of loop header executions which is the number
764 of latch executions plus one.
765 ??? For UINT_MAX latch executions this number overflows to zero
766 for loops like do { n++; } while (n != 0); */
767 if (niter && !chrec_contains_undetermined (niter))
768 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
769 build_int_cst (TREE_TYPE (niter), 1));
770 *number_of_iterations = niter;
772 return cond;
775 /* Function bb_in_loop_p
777 Used as predicate for dfs order traversal of the loop bbs. */
779 static bool
780 bb_in_loop_p (const_basic_block bb, const void *data)
782 const class loop *const loop = (const class loop *)data;
783 if (flow_bb_inside_loop_p (loop, bb))
784 return true;
785 return false;
789 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
790 stmt_vec_info structs for all the stmts in LOOP_IN. */
792 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
793 : vec_info (vec_info::loop, init_cost (loop_in), shared),
794 loop (loop_in),
795 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
796 num_itersm1 (NULL_TREE),
797 num_iters (NULL_TREE),
798 num_iters_unchanged (NULL_TREE),
799 num_iters_assumptions (NULL_TREE),
800 th (0),
801 versioning_threshold (0),
802 vectorization_factor (0),
803 max_vectorization_factor (0),
804 mask_skip_niters (NULL_TREE),
805 rgroup_compare_type (NULL_TREE),
806 simd_if_cond (NULL_TREE),
807 unaligned_dr (NULL),
808 peeling_for_alignment (0),
809 ptr_mask (0),
810 ivexpr_map (NULL),
811 scan_map (NULL),
812 slp_unrolling_factor (1),
813 single_scalar_iteration_cost (0),
814 vec_outside_cost (0),
815 vec_inside_cost (0),
816 vectorizable (false),
817 can_use_partial_vectors_p (true),
818 using_partial_vectors_p (false),
819 epil_using_partial_vectors_p (false),
820 peeling_for_gaps (false),
821 peeling_for_niter (false),
822 no_data_dependencies (false),
823 has_mask_store (false),
824 scalar_loop_scaling (profile_probability::uninitialized ()),
825 scalar_loop (NULL),
826 orig_loop_info (NULL)
828 /* CHECKME: We want to visit all BBs before their successors (except for
829 latch blocks, for which this assertion wouldn't hold). In the simple
830 case of the loop forms we allow, a dfs order of the BBs would the same
831 as reversed postorder traversal, so we are safe. */
833 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
834 bbs, loop->num_nodes, loop);
835 gcc_assert (nbbs == loop->num_nodes);
837 for (unsigned int i = 0; i < nbbs; i++)
839 basic_block bb = bbs[i];
840 gimple_stmt_iterator si;
842 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
844 gimple *phi = gsi_stmt (si);
845 gimple_set_uid (phi, 0);
846 add_stmt (phi);
849 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
851 gimple *stmt = gsi_stmt (si);
852 gimple_set_uid (stmt, 0);
853 if (is_gimple_debug (stmt))
854 continue;
855 add_stmt (stmt);
856 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
857 third argument is the #pragma omp simd if (x) condition, when 0,
858 loop shouldn't be vectorized, when non-zero constant, it should
859 be vectorized normally, otherwise versioned with vectorized loop
860 done if the condition is non-zero at runtime. */
861 if (loop_in->simduid
862 && is_gimple_call (stmt)
863 && gimple_call_internal_p (stmt)
864 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
865 && gimple_call_num_args (stmt) >= 3
866 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
867 && (loop_in->simduid
868 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
870 tree arg = gimple_call_arg (stmt, 2);
871 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
872 simd_if_cond = arg;
873 else
874 gcc_assert (integer_nonzerop (arg));
879 epilogue_vinfos.create (6);
882 /* Free all levels of rgroup CONTROLS. */
884 void
885 release_vec_loop_controls (vec<rgroup_controls> *controls)
887 rgroup_controls *rgc;
888 unsigned int i;
889 FOR_EACH_VEC_ELT (*controls, i, rgc)
890 rgc->controls.release ();
891 controls->release ();
894 /* Free all memory used by the _loop_vec_info, as well as all the
895 stmt_vec_info structs of all the stmts in the loop. */
897 _loop_vec_info::~_loop_vec_info ()
899 free (bbs);
901 release_vec_loop_controls (&masks);
902 release_vec_loop_controls (&lens);
903 delete ivexpr_map;
904 delete scan_map;
905 epilogue_vinfos.release ();
907 loop->aux = NULL;
910 /* Return an invariant or register for EXPR and emit necessary
911 computations in the LOOP_VINFO loop preheader. */
913 tree
914 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
916 if (is_gimple_reg (expr)
917 || is_gimple_min_invariant (expr))
918 return expr;
920 if (! loop_vinfo->ivexpr_map)
921 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
922 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
923 if (! cached)
925 gimple_seq stmts = NULL;
926 cached = force_gimple_operand (unshare_expr (expr),
927 &stmts, true, NULL_TREE);
928 if (stmts)
930 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
931 gsi_insert_seq_on_edge_immediate (e, stmts);
934 return cached;
937 /* Return true if we can use CMP_TYPE as the comparison type to produce
938 all masks required to mask LOOP_VINFO. */
940 static bool
941 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
943 rgroup_controls *rgm;
944 unsigned int i;
945 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
946 if (rgm->type != NULL_TREE
947 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
948 cmp_type, rgm->type,
949 OPTIMIZE_FOR_SPEED))
950 return false;
951 return true;
954 /* Calculate the maximum number of scalars per iteration for every
955 rgroup in LOOP_VINFO. */
957 static unsigned int
958 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
960 unsigned int res = 1;
961 unsigned int i;
962 rgroup_controls *rgm;
963 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
964 res = MAX (res, rgm->max_nscalars_per_iter);
965 return res;
968 /* Calculate the minimum precision necessary to represent:
970 MAX_NITERS * FACTOR
972 as an unsigned integer, where MAX_NITERS is the maximum number of
973 loop header iterations for the original scalar form of LOOP_VINFO. */
975 static unsigned
976 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
978 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
980 /* Get the maximum number of iterations that is representable
981 in the counter type. */
982 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
983 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
985 /* Get a more refined estimate for the number of iterations. */
986 widest_int max_back_edges;
987 if (max_loop_iterations (loop, &max_back_edges))
988 max_ni = wi::smin (max_ni, max_back_edges + 1);
990 /* Work out how many bits we need to represent the limit. */
991 return wi::min_precision (max_ni * factor, UNSIGNED);
994 /* Each statement in LOOP_VINFO can be masked where necessary. Check
995 whether we can actually generate the masks required. Return true if so,
996 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
998 static bool
999 vect_verify_full_masking (loop_vec_info loop_vinfo)
1001 unsigned int min_ni_width;
1002 unsigned int max_nscalars_per_iter
1003 = vect_get_max_nscalars_per_iter (loop_vinfo);
1005 /* Use a normal loop if there are no statements that need masking.
1006 This only happens in rare degenerate cases: it means that the loop
1007 has no loads, no stores, and no live-out values. */
1008 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1009 return false;
1011 /* Work out how many bits we need to represent the limit. */
1012 min_ni_width
1013 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1015 /* Find a scalar mode for which WHILE_ULT is supported. */
1016 opt_scalar_int_mode cmp_mode_iter;
1017 tree cmp_type = NULL_TREE;
1018 tree iv_type = NULL_TREE;
1019 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1020 unsigned int iv_precision = UINT_MAX;
1022 if (iv_limit != -1)
1023 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1024 UNSIGNED);
1026 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1028 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1029 if (cmp_bits >= min_ni_width
1030 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1032 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1033 if (this_type
1034 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1036 /* Although we could stop as soon as we find a valid mode,
1037 there are at least two reasons why that's not always the
1038 best choice:
1040 - An IV that's Pmode or wider is more likely to be reusable
1041 in address calculations than an IV that's narrower than
1042 Pmode.
1044 - Doing the comparison in IV_PRECISION or wider allows
1045 a natural 0-based IV, whereas using a narrower comparison
1046 type requires mitigations against wrap-around.
1048 Conversely, if the IV limit is variable, doing the comparison
1049 in a wider type than the original type can introduce
1050 unnecessary extensions, so picking the widest valid mode
1051 is not always a good choice either.
1053 Here we prefer the first IV type that's Pmode or wider,
1054 and the first comparison type that's IV_PRECISION or wider.
1055 (The comparison type must be no wider than the IV type,
1056 to avoid extensions in the vector loop.)
1058 ??? We might want to try continuing beyond Pmode for ILP32
1059 targets if CMP_BITS < IV_PRECISION. */
1060 iv_type = this_type;
1061 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1062 cmp_type = this_type;
1063 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1064 break;
1069 if (!cmp_type)
1070 return false;
1072 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1073 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1074 return true;
1077 /* Check whether we can use vector access with length based on precison
1078 comparison. So far, to keep it simple, we only allow the case that the
1079 precision of the target supported length is larger than the precision
1080 required by loop niters. */
1082 static bool
1083 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1085 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1086 return false;
1088 unsigned int max_nitems_per_iter = 1;
1089 unsigned int i;
1090 rgroup_controls *rgl;
1091 /* Find the maximum number of items per iteration for every rgroup. */
1092 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1094 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1095 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1098 /* Work out how many bits we need to represent the length limit. */
1099 unsigned int min_ni_prec
1100 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1102 /* Now use the maximum of below precisions for one suitable IV type:
1103 - the IV's natural precision
1104 - the precision needed to hold: the maximum number of scalar
1105 iterations multiplied by the scale factor (min_ni_prec above)
1106 - the Pmode precision
1108 If min_ni_prec is less than the precision of the current niters,
1109 we perfer to still use the niters type. Prefer to use Pmode and
1110 wider IV to avoid narrow conversions. */
1112 unsigned int ni_prec
1113 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1114 min_ni_prec = MAX (min_ni_prec, ni_prec);
1115 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1117 tree iv_type = NULL_TREE;
1118 opt_scalar_int_mode tmode_iter;
1119 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1121 scalar_mode tmode = tmode_iter.require ();
1122 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1124 /* ??? Do we really want to construct one IV whose precision exceeds
1125 BITS_PER_WORD? */
1126 if (tbits > BITS_PER_WORD)
1127 break;
1129 /* Find the first available standard integral type. */
1130 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1132 iv_type = build_nonstandard_integer_type (tbits, true);
1133 break;
1137 if (!iv_type)
1139 if (dump_enabled_p ())
1140 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1141 "can't vectorize with length-based partial vectors"
1142 " because there is no suitable iv type.\n");
1143 return false;
1146 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1147 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1149 return true;
1152 /* Calculate the cost of one scalar iteration of the loop. */
1153 static void
1154 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1156 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1157 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1158 int nbbs = loop->num_nodes, factor;
1159 int innerloop_iters, i;
1161 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1163 /* Gather costs for statements in the scalar loop. */
1165 /* FORNOW. */
1166 innerloop_iters = 1;
1167 if (loop->inner)
1168 innerloop_iters = 50; /* FIXME */
1170 for (i = 0; i < nbbs; i++)
1172 gimple_stmt_iterator si;
1173 basic_block bb = bbs[i];
1175 if (bb->loop_father == loop->inner)
1176 factor = innerloop_iters;
1177 else
1178 factor = 1;
1180 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1182 gimple *stmt = gsi_stmt (si);
1183 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1185 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1186 continue;
1188 /* Skip stmts that are not vectorized inside the loop. */
1189 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1190 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1191 && (!STMT_VINFO_LIVE_P (vstmt_info)
1192 || !VECTORIZABLE_CYCLE_DEF
1193 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1194 continue;
1196 vect_cost_for_stmt kind;
1197 if (STMT_VINFO_DATA_REF (stmt_info))
1199 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1200 kind = scalar_load;
1201 else
1202 kind = scalar_store;
1204 else if (vect_nop_conversion_p (stmt_info))
1205 continue;
1206 else
1207 kind = scalar_stmt;
1209 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1210 factor, kind, stmt_info, 0, vect_prologue);
1214 /* Now accumulate cost. */
1215 void *target_cost_data = init_cost (loop);
1216 stmt_info_for_cost *si;
1217 int j;
1218 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1219 j, si)
1220 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1221 si->kind, si->stmt_info, si->vectype,
1222 si->misalign, vect_body);
1223 unsigned dummy, body_cost = 0;
1224 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1225 destroy_cost_data (target_cost_data);
1226 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1230 /* Function vect_analyze_loop_form_1.
1232 Verify that certain CFG restrictions hold, including:
1233 - the loop has a pre-header
1234 - the loop has a single entry and exit
1235 - the loop exit condition is simple enough
1236 - the number of iterations can be analyzed, i.e, a countable loop. The
1237 niter could be analyzed under some assumptions. */
1239 opt_result
1240 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1241 tree *assumptions, tree *number_of_iterationsm1,
1242 tree *number_of_iterations, gcond **inner_loop_cond)
1244 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1246 /* Different restrictions apply when we are considering an inner-most loop,
1247 vs. an outer (nested) loop.
1248 (FORNOW. May want to relax some of these restrictions in the future). */
1250 if (!loop->inner)
1252 /* Inner-most loop. We currently require that the number of BBs is
1253 exactly 2 (the header and latch). Vectorizable inner-most loops
1254 look like this:
1256 (pre-header)
1258 header <--------+
1259 | | |
1260 | +--> latch --+
1262 (exit-bb) */
1264 if (loop->num_nodes != 2)
1265 return opt_result::failure_at (vect_location,
1266 "not vectorized:"
1267 " control flow in loop.\n");
1269 if (empty_block_p (loop->header))
1270 return opt_result::failure_at (vect_location,
1271 "not vectorized: empty loop.\n");
1273 else
1275 class loop *innerloop = loop->inner;
1276 edge entryedge;
1278 /* Nested loop. We currently require that the loop is doubly-nested,
1279 contains a single inner loop, and the number of BBs is exactly 5.
1280 Vectorizable outer-loops look like this:
1282 (pre-header)
1284 header <---+
1286 inner-loop |
1288 tail ------+
1290 (exit-bb)
1292 The inner-loop has the properties expected of inner-most loops
1293 as described above. */
1295 if ((loop->inner)->inner || (loop->inner)->next)
1296 return opt_result::failure_at (vect_location,
1297 "not vectorized:"
1298 " multiple nested loops.\n");
1300 if (loop->num_nodes != 5)
1301 return opt_result::failure_at (vect_location,
1302 "not vectorized:"
1303 " control flow in loop.\n");
1305 entryedge = loop_preheader_edge (innerloop);
1306 if (entryedge->src != loop->header
1307 || !single_exit (innerloop)
1308 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1309 return opt_result::failure_at (vect_location,
1310 "not vectorized:"
1311 " unsupported outerloop form.\n");
1313 /* Analyze the inner-loop. */
1314 tree inner_niterm1, inner_niter, inner_assumptions;
1315 opt_result res
1316 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1317 &inner_assumptions, &inner_niterm1,
1318 &inner_niter, NULL);
1319 if (!res)
1321 if (dump_enabled_p ())
1322 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1323 "not vectorized: Bad inner loop.\n");
1324 return res;
1327 /* Don't support analyzing niter under assumptions for inner
1328 loop. */
1329 if (!integer_onep (inner_assumptions))
1330 return opt_result::failure_at (vect_location,
1331 "not vectorized: Bad inner loop.\n");
1333 if (!expr_invariant_in_loop_p (loop, inner_niter))
1334 return opt_result::failure_at (vect_location,
1335 "not vectorized: inner-loop count not"
1336 " invariant.\n");
1338 if (dump_enabled_p ())
1339 dump_printf_loc (MSG_NOTE, vect_location,
1340 "Considering outer-loop vectorization.\n");
1343 if (!single_exit (loop))
1344 return opt_result::failure_at (vect_location,
1345 "not vectorized: multiple exits.\n");
1346 if (EDGE_COUNT (loop->header->preds) != 2)
1347 return opt_result::failure_at (vect_location,
1348 "not vectorized:"
1349 " too many incoming edges.\n");
1351 /* We assume that the loop exit condition is at the end of the loop. i.e,
1352 that the loop is represented as a do-while (with a proper if-guard
1353 before the loop if needed), where the loop header contains all the
1354 executable statements, and the latch is empty. */
1355 if (!empty_block_p (loop->latch)
1356 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1357 return opt_result::failure_at (vect_location,
1358 "not vectorized: latch block not empty.\n");
1360 /* Make sure the exit is not abnormal. */
1361 edge e = single_exit (loop);
1362 if (e->flags & EDGE_ABNORMAL)
1363 return opt_result::failure_at (vect_location,
1364 "not vectorized:"
1365 " abnormal loop exit edge.\n");
1367 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1368 number_of_iterationsm1);
1369 if (!*loop_cond)
1370 return opt_result::failure_at
1371 (vect_location,
1372 "not vectorized: complicated exit condition.\n");
1374 if (integer_zerop (*assumptions)
1375 || !*number_of_iterations
1376 || chrec_contains_undetermined (*number_of_iterations))
1377 return opt_result::failure_at
1378 (*loop_cond,
1379 "not vectorized: number of iterations cannot be computed.\n");
1381 if (integer_zerop (*number_of_iterations))
1382 return opt_result::failure_at
1383 (*loop_cond,
1384 "not vectorized: number of iterations = 0.\n");
1386 return opt_result::success ();
1389 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1391 opt_loop_vec_info
1392 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1394 tree assumptions, number_of_iterations, number_of_iterationsm1;
1395 gcond *loop_cond, *inner_loop_cond = NULL;
1397 opt_result res
1398 = vect_analyze_loop_form_1 (loop, &loop_cond,
1399 &assumptions, &number_of_iterationsm1,
1400 &number_of_iterations, &inner_loop_cond);
1401 if (!res)
1402 return opt_loop_vec_info::propagate_failure (res);
1404 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1405 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1406 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1407 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1408 if (!integer_onep (assumptions))
1410 /* We consider to vectorize this loop by versioning it under
1411 some assumptions. In order to do this, we need to clear
1412 existing information computed by scev and niter analyzer. */
1413 scev_reset_htab ();
1414 free_numbers_of_iterations_estimates (loop);
1415 /* Also set flag for this loop so that following scev and niter
1416 analysis are done under the assumptions. */
1417 loop_constraint_set (loop, LOOP_C_FINITE);
1418 /* Also record the assumptions for versioning. */
1419 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1422 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1424 if (dump_enabled_p ())
1426 dump_printf_loc (MSG_NOTE, vect_location,
1427 "Symbolic number of iterations is ");
1428 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1429 dump_printf (MSG_NOTE, "\n");
1433 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1434 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1435 if (inner_loop_cond)
1437 stmt_vec_info inner_loop_cond_info
1438 = loop_vinfo->lookup_stmt (inner_loop_cond);
1439 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1442 gcc_assert (!loop->aux);
1443 loop->aux = loop_vinfo;
1444 return opt_loop_vec_info::success (loop_vinfo);
1449 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1450 statements update the vectorization factor. */
1452 static void
1453 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1455 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1456 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1457 int nbbs = loop->num_nodes;
1458 poly_uint64 vectorization_factor;
1459 int i;
1461 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1463 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1464 gcc_assert (known_ne (vectorization_factor, 0U));
1466 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1467 vectorization factor of the loop is the unrolling factor required by
1468 the SLP instances. If that unrolling factor is 1, we say, that we
1469 perform pure SLP on loop - cross iteration parallelism is not
1470 exploited. */
1471 bool only_slp_in_loop = true;
1472 for (i = 0; i < nbbs; i++)
1474 basic_block bb = bbs[i];
1475 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1476 gsi_next (&si))
1478 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1479 if (!stmt_info)
1480 continue;
1481 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1482 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1483 && !PURE_SLP_STMT (stmt_info))
1484 /* STMT needs both SLP and loop-based vectorization. */
1485 only_slp_in_loop = false;
1487 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1488 gsi_next (&si))
1490 if (is_gimple_debug (gsi_stmt (si)))
1491 continue;
1492 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1493 stmt_info = vect_stmt_to_vectorize (stmt_info);
1494 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1495 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1496 && !PURE_SLP_STMT (stmt_info))
1497 /* STMT needs both SLP and loop-based vectorization. */
1498 only_slp_in_loop = false;
1502 if (only_slp_in_loop)
1504 if (dump_enabled_p ())
1505 dump_printf_loc (MSG_NOTE, vect_location,
1506 "Loop contains only SLP stmts\n");
1507 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1509 else
1511 if (dump_enabled_p ())
1512 dump_printf_loc (MSG_NOTE, vect_location,
1513 "Loop contains SLP and non-SLP stmts\n");
1514 /* Both the vectorization factor and unroll factor have the form
1515 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1516 so they must have a common multiple. */
1517 vectorization_factor
1518 = force_common_multiple (vectorization_factor,
1519 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1522 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1523 if (dump_enabled_p ())
1525 dump_printf_loc (MSG_NOTE, vect_location,
1526 "Updating vectorization factor to ");
1527 dump_dec (MSG_NOTE, vectorization_factor);
1528 dump_printf (MSG_NOTE, ".\n");
1532 /* Return true if STMT_INFO describes a double reduction phi and if
1533 the other phi in the reduction is also relevant for vectorization.
1534 This rejects cases such as:
1536 outer1:
1537 x_1 = PHI <x_3(outer2), ...>;
1540 inner:
1541 x_2 = ...;
1544 outer2:
1545 x_3 = PHI <x_2(inner)>;
1547 if nothing in x_2 or elsewhere makes x_1 relevant. */
1549 static bool
1550 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1552 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1553 return false;
1555 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1558 /* Function vect_analyze_loop_operations.
1560 Scan the loop stmts and make sure they are all vectorizable. */
1562 static opt_result
1563 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1565 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1566 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1567 int nbbs = loop->num_nodes;
1568 int i;
1569 stmt_vec_info stmt_info;
1570 bool need_to_vectorize = false;
1571 bool ok;
1573 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1575 auto_vec<stmt_info_for_cost> cost_vec;
1577 for (i = 0; i < nbbs; i++)
1579 basic_block bb = bbs[i];
1581 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1582 gsi_next (&si))
1584 gphi *phi = si.phi ();
1585 ok = true;
1587 stmt_info = loop_vinfo->lookup_stmt (phi);
1588 if (dump_enabled_p ())
1589 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1590 if (virtual_operand_p (gimple_phi_result (phi)))
1591 continue;
1593 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1594 (i.e., a phi in the tail of the outer-loop). */
1595 if (! is_loop_header_bb_p (bb))
1597 /* FORNOW: we currently don't support the case that these phis
1598 are not used in the outerloop (unless it is double reduction,
1599 i.e., this phi is vect_reduction_def), cause this case
1600 requires to actually do something here. */
1601 if (STMT_VINFO_LIVE_P (stmt_info)
1602 && !vect_active_double_reduction_p (stmt_info))
1603 return opt_result::failure_at (phi,
1604 "Unsupported loop-closed phi"
1605 " in outer-loop.\n");
1607 /* If PHI is used in the outer loop, we check that its operand
1608 is defined in the inner loop. */
1609 if (STMT_VINFO_RELEVANT_P (stmt_info))
1611 tree phi_op;
1613 if (gimple_phi_num_args (phi) != 1)
1614 return opt_result::failure_at (phi, "unsupported phi");
1616 phi_op = PHI_ARG_DEF (phi, 0);
1617 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1618 if (!op_def_info)
1619 return opt_result::failure_at (phi, "unsupported phi\n");
1621 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1622 && (STMT_VINFO_RELEVANT (op_def_info)
1623 != vect_used_in_outer_by_reduction))
1624 return opt_result::failure_at (phi, "unsupported phi\n");
1626 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1627 || (STMT_VINFO_DEF_TYPE (stmt_info)
1628 == vect_double_reduction_def))
1629 && !vectorizable_lc_phi (loop_vinfo,
1630 stmt_info, NULL, NULL))
1631 return opt_result::failure_at (phi, "unsupported phi\n");
1634 continue;
1637 gcc_assert (stmt_info);
1639 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1640 || STMT_VINFO_LIVE_P (stmt_info))
1641 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1642 /* A scalar-dependence cycle that we don't support. */
1643 return opt_result::failure_at (phi,
1644 "not vectorized:"
1645 " scalar dependence cycle.\n");
1647 if (STMT_VINFO_RELEVANT_P (stmt_info))
1649 need_to_vectorize = true;
1650 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1651 && ! PURE_SLP_STMT (stmt_info))
1652 ok = vectorizable_induction (loop_vinfo,
1653 stmt_info, NULL, NULL,
1654 &cost_vec);
1655 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1656 || (STMT_VINFO_DEF_TYPE (stmt_info)
1657 == vect_double_reduction_def)
1658 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1659 && ! PURE_SLP_STMT (stmt_info))
1660 ok = vectorizable_reduction (loop_vinfo,
1661 stmt_info, NULL, NULL, &cost_vec);
1664 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1665 if (ok
1666 && STMT_VINFO_LIVE_P (stmt_info)
1667 && !PURE_SLP_STMT (stmt_info))
1668 ok = vectorizable_live_operation (loop_vinfo,
1669 stmt_info, NULL, NULL, NULL,
1670 -1, false, &cost_vec);
1672 if (!ok)
1673 return opt_result::failure_at (phi,
1674 "not vectorized: relevant phi not "
1675 "supported: %G",
1676 static_cast <gimple *> (phi));
1679 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1680 gsi_next (&si))
1682 gimple *stmt = gsi_stmt (si);
1683 if (!gimple_clobber_p (stmt)
1684 && !is_gimple_debug (stmt))
1686 opt_result res
1687 = vect_analyze_stmt (loop_vinfo,
1688 loop_vinfo->lookup_stmt (stmt),
1689 &need_to_vectorize,
1690 NULL, NULL, &cost_vec);
1691 if (!res)
1692 return res;
1695 } /* bbs */
1697 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1699 /* All operations in the loop are either irrelevant (deal with loop
1700 control, or dead), or only used outside the loop and can be moved
1701 out of the loop (e.g. invariants, inductions). The loop can be
1702 optimized away by scalar optimizations. We're better off not
1703 touching this loop. */
1704 if (!need_to_vectorize)
1706 if (dump_enabled_p ())
1707 dump_printf_loc (MSG_NOTE, vect_location,
1708 "All the computation can be taken out of the loop.\n");
1709 return opt_result::failure_at
1710 (vect_location,
1711 "not vectorized: redundant loop. no profit to vectorize.\n");
1714 return opt_result::success ();
1717 /* Return true if we know that the iteration count is smaller than the
1718 vectorization factor. Return false if it isn't, or if we can't be sure
1719 either way. */
1721 static bool
1722 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1724 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1726 HOST_WIDE_INT max_niter;
1727 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1728 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1729 else
1730 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1732 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1733 return true;
1735 return false;
1738 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1739 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1740 definitely no, or -1 if it's worth retrying. */
1742 static int
1743 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1745 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1746 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1748 /* Only loops that can handle partially-populated vectors can have iteration
1749 counts less than the vectorization factor. */
1750 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1752 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1754 if (dump_enabled_p ())
1755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1756 "not vectorized: iteration count smaller than "
1757 "vectorization factor.\n");
1758 return 0;
1762 int min_profitable_iters, min_profitable_estimate;
1763 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1764 &min_profitable_estimate);
1766 if (min_profitable_iters < 0)
1768 if (dump_enabled_p ())
1769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1770 "not vectorized: vectorization not profitable.\n");
1771 if (dump_enabled_p ())
1772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773 "not vectorized: vector version will never be "
1774 "profitable.\n");
1775 return -1;
1778 int min_scalar_loop_bound = (param_min_vect_loop_bound
1779 * assumed_vf);
1781 /* Use the cost model only if it is more conservative than user specified
1782 threshold. */
1783 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1784 min_profitable_iters);
1786 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1788 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1789 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1791 if (dump_enabled_p ())
1792 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1793 "not vectorized: vectorization not profitable.\n");
1794 if (dump_enabled_p ())
1795 dump_printf_loc (MSG_NOTE, vect_location,
1796 "not vectorized: iteration count smaller than user "
1797 "specified loop bound parameter or minimum profitable "
1798 "iterations (whichever is more conservative).\n");
1799 return 0;
1802 /* The static profitablity threshold min_profitable_estimate includes
1803 the cost of having to check at runtime whether the scalar loop
1804 should be used instead. If it turns out that we don't need or want
1805 such a check, the threshold we should use for the static estimate
1806 is simply the point at which the vector loop becomes more profitable
1807 than the scalar loop. */
1808 if (min_profitable_estimate > min_profitable_iters
1809 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1810 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1811 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1812 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1814 if (dump_enabled_p ())
1815 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1816 " choice between the scalar and vector loops\n");
1817 min_profitable_estimate = min_profitable_iters;
1820 HOST_WIDE_INT estimated_niter;
1822 /* If we are vectorizing an epilogue then we know the maximum number of
1823 scalar iterations it will cover is at least one lower than the
1824 vectorization factor of the main loop. */
1825 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1826 estimated_niter
1827 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1828 else
1830 estimated_niter = estimated_stmt_executions_int (loop);
1831 if (estimated_niter == -1)
1832 estimated_niter = likely_max_stmt_executions_int (loop);
1834 if (estimated_niter != -1
1835 && ((unsigned HOST_WIDE_INT) estimated_niter
1836 < MAX (th, (unsigned) min_profitable_estimate)))
1838 if (dump_enabled_p ())
1839 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1840 "not vectorized: estimated iteration count too "
1841 "small.\n");
1842 if (dump_enabled_p ())
1843 dump_printf_loc (MSG_NOTE, vect_location,
1844 "not vectorized: estimated iteration count smaller "
1845 "than specified loop bound parameter or minimum "
1846 "profitable iterations (whichever is more "
1847 "conservative).\n");
1848 return -1;
1851 return 1;
1854 static opt_result
1855 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1856 vec<data_reference_p> *datarefs,
1857 unsigned int *n_stmts)
1859 *n_stmts = 0;
1860 for (unsigned i = 0; i < loop->num_nodes; i++)
1861 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1862 !gsi_end_p (gsi); gsi_next (&gsi))
1864 gimple *stmt = gsi_stmt (gsi);
1865 if (is_gimple_debug (stmt))
1866 continue;
1867 ++(*n_stmts);
1868 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1869 NULL, 0);
1870 if (!res)
1872 if (is_gimple_call (stmt) && loop->safelen)
1874 tree fndecl = gimple_call_fndecl (stmt), op;
1875 if (fndecl != NULL_TREE)
1877 cgraph_node *node = cgraph_node::get (fndecl);
1878 if (node != NULL && node->simd_clones != NULL)
1880 unsigned int j, n = gimple_call_num_args (stmt);
1881 for (j = 0; j < n; j++)
1883 op = gimple_call_arg (stmt, j);
1884 if (DECL_P (op)
1885 || (REFERENCE_CLASS_P (op)
1886 && get_base_address (op)))
1887 break;
1889 op = gimple_call_lhs (stmt);
1890 /* Ignore #pragma omp declare simd functions
1891 if they don't have data references in the
1892 call stmt itself. */
1893 if (j == n
1894 && !(op
1895 && (DECL_P (op)
1896 || (REFERENCE_CLASS_P (op)
1897 && get_base_address (op)))))
1898 continue;
1902 return res;
1904 /* If dependence analysis will give up due to the limit on the
1905 number of datarefs stop here and fail fatally. */
1906 if (datarefs->length ()
1907 > (unsigned)param_loop_max_datarefs_for_datadeps)
1908 return opt_result::failure_at (stmt, "exceeded param "
1909 "loop-max-datarefs-for-datadeps\n");
1911 return opt_result::success ();
1914 /* Look for SLP-only access groups and turn each individual access into its own
1915 group. */
1916 static void
1917 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1919 unsigned int i;
1920 struct data_reference *dr;
1922 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1924 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1925 FOR_EACH_VEC_ELT (datarefs, i, dr)
1927 gcc_assert (DR_REF (dr));
1928 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1930 /* Check if the load is a part of an interleaving chain. */
1931 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1933 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1934 unsigned int group_size = DR_GROUP_SIZE (first_element);
1936 /* Check if SLP-only groups. */
1937 if (!STMT_SLP_TYPE (stmt_info)
1938 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1940 /* Dissolve the group. */
1941 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1943 stmt_vec_info vinfo = first_element;
1944 while (vinfo)
1946 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1947 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1948 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1949 DR_GROUP_SIZE (vinfo) = 1;
1950 if (STMT_VINFO_STRIDED_P (first_element))
1951 DR_GROUP_GAP (vinfo) = 0;
1952 else
1953 DR_GROUP_GAP (vinfo) = group_size - 1;
1954 vinfo = next;
1962 /* Decides whether we need to create an epilogue loop to handle
1963 remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */
1965 void
1966 determine_peel_for_niter (loop_vec_info loop_vinfo)
1968 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1970 unsigned HOST_WIDE_INT const_vf;
1971 HOST_WIDE_INT max_niter
1972 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1974 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1975 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1976 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1977 (loop_vinfo));
1979 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1980 /* The main loop handles all iterations. */
1981 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1982 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1983 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1985 /* Work out the (constant) number of iterations that need to be
1986 peeled for reasons other than niters. */
1987 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1988 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1989 peel_niter += 1;
1990 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1991 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1992 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1994 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1995 /* ??? When peeling for gaps but not alignment, we could
1996 try to check whether the (variable) niters is known to be
1997 VF * N + 1. That's something of a niche case though. */
1998 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1999 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2000 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2001 < (unsigned) exact_log2 (const_vf))
2002 /* In case of versioning, check if the maximum number of
2003 iterations is greater than th. If they are identical,
2004 the epilogue is unnecessary. */
2005 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2006 || ((unsigned HOST_WIDE_INT) max_niter
2007 > (th / const_vf) * const_vf))))
2008 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2012 /* Function vect_analyze_loop_2.
2014 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2015 for it. The different analyses will record information in the
2016 loop_vec_info struct. */
2017 static opt_result
2018 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2020 opt_result ok = opt_result::success ();
2021 int res;
2022 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2023 poly_uint64 min_vf = 2;
2024 loop_vec_info orig_loop_vinfo = NULL;
2026 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2027 loop_vec_info of the first vectorized loop. */
2028 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2029 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2030 else
2031 orig_loop_vinfo = loop_vinfo;
2032 gcc_assert (orig_loop_vinfo);
2034 /* The first group of checks is independent of the vector size. */
2035 fatal = true;
2037 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2038 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2039 return opt_result::failure_at (vect_location,
2040 "not vectorized: simd if(0)\n");
2042 /* Find all data references in the loop (which correspond to vdefs/vuses)
2043 and analyze their evolution in the loop. */
2045 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2047 /* Gather the data references and count stmts in the loop. */
2048 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2050 opt_result res
2051 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2052 &LOOP_VINFO_DATAREFS (loop_vinfo),
2053 n_stmts);
2054 if (!res)
2056 if (dump_enabled_p ())
2057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2058 "not vectorized: loop contains function "
2059 "calls or data references that cannot "
2060 "be analyzed\n");
2061 return res;
2063 loop_vinfo->shared->save_datarefs ();
2065 else
2066 loop_vinfo->shared->check_datarefs ();
2068 /* Analyze the data references and also adjust the minimal
2069 vectorization factor according to the loads and stores. */
2071 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2072 if (!ok)
2074 if (dump_enabled_p ())
2075 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2076 "bad data references.\n");
2077 return ok;
2080 /* Classify all cross-iteration scalar data-flow cycles.
2081 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2082 vect_analyze_scalar_cycles (loop_vinfo);
2084 vect_pattern_recog (loop_vinfo);
2086 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2088 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2089 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2091 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2092 if (!ok)
2094 if (dump_enabled_p ())
2095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2096 "bad data access.\n");
2097 return ok;
2100 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2102 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2103 if (!ok)
2105 if (dump_enabled_p ())
2106 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2107 "unexpected pattern.\n");
2108 return ok;
2111 /* While the rest of the analysis below depends on it in some way. */
2112 fatal = false;
2114 /* Analyze data dependences between the data-refs in the loop
2115 and adjust the maximum vectorization factor according to
2116 the dependences.
2117 FORNOW: fail at the first data dependence that we encounter. */
2119 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2120 if (!ok)
2122 if (dump_enabled_p ())
2123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2124 "bad data dependence.\n");
2125 return ok;
2127 if (max_vf != MAX_VECTORIZATION_FACTOR
2128 && maybe_lt (max_vf, min_vf))
2129 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2130 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2132 ok = vect_determine_vectorization_factor (loop_vinfo);
2133 if (!ok)
2135 if (dump_enabled_p ())
2136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2137 "can't determine vectorization factor.\n");
2138 return ok;
2140 if (max_vf != MAX_VECTORIZATION_FACTOR
2141 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2142 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2144 /* Compute the scalar iteration cost. */
2145 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2147 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2149 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2150 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2151 if (!ok)
2152 return ok;
2154 /* If there are any SLP instances mark them as pure_slp. */
2155 bool slp = vect_make_slp_decision (loop_vinfo);
2156 if (slp)
2158 /* Find stmts that need to be both vectorized and SLPed. */
2159 vect_detect_hybrid_slp (loop_vinfo);
2161 /* Update the vectorization factor based on the SLP decision. */
2162 vect_update_vf_for_slp (loop_vinfo);
2164 /* Optimize the SLP graph with the vectorization factor fixed. */
2165 vect_optimize_slp (loop_vinfo);
2168 bool saved_can_use_partial_vectors_p
2169 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2171 /* We don't expect to have to roll back to anything other than an empty
2172 set of rgroups. */
2173 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2175 /* This is the point where we can re-start analysis with SLP forced off. */
2176 start_over:
2178 /* Now the vectorization factor is final. */
2179 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2180 gcc_assert (known_ne (vectorization_factor, 0U));
2182 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2184 dump_printf_loc (MSG_NOTE, vect_location,
2185 "vectorization_factor = ");
2186 dump_dec (MSG_NOTE, vectorization_factor);
2187 dump_printf (MSG_NOTE, ", niters = %wd\n",
2188 LOOP_VINFO_INT_NITERS (loop_vinfo));
2191 /* Analyze the alignment of the data-refs in the loop.
2192 Fail if a data reference is found that cannot be vectorized. */
2194 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2195 if (!ok)
2197 if (dump_enabled_p ())
2198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2199 "bad data alignment.\n");
2200 return ok;
2203 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2204 It is important to call pruning after vect_analyze_data_ref_accesses,
2205 since we use grouping information gathered by interleaving analysis. */
2206 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2207 if (!ok)
2208 return ok;
2210 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2211 vectorization, since we do not want to add extra peeling or
2212 add versioning for alignment. */
2213 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2214 /* This pass will decide on using loop versioning and/or loop peeling in
2215 order to enhance the alignment of data references in the loop. */
2216 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2217 if (!ok)
2218 return ok;
2220 if (slp)
2222 /* Analyze operations in the SLP instances. Note this may
2223 remove unsupported SLP instances which makes the above
2224 SLP kind detection invalid. */
2225 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2226 vect_slp_analyze_operations (loop_vinfo);
2227 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2229 ok = opt_result::failure_at (vect_location,
2230 "unsupported SLP instances\n");
2231 goto again;
2235 /* Dissolve SLP-only groups. */
2236 vect_dissolve_slp_only_groups (loop_vinfo);
2238 /* Scan all the remaining operations in the loop that are not subject
2239 to SLP and make sure they are vectorizable. */
2240 ok = vect_analyze_loop_operations (loop_vinfo);
2241 if (!ok)
2243 if (dump_enabled_p ())
2244 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2245 "bad operation or unsupported loop bound.\n");
2246 return ok;
2249 /* For now, we don't expect to mix both masking and length approaches for one
2250 loop, disable it if both are recorded. */
2251 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2252 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2253 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2255 if (dump_enabled_p ())
2256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2257 "can't vectorize a loop with partial vectors"
2258 " because we don't expect to mix different"
2259 " approaches with partial vectors for the"
2260 " same loop.\n");
2261 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2264 /* Decide whether to vectorize a loop with partial vectors for
2265 this vectorization factor. */
2266 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2268 if (param_vect_partial_vector_usage == 0)
2269 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2270 else if (vect_verify_full_masking (loop_vinfo)
2271 || vect_verify_loop_lens (loop_vinfo))
2273 /* The epilogue and other known niters less than VF
2274 cases can still use vector access with length fully. */
2275 if (param_vect_partial_vector_usage == 1
2276 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2277 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2279 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2280 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2282 else
2283 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2285 else
2286 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2288 else
2289 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2291 if (dump_enabled_p ())
2293 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2294 dump_printf_loc (MSG_NOTE, vect_location,
2295 "operating on partial vectors.\n");
2296 else
2297 dump_printf_loc (MSG_NOTE, vect_location,
2298 "operating only on full vectors.\n");
2301 /* If epilog loop is required because of data accesses with gaps,
2302 one additional iteration needs to be peeled. Check if there is
2303 enough iterations for vectorization. */
2304 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2305 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2306 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2308 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2309 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2311 if (known_lt (wi::to_widest (scalar_niters), vf))
2312 return opt_result::failure_at (vect_location,
2313 "loop has no enough iterations to"
2314 " support peeling for gaps.\n");
2317 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2318 to be able to handle fewer than VF scalars, or needs to have a lower VF
2319 than the main loop. */
2320 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2321 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2322 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2323 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2324 return opt_result::failure_at (vect_location,
2325 "Vectorization factor too high for"
2326 " epilogue loop.\n");
2328 /* Check the costings of the loop make vectorizing worthwhile. */
2329 res = vect_analyze_loop_costing (loop_vinfo);
2330 if (res < 0)
2332 ok = opt_result::failure_at (vect_location,
2333 "Loop costings may not be worthwhile.\n");
2334 goto again;
2336 if (!res)
2337 return opt_result::failure_at (vect_location,
2338 "Loop costings not worthwhile.\n");
2340 determine_peel_for_niter (loop_vinfo);
2341 /* If an epilogue loop is required make sure we can create one. */
2342 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2343 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2345 if (dump_enabled_p ())
2346 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2347 if (!vect_can_advance_ivs_p (loop_vinfo)
2348 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2349 single_exit (LOOP_VINFO_LOOP
2350 (loop_vinfo))))
2352 ok = opt_result::failure_at (vect_location,
2353 "not vectorized: can't create required "
2354 "epilog loop\n");
2355 goto again;
2359 /* During peeling, we need to check if number of loop iterations is
2360 enough for both peeled prolog loop and vector loop. This check
2361 can be merged along with threshold check of loop versioning, so
2362 increase threshold for this case if necessary.
2364 If we are analyzing an epilogue we still want to check what its
2365 versioning threshold would be. If we decide to vectorize the epilogues we
2366 will want to use the lowest versioning threshold of all epilogues and main
2367 loop. This will enable us to enter a vectorized epilogue even when
2368 versioning the loop. We can't simply check whether the epilogue requires
2369 versioning though since we may have skipped some versioning checks when
2370 analyzing the epilogue. For instance, checks for alias versioning will be
2371 skipped when dealing with epilogues as we assume we already checked them
2372 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2373 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2375 poly_uint64 niters_th = 0;
2376 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2378 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2380 /* Niters for peeled prolog loop. */
2381 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2383 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2384 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2385 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2387 else
2388 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2391 /* Niters for at least one iteration of vectorized loop. */
2392 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2393 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2394 /* One additional iteration because of peeling for gap. */
2395 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2396 niters_th += 1;
2398 /* Use the same condition as vect_transform_loop to decide when to use
2399 the cost to determine a versioning threshold. */
2400 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2401 && ordered_p (th, niters_th))
2402 niters_th = ordered_max (poly_uint64 (th), niters_th);
2404 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2407 gcc_assert (known_eq (vectorization_factor,
2408 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2410 /* Ok to vectorize! */
2411 return opt_result::success ();
2413 again:
2414 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2415 gcc_assert (!ok);
2417 /* Try again with SLP forced off but if we didn't do any SLP there is
2418 no point in re-trying. */
2419 if (!slp)
2420 return ok;
2422 /* If there are reduction chains re-trying will fail anyway. */
2423 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2424 return ok;
2426 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2427 via interleaving or lane instructions. */
2428 slp_instance instance;
2429 slp_tree node;
2430 unsigned i, j;
2431 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2433 stmt_vec_info vinfo;
2434 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2435 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2436 continue;
2437 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2438 unsigned int size = DR_GROUP_SIZE (vinfo);
2439 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2440 if (! vect_store_lanes_supported (vectype, size, false)
2441 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2442 && ! vect_grouped_store_supported (vectype, size))
2443 return opt_result::failure_at (vinfo->stmt,
2444 "unsupported grouped store\n");
2445 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2447 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2448 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2449 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2450 size = DR_GROUP_SIZE (vinfo);
2451 vectype = STMT_VINFO_VECTYPE (vinfo);
2452 if (! vect_load_lanes_supported (vectype, size, false)
2453 && ! vect_grouped_load_supported (vectype, single_element_p,
2454 size))
2455 return opt_result::failure_at (vinfo->stmt,
2456 "unsupported grouped load\n");
2460 if (dump_enabled_p ())
2461 dump_printf_loc (MSG_NOTE, vect_location,
2462 "re-trying with SLP disabled\n");
2464 /* Roll back state appropriately. No SLP this time. */
2465 slp = false;
2466 /* Restore vectorization factor as it were without SLP. */
2467 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2468 /* Free the SLP instances. */
2469 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2470 vect_free_slp_instance (instance, false);
2471 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2472 /* Reset SLP type to loop_vect on all stmts. */
2473 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2475 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2476 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2477 !gsi_end_p (si); gsi_next (&si))
2479 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2480 STMT_SLP_TYPE (stmt_info) = loop_vect;
2481 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2482 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2484 /* vectorizable_reduction adjusts reduction stmt def-types,
2485 restore them to that of the PHI. */
2486 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2487 = STMT_VINFO_DEF_TYPE (stmt_info);
2488 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2489 (STMT_VINFO_REDUC_DEF (stmt_info)))
2490 = STMT_VINFO_DEF_TYPE (stmt_info);
2493 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2494 !gsi_end_p (si); gsi_next (&si))
2496 if (is_gimple_debug (gsi_stmt (si)))
2497 continue;
2498 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2499 STMT_SLP_TYPE (stmt_info) = loop_vect;
2500 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2502 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2503 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2504 STMT_SLP_TYPE (stmt_info) = loop_vect;
2505 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2506 !gsi_end_p (pi); gsi_next (&pi))
2507 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2508 = loop_vect;
2512 /* Free optimized alias test DDRS. */
2513 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2514 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2515 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2516 /* Reset target cost data. */
2517 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2518 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2519 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2520 /* Reset accumulated rgroup information. */
2521 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2522 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2523 /* Reset assorted flags. */
2524 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2525 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2526 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2527 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2528 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2529 = saved_can_use_partial_vectors_p;
2531 goto start_over;
2534 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2535 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2536 OLD_LOOP_VINFO is better unless something specifically indicates
2537 otherwise.
2539 Note that this deliberately isn't a partial order. */
2541 static bool
2542 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2543 loop_vec_info old_loop_vinfo)
2545 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2546 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2548 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2549 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2551 /* Always prefer a VF of loop->simdlen over any other VF. */
2552 if (loop->simdlen)
2554 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2555 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2556 if (new_simdlen_p != old_simdlen_p)
2557 return new_simdlen_p;
2560 /* Limit the VFs to what is likely to be the maximum number of iterations,
2561 to handle cases in which at least one loop_vinfo is fully-masked. */
2562 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2563 if (estimated_max_niter != -1)
2565 if (known_le (estimated_max_niter, new_vf))
2566 new_vf = estimated_max_niter;
2567 if (known_le (estimated_max_niter, old_vf))
2568 old_vf = estimated_max_niter;
2571 /* Check whether the (fractional) cost per scalar iteration is lower
2572 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2573 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2574 * poly_widest_int (old_vf));
2575 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2576 * poly_widest_int (new_vf));
2577 if (maybe_lt (rel_old, rel_new))
2579 /* When old_loop_vinfo uses a variable vectorization factor,
2580 we know that it has a lower cost for at least one runtime VF.
2581 However, we don't know how likely that VF is.
2583 One option would be to compare the costs for the estimated VFs.
2584 The problem is that that can put too much pressure on the cost
2585 model. E.g. if the estimated VF is also the lowest possible VF,
2586 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2587 for the estimated VF, we'd then choose new_loop_vinfo even
2588 though (a) new_loop_vinfo might not actually be better than
2589 old_loop_vinfo for that VF and (b) it would be significantly
2590 worse at larger VFs.
2592 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2593 no more expensive than old_loop_vinfo even after doubling the
2594 estimated old_loop_vinfo VF. For all but trivial loops, this
2595 ensures that we only pick new_loop_vinfo if it is significantly
2596 better than old_loop_vinfo at the estimated VF. */
2597 if (rel_new.is_constant ())
2598 return false;
2600 HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2601 HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2602 widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2603 * widest_int (old_estimated_vf));
2604 widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2605 * widest_int (new_estimated_vf));
2606 return estimated_rel_new * 2 <= estimated_rel_old;
2608 if (known_lt (rel_new, rel_old))
2609 return true;
2611 /* If there's nothing to choose between the loop bodies, see whether
2612 there's a difference in the prologue and epilogue costs. */
2613 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2614 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2616 return false;
2619 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2620 true if we should. */
2622 static bool
2623 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2624 loop_vec_info old_loop_vinfo)
2626 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2627 return false;
2629 if (dump_enabled_p ())
2630 dump_printf_loc (MSG_NOTE, vect_location,
2631 "***** Preferring vector mode %s to vector mode %s\n",
2632 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2633 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2634 return true;
2637 /* Function vect_analyze_loop.
2639 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2640 for it. The different analyses will record information in the
2641 loop_vec_info struct. */
2642 opt_loop_vec_info
2643 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2645 auto_vector_modes vector_modes;
2647 /* Autodetect first vector size we try. */
2648 unsigned int autovec_flags
2649 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2650 loop->simdlen != 0);
2651 unsigned int mode_i = 0;
2653 DUMP_VECT_SCOPE ("analyze_loop_nest");
2655 if (loop_outer (loop)
2656 && loop_vec_info_for_loop (loop_outer (loop))
2657 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2658 return opt_loop_vec_info::failure_at (vect_location,
2659 "outer-loop already vectorized.\n");
2661 if (!find_loop_nest (loop, &shared->loop_nest))
2662 return opt_loop_vec_info::failure_at
2663 (vect_location,
2664 "not vectorized: loop nest containing two or more consecutive inner"
2665 " loops cannot be vectorized\n");
2667 unsigned n_stmts = 0;
2668 machine_mode autodetected_vector_mode = VOIDmode;
2669 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2670 machine_mode next_vector_mode = VOIDmode;
2671 poly_uint64 lowest_th = 0;
2672 unsigned vectorized_loops = 0;
2673 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2674 && !unlimited_cost_model (loop));
2676 bool vect_epilogues = false;
2677 opt_result res = opt_result::success ();
2678 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2679 while (1)
2681 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2682 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2683 if (!loop_vinfo)
2685 if (dump_enabled_p ())
2686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2687 "bad loop form.\n");
2688 gcc_checking_assert (first_loop_vinfo == NULL);
2689 return loop_vinfo;
2691 loop_vinfo->vector_mode = next_vector_mode;
2693 bool fatal = false;
2695 /* When pick_lowest_cost_p is true, we should in principle iterate
2696 over all the loop_vec_infos that LOOP_VINFO could replace and
2697 try to vectorize LOOP_VINFO under the same conditions.
2698 E.g. when trying to replace an epilogue loop, we should vectorize
2699 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2700 to replace the main loop, we should vectorize LOOP_VINFO as a main
2701 loop too.
2703 However, autovectorize_vector_modes is usually sorted as follows:
2705 - Modes that naturally produce lower VFs usually follow modes that
2706 naturally produce higher VFs.
2708 - When modes naturally produce the same VF, maskable modes
2709 usually follow unmaskable ones, so that the maskable mode
2710 can be used to vectorize the epilogue of the unmaskable mode.
2712 This order is preferred because it leads to the maximum
2713 epilogue vectorization opportunities. Targets should only use
2714 a different order if they want to make wide modes available while
2715 disparaging them relative to earlier, smaller modes. The assumption
2716 in that case is that the wider modes are more expensive in some
2717 way that isn't reflected directly in the costs.
2719 There should therefore be few interesting cases in which
2720 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2721 treated as a standalone loop, and ends up being genuinely cheaper
2722 than FIRST_LOOP_VINFO. */
2723 if (vect_epilogues)
2724 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2726 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2727 if (mode_i == 0)
2728 autodetected_vector_mode = loop_vinfo->vector_mode;
2729 if (dump_enabled_p ())
2731 if (res)
2732 dump_printf_loc (MSG_NOTE, vect_location,
2733 "***** Analysis succeeded with vector mode %s\n",
2734 GET_MODE_NAME (loop_vinfo->vector_mode));
2735 else
2736 dump_printf_loc (MSG_NOTE, vect_location,
2737 "***** Analysis failed with vector mode %s\n",
2738 GET_MODE_NAME (loop_vinfo->vector_mode));
2741 loop->aux = NULL;
2743 if (!fatal)
2744 while (mode_i < vector_modes.length ()
2745 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2747 if (dump_enabled_p ())
2748 dump_printf_loc (MSG_NOTE, vect_location,
2749 "***** The result for vector mode %s would"
2750 " be the same\n",
2751 GET_MODE_NAME (vector_modes[mode_i]));
2752 mode_i += 1;
2755 if (res)
2757 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2758 vectorized_loops++;
2760 /* Once we hit the desired simdlen for the first time,
2761 discard any previous attempts. */
2762 if (simdlen
2763 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2765 delete first_loop_vinfo;
2766 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2767 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2768 simdlen = 0;
2770 else if (pick_lowest_cost_p && first_loop_vinfo)
2772 /* Keep trying to roll back vectorization attempts while the
2773 loop_vec_infos they produced were worse than this one. */
2774 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2775 while (!vinfos.is_empty ()
2776 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2778 gcc_assert (vect_epilogues);
2779 delete vinfos.pop ();
2781 if (vinfos.is_empty ()
2782 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2784 delete first_loop_vinfo;
2785 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2786 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2790 if (first_loop_vinfo == NULL)
2792 first_loop_vinfo = loop_vinfo;
2793 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2795 else if (vect_epilogues
2796 /* For now only allow one epilogue loop. */
2797 && first_loop_vinfo->epilogue_vinfos.is_empty ())
2799 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2800 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2801 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2802 || maybe_ne (lowest_th, 0U));
2803 /* Keep track of the known smallest versioning
2804 threshold. */
2805 if (ordered_p (lowest_th, th))
2806 lowest_th = ordered_min (lowest_th, th);
2808 else
2810 delete loop_vinfo;
2811 loop_vinfo = opt_loop_vec_info::success (NULL);
2814 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2815 enabled, SIMDUID is not set, it is the innermost loop and we have
2816 either already found the loop's SIMDLEN or there was no SIMDLEN to
2817 begin with.
2818 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
2819 vect_epilogues = (!simdlen
2820 && loop->inner == NULL
2821 && param_vect_epilogues_nomask
2822 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2823 && !loop->simduid
2824 /* For now only allow one epilogue loop, but allow
2825 pick_lowest_cost_p to replace it. */
2826 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2827 || pick_lowest_cost_p));
2829 /* Commit to first_loop_vinfo if we have no reason to try
2830 alternatives. */
2831 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2832 break;
2834 else
2836 delete loop_vinfo;
2837 loop_vinfo = opt_loop_vec_info::success (NULL);
2838 if (fatal)
2840 gcc_checking_assert (first_loop_vinfo == NULL);
2841 break;
2845 /* Handle the case that the original loop can use partial
2846 vectorization, but want to only adopt it for the epilogue.
2847 The retry should be in the same mode as original. */
2848 if (vect_epilogues
2849 && loop_vinfo
2850 && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
2852 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2853 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
2854 if (dump_enabled_p ())
2855 dump_printf_loc (MSG_NOTE, vect_location,
2856 "***** Re-trying analysis with same vector mode"
2857 " %s for epilogue with partial vectors.\n",
2858 GET_MODE_NAME (loop_vinfo->vector_mode));
2859 continue;
2862 if (mode_i < vector_modes.length ()
2863 && VECTOR_MODE_P (autodetected_vector_mode)
2864 && (related_vector_mode (vector_modes[mode_i],
2865 GET_MODE_INNER (autodetected_vector_mode))
2866 == autodetected_vector_mode)
2867 && (related_vector_mode (autodetected_vector_mode,
2868 GET_MODE_INNER (vector_modes[mode_i]))
2869 == vector_modes[mode_i]))
2871 if (dump_enabled_p ())
2872 dump_printf_loc (MSG_NOTE, vect_location,
2873 "***** Skipping vector mode %s, which would"
2874 " repeat the analysis for %s\n",
2875 GET_MODE_NAME (vector_modes[mode_i]),
2876 GET_MODE_NAME (autodetected_vector_mode));
2877 mode_i += 1;
2880 if (mode_i == vector_modes.length ()
2881 || autodetected_vector_mode == VOIDmode)
2882 break;
2884 /* Try the next biggest vector size. */
2885 next_vector_mode = vector_modes[mode_i++];
2886 if (dump_enabled_p ())
2887 dump_printf_loc (MSG_NOTE, vect_location,
2888 "***** Re-trying analysis with vector mode %s\n",
2889 GET_MODE_NAME (next_vector_mode));
2892 if (first_loop_vinfo)
2894 loop->aux = (loop_vec_info) first_loop_vinfo;
2895 if (dump_enabled_p ())
2896 dump_printf_loc (MSG_NOTE, vect_location,
2897 "***** Choosing vector mode %s\n",
2898 GET_MODE_NAME (first_loop_vinfo->vector_mode));
2899 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2900 return first_loop_vinfo;
2903 return opt_loop_vec_info::propagate_failure (res);
2906 /* Return true if there is an in-order reduction function for CODE, storing
2907 it in *REDUC_FN if so. */
2909 static bool
2910 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2912 switch (code)
2914 case PLUS_EXPR:
2915 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2916 return true;
2918 default:
2919 return false;
2923 /* Function reduction_fn_for_scalar_code
2925 Input:
2926 CODE - tree_code of a reduction operations.
2928 Output:
2929 REDUC_FN - the corresponding internal function to be used to reduce the
2930 vector of partial results into a single scalar result, or IFN_LAST
2931 if the operation is a supported reduction operation, but does not have
2932 such an internal function.
2934 Return FALSE if CODE currently cannot be vectorized as reduction. */
2936 static bool
2937 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2939 switch (code)
2941 case MAX_EXPR:
2942 *reduc_fn = IFN_REDUC_MAX;
2943 return true;
2945 case MIN_EXPR:
2946 *reduc_fn = IFN_REDUC_MIN;
2947 return true;
2949 case PLUS_EXPR:
2950 *reduc_fn = IFN_REDUC_PLUS;
2951 return true;
2953 case BIT_AND_EXPR:
2954 *reduc_fn = IFN_REDUC_AND;
2955 return true;
2957 case BIT_IOR_EXPR:
2958 *reduc_fn = IFN_REDUC_IOR;
2959 return true;
2961 case BIT_XOR_EXPR:
2962 *reduc_fn = IFN_REDUC_XOR;
2963 return true;
2965 case MULT_EXPR:
2966 case MINUS_EXPR:
2967 *reduc_fn = IFN_LAST;
2968 return true;
2970 default:
2971 return false;
2975 /* If there is a neutral value X such that SLP reduction NODE would not
2976 be affected by the introduction of additional X elements, return that X,
2977 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
2978 is the vector type that would hold element X. REDUC_CHAIN is true if
2979 the SLP statements perform a single reduction, false if each statement
2980 performs an independent reduction. */
2982 static tree
2983 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2984 tree_code code, bool reduc_chain)
2986 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2987 stmt_vec_info stmt_vinfo = stmts[0];
2988 tree scalar_type = TREE_TYPE (vector_type);
2989 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2990 gcc_assert (loop);
2992 switch (code)
2994 case WIDEN_SUM_EXPR:
2995 case DOT_PROD_EXPR:
2996 case SAD_EXPR:
2997 case PLUS_EXPR:
2998 case MINUS_EXPR:
2999 case BIT_IOR_EXPR:
3000 case BIT_XOR_EXPR:
3001 return build_zero_cst (scalar_type);
3003 case MULT_EXPR:
3004 return build_one_cst (scalar_type);
3006 case BIT_AND_EXPR:
3007 return build_all_ones_cst (scalar_type);
3009 case MAX_EXPR:
3010 case MIN_EXPR:
3011 /* For MIN/MAX the initial values are neutral. A reduction chain
3012 has only a single initial value, so that value is neutral for
3013 all statements. */
3014 if (reduc_chain)
3015 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3016 loop_preheader_edge (loop));
3017 return NULL_TREE;
3019 default:
3020 return NULL_TREE;
3024 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3025 STMT is printed with a message MSG. */
3027 static void
3028 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3030 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3033 /* Return true if we need an in-order reduction for operation CODE
3034 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3035 overflow must wrap. */
3037 bool
3038 needs_fold_left_reduction_p (tree type, tree_code code)
3040 /* CHECKME: check for !flag_finite_math_only too? */
3041 if (SCALAR_FLOAT_TYPE_P (type))
3042 switch (code)
3044 case MIN_EXPR:
3045 case MAX_EXPR:
3046 return false;
3048 default:
3049 return !flag_associative_math;
3052 if (INTEGRAL_TYPE_P (type))
3054 if (!operation_no_trapping_overflow (type, code))
3055 return true;
3056 return false;
3059 if (SAT_FIXED_POINT_TYPE_P (type))
3060 return true;
3062 return false;
3065 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3066 has a handled computation expression. Store the main reduction
3067 operation in *CODE. */
3069 static bool
3070 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3071 tree loop_arg, enum tree_code *code,
3072 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3074 auto_bitmap visited;
3075 tree lookfor = PHI_RESULT (phi);
3076 ssa_op_iter curri;
3077 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3078 while (USE_FROM_PTR (curr) != loop_arg)
3079 curr = op_iter_next_use (&curri);
3080 curri.i = curri.numops;
3083 path.safe_push (std::make_pair (curri, curr));
3084 tree use = USE_FROM_PTR (curr);
3085 if (use == lookfor)
3086 break;
3087 gimple *def = SSA_NAME_DEF_STMT (use);
3088 if (gimple_nop_p (def)
3089 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3091 pop:
3094 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3095 curri = x.first;
3096 curr = x.second;
3098 curr = op_iter_next_use (&curri);
3099 /* Skip already visited or non-SSA operands (from iterating
3100 over PHI args). */
3101 while (curr != NULL_USE_OPERAND_P
3102 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3103 || ! bitmap_set_bit (visited,
3104 SSA_NAME_VERSION
3105 (USE_FROM_PTR (curr)))));
3107 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3108 if (curr == NULL_USE_OPERAND_P)
3109 break;
3111 else
3113 if (gimple_code (def) == GIMPLE_PHI)
3114 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3115 else
3116 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3117 while (curr != NULL_USE_OPERAND_P
3118 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3119 || ! bitmap_set_bit (visited,
3120 SSA_NAME_VERSION
3121 (USE_FROM_PTR (curr)))))
3122 curr = op_iter_next_use (&curri);
3123 if (curr == NULL_USE_OPERAND_P)
3124 goto pop;
3127 while (1);
3128 if (dump_file && (dump_flags & TDF_DETAILS))
3130 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3131 unsigned i;
3132 std::pair<ssa_op_iter, use_operand_p> *x;
3133 FOR_EACH_VEC_ELT (path, i, x)
3134 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3135 dump_printf (MSG_NOTE, "\n");
3138 /* Check whether the reduction path detected is valid. */
3139 bool fail = path.length () == 0;
3140 bool neg = false;
3141 int sign = -1;
3142 *code = ERROR_MARK;
3143 for (unsigned i = 1; i < path.length (); ++i)
3145 gimple *use_stmt = USE_STMT (path[i].second);
3146 tree op = USE_FROM_PTR (path[i].second);
3147 if (! is_gimple_assign (use_stmt)
3148 /* The following make sure we can compute the operand index
3149 easily plus it mostly disallows chaining via COND_EXPR condition
3150 operands. */
3151 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3152 && (gimple_num_ops (use_stmt) <= 2
3153 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3154 && (gimple_num_ops (use_stmt) <= 3
3155 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3157 fail = true;
3158 break;
3160 /* Check there's only a single stmt the op is used on inside
3161 of the loop. */
3162 imm_use_iterator imm_iter;
3163 gimple *op_use_stmt;
3164 unsigned cnt = 0;
3165 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3166 if (!is_gimple_debug (op_use_stmt)
3167 && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
3169 /* We want to allow x + x but not x < 1 ? x : 2. */
3170 if (is_gimple_assign (op_use_stmt)
3171 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3173 use_operand_p use_p;
3174 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3175 cnt++;
3177 else
3178 cnt++;
3180 if (cnt != 1)
3182 fail = true;
3183 break;
3185 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3186 if (use_code == MINUS_EXPR)
3188 use_code = PLUS_EXPR;
3189 /* Track whether we negate the reduction value each iteration. */
3190 if (gimple_assign_rhs2 (use_stmt) == op)
3191 neg = ! neg;
3193 if (CONVERT_EXPR_CODE_P (use_code)
3194 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3195 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3197 else if (*code == ERROR_MARK)
3199 *code = use_code;
3200 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3202 else if (use_code != *code)
3204 fail = true;
3205 break;
3207 else if ((use_code == MIN_EXPR
3208 || use_code == MAX_EXPR)
3209 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3211 fail = true;
3212 break;
3215 return ! fail && ! neg && *code != ERROR_MARK;
3218 bool
3219 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3220 tree loop_arg, enum tree_code code)
3222 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3223 enum tree_code code_;
3224 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3225 && code_ == code);
3230 /* Function vect_is_simple_reduction
3232 (1) Detect a cross-iteration def-use cycle that represents a simple
3233 reduction computation. We look for the following pattern:
3235 loop_header:
3236 a1 = phi < a0, a2 >
3237 a3 = ...
3238 a2 = operation (a3, a1)
3242 a3 = ...
3243 loop_header:
3244 a1 = phi < a0, a2 >
3245 a2 = operation (a3, a1)
3247 such that:
3248 1. operation is commutative and associative and it is safe to
3249 change the order of the computation
3250 2. no uses for a2 in the loop (a2 is used out of the loop)
3251 3. no uses of a1 in the loop besides the reduction operation
3252 4. no uses of a1 outside the loop.
3254 Conditions 1,4 are tested here.
3255 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3257 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3258 nested cycles.
3260 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3261 reductions:
3263 a1 = phi < a0, a2 >
3264 inner loop (def of a3)
3265 a2 = phi < a3 >
3267 (4) Detect condition expressions, ie:
3268 for (int i = 0; i < N; i++)
3269 if (a[i] < val)
3270 ret_val = a[i];
3274 static stmt_vec_info
3275 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3276 bool *double_reduc, bool *reduc_chain_p)
3278 gphi *phi = as_a <gphi *> (phi_info->stmt);
3279 gimple *phi_use_stmt = NULL;
3280 imm_use_iterator imm_iter;
3281 use_operand_p use_p;
3283 *double_reduc = false;
3284 *reduc_chain_p = false;
3285 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3287 tree phi_name = PHI_RESULT (phi);
3288 /* ??? If there are no uses of the PHI result the inner loop reduction
3289 won't be detected as possibly double-reduction by vectorizable_reduction
3290 because that tries to walk the PHI arg from the preheader edge which
3291 can be constant. See PR60382. */
3292 if (has_zero_uses (phi_name))
3293 return NULL;
3294 class loop *loop = (gimple_bb (phi))->loop_father;
3295 unsigned nphi_def_loop_uses = 0;
3296 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3298 gimple *use_stmt = USE_STMT (use_p);
3299 if (is_gimple_debug (use_stmt))
3300 continue;
3302 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3304 if (dump_enabled_p ())
3305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3306 "intermediate value used outside loop.\n");
3308 return NULL;
3311 nphi_def_loop_uses++;
3312 phi_use_stmt = use_stmt;
3315 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3316 if (TREE_CODE (latch_def) != SSA_NAME)
3318 if (dump_enabled_p ())
3319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3320 "reduction: not ssa_name: %T\n", latch_def);
3321 return NULL;
3324 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3325 if (!def_stmt_info
3326 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3327 return NULL;
3329 bool nested_in_vect_loop
3330 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3331 unsigned nlatch_def_loop_uses = 0;
3332 auto_vec<gphi *, 3> lcphis;
3333 bool inner_loop_of_double_reduc = false;
3334 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3336 gimple *use_stmt = USE_STMT (use_p);
3337 if (is_gimple_debug (use_stmt))
3338 continue;
3339 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3340 nlatch_def_loop_uses++;
3341 else
3343 /* We can have more than one loop-closed PHI. */
3344 lcphis.safe_push (as_a <gphi *> (use_stmt));
3345 if (nested_in_vect_loop
3346 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3347 == vect_double_reduction_def))
3348 inner_loop_of_double_reduc = true;
3352 /* If we are vectorizing an inner reduction we are executing that
3353 in the original order only in case we are not dealing with a
3354 double reduction. */
3355 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3357 if (dump_enabled_p ())
3358 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3359 "detected nested cycle: ");
3360 return def_stmt_info;
3363 /* If this isn't a nested cycle or if the nested cycle reduction value
3364 is used ouside of the inner loop we cannot handle uses of the reduction
3365 value. */
3366 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3368 if (dump_enabled_p ())
3369 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3370 "reduction used in loop.\n");
3371 return NULL;
3374 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3375 defined in the inner loop. */
3376 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3378 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3379 if (gimple_phi_num_args (def_stmt) != 1
3380 || TREE_CODE (op1) != SSA_NAME)
3382 if (dump_enabled_p ())
3383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3384 "unsupported phi node definition.\n");
3386 return NULL;
3389 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3390 if (gimple_bb (def1)
3391 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3392 && loop->inner
3393 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3394 && is_gimple_assign (def1)
3395 && is_a <gphi *> (phi_use_stmt)
3396 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3398 if (dump_enabled_p ())
3399 report_vect_op (MSG_NOTE, def_stmt,
3400 "detected double reduction: ");
3402 *double_reduc = true;
3403 return def_stmt_info;
3406 return NULL;
3409 /* Look for the expression computing latch_def from then loop PHI result. */
3410 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3411 enum tree_code code;
3412 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3413 path))
3415 STMT_VINFO_REDUC_CODE (phi_info) = code;
3416 if (code == COND_EXPR && !nested_in_vect_loop)
3417 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3419 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3420 reduction chain for which the additional restriction is that
3421 all operations in the chain are the same. */
3422 auto_vec<stmt_vec_info, 8> reduc_chain;
3423 unsigned i;
3424 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3425 for (i = path.length () - 1; i >= 1; --i)
3427 gimple *stmt = USE_STMT (path[i].second);
3428 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3429 STMT_VINFO_REDUC_IDX (stmt_info)
3430 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3431 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3432 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3433 && (i == 1 || i == path.length () - 1));
3434 if ((stmt_code != code && !leading_conversion)
3435 /* We can only handle the final value in epilogue
3436 generation for reduction chains. */
3437 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3438 is_slp_reduc = false;
3439 /* For reduction chains we support a trailing/leading
3440 conversions. We do not store those in the actual chain. */
3441 if (leading_conversion)
3442 continue;
3443 reduc_chain.safe_push (stmt_info);
3445 if (is_slp_reduc && reduc_chain.length () > 1)
3447 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3449 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3450 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3452 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3453 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3455 /* Save the chain for further analysis in SLP detection. */
3456 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3457 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3459 *reduc_chain_p = true;
3460 if (dump_enabled_p ())
3461 dump_printf_loc (MSG_NOTE, vect_location,
3462 "reduction: detected reduction chain\n");
3464 else if (dump_enabled_p ())
3465 dump_printf_loc (MSG_NOTE, vect_location,
3466 "reduction: detected reduction\n");
3468 return def_stmt_info;
3471 if (dump_enabled_p ())
3472 dump_printf_loc (MSG_NOTE, vect_location,
3473 "reduction: unknown pattern\n");
3475 return NULL;
3478 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3479 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3480 or -1 if not known. */
3482 static int
3483 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3485 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3486 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3488 if (dump_enabled_p ())
3489 dump_printf_loc (MSG_NOTE, vect_location,
3490 "cost model: epilogue peel iters set to vf/2 "
3491 "because loop iterations are unknown .\n");
3492 return assumed_vf / 2;
3494 else
3496 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3497 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3498 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3499 /* If we need to peel for gaps, but no peeling is required, we have to
3500 peel VF iterations. */
3501 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3502 peel_iters_epilogue = assumed_vf;
3503 return peel_iters_epilogue;
3507 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3509 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3510 int *peel_iters_epilogue,
3511 stmt_vector_for_cost *scalar_cost_vec,
3512 stmt_vector_for_cost *prologue_cost_vec,
3513 stmt_vector_for_cost *epilogue_cost_vec)
3515 int retval = 0;
3517 *peel_iters_epilogue
3518 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3520 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3522 /* If peeled iterations are known but number of scalar loop
3523 iterations are unknown, count a taken branch per peeled loop. */
3524 if (peel_iters_prologue > 0)
3525 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3526 NULL, NULL_TREE, 0, vect_prologue);
3527 if (*peel_iters_epilogue > 0)
3528 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3529 NULL, NULL_TREE, 0, vect_epilogue);
3532 stmt_info_for_cost *si;
3533 int j;
3534 if (peel_iters_prologue)
3535 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3536 retval += record_stmt_cost (prologue_cost_vec,
3537 si->count * peel_iters_prologue,
3538 si->kind, si->stmt_info, si->misalign,
3539 vect_prologue);
3540 if (*peel_iters_epilogue)
3541 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3542 retval += record_stmt_cost (epilogue_cost_vec,
3543 si->count * *peel_iters_epilogue,
3544 si->kind, si->stmt_info, si->misalign,
3545 vect_epilogue);
3547 return retval;
3550 /* Function vect_estimate_min_profitable_iters
3552 Return the number of iterations required for the vector version of the
3553 loop to be profitable relative to the cost of the scalar version of the
3554 loop.
3556 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3557 of iterations for vectorization. -1 value means loop vectorization
3558 is not profitable. This returned value may be used for dynamic
3559 profitability check.
3561 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3562 for static check against estimated number of iterations. */
3564 static void
3565 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3566 int *ret_min_profitable_niters,
3567 int *ret_min_profitable_estimate)
3569 int min_profitable_iters;
3570 int min_profitable_estimate;
3571 int peel_iters_prologue;
3572 int peel_iters_epilogue;
3573 unsigned vec_inside_cost = 0;
3574 int vec_outside_cost = 0;
3575 unsigned vec_prologue_cost = 0;
3576 unsigned vec_epilogue_cost = 0;
3577 int scalar_single_iter_cost = 0;
3578 int scalar_outside_cost = 0;
3579 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3580 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3581 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3583 /* Cost model disabled. */
3584 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3586 if (dump_enabled_p ())
3587 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3588 *ret_min_profitable_niters = 0;
3589 *ret_min_profitable_estimate = 0;
3590 return;
3593 /* Requires loop versioning tests to handle misalignment. */
3594 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3596 /* FIXME: Make cost depend on complexity of individual check. */
3597 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3598 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3599 NULL, NULL_TREE, 0, vect_prologue);
3600 if (dump_enabled_p ())
3601 dump_printf (MSG_NOTE,
3602 "cost model: Adding cost of checks for loop "
3603 "versioning to treat misalignment.\n");
3606 /* Requires loop versioning with alias checks. */
3607 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3609 /* FIXME: Make cost depend on complexity of individual check. */
3610 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3611 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3612 NULL, NULL_TREE, 0, vect_prologue);
3613 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3614 if (len)
3615 /* Count LEN - 1 ANDs and LEN comparisons. */
3616 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3617 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3618 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3619 if (len)
3621 /* Count LEN - 1 ANDs and LEN comparisons. */
3622 unsigned int nstmts = len * 2 - 1;
3623 /* +1 for each bias that needs adding. */
3624 for (unsigned int i = 0; i < len; ++i)
3625 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3626 nstmts += 1;
3627 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3628 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3630 if (dump_enabled_p ())
3631 dump_printf (MSG_NOTE,
3632 "cost model: Adding cost of checks for loop "
3633 "versioning aliasing.\n");
3636 /* Requires loop versioning with niter checks. */
3637 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3639 /* FIXME: Make cost depend on complexity of individual check. */
3640 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3641 NULL, NULL_TREE, 0, vect_prologue);
3642 if (dump_enabled_p ())
3643 dump_printf (MSG_NOTE,
3644 "cost model: Adding cost of checks for loop "
3645 "versioning niters.\n");
3648 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3649 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3650 NULL, NULL_TREE, 0, vect_prologue);
3652 /* Count statements in scalar loop. Using this as scalar cost for a single
3653 iteration for now.
3655 TODO: Add outer loop support.
3657 TODO: Consider assigning different costs to different scalar
3658 statements. */
3660 scalar_single_iter_cost
3661 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3663 /* Add additional cost for the peeled instructions in prologue and epilogue
3664 loop. (For fully-masked loops there will be no peeling.)
3666 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3667 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3669 TODO: Build an expression that represents peel_iters for prologue and
3670 epilogue to be used in a run-time test. */
3672 bool prologue_need_br_taken_cost = false;
3673 bool prologue_need_br_not_taken_cost = false;
3675 /* Calculate peel_iters_prologue. */
3676 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3677 peel_iters_prologue = 0;
3678 else if (npeel < 0)
3680 peel_iters_prologue = assumed_vf / 2;
3681 if (dump_enabled_p ())
3682 dump_printf (MSG_NOTE, "cost model: "
3683 "prologue peel iters set to vf/2.\n");
3685 /* If peeled iterations are unknown, count a taken branch and a not taken
3686 branch per peeled loop. Even if scalar loop iterations are known,
3687 vector iterations are not known since peeled prologue iterations are
3688 not known. Hence guards remain the same. */
3689 prologue_need_br_taken_cost = true;
3690 prologue_need_br_not_taken_cost = true;
3692 else
3694 peel_iters_prologue = npeel;
3695 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3696 /* If peeled iterations are known but number of scalar loop
3697 iterations are unknown, count a taken branch per peeled loop. */
3698 prologue_need_br_taken_cost = true;
3701 bool epilogue_need_br_taken_cost = false;
3702 bool epilogue_need_br_not_taken_cost = false;
3704 /* Calculate peel_iters_epilogue. */
3705 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3706 /* We need to peel exactly one iteration for gaps. */
3707 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3708 else if (npeel < 0)
3710 /* If peeling for alignment is unknown, loop bound of main loop
3711 becomes unknown. */
3712 peel_iters_epilogue = assumed_vf / 2;
3713 if (dump_enabled_p ())
3714 dump_printf (MSG_NOTE, "cost model: "
3715 "epilogue peel iters set to vf/2 because "
3716 "peeling for alignment is unknown.\n");
3718 /* See the same reason above in peel_iters_prologue calculation. */
3719 epilogue_need_br_taken_cost = true;
3720 epilogue_need_br_not_taken_cost = true;
3722 else
3724 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3725 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3726 /* If peeled iterations are known but number of scalar loop
3727 iterations are unknown, count a taken branch per peeled loop. */
3728 epilogue_need_br_taken_cost = true;
3731 stmt_info_for_cost *si;
3732 int j;
3733 /* Add costs associated with peel_iters_prologue. */
3734 if (peel_iters_prologue)
3735 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3737 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3738 si->count * peel_iters_prologue, si->kind,
3739 si->stmt_info, si->vectype, si->misalign,
3740 vect_prologue);
3743 /* Add costs associated with peel_iters_epilogue. */
3744 if (peel_iters_epilogue)
3745 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3747 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3748 si->count * peel_iters_epilogue, si->kind,
3749 si->stmt_info, si->vectype, si->misalign,
3750 vect_epilogue);
3753 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
3755 if (prologue_need_br_taken_cost)
3756 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3757 NULL, NULL_TREE, 0, vect_prologue);
3759 if (prologue_need_br_not_taken_cost)
3760 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3761 cond_branch_not_taken, NULL, NULL_TREE, 0,
3762 vect_prologue);
3764 if (epilogue_need_br_taken_cost)
3765 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3766 NULL, NULL_TREE, 0, vect_epilogue);
3768 if (epilogue_need_br_not_taken_cost)
3769 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3770 cond_branch_not_taken, NULL, NULL_TREE, 0,
3771 vect_epilogue);
3773 /* Take care of special costs for rgroup controls of partial vectors. */
3774 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3776 /* Calculate how many masks we need to generate. */
3777 unsigned int num_masks = 0;
3778 rgroup_controls *rgm;
3779 unsigned int num_vectors_m1;
3780 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3781 if (rgm->type)
3782 num_masks += num_vectors_m1 + 1;
3783 gcc_assert (num_masks > 0);
3785 /* In the worst case, we need to generate each mask in the prologue
3786 and in the loop body. One of the loop body mask instructions
3787 replaces the comparison in the scalar loop, and since we don't
3788 count the scalar comparison against the scalar body, we shouldn't
3789 count that vector instruction against the vector body either.
3791 Sometimes we can use unpacks instead of generating prologue
3792 masks and sometimes the prologue mask will fold to a constant,
3793 so the actual prologue cost might be smaller. However, it's
3794 simpler and safer to use the worst-case cost; if this ends up
3795 being the tie-breaker between vectorizing or not, then it's
3796 probably better not to vectorize. */
3797 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
3798 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
3799 (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
3800 vector_stmt, NULL, NULL_TREE, 0, vect_body);
3802 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
3804 /* Referring to the functions vect_set_loop_condition_partial_vectors
3805 and vect_set_loop_controls_directly, we need to generate each
3806 length in the prologue and in the loop body if required. Although
3807 there are some possible optimizations, we consider the worst case
3808 here. */
3810 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
3811 bool need_iterate_p
3812 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3813 && !vect_known_niters_smaller_than_vf (loop_vinfo));
3815 /* Calculate how many statements to be added. */
3816 unsigned int prologue_stmts = 0;
3817 unsigned int body_stmts = 0;
3819 rgroup_controls *rgc;
3820 unsigned int num_vectors_m1;
3821 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
3822 if (rgc->type)
3824 /* May need one SHIFT for nitems_total computation. */
3825 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
3826 if (nitems != 1 && !niters_known_p)
3827 prologue_stmts += 1;
3829 /* May need one MAX and one MINUS for wrap around. */
3830 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
3831 prologue_stmts += 2;
3833 /* Need one MAX and one MINUS for each batch limit excepting for
3834 the 1st one. */
3835 prologue_stmts += num_vectors_m1 * 2;
3837 unsigned int num_vectors = num_vectors_m1 + 1;
3839 /* Need to set up lengths in prologue, only one MIN required
3840 for each since start index is zero. */
3841 prologue_stmts += num_vectors;
3843 /* Each may need two MINs and one MINUS to update lengths in body
3844 for next iteration. */
3845 if (need_iterate_p)
3846 body_stmts += 3 * num_vectors;
3849 (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
3850 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3851 (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
3852 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
3855 /* FORNOW: The scalar outside cost is incremented in one of the
3856 following ways:
3858 1. The vectorizer checks for alignment and aliasing and generates
3859 a condition that allows dynamic vectorization. A cost model
3860 check is ANDED with the versioning condition. Hence scalar code
3861 path now has the added cost of the versioning check.
3863 if (cost > th & versioning_check)
3864 jmp to vector code
3866 Hence run-time scalar is incremented by not-taken branch cost.
3868 2. The vectorizer then checks if a prologue is required. If the
3869 cost model check was not done before during versioning, it has to
3870 be done before the prologue check.
3872 if (cost <= th)
3873 prologue = scalar_iters
3874 if (prologue == 0)
3875 jmp to vector code
3876 else
3877 execute prologue
3878 if (prologue == num_iters)
3879 go to exit
3881 Hence the run-time scalar cost is incremented by a taken branch,
3882 plus a not-taken branch, plus a taken branch cost.
3884 3. The vectorizer then checks if an epilogue is required. If the
3885 cost model check was not done before during prologue check, it
3886 has to be done with the epilogue check.
3888 if (prologue == 0)
3889 jmp to vector code
3890 else
3891 execute prologue
3892 if (prologue == num_iters)
3893 go to exit
3894 vector code:
3895 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3896 jmp to epilogue
3898 Hence the run-time scalar cost should be incremented by 2 taken
3899 branches.
3901 TODO: The back end may reorder the BBS's differently and reverse
3902 conditions/branch directions. Change the estimates below to
3903 something more reasonable. */
3905 /* If the number of iterations is known and we do not do versioning, we can
3906 decide whether to vectorize at compile time. Hence the scalar version
3907 do not carry cost model guard costs. */
3908 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3909 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3911 /* Cost model check occurs at versioning. */
3912 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3913 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3914 else
3916 /* Cost model check occurs at prologue generation. */
3917 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3918 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3919 + vect_get_stmt_cost (cond_branch_not_taken);
3920 /* Cost model check occurs at epilogue generation. */
3921 else
3922 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3926 /* Complete the target-specific cost calculations. */
3927 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3928 &vec_inside_cost, &vec_epilogue_cost);
3930 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3932 /* Stash the costs so that we can compare two loop_vec_infos. */
3933 loop_vinfo->vec_inside_cost = vec_inside_cost;
3934 loop_vinfo->vec_outside_cost = vec_outside_cost;
3936 if (dump_enabled_p ())
3938 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3939 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3940 vec_inside_cost);
3941 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3942 vec_prologue_cost);
3943 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3944 vec_epilogue_cost);
3945 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3946 scalar_single_iter_cost);
3947 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3948 scalar_outside_cost);
3949 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3950 vec_outside_cost);
3951 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3952 peel_iters_prologue);
3953 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3954 peel_iters_epilogue);
3957 /* Calculate number of iterations required to make the vector version
3958 profitable, relative to the loop bodies only. The following condition
3959 must hold true:
3960 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3961 where
3962 SIC = scalar iteration cost, VIC = vector iteration cost,
3963 VOC = vector outside cost, VF = vectorization factor,
3964 NPEEL = prologue iterations + epilogue iterations,
3965 SOC = scalar outside cost for run time cost model check. */
3967 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3968 - vec_inside_cost);
3969 if (saving_per_viter <= 0)
3971 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3972 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3973 "vectorization did not happen for a simd loop");
3975 if (dump_enabled_p ())
3976 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3977 "cost model: the vector iteration cost = %d "
3978 "divided by the scalar iteration cost = %d "
3979 "is greater or equal to the vectorization factor = %d"
3980 ".\n",
3981 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3982 *ret_min_profitable_niters = -1;
3983 *ret_min_profitable_estimate = -1;
3984 return;
3987 /* ??? The "if" arm is written to handle all cases; see below for what
3988 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
3989 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3991 /* Rewriting the condition above in terms of the number of
3992 vector iterations (vniters) rather than the number of
3993 scalar iterations (niters) gives:
3995 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3997 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3999 For integer N, X and Y when X > 0:
4001 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4002 int outside_overhead = (vec_outside_cost
4003 - scalar_single_iter_cost * peel_iters_prologue
4004 - scalar_single_iter_cost * peel_iters_epilogue
4005 - scalar_outside_cost);
4006 /* We're only interested in cases that require at least one
4007 vector iteration. */
4008 int min_vec_niters = 1;
4009 if (outside_overhead > 0)
4010 min_vec_niters = outside_overhead / saving_per_viter + 1;
4012 if (dump_enabled_p ())
4013 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4014 min_vec_niters);
4016 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4018 /* Now that we know the minimum number of vector iterations,
4019 find the minimum niters for which the scalar cost is larger:
4021 SIC * niters > VIC * vniters + VOC - SOC
4023 We know that the minimum niters is no more than
4024 vniters * VF + NPEEL, but it might be (and often is) less
4025 than that if a partial vector iteration is cheaper than the
4026 equivalent scalar code. */
4027 int threshold = (vec_inside_cost * min_vec_niters
4028 + vec_outside_cost
4029 - scalar_outside_cost);
4030 if (threshold <= 0)
4031 min_profitable_iters = 1;
4032 else
4033 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4035 else
4036 /* Convert the number of vector iterations into a number of
4037 scalar iterations. */
4038 min_profitable_iters = (min_vec_niters * assumed_vf
4039 + peel_iters_prologue
4040 + peel_iters_epilogue);
4042 else
4044 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4045 * assumed_vf
4046 - vec_inside_cost * peel_iters_prologue
4047 - vec_inside_cost * peel_iters_epilogue);
4048 if (min_profitable_iters <= 0)
4049 min_profitable_iters = 0;
4050 else
4052 min_profitable_iters /= saving_per_viter;
4054 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4055 <= (((int) vec_inside_cost * min_profitable_iters)
4056 + (((int) vec_outside_cost - scalar_outside_cost)
4057 * assumed_vf)))
4058 min_profitable_iters++;
4062 if (dump_enabled_p ())
4063 dump_printf (MSG_NOTE,
4064 " Calculated minimum iters for profitability: %d\n",
4065 min_profitable_iters);
4067 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4068 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4069 /* We want the vectorized loop to execute at least once. */
4070 min_profitable_iters = assumed_vf + peel_iters_prologue;
4071 else if (min_profitable_iters < peel_iters_prologue)
4072 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4073 vectorized loop executes at least once. */
4074 min_profitable_iters = peel_iters_prologue;
4076 if (dump_enabled_p ())
4077 dump_printf_loc (MSG_NOTE, vect_location,
4078 " Runtime profitability threshold = %d\n",
4079 min_profitable_iters);
4081 *ret_min_profitable_niters = min_profitable_iters;
4083 /* Calculate number of iterations required to make the vector version
4084 profitable, relative to the loop bodies only.
4086 Non-vectorized variant is SIC * niters and it must win over vector
4087 variant on the expected loop trip count. The following condition must hold true:
4088 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4090 if (vec_outside_cost <= 0)
4091 min_profitable_estimate = 0;
4092 /* ??? This "else if" arm is written to handle all cases; see below for
4093 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4094 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4096 /* This is a repeat of the code above, but with + SOC rather
4097 than - SOC. */
4098 int outside_overhead = (vec_outside_cost
4099 - scalar_single_iter_cost * peel_iters_prologue
4100 - scalar_single_iter_cost * peel_iters_epilogue
4101 + scalar_outside_cost);
4102 int min_vec_niters = 1;
4103 if (outside_overhead > 0)
4104 min_vec_niters = outside_overhead / saving_per_viter + 1;
4106 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4108 int threshold = (vec_inside_cost * min_vec_niters
4109 + vec_outside_cost
4110 + scalar_outside_cost);
4111 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4113 else
4114 min_profitable_estimate = (min_vec_niters * assumed_vf
4115 + peel_iters_prologue
4116 + peel_iters_epilogue);
4118 else
4120 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4121 * assumed_vf
4122 - vec_inside_cost * peel_iters_prologue
4123 - vec_inside_cost * peel_iters_epilogue)
4124 / ((scalar_single_iter_cost * assumed_vf)
4125 - vec_inside_cost);
4127 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4128 if (dump_enabled_p ())
4129 dump_printf_loc (MSG_NOTE, vect_location,
4130 " Static estimate profitability threshold = %d\n",
4131 min_profitable_estimate);
4133 *ret_min_profitable_estimate = min_profitable_estimate;
4136 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4137 vector elements (not bits) for a vector with NELT elements. */
4138 static void
4139 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4140 vec_perm_builder *sel)
4142 /* The encoding is a single stepped pattern. Any wrap-around is handled
4143 by vec_perm_indices. */
4144 sel->new_vector (nelt, 1, 3);
4145 for (unsigned int i = 0; i < 3; i++)
4146 sel->quick_push (i + offset);
4149 /* Checks whether the target supports whole-vector shifts for vectors of mode
4150 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4151 it supports vec_perm_const with masks for all necessary shift amounts. */
4152 static bool
4153 have_whole_vector_shift (machine_mode mode)
4155 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4156 return true;
4158 /* Variable-length vectors should be handled via the optab. */
4159 unsigned int nelt;
4160 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4161 return false;
4163 vec_perm_builder sel;
4164 vec_perm_indices indices;
4165 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4167 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4168 indices.new_vector (sel, 2, nelt);
4169 if (!can_vec_perm_const_p (mode, indices, false))
4170 return false;
4172 return true;
4175 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4176 functions. Design better to avoid maintenance issues. */
4178 /* Function vect_model_reduction_cost.
4180 Models cost for a reduction operation, including the vector ops
4181 generated within the strip-mine loop, the initial definition before
4182 the loop, and the epilogue code that must be generated. */
4184 static void
4185 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4186 stmt_vec_info stmt_info, internal_fn reduc_fn,
4187 vect_reduction_type reduction_type,
4188 int ncopies, stmt_vector_for_cost *cost_vec)
4190 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4191 enum tree_code code;
4192 optab optab;
4193 tree vectype;
4194 machine_mode mode;
4195 class loop *loop = NULL;
4197 if (loop_vinfo)
4198 loop = LOOP_VINFO_LOOP (loop_vinfo);
4200 /* Condition reductions generate two reductions in the loop. */
4201 if (reduction_type == COND_REDUCTION)
4202 ncopies *= 2;
4204 vectype = STMT_VINFO_VECTYPE (stmt_info);
4205 mode = TYPE_MODE (vectype);
4206 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4208 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4210 if (reduction_type == EXTRACT_LAST_REDUCTION)
4211 /* No extra instructions are needed in the prologue. The loop body
4212 operations are costed in vectorizable_condition. */
4213 inside_cost = 0;
4214 else if (reduction_type == FOLD_LEFT_REDUCTION)
4216 /* No extra instructions needed in the prologue. */
4217 prologue_cost = 0;
4219 if (reduc_fn != IFN_LAST)
4220 /* Count one reduction-like operation per vector. */
4221 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4222 stmt_info, 0, vect_body);
4223 else
4225 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4226 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4227 inside_cost = record_stmt_cost (cost_vec, nelements,
4228 vec_to_scalar, stmt_info, 0,
4229 vect_body);
4230 inside_cost += record_stmt_cost (cost_vec, nelements,
4231 scalar_stmt, stmt_info, 0,
4232 vect_body);
4235 else
4237 /* Add in cost for initial definition.
4238 For cond reduction we have four vectors: initial index, step,
4239 initial result of the data reduction, initial value of the index
4240 reduction. */
4241 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4242 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4243 scalar_to_vec, stmt_info, 0,
4244 vect_prologue);
4246 /* Cost of reduction op inside loop. */
4247 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4248 stmt_info, 0, vect_body);
4251 /* Determine cost of epilogue code.
4253 We have a reduction operator that will reduce the vector in one statement.
4254 Also requires scalar extract. */
4256 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4258 if (reduc_fn != IFN_LAST)
4260 if (reduction_type == COND_REDUCTION)
4262 /* An EQ stmt and an COND_EXPR stmt. */
4263 epilogue_cost += record_stmt_cost (cost_vec, 2,
4264 vector_stmt, stmt_info, 0,
4265 vect_epilogue);
4266 /* Reduction of the max index and a reduction of the found
4267 values. */
4268 epilogue_cost += record_stmt_cost (cost_vec, 2,
4269 vec_to_scalar, stmt_info, 0,
4270 vect_epilogue);
4271 /* A broadcast of the max value. */
4272 epilogue_cost += record_stmt_cost (cost_vec, 1,
4273 scalar_to_vec, stmt_info, 0,
4274 vect_epilogue);
4276 else
4278 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4279 stmt_info, 0, vect_epilogue);
4280 epilogue_cost += record_stmt_cost (cost_vec, 1,
4281 vec_to_scalar, stmt_info, 0,
4282 vect_epilogue);
4285 else if (reduction_type == COND_REDUCTION)
4287 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4288 /* Extraction of scalar elements. */
4289 epilogue_cost += record_stmt_cost (cost_vec,
4290 2 * estimated_nunits,
4291 vec_to_scalar, stmt_info, 0,
4292 vect_epilogue);
4293 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4294 epilogue_cost += record_stmt_cost (cost_vec,
4295 2 * estimated_nunits - 3,
4296 scalar_stmt, stmt_info, 0,
4297 vect_epilogue);
4299 else if (reduction_type == EXTRACT_LAST_REDUCTION
4300 || reduction_type == FOLD_LEFT_REDUCTION)
4301 /* No extra instructions need in the epilogue. */
4303 else
4305 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4306 tree bitsize =
4307 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4308 int element_bitsize = tree_to_uhwi (bitsize);
4309 int nelements = vec_size_in_bits / element_bitsize;
4311 if (code == COND_EXPR)
4312 code = MAX_EXPR;
4314 optab = optab_for_tree_code (code, vectype, optab_default);
4316 /* We have a whole vector shift available. */
4317 if (optab != unknown_optab
4318 && VECTOR_MODE_P (mode)
4319 && optab_handler (optab, mode) != CODE_FOR_nothing
4320 && have_whole_vector_shift (mode))
4322 /* Final reduction via vector shifts and the reduction operator.
4323 Also requires scalar extract. */
4324 epilogue_cost += record_stmt_cost (cost_vec,
4325 exact_log2 (nelements) * 2,
4326 vector_stmt, stmt_info, 0,
4327 vect_epilogue);
4328 epilogue_cost += record_stmt_cost (cost_vec, 1,
4329 vec_to_scalar, stmt_info, 0,
4330 vect_epilogue);
4332 else
4333 /* Use extracts and reduction op for final reduction. For N
4334 elements, we have N extracts and N-1 reduction ops. */
4335 epilogue_cost += record_stmt_cost (cost_vec,
4336 nelements + nelements - 1,
4337 vector_stmt, stmt_info, 0,
4338 vect_epilogue);
4342 if (dump_enabled_p ())
4343 dump_printf (MSG_NOTE,
4344 "vect_model_reduction_cost: inside_cost = %d, "
4345 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4346 prologue_cost, epilogue_cost);
4350 /* Function vect_model_induction_cost.
4352 Models cost for induction operations. */
4354 static void
4355 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4356 stmt_vector_for_cost *cost_vec)
4358 unsigned inside_cost, prologue_cost;
4360 if (PURE_SLP_STMT (stmt_info))
4361 return;
4363 /* loop cost for vec_loop. */
4364 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4365 stmt_info, 0, vect_body);
4367 /* prologue cost for vec_init and vec_step. */
4368 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4369 stmt_info, 0, vect_prologue);
4371 if (dump_enabled_p ())
4372 dump_printf_loc (MSG_NOTE, vect_location,
4373 "vect_model_induction_cost: inside_cost = %d, "
4374 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4379 /* Function get_initial_def_for_reduction
4381 Input:
4382 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4383 INIT_VAL - the initial value of the reduction variable
4385 Output:
4386 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4387 of the reduction (used for adjusting the epilog - see below).
4388 Return a vector variable, initialized according to the operation that
4389 STMT_VINFO performs. This vector will be used as the initial value
4390 of the vector of partial results.
4392 Option1 (adjust in epilog): Initialize the vector as follows:
4393 add/bit or/xor: [0,0,...,0,0]
4394 mult/bit and: [1,1,...,1,1]
4395 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4396 and when necessary (e.g. add/mult case) let the caller know
4397 that it needs to adjust the result by init_val.
4399 Option2: Initialize the vector as follows:
4400 add/bit or/xor: [init_val,0,0,...,0]
4401 mult/bit and: [init_val,1,1,...,1]
4402 min/max/cond_expr: [init_val,init_val,...,init_val]
4403 and no adjustments are needed.
4405 For example, for the following code:
4407 s = init_val;
4408 for (i=0;i<n;i++)
4409 s = s + a[i];
4411 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4412 For a vector of 4 units, we want to return either [0,0,0,init_val],
4413 or [0,0,0,0] and let the caller know that it needs to adjust
4414 the result at the end by 'init_val'.
4416 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4417 initialization vector is simpler (same element in all entries), if
4418 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4420 A cost model should help decide between these two schemes. */
4422 static tree
4423 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4424 stmt_vec_info stmt_vinfo,
4425 enum tree_code code, tree init_val,
4426 tree *adjustment_def)
4428 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4429 tree scalar_type = TREE_TYPE (init_val);
4430 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4431 tree def_for_init;
4432 tree init_def;
4433 REAL_VALUE_TYPE real_init_val = dconst0;
4434 int int_init_val = 0;
4435 gimple_seq stmts = NULL;
4437 gcc_assert (vectype);
4439 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4440 || SCALAR_FLOAT_TYPE_P (scalar_type));
4442 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4443 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4445 /* ADJUSTMENT_DEF is NULL when called from
4446 vect_create_epilog_for_reduction to vectorize double reduction. */
4447 if (adjustment_def)
4448 *adjustment_def = NULL;
4450 switch (code)
4452 case WIDEN_SUM_EXPR:
4453 case DOT_PROD_EXPR:
4454 case SAD_EXPR:
4455 case PLUS_EXPR:
4456 case MINUS_EXPR:
4457 case BIT_IOR_EXPR:
4458 case BIT_XOR_EXPR:
4459 case MULT_EXPR:
4460 case BIT_AND_EXPR:
4462 if (code == MULT_EXPR)
4464 real_init_val = dconst1;
4465 int_init_val = 1;
4468 if (code == BIT_AND_EXPR)
4469 int_init_val = -1;
4471 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4472 def_for_init = build_real (scalar_type, real_init_val);
4473 else
4474 def_for_init = build_int_cst (scalar_type, int_init_val);
4476 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4478 /* Option1: the first element is '0' or '1' as well. */
4479 if (!operand_equal_p (def_for_init, init_val, 0))
4480 *adjustment_def = init_val;
4481 init_def = gimple_build_vector_from_val (&stmts, vectype,
4482 def_for_init);
4484 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4486 /* Option2 (variable length): the first element is INIT_VAL. */
4487 init_def = gimple_build_vector_from_val (&stmts, vectype,
4488 def_for_init);
4489 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4490 vectype, init_def, init_val);
4492 else
4494 /* Option2: the first element is INIT_VAL. */
4495 tree_vector_builder elts (vectype, 1, 2);
4496 elts.quick_push (init_val);
4497 elts.quick_push (def_for_init);
4498 init_def = gimple_build_vector (&stmts, &elts);
4501 break;
4503 case MIN_EXPR:
4504 case MAX_EXPR:
4505 case COND_EXPR:
4507 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4508 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4510 break;
4512 default:
4513 gcc_unreachable ();
4516 if (stmts)
4517 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4518 return init_def;
4521 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4522 NUMBER_OF_VECTORS is the number of vector defs to create.
4523 If NEUTRAL_OP is nonnull, introducing extra elements of that
4524 value will not change the result. */
4526 static void
4527 get_initial_defs_for_reduction (vec_info *vinfo,
4528 slp_tree slp_node,
4529 vec<tree> *vec_oprnds,
4530 unsigned int number_of_vectors,
4531 bool reduc_chain, tree neutral_op)
4533 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4534 stmt_vec_info stmt_vinfo = stmts[0];
4535 unsigned HOST_WIDE_INT nunits;
4536 unsigned j, number_of_places_left_in_vector;
4537 tree vector_type;
4538 unsigned int group_size = stmts.length ();
4539 unsigned int i;
4540 class loop *loop;
4542 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4544 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4546 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4547 gcc_assert (loop);
4548 edge pe = loop_preheader_edge (loop);
4550 gcc_assert (!reduc_chain || neutral_op);
4552 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4553 created vectors. It is greater than 1 if unrolling is performed.
4555 For example, we have two scalar operands, s1 and s2 (e.g., group of
4556 strided accesses of size two), while NUNITS is four (i.e., four scalars
4557 of this type can be packed in a vector). The output vector will contain
4558 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4559 will be 2).
4561 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4562 vectors containing the operands.
4564 For example, NUNITS is four as before, and the group size is 8
4565 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4566 {s5, s6, s7, s8}. */
4568 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4569 nunits = group_size;
4571 number_of_places_left_in_vector = nunits;
4572 bool constant_p = true;
4573 tree_vector_builder elts (vector_type, nunits, 1);
4574 elts.quick_grow (nunits);
4575 gimple_seq ctor_seq = NULL;
4576 for (j = 0; j < nunits * number_of_vectors; ++j)
4578 tree op;
4579 i = j % group_size;
4580 stmt_vinfo = stmts[i];
4582 /* Get the def before the loop. In reduction chain we have only
4583 one initial value. Else we have as many as PHIs in the group. */
4584 if (reduc_chain)
4585 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4586 else if (((vec_oprnds->length () + 1) * nunits
4587 - number_of_places_left_in_vector >= group_size)
4588 && neutral_op)
4589 op = neutral_op;
4590 else
4591 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4593 /* Create 'vect_ = {op0,op1,...,opn}'. */
4594 number_of_places_left_in_vector--;
4595 elts[nunits - number_of_places_left_in_vector - 1] = op;
4596 if (!CONSTANT_CLASS_P (op))
4597 constant_p = false;
4599 if (number_of_places_left_in_vector == 0)
4601 tree init;
4602 if (constant_p && !neutral_op
4603 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4604 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4605 /* Build the vector directly from ELTS. */
4606 init = gimple_build_vector (&ctor_seq, &elts);
4607 else if (neutral_op)
4609 /* Build a vector of the neutral value and shift the
4610 other elements into place. */
4611 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4612 neutral_op);
4613 int k = nunits;
4614 while (k > 0 && elts[k - 1] == neutral_op)
4615 k -= 1;
4616 while (k > 0)
4618 k -= 1;
4619 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4620 vector_type, init, elts[k]);
4623 else
4625 /* First time round, duplicate ELTS to fill the
4626 required number of vectors. */
4627 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4628 number_of_vectors, *vec_oprnds);
4629 break;
4631 vec_oprnds->quick_push (init);
4633 number_of_places_left_in_vector = nunits;
4634 elts.new_vector (vector_type, nunits, 1);
4635 elts.quick_grow (nunits);
4636 constant_p = true;
4639 if (ctor_seq != NULL)
4640 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4643 /* For a statement STMT_INFO taking part in a reduction operation return
4644 the stmt_vec_info the meta information is stored on. */
4646 stmt_vec_info
4647 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4649 stmt_info = vect_orig_stmt (stmt_info);
4650 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4651 if (!is_a <gphi *> (stmt_info->stmt)
4652 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4653 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4654 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4655 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4657 if (gimple_phi_num_args (phi) == 1)
4658 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4660 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4662 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4663 stmt_vec_info info
4664 = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4665 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4666 stmt_info = info;
4668 return stmt_info;
4671 /* Function vect_create_epilog_for_reduction
4673 Create code at the loop-epilog to finalize the result of a reduction
4674 computation.
4676 STMT_INFO is the scalar reduction stmt that is being vectorized.
4677 SLP_NODE is an SLP node containing a group of reduction statements. The
4678 first one in this group is STMT_INFO.
4679 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4680 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4681 (counting from 0)
4683 This function:
4684 1. Completes the reduction def-use cycles.
4685 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4686 by calling the function specified by REDUC_FN if available, or by
4687 other means (whole-vector shifts or a scalar loop).
4688 The function also creates a new phi node at the loop exit to preserve
4689 loop-closed form, as illustrated below.
4691 The flow at the entry to this function:
4693 loop:
4694 vec_def = phi <vec_init, null> # REDUCTION_PHI
4695 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4696 s_loop = scalar_stmt # (scalar) STMT_INFO
4697 loop_exit:
4698 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4699 use <s_out0>
4700 use <s_out0>
4702 The above is transformed by this function into:
4704 loop:
4705 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4706 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4707 s_loop = scalar_stmt # (scalar) STMT_INFO
4708 loop_exit:
4709 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4710 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4711 v_out2 = reduce <v_out1>
4712 s_out3 = extract_field <v_out2, 0>
4713 s_out4 = adjust_result <s_out3>
4714 use <s_out4>
4715 use <s_out4>
4718 static void
4719 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4720 stmt_vec_info stmt_info,
4721 slp_tree slp_node,
4722 slp_instance slp_node_instance)
4724 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4725 gcc_assert (reduc_info->is_reduc_info);
4726 /* For double reductions we need to get at the inner loop reduction
4727 stmt which has the meta info attached. Our stmt_info is that of the
4728 loop-closed PHI of the inner loop which we remember as
4729 def for the reduction PHI generation. */
4730 bool double_reduc = false;
4731 stmt_vec_info rdef_info = stmt_info;
4732 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4734 gcc_assert (!slp_node);
4735 double_reduc = true;
4736 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4737 (stmt_info->stmt, 0));
4738 stmt_info = vect_stmt_to_vectorize (stmt_info);
4740 gphi *reduc_def_stmt
4741 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4742 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4743 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4744 tree vectype;
4745 machine_mode mode;
4746 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4747 basic_block exit_bb;
4748 tree scalar_dest;
4749 tree scalar_type;
4750 gimple *new_phi = NULL, *phi;
4751 gimple_stmt_iterator exit_gsi;
4752 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4753 gimple *epilog_stmt = NULL;
4754 gimple *exit_phi;
4755 tree bitsize;
4756 tree def;
4757 tree orig_name, scalar_result;
4758 imm_use_iterator imm_iter, phi_imm_iter;
4759 use_operand_p use_p, phi_use_p;
4760 gimple *use_stmt;
4761 bool nested_in_vect_loop = false;
4762 auto_vec<gimple *> new_phis;
4763 int j, i;
4764 auto_vec<tree> scalar_results;
4765 unsigned int group_size = 1, k;
4766 auto_vec<gimple *> phis;
4767 bool slp_reduc = false;
4768 bool direct_slp_reduc;
4769 tree new_phi_result;
4770 tree induction_index = NULL_TREE;
4772 if (slp_node)
4773 group_size = SLP_TREE_LANES (slp_node);
4775 if (nested_in_vect_loop_p (loop, stmt_info))
4777 outer_loop = loop;
4778 loop = loop->inner;
4779 nested_in_vect_loop = true;
4780 gcc_assert (!slp_node);
4782 gcc_assert (!nested_in_vect_loop || double_reduc);
4784 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4785 gcc_assert (vectype);
4786 mode = TYPE_MODE (vectype);
4788 tree initial_def = NULL;
4789 tree induc_val = NULL_TREE;
4790 tree adjustment_def = NULL;
4791 if (slp_node)
4793 else
4795 /* Get at the scalar def before the loop, that defines the initial value
4796 of the reduction variable. */
4797 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4798 loop_preheader_edge (loop));
4799 /* Optimize: for induction condition reduction, if we can't use zero
4800 for induc_val, use initial_def. */
4801 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4802 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4803 else if (double_reduc)
4805 else if (nested_in_vect_loop)
4807 else
4808 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4811 unsigned vec_num;
4812 int ncopies;
4813 if (slp_node)
4815 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4816 ncopies = 1;
4818 else
4820 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4821 vec_num = 1;
4822 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4825 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4826 which is updated with the current index of the loop for every match of
4827 the original loop's cond_expr (VEC_STMT). This results in a vector
4828 containing the last time the condition passed for that vector lane.
4829 The first match will be a 1 to allow 0 to be used for non-matching
4830 indexes. If there are no matches at all then the vector will be all
4831 zeroes.
4833 PR92772: This algorithm is broken for architectures that support
4834 masked vectors, but do not provide fold_extract_last. */
4835 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4837 auto_vec<std::pair<tree, bool>, 2> ccompares;
4838 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4839 cond_info = vect_stmt_to_vectorize (cond_info);
4840 while (cond_info != reduc_info)
4842 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4844 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4845 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4846 ccompares.safe_push
4847 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4848 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4850 cond_info
4851 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4852 1 + STMT_VINFO_REDUC_IDX
4853 (cond_info)));
4854 cond_info = vect_stmt_to_vectorize (cond_info);
4856 gcc_assert (ccompares.length () != 0);
4858 tree indx_before_incr, indx_after_incr;
4859 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4860 int scalar_precision
4861 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4862 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4863 tree cr_index_vector_type = get_related_vectype_for_scalar_type
4864 (TYPE_MODE (vectype), cr_index_scalar_type,
4865 TYPE_VECTOR_SUBPARTS (vectype));
4867 /* First we create a simple vector induction variable which starts
4868 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4869 vector size (STEP). */
4871 /* Create a {1,2,3,...} vector. */
4872 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4874 /* Create a vector of the step value. */
4875 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4876 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4878 /* Create an induction variable. */
4879 gimple_stmt_iterator incr_gsi;
4880 bool insert_after;
4881 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4882 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4883 insert_after, &indx_before_incr, &indx_after_incr);
4885 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4886 filled with zeros (VEC_ZERO). */
4888 /* Create a vector of 0s. */
4889 tree zero = build_zero_cst (cr_index_scalar_type);
4890 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4892 /* Create a vector phi node. */
4893 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4894 new_phi = create_phi_node (new_phi_tree, loop->header);
4895 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4896 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4898 /* Now take the condition from the loops original cond_exprs
4899 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4900 every match uses values from the induction variable
4901 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4902 (NEW_PHI_TREE).
4903 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4904 the new cond_expr (INDEX_COND_EXPR). */
4905 gimple_seq stmts = NULL;
4906 for (int i = ccompares.length () - 1; i != -1; --i)
4908 tree ccompare = ccompares[i].first;
4909 if (ccompares[i].second)
4910 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4911 cr_index_vector_type,
4912 ccompare,
4913 indx_before_incr, new_phi_tree);
4914 else
4915 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4916 cr_index_vector_type,
4917 ccompare,
4918 new_phi_tree, indx_before_incr);
4920 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4922 /* Update the phi with the vec cond. */
4923 induction_index = new_phi_tree;
4924 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4925 loop_latch_edge (loop), UNKNOWN_LOCATION);
4928 /* 2. Create epilog code.
4929 The reduction epilog code operates across the elements of the vector
4930 of partial results computed by the vectorized loop.
4931 The reduction epilog code consists of:
4933 step 1: compute the scalar result in a vector (v_out2)
4934 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4935 step 3: adjust the scalar result (s_out3) if needed.
4937 Step 1 can be accomplished using one the following three schemes:
4938 (scheme 1) using reduc_fn, if available.
4939 (scheme 2) using whole-vector shifts, if available.
4940 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4941 combined.
4943 The overall epilog code looks like this:
4945 s_out0 = phi <s_loop> # original EXIT_PHI
4946 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4947 v_out2 = reduce <v_out1> # step 1
4948 s_out3 = extract_field <v_out2, 0> # step 2
4949 s_out4 = adjust_result <s_out3> # step 3
4951 (step 3 is optional, and steps 1 and 2 may be combined).
4952 Lastly, the uses of s_out0 are replaced by s_out4. */
4955 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4956 v_out1 = phi <VECT_DEF>
4957 Store them in NEW_PHIS. */
4958 if (double_reduc)
4959 loop = outer_loop;
4960 exit_bb = single_exit (loop)->dest;
4961 new_phis.create (slp_node ? vec_num : ncopies);
4962 for (unsigned i = 0; i < vec_num; i++)
4964 if (slp_node)
4965 def = vect_get_slp_vect_def (slp_node, i);
4966 else
4967 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
4968 for (j = 0; j < ncopies; j++)
4970 tree new_def = copy_ssa_name (def);
4971 phi = create_phi_node (new_def, exit_bb);
4972 if (j == 0)
4973 new_phis.quick_push (phi);
4974 else
4976 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
4977 new_phis.quick_push (phi);
4980 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4984 exit_gsi = gsi_after_labels (exit_bb);
4986 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4987 (i.e. when reduc_fn is not available) and in the final adjustment
4988 code (if needed). Also get the original scalar reduction variable as
4989 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4990 represents a reduction pattern), the tree-code and scalar-def are
4991 taken from the original stmt that the pattern-stmt (STMT) replaces.
4992 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4993 are taken from STMT. */
4995 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4996 if (orig_stmt_info != stmt_info)
4998 /* Reduction pattern */
4999 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5000 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5003 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5004 scalar_type = TREE_TYPE (scalar_dest);
5005 scalar_results.create (group_size);
5006 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5007 bitsize = TYPE_SIZE (scalar_type);
5009 /* SLP reduction without reduction chain, e.g.,
5010 # a1 = phi <a2, a0>
5011 # b1 = phi <b2, b0>
5012 a2 = operation (a1)
5013 b2 = operation (b1) */
5014 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5016 /* True if we should implement SLP_REDUC using native reduction operations
5017 instead of scalar operations. */
5018 direct_slp_reduc = (reduc_fn != IFN_LAST
5019 && slp_reduc
5020 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5022 /* In case of reduction chain, e.g.,
5023 # a1 = phi <a3, a0>
5024 a2 = operation (a1)
5025 a3 = operation (a2),
5027 we may end up with more than one vector result. Here we reduce them to
5028 one vector. */
5029 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5031 gimple_seq stmts = NULL;
5032 tree first_vect = PHI_RESULT (new_phis[0]);
5033 first_vect = gimple_convert (&stmts, vectype, first_vect);
5034 for (k = 1; k < new_phis.length (); k++)
5036 gimple *next_phi = new_phis[k];
5037 tree second_vect = PHI_RESULT (next_phi);
5038 second_vect = gimple_convert (&stmts, vectype, second_vect);
5039 first_vect = gimple_build (&stmts, code, vectype,
5040 first_vect, second_vect);
5042 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5044 new_phi_result = first_vect;
5045 new_phis.truncate (0);
5046 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5048 /* Likewise if we couldn't use a single defuse cycle. */
5049 else if (ncopies > 1)
5051 gimple_seq stmts = NULL;
5052 tree first_vect = PHI_RESULT (new_phis[0]);
5053 first_vect = gimple_convert (&stmts, vectype, first_vect);
5054 for (int k = 1; k < ncopies; ++k)
5056 tree second_vect = PHI_RESULT (new_phis[k]);
5057 second_vect = gimple_convert (&stmts, vectype, second_vect);
5058 first_vect = gimple_build (&stmts, code, vectype,
5059 first_vect, second_vect);
5061 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5062 new_phi_result = first_vect;
5063 new_phis.truncate (0);
5064 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5066 else
5067 new_phi_result = PHI_RESULT (new_phis[0]);
5069 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5070 && reduc_fn != IFN_LAST)
5072 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5073 various data values where the condition matched and another vector
5074 (INDUCTION_INDEX) containing all the indexes of those matches. We
5075 need to extract the last matching index (which will be the index with
5076 highest value) and use this to index into the data vector.
5077 For the case where there were no matches, the data vector will contain
5078 all default values and the index vector will be all zeros. */
5080 /* Get various versions of the type of the vector of indexes. */
5081 tree index_vec_type = TREE_TYPE (induction_index);
5082 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5083 tree index_scalar_type = TREE_TYPE (index_vec_type);
5084 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5086 /* Get an unsigned integer version of the type of the data vector. */
5087 int scalar_precision
5088 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5089 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5090 tree vectype_unsigned = build_vector_type
5091 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5093 /* First we need to create a vector (ZERO_VEC) of zeros and another
5094 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5095 can create using a MAX reduction and then expanding.
5096 In the case where the loop never made any matches, the max index will
5097 be zero. */
5099 /* Vector of {0, 0, 0,...}. */
5100 tree zero_vec = build_zero_cst (vectype);
5102 gimple_seq stmts = NULL;
5103 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5104 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5106 /* Find maximum value from the vector of found indexes. */
5107 tree max_index = make_ssa_name (index_scalar_type);
5108 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5109 1, induction_index);
5110 gimple_call_set_lhs (max_index_stmt, max_index);
5111 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5113 /* Vector of {max_index, max_index, max_index,...}. */
5114 tree max_index_vec = make_ssa_name (index_vec_type);
5115 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5116 max_index);
5117 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5118 max_index_vec_rhs);
5119 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5121 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5122 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5123 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5124 otherwise. Only one value should match, resulting in a vector
5125 (VEC_COND) with one data value and the rest zeros.
5126 In the case where the loop never made any matches, every index will
5127 match, resulting in a vector with all data values (which will all be
5128 the default value). */
5130 /* Compare the max index vector to the vector of found indexes to find
5131 the position of the max value. */
5132 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5133 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5134 induction_index,
5135 max_index_vec);
5136 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5138 /* Use the compare to choose either values from the data vector or
5139 zero. */
5140 tree vec_cond = make_ssa_name (vectype);
5141 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5142 vec_compare, new_phi_result,
5143 zero_vec);
5144 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5146 /* Finally we need to extract the data value from the vector (VEC_COND)
5147 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5148 reduction, but because this doesn't exist, we can use a MAX reduction
5149 instead. The data value might be signed or a float so we need to cast
5150 it first.
5151 In the case where the loop never made any matches, the data values are
5152 all identical, and so will reduce down correctly. */
5154 /* Make the matched data values unsigned. */
5155 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5156 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5157 vec_cond);
5158 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5159 VIEW_CONVERT_EXPR,
5160 vec_cond_cast_rhs);
5161 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5163 /* Reduce down to a scalar value. */
5164 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5165 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5166 1, vec_cond_cast);
5167 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5168 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5170 /* Convert the reduced value back to the result type and set as the
5171 result. */
5172 stmts = NULL;
5173 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5174 data_reduc);
5175 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5176 scalar_results.safe_push (new_temp);
5178 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5179 && reduc_fn == IFN_LAST)
5181 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5182 idx = 0;
5183 idx_val = induction_index[0];
5184 val = data_reduc[0];
5185 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5186 if (induction_index[i] > idx_val)
5187 val = data_reduc[i], idx_val = induction_index[i];
5188 return val; */
5190 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5191 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5192 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5193 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5194 /* Enforced by vectorizable_reduction, which ensures we have target
5195 support before allowing a conditional reduction on variable-length
5196 vectors. */
5197 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5198 tree idx_val = NULL_TREE, val = NULL_TREE;
5199 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5201 tree old_idx_val = idx_val;
5202 tree old_val = val;
5203 idx_val = make_ssa_name (idx_eltype);
5204 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5205 build3 (BIT_FIELD_REF, idx_eltype,
5206 induction_index,
5207 bitsize_int (el_size),
5208 bitsize_int (off)));
5209 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5210 val = make_ssa_name (data_eltype);
5211 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5212 build3 (BIT_FIELD_REF,
5213 data_eltype,
5214 new_phi_result,
5215 bitsize_int (el_size),
5216 bitsize_int (off)));
5217 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5218 if (off != 0)
5220 tree new_idx_val = idx_val;
5221 if (off != v_size - el_size)
5223 new_idx_val = make_ssa_name (idx_eltype);
5224 epilog_stmt = gimple_build_assign (new_idx_val,
5225 MAX_EXPR, idx_val,
5226 old_idx_val);
5227 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5229 tree new_val = make_ssa_name (data_eltype);
5230 epilog_stmt = gimple_build_assign (new_val,
5231 COND_EXPR,
5232 build2 (GT_EXPR,
5233 boolean_type_node,
5234 idx_val,
5235 old_idx_val),
5236 val, old_val);
5237 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5238 idx_val = new_idx_val;
5239 val = new_val;
5242 /* Convert the reduced value back to the result type and set as the
5243 result. */
5244 gimple_seq stmts = NULL;
5245 val = gimple_convert (&stmts, scalar_type, val);
5246 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5247 scalar_results.safe_push (val);
5250 /* 2.3 Create the reduction code, using one of the three schemes described
5251 above. In SLP we simply need to extract all the elements from the
5252 vector (without reducing them), so we use scalar shifts. */
5253 else if (reduc_fn != IFN_LAST && !slp_reduc)
5255 tree tmp;
5256 tree vec_elem_type;
5258 /* Case 1: Create:
5259 v_out2 = reduc_expr <v_out1> */
5261 if (dump_enabled_p ())
5262 dump_printf_loc (MSG_NOTE, vect_location,
5263 "Reduce using direct vector reduction.\n");
5265 gimple_seq stmts = NULL;
5266 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5267 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5268 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5269 vec_elem_type, new_phi_result);
5270 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5271 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5273 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5274 && induc_val)
5276 /* Earlier we set the initial value to be a vector if induc_val
5277 values. Check the result and if it is induc_val then replace
5278 with the original initial value, unless induc_val is
5279 the same as initial_def already. */
5280 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5281 induc_val);
5283 tmp = make_ssa_name (new_scalar_dest);
5284 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5285 initial_def, new_temp);
5286 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5287 new_temp = tmp;
5290 scalar_results.safe_push (new_temp);
5292 else if (direct_slp_reduc)
5294 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5295 with the elements for other SLP statements replaced with the
5296 neutral value. We can then do a normal reduction on each vector. */
5298 /* Enforced by vectorizable_reduction. */
5299 gcc_assert (new_phis.length () == 1);
5300 gcc_assert (pow2p_hwi (group_size));
5302 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5303 vec<stmt_vec_info> orig_phis
5304 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5305 gimple_seq seq = NULL;
5307 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5308 and the same element size as VECTYPE. */
5309 tree index = build_index_vector (vectype, 0, 1);
5310 tree index_type = TREE_TYPE (index);
5311 tree index_elt_type = TREE_TYPE (index_type);
5312 tree mask_type = truth_type_for (index_type);
5314 /* Create a vector that, for each element, identifies which of
5315 the REDUC_GROUP_SIZE results should use it. */
5316 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5317 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5318 build_vector_from_val (index_type, index_mask));
5320 /* Get a neutral vector value. This is simply a splat of the neutral
5321 scalar value if we have one, otherwise the initial scalar value
5322 is itself a neutral value. */
5323 tree vector_identity = NULL_TREE;
5324 tree neutral_op = NULL_TREE;
5325 if (slp_node)
5327 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5328 neutral_op
5329 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5330 vectype, code, first != NULL);
5332 if (neutral_op)
5333 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5334 neutral_op);
5335 for (unsigned int i = 0; i < group_size; ++i)
5337 /* If there's no univeral neutral value, we can use the
5338 initial scalar value from the original PHI. This is used
5339 for MIN and MAX reduction, for example. */
5340 if (!neutral_op)
5342 tree scalar_value
5343 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5344 loop_preheader_edge (loop));
5345 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5346 scalar_value);
5347 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5348 scalar_value);
5351 /* Calculate the equivalent of:
5353 sel[j] = (index[j] == i);
5355 which selects the elements of NEW_PHI_RESULT that should
5356 be included in the result. */
5357 tree compare_val = build_int_cst (index_elt_type, i);
5358 compare_val = build_vector_from_val (index_type, compare_val);
5359 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5360 index, compare_val);
5362 /* Calculate the equivalent of:
5364 vec = seq ? new_phi_result : vector_identity;
5366 VEC is now suitable for a full vector reduction. */
5367 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5368 sel, new_phi_result, vector_identity);
5370 /* Do the reduction and convert it to the appropriate type. */
5371 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5372 TREE_TYPE (vectype), vec);
5373 scalar = gimple_convert (&seq, scalar_type, scalar);
5374 scalar_results.safe_push (scalar);
5376 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5378 else
5380 bool reduce_with_shift;
5381 tree vec_temp;
5383 gcc_assert (slp_reduc || new_phis.length () == 1);
5385 /* See if the target wants to do the final (shift) reduction
5386 in a vector mode of smaller size and first reduce upper/lower
5387 halves against each other. */
5388 enum machine_mode mode1 = mode;
5389 tree stype = TREE_TYPE (vectype);
5390 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5391 unsigned nunits1 = nunits;
5392 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5393 && new_phis.length () == 1)
5395 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5396 /* For SLP reductions we have to make sure lanes match up, but
5397 since we're doing individual element final reduction reducing
5398 vector width here is even more important.
5399 ??? We can also separate lanes with permutes, for the common
5400 case of power-of-two group-size odd/even extracts would work. */
5401 if (slp_reduc && nunits != nunits1)
5403 nunits1 = least_common_multiple (nunits1, group_size);
5404 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5407 if (!slp_reduc
5408 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5409 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5411 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5412 stype, nunits1);
5413 reduce_with_shift = have_whole_vector_shift (mode1);
5414 if (!VECTOR_MODE_P (mode1))
5415 reduce_with_shift = false;
5416 else
5418 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5419 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5420 reduce_with_shift = false;
5423 /* First reduce the vector to the desired vector size we should
5424 do shift reduction on by combining upper and lower halves. */
5425 new_temp = new_phi_result;
5426 while (nunits > nunits1)
5428 nunits /= 2;
5429 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5430 stype, nunits);
5431 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5433 /* The target has to make sure we support lowpart/highpart
5434 extraction, either via direct vector extract or through
5435 an integer mode punning. */
5436 tree dst1, dst2;
5437 if (convert_optab_handler (vec_extract_optab,
5438 TYPE_MODE (TREE_TYPE (new_temp)),
5439 TYPE_MODE (vectype1))
5440 != CODE_FOR_nothing)
5442 /* Extract sub-vectors directly once vec_extract becomes
5443 a conversion optab. */
5444 dst1 = make_ssa_name (vectype1);
5445 epilog_stmt
5446 = gimple_build_assign (dst1, BIT_FIELD_REF,
5447 build3 (BIT_FIELD_REF, vectype1,
5448 new_temp, TYPE_SIZE (vectype1),
5449 bitsize_int (0)));
5450 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5451 dst2 = make_ssa_name (vectype1);
5452 epilog_stmt
5453 = gimple_build_assign (dst2, BIT_FIELD_REF,
5454 build3 (BIT_FIELD_REF, vectype1,
5455 new_temp, TYPE_SIZE (vectype1),
5456 bitsize_int (bitsize)));
5457 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5459 else
5461 /* Extract via punning to appropriately sized integer mode
5462 vector. */
5463 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5464 tree etype = build_vector_type (eltype, 2);
5465 gcc_assert (convert_optab_handler (vec_extract_optab,
5466 TYPE_MODE (etype),
5467 TYPE_MODE (eltype))
5468 != CODE_FOR_nothing);
5469 tree tem = make_ssa_name (etype);
5470 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5471 build1 (VIEW_CONVERT_EXPR,
5472 etype, new_temp));
5473 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5474 new_temp = tem;
5475 tem = make_ssa_name (eltype);
5476 epilog_stmt
5477 = gimple_build_assign (tem, BIT_FIELD_REF,
5478 build3 (BIT_FIELD_REF, eltype,
5479 new_temp, TYPE_SIZE (eltype),
5480 bitsize_int (0)));
5481 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5482 dst1 = make_ssa_name (vectype1);
5483 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5484 build1 (VIEW_CONVERT_EXPR,
5485 vectype1, tem));
5486 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5487 tem = make_ssa_name (eltype);
5488 epilog_stmt
5489 = gimple_build_assign (tem, BIT_FIELD_REF,
5490 build3 (BIT_FIELD_REF, eltype,
5491 new_temp, TYPE_SIZE (eltype),
5492 bitsize_int (bitsize)));
5493 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5494 dst2 = make_ssa_name (vectype1);
5495 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5496 build1 (VIEW_CONVERT_EXPR,
5497 vectype1, tem));
5498 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5501 new_temp = make_ssa_name (vectype1);
5502 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5503 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5504 new_phis[0] = epilog_stmt;
5507 if (reduce_with_shift && !slp_reduc)
5509 int element_bitsize = tree_to_uhwi (bitsize);
5510 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5511 for variable-length vectors and also requires direct target support
5512 for loop reductions. */
5513 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5514 int nelements = vec_size_in_bits / element_bitsize;
5515 vec_perm_builder sel;
5516 vec_perm_indices indices;
5518 int elt_offset;
5520 tree zero_vec = build_zero_cst (vectype1);
5521 /* Case 2: Create:
5522 for (offset = nelements/2; offset >= 1; offset/=2)
5524 Create: va' = vec_shift <va, offset>
5525 Create: va = vop <va, va'>
5526 } */
5528 tree rhs;
5530 if (dump_enabled_p ())
5531 dump_printf_loc (MSG_NOTE, vect_location,
5532 "Reduce using vector shifts\n");
5534 gimple_seq stmts = NULL;
5535 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5536 for (elt_offset = nelements / 2;
5537 elt_offset >= 1;
5538 elt_offset /= 2)
5540 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5541 indices.new_vector (sel, 2, nelements);
5542 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5543 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5544 new_temp, zero_vec, mask);
5545 new_temp = gimple_build (&stmts, code,
5546 vectype1, new_name, new_temp);
5548 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5550 /* 2.4 Extract the final scalar result. Create:
5551 s_out3 = extract_field <v_out2, bitpos> */
5553 if (dump_enabled_p ())
5554 dump_printf_loc (MSG_NOTE, vect_location,
5555 "extract scalar result\n");
5557 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5558 bitsize, bitsize_zero_node);
5559 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5560 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5561 gimple_assign_set_lhs (epilog_stmt, new_temp);
5562 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5563 scalar_results.safe_push (new_temp);
5565 else
5567 /* Case 3: Create:
5568 s = extract_field <v_out2, 0>
5569 for (offset = element_size;
5570 offset < vector_size;
5571 offset += element_size;)
5573 Create: s' = extract_field <v_out2, offset>
5574 Create: s = op <s, s'> // For non SLP cases
5575 } */
5577 if (dump_enabled_p ())
5578 dump_printf_loc (MSG_NOTE, vect_location,
5579 "Reduce using scalar code.\n");
5581 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5582 int element_bitsize = tree_to_uhwi (bitsize);
5583 tree compute_type = TREE_TYPE (vectype);
5584 gimple_seq stmts = NULL;
5585 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5587 int bit_offset;
5588 if (gimple_code (new_phi) == GIMPLE_PHI)
5589 vec_temp = PHI_RESULT (new_phi);
5590 else
5591 vec_temp = gimple_assign_lhs (new_phi);
5592 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5593 vec_temp, bitsize, bitsize_zero_node);
5595 /* In SLP we don't need to apply reduction operation, so we just
5596 collect s' values in SCALAR_RESULTS. */
5597 if (slp_reduc)
5598 scalar_results.safe_push (new_temp);
5600 for (bit_offset = element_bitsize;
5601 bit_offset < vec_size_in_bits;
5602 bit_offset += element_bitsize)
5604 tree bitpos = bitsize_int (bit_offset);
5605 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5606 compute_type, vec_temp,
5607 bitsize, bitpos);
5608 if (slp_reduc)
5610 /* In SLP we don't need to apply reduction operation, so
5611 we just collect s' values in SCALAR_RESULTS. */
5612 new_temp = new_name;
5613 scalar_results.safe_push (new_name);
5615 else
5616 new_temp = gimple_build (&stmts, code, compute_type,
5617 new_name, new_temp);
5621 /* The only case where we need to reduce scalar results in SLP, is
5622 unrolling. If the size of SCALAR_RESULTS is greater than
5623 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5624 REDUC_GROUP_SIZE. */
5625 if (slp_reduc)
5627 tree res, first_res, new_res;
5629 /* Reduce multiple scalar results in case of SLP unrolling. */
5630 for (j = group_size; scalar_results.iterate (j, &res);
5631 j++)
5633 first_res = scalar_results[j % group_size];
5634 new_res = gimple_build (&stmts, code, compute_type,
5635 first_res, res);
5636 scalar_results[j % group_size] = new_res;
5638 for (k = 0; k < group_size; k++)
5639 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5640 scalar_results[k]);
5642 else
5644 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5645 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5646 scalar_results.safe_push (new_temp);
5649 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5652 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5653 && induc_val)
5655 /* Earlier we set the initial value to be a vector if induc_val
5656 values. Check the result and if it is induc_val then replace
5657 with the original initial value, unless induc_val is
5658 the same as initial_def already. */
5659 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5660 induc_val);
5662 tree tmp = make_ssa_name (new_scalar_dest);
5663 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5664 initial_def, new_temp);
5665 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5666 scalar_results[0] = tmp;
5670 /* 2.5 Adjust the final result by the initial value of the reduction
5671 variable. (When such adjustment is not needed, then
5672 'adjustment_def' is zero). For example, if code is PLUS we create:
5673 new_temp = loop_exit_def + adjustment_def */
5675 if (adjustment_def)
5677 gcc_assert (!slp_reduc);
5678 gimple_seq stmts = NULL;
5679 if (nested_in_vect_loop)
5681 new_phi = new_phis[0];
5682 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5683 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5684 new_temp = gimple_build (&stmts, code, vectype,
5685 PHI_RESULT (new_phi), adjustment_def);
5687 else
5689 new_temp = scalar_results[0];
5690 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5691 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5692 new_temp = gimple_build (&stmts, code, scalar_type,
5693 new_temp, adjustment_def);
5696 epilog_stmt = gimple_seq_last_stmt (stmts);
5697 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5698 if (nested_in_vect_loop)
5700 if (!double_reduc)
5701 scalar_results.quick_push (new_temp);
5702 else
5703 scalar_results[0] = new_temp;
5705 else
5706 scalar_results[0] = new_temp;
5708 new_phis[0] = epilog_stmt;
5711 if (double_reduc)
5712 loop = loop->inner;
5714 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5715 phis with new adjusted scalar results, i.e., replace use <s_out0>
5716 with use <s_out4>.
5718 Transform:
5719 loop_exit:
5720 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5721 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5722 v_out2 = reduce <v_out1>
5723 s_out3 = extract_field <v_out2, 0>
5724 s_out4 = adjust_result <s_out3>
5725 use <s_out0>
5726 use <s_out0>
5728 into:
5730 loop_exit:
5731 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5732 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5733 v_out2 = reduce <v_out1>
5734 s_out3 = extract_field <v_out2, 0>
5735 s_out4 = adjust_result <s_out3>
5736 use <s_out4>
5737 use <s_out4> */
5740 /* In SLP reduction chain we reduce vector results into one vector if
5741 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5742 LHS of the last stmt in the reduction chain, since we are looking for
5743 the loop exit phi node. */
5744 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5746 stmt_vec_info dest_stmt_info
5747 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5748 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5749 group_size = 1;
5752 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5753 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5754 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5755 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5756 correspond to the first vector stmt, etc.
5757 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5758 if (group_size > new_phis.length ())
5759 gcc_assert (!(group_size % new_phis.length ()));
5761 for (k = 0; k < group_size; k++)
5763 if (slp_reduc)
5765 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5767 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5768 /* SLP statements can't participate in patterns. */
5769 gcc_assert (!orig_stmt_info);
5770 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5773 if (nested_in_vect_loop)
5775 if (double_reduc)
5776 loop = outer_loop;
5777 else
5778 gcc_unreachable ();
5781 phis.create (3);
5782 /* Find the loop-closed-use at the loop exit of the original scalar
5783 result. (The reduction result is expected to have two immediate uses,
5784 one at the latch block, and one at the loop exit). For double
5785 reductions we are looking for exit phis of the outer loop. */
5786 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5788 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5790 if (!is_gimple_debug (USE_STMT (use_p)))
5791 phis.safe_push (USE_STMT (use_p));
5793 else
5795 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5797 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5799 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5801 if (!flow_bb_inside_loop_p (loop,
5802 gimple_bb (USE_STMT (phi_use_p)))
5803 && !is_gimple_debug (USE_STMT (phi_use_p)))
5804 phis.safe_push (USE_STMT (phi_use_p));
5810 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5812 /* Replace the uses: */
5813 orig_name = PHI_RESULT (exit_phi);
5814 scalar_result = scalar_results[k];
5815 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5817 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5818 SET_USE (use_p, scalar_result);
5819 update_stmt (use_stmt);
5823 phis.release ();
5827 /* Return a vector of type VECTYPE that is equal to the vector select
5828 operation "MASK ? VEC : IDENTITY". Insert the select statements
5829 before GSI. */
5831 static tree
5832 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5833 tree vec, tree identity)
5835 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5836 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5837 mask, vec, identity);
5838 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5839 return cond;
5842 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5843 order, starting with LHS. Insert the extraction statements before GSI and
5844 associate the new scalar SSA names with variable SCALAR_DEST.
5845 Return the SSA name for the result. */
5847 static tree
5848 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5849 tree_code code, tree lhs, tree vector_rhs)
5851 tree vectype = TREE_TYPE (vector_rhs);
5852 tree scalar_type = TREE_TYPE (vectype);
5853 tree bitsize = TYPE_SIZE (scalar_type);
5854 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5855 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5857 for (unsigned HOST_WIDE_INT bit_offset = 0;
5858 bit_offset < vec_size_in_bits;
5859 bit_offset += element_bitsize)
5861 tree bitpos = bitsize_int (bit_offset);
5862 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5863 bitsize, bitpos);
5865 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5866 rhs = make_ssa_name (scalar_dest, stmt);
5867 gimple_assign_set_lhs (stmt, rhs);
5868 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5870 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5871 tree new_name = make_ssa_name (scalar_dest, stmt);
5872 gimple_assign_set_lhs (stmt, new_name);
5873 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5874 lhs = new_name;
5876 return lhs;
5879 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5880 type of the vector input. */
5882 static internal_fn
5883 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5885 internal_fn mask_reduc_fn;
5887 switch (reduc_fn)
5889 case IFN_FOLD_LEFT_PLUS:
5890 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5891 break;
5893 default:
5894 return IFN_LAST;
5897 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5898 OPTIMIZE_FOR_SPEED))
5899 return mask_reduc_fn;
5900 return IFN_LAST;
5903 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5904 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5905 statement. CODE is the operation performed by STMT_INFO and OPS are
5906 its scalar operands. REDUC_INDEX is the index of the operand in
5907 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5908 implements in-order reduction, or IFN_LAST if we should open-code it.
5909 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5910 that should be used to control the operation in a fully-masked loop. */
5912 static bool
5913 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
5914 stmt_vec_info stmt_info,
5915 gimple_stmt_iterator *gsi,
5916 gimple **vec_stmt, slp_tree slp_node,
5917 gimple *reduc_def_stmt,
5918 tree_code code, internal_fn reduc_fn,
5919 tree ops[3], tree vectype_in,
5920 int reduc_index, vec_loop_masks *masks)
5922 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5923 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5924 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5926 int ncopies;
5927 if (slp_node)
5928 ncopies = 1;
5929 else
5930 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5932 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5933 gcc_assert (ncopies == 1);
5934 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5936 if (slp_node)
5937 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5938 TYPE_VECTOR_SUBPARTS (vectype_in)));
5940 tree op0 = ops[1 - reduc_index];
5942 int group_size = 1;
5943 stmt_vec_info scalar_dest_def_info;
5944 auto_vec<tree> vec_oprnds0;
5945 if (slp_node)
5947 auto_vec<vec<tree> > vec_defs (2);
5948 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
5949 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5950 vec_defs[0].release ();
5951 vec_defs[1].release ();
5952 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5953 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5955 else
5957 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
5958 op0, &vec_oprnds0);
5959 scalar_dest_def_info = stmt_info;
5962 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5963 tree scalar_type = TREE_TYPE (scalar_dest);
5964 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5966 int vec_num = vec_oprnds0.length ();
5967 gcc_assert (vec_num == 1 || slp_node);
5968 tree vec_elem_type = TREE_TYPE (vectype_out);
5969 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5971 tree vector_identity = NULL_TREE;
5972 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5973 vector_identity = build_zero_cst (vectype_out);
5975 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5976 int i;
5977 tree def0;
5978 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5980 gimple *new_stmt;
5981 tree mask = NULL_TREE;
5982 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5983 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5985 /* Handle MINUS by adding the negative. */
5986 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5988 tree negated = make_ssa_name (vectype_out);
5989 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5990 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5991 def0 = negated;
5994 if (mask && mask_reduc_fn == IFN_LAST)
5995 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5996 vector_identity);
5998 /* On the first iteration the input is simply the scalar phi
5999 result, and for subsequent iterations it is the output of
6000 the preceding operation. */
6001 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6003 if (mask && mask_reduc_fn != IFN_LAST)
6004 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6005 def0, mask);
6006 else
6007 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6008 def0);
6009 /* For chained SLP reductions the output of the previous reduction
6010 operation serves as the input of the next. For the final statement
6011 the output cannot be a temporary - we reuse the original
6012 scalar destination of the last statement. */
6013 if (i != vec_num - 1)
6015 gimple_set_lhs (new_stmt, scalar_dest_var);
6016 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6017 gimple_set_lhs (new_stmt, reduc_var);
6020 else
6022 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6023 reduc_var, def0);
6024 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6025 /* Remove the statement, so that we can use the same code paths
6026 as for statements that we've just created. */
6027 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6028 gsi_remove (&tmp_gsi, true);
6031 if (i == vec_num - 1)
6033 gimple_set_lhs (new_stmt, scalar_dest);
6034 vect_finish_replace_stmt (loop_vinfo,
6035 scalar_dest_def_info,
6036 new_stmt);
6038 else
6039 vect_finish_stmt_generation (loop_vinfo,
6040 scalar_dest_def_info,
6041 new_stmt, gsi);
6043 if (slp_node)
6044 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6045 else
6047 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6048 *vec_stmt = new_stmt;
6052 return true;
6055 /* Function is_nonwrapping_integer_induction.
6057 Check if STMT_VINO (which is part of loop LOOP) both increments and
6058 does not cause overflow. */
6060 static bool
6061 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6063 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6064 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6065 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6066 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6067 widest_int ni, max_loop_value, lhs_max;
6068 wi::overflow_type overflow = wi::OVF_NONE;
6070 /* Make sure the loop is integer based. */
6071 if (TREE_CODE (base) != INTEGER_CST
6072 || TREE_CODE (step) != INTEGER_CST)
6073 return false;
6075 /* Check that the max size of the loop will not wrap. */
6077 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6078 return true;
6080 if (! max_stmt_executions (loop, &ni))
6081 return false;
6083 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6084 &overflow);
6085 if (overflow)
6086 return false;
6088 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6089 TYPE_SIGN (lhs_type), &overflow);
6090 if (overflow)
6091 return false;
6093 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6094 <= TYPE_PRECISION (lhs_type));
6097 /* Check if masking can be supported by inserting a conditional expression.
6098 CODE is the code for the operation. COND_FN is the conditional internal
6099 function, if it exists. VECTYPE_IN is the type of the vector input. */
6100 static bool
6101 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6102 tree vectype_in)
6104 if (cond_fn != IFN_LAST
6105 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6106 OPTIMIZE_FOR_SPEED))
6107 return false;
6109 switch (code)
6111 case DOT_PROD_EXPR:
6112 case SAD_EXPR:
6113 return true;
6115 default:
6116 return false;
6120 /* Insert a conditional expression to enable masked vectorization. CODE is the
6121 code for the operation. VOP is the array of operands. MASK is the loop
6122 mask. GSI is a statement iterator used to place the new conditional
6123 expression. */
6124 static void
6125 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6126 gimple_stmt_iterator *gsi)
6128 switch (code)
6130 case DOT_PROD_EXPR:
6132 tree vectype = TREE_TYPE (vop[1]);
6133 tree zero = build_zero_cst (vectype);
6134 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6135 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6136 mask, vop[1], zero);
6137 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6138 vop[1] = masked_op1;
6139 break;
6142 case SAD_EXPR:
6144 tree vectype = TREE_TYPE (vop[1]);
6145 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6146 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6147 mask, vop[1], vop[0]);
6148 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6149 vop[1] = masked_op1;
6150 break;
6153 default:
6154 gcc_unreachable ();
6158 /* Function vectorizable_reduction.
6160 Check if STMT_INFO performs a reduction operation that can be vectorized.
6161 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6162 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6163 Return true if STMT_INFO is vectorizable in this way.
6165 This function also handles reduction idioms (patterns) that have been
6166 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6167 may be of this form:
6168 X = pattern_expr (arg0, arg1, ..., X)
6169 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6170 sequence that had been detected and replaced by the pattern-stmt
6171 (STMT_INFO).
6173 This function also handles reduction of condition expressions, for example:
6174 for (int i = 0; i < N; i++)
6175 if (a[i] < value)
6176 last = a[i];
6177 This is handled by vectorising the loop and creating an additional vector
6178 containing the loop indexes for which "a[i] < value" was true. In the
6179 function epilogue this is reduced to a single max value and then used to
6180 index into the vector of results.
6182 In some cases of reduction patterns, the type of the reduction variable X is
6183 different than the type of the other arguments of STMT_INFO.
6184 In such cases, the vectype that is used when transforming STMT_INFO into
6185 a vector stmt is different than the vectype that is used to determine the
6186 vectorization factor, because it consists of a different number of elements
6187 than the actual number of elements that are being operated upon in parallel.
6189 For example, consider an accumulation of shorts into an int accumulator.
6190 On some targets it's possible to vectorize this pattern operating on 8
6191 shorts at a time (hence, the vectype for purposes of determining the
6192 vectorization factor should be V8HI); on the other hand, the vectype that
6193 is used to create the vector form is actually V4SI (the type of the result).
6195 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6196 indicates what is the actual level of parallelism (V8HI in the example), so
6197 that the right vectorization factor would be derived. This vectype
6198 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6199 be used to create the vectorized stmt. The right vectype for the vectorized
6200 stmt is obtained from the type of the result X:
6201 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6203 This means that, contrary to "regular" reductions (or "regular" stmts in
6204 general), the following equation:
6205 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6206 does *NOT* necessarily hold for reduction patterns. */
6208 bool
6209 vectorizable_reduction (loop_vec_info loop_vinfo,
6210 stmt_vec_info stmt_info, slp_tree slp_node,
6211 slp_instance slp_node_instance,
6212 stmt_vector_for_cost *cost_vec)
6214 tree scalar_dest;
6215 tree vectype_in = NULL_TREE;
6216 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6217 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6218 stmt_vec_info cond_stmt_vinfo = NULL;
6219 tree scalar_type;
6220 int i;
6221 int ncopies;
6222 bool single_defuse_cycle = false;
6223 bool nested_cycle = false;
6224 bool double_reduc = false;
6225 int vec_num;
6226 tree tem;
6227 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6228 tree cond_reduc_val = NULL_TREE;
6230 /* Make sure it was already recognized as a reduction computation. */
6231 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6232 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6233 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6234 return false;
6236 /* The stmt we store reduction analysis meta on. */
6237 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6238 reduc_info->is_reduc_info = true;
6240 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6242 if (is_a <gphi *> (stmt_info->stmt))
6243 /* Analysis for double-reduction is done on the outer
6244 loop PHI, nested cycles have no further restrictions. */
6245 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6246 else
6247 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6248 return true;
6251 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6252 stmt_vec_info phi_info = stmt_info;
6253 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6254 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6256 if (!is_a <gphi *> (stmt_info->stmt))
6258 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6259 return true;
6261 if (slp_node)
6263 slp_node_instance->reduc_phis = slp_node;
6264 /* ??? We're leaving slp_node to point to the PHIs, we only
6265 need it to get at the number of vector stmts which wasn't
6266 yet initialized for the instance root. */
6268 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6269 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6270 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6272 use_operand_p use_p;
6273 gimple *use_stmt;
6274 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6275 &use_p, &use_stmt);
6276 gcc_assert (res);
6277 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6278 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6282 /* PHIs should not participate in patterns. */
6283 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6284 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6286 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6287 and compute the reduction chain length. */
6288 tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6289 loop_latch_edge (loop));
6290 unsigned reduc_chain_length = 0;
6291 bool only_slp_reduc_chain = true;
6292 stmt_info = NULL;
6293 while (reduc_def != PHI_RESULT (reduc_def_phi))
6295 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6296 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6297 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6299 if (dump_enabled_p ())
6300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6301 "reduction chain broken by patterns.\n");
6302 return false;
6304 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6305 only_slp_reduc_chain = false;
6306 /* ??? For epilogue generation live members of the chain need
6307 to point back to the PHI via their original stmt for
6308 info_for_reduction to work. */
6309 if (STMT_VINFO_LIVE_P (vdef))
6310 STMT_VINFO_REDUC_DEF (def) = phi_info;
6311 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6312 if (!assign)
6314 if (dump_enabled_p ())
6315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6316 "reduction chain includes calls.\n");
6317 return false;
6319 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6321 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6322 TREE_TYPE (gimple_assign_rhs1 (assign))))
6324 if (dump_enabled_p ())
6325 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6326 "conversion in the reduction chain.\n");
6327 return false;
6330 else if (!stmt_info)
6331 /* First non-conversion stmt. */
6332 stmt_info = vdef;
6333 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6334 reduc_chain_length++;
6336 /* PHIs should not participate in patterns. */
6337 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6339 if (nested_in_vect_loop_p (loop, stmt_info))
6341 loop = loop->inner;
6342 nested_cycle = true;
6345 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6346 element. */
6347 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6349 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6350 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6352 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6353 gcc_assert (slp_node
6354 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6356 /* 1. Is vectorizable reduction? */
6357 /* Not supportable if the reduction variable is used in the loop, unless
6358 it's a reduction chain. */
6359 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6360 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6361 return false;
6363 /* Reductions that are not used even in an enclosing outer-loop,
6364 are expected to be "live" (used out of the loop). */
6365 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6366 && !STMT_VINFO_LIVE_P (stmt_info))
6367 return false;
6369 /* 2. Has this been recognized as a reduction pattern?
6371 Check if STMT represents a pattern that has been recognized
6372 in earlier analysis stages. For stmts that represent a pattern,
6373 the STMT_VINFO_RELATED_STMT field records the last stmt in
6374 the original sequence that constitutes the pattern. */
6376 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6377 if (orig_stmt_info)
6379 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6380 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6383 /* 3. Check the operands of the operation. The first operands are defined
6384 inside the loop body. The last operand is the reduction variable,
6385 which is defined by the loop-header-phi. */
6387 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6388 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6389 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6390 enum tree_code code = gimple_assign_rhs_code (stmt);
6391 bool lane_reduc_code_p
6392 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6393 int op_type = TREE_CODE_LENGTH (code);
6395 scalar_dest = gimple_assign_lhs (stmt);
6396 scalar_type = TREE_TYPE (scalar_dest);
6397 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6398 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6399 return false;
6401 /* Do not try to vectorize bit-precision reductions. */
6402 if (!type_has_mode_precision_p (scalar_type))
6403 return false;
6405 /* For lane-reducing ops we're reducing the number of reduction PHIs
6406 which means the only use of that may be in the lane-reducing operation. */
6407 if (lane_reduc_code_p
6408 && reduc_chain_length != 1
6409 && !only_slp_reduc_chain)
6411 if (dump_enabled_p ())
6412 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6413 "lane-reducing reduction with extra stmts.\n");
6414 return false;
6417 /* All uses but the last are expected to be defined in the loop.
6418 The last use is the reduction variable. In case of nested cycle this
6419 assumption is not true: we use reduc_index to record the index of the
6420 reduction variable. */
6421 /* ??? To get at invariant/constant uses on the SLP node we have to
6422 get to it here, slp_node is still the reduction PHI. */
6423 slp_tree slp_for_stmt_info = NULL;
6424 if (slp_node)
6426 slp_for_stmt_info = slp_node_instance->root;
6427 /* And then there's reduction chain with a conversion ... */
6428 if (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) != stmt_info)
6429 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6430 gcc_assert (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) == stmt_info);
6432 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6433 /* We need to skip an extra operand for COND_EXPRs with embedded
6434 comparison. */
6435 unsigned opno_adjust = 0;
6436 if (code == COND_EXPR
6437 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6438 opno_adjust = 1;
6439 for (i = 0; i < op_type; i++)
6441 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6442 if (i == 0 && code == COND_EXPR)
6443 continue;
6445 stmt_vec_info def_stmt_info;
6446 enum vect_def_type dt;
6447 tree op;
6448 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6449 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6450 &def_stmt_info))
6452 if (dump_enabled_p ())
6453 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6454 "use not simple.\n");
6455 return false;
6457 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6458 continue;
6460 /* There should be only one cycle def in the stmt, the one
6461 leading to reduc_def. */
6462 if (VECTORIZABLE_CYCLE_DEF (dt))
6463 return false;
6465 /* To properly compute ncopies we are interested in the widest
6466 non-reduction input type in case we're looking at a widening
6467 accumulation that we later handle in vect_transform_reduction. */
6468 if (lane_reduc_code_p
6469 && tem
6470 && (!vectype_in
6471 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6472 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6473 vectype_in = tem;
6475 if (code == COND_EXPR)
6477 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6478 if (dt == vect_constant_def)
6480 cond_reduc_dt = dt;
6481 cond_reduc_val = op;
6483 if (dt == vect_induction_def
6484 && def_stmt_info
6485 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6487 cond_reduc_dt = dt;
6488 cond_stmt_vinfo = def_stmt_info;
6492 if (!vectype_in)
6493 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6494 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6496 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6497 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6498 /* If we have a condition reduction, see if we can simplify it further. */
6499 if (v_reduc_type == COND_REDUCTION)
6501 if (slp_node)
6502 return false;
6504 /* When the condition uses the reduction value in the condition, fail. */
6505 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6507 if (dump_enabled_p ())
6508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6509 "condition depends on previous iteration\n");
6510 return false;
6513 if (reduc_chain_length == 1
6514 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6515 vectype_in, OPTIMIZE_FOR_SPEED))
6517 if (dump_enabled_p ())
6518 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6519 "optimizing condition reduction with"
6520 " FOLD_EXTRACT_LAST.\n");
6521 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6523 else if (cond_reduc_dt == vect_induction_def)
6525 tree base
6526 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6527 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6529 gcc_assert (TREE_CODE (base) == INTEGER_CST
6530 && TREE_CODE (step) == INTEGER_CST);
6531 cond_reduc_val = NULL_TREE;
6532 enum tree_code cond_reduc_op_code = ERROR_MARK;
6533 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6534 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6536 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6537 above base; punt if base is the minimum value of the type for
6538 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6539 else if (tree_int_cst_sgn (step) == -1)
6541 cond_reduc_op_code = MIN_EXPR;
6542 if (tree_int_cst_sgn (base) == -1)
6543 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6544 else if (tree_int_cst_lt (base,
6545 TYPE_MAX_VALUE (TREE_TYPE (base))))
6546 cond_reduc_val
6547 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6549 else
6551 cond_reduc_op_code = MAX_EXPR;
6552 if (tree_int_cst_sgn (base) == 1)
6553 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6554 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6555 base))
6556 cond_reduc_val
6557 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6559 if (cond_reduc_val)
6561 if (dump_enabled_p ())
6562 dump_printf_loc (MSG_NOTE, vect_location,
6563 "condition expression based on "
6564 "integer induction.\n");
6565 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6566 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6567 = cond_reduc_val;
6568 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6571 else if (cond_reduc_dt == vect_constant_def)
6573 enum vect_def_type cond_initial_dt;
6574 tree cond_initial_val
6575 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6577 gcc_assert (cond_reduc_val != NULL_TREE);
6578 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6579 if (cond_initial_dt == vect_constant_def
6580 && types_compatible_p (TREE_TYPE (cond_initial_val),
6581 TREE_TYPE (cond_reduc_val)))
6583 tree e = fold_binary (LE_EXPR, boolean_type_node,
6584 cond_initial_val, cond_reduc_val);
6585 if (e && (integer_onep (e) || integer_zerop (e)))
6587 if (dump_enabled_p ())
6588 dump_printf_loc (MSG_NOTE, vect_location,
6589 "condition expression based on "
6590 "compile time constant.\n");
6591 /* Record reduction code at analysis stage. */
6592 STMT_VINFO_REDUC_CODE (reduc_info)
6593 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6594 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6600 if (STMT_VINFO_LIVE_P (phi_info))
6601 return false;
6603 if (slp_node)
6604 ncopies = 1;
6605 else
6606 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6608 gcc_assert (ncopies >= 1);
6610 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6612 if (nested_cycle)
6614 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6615 == vect_double_reduction_def);
6616 double_reduc = true;
6619 /* 4.2. Check support for the epilog operation.
6621 If STMT represents a reduction pattern, then the type of the
6622 reduction variable may be different than the type of the rest
6623 of the arguments. For example, consider the case of accumulation
6624 of shorts into an int accumulator; The original code:
6625 S1: int_a = (int) short_a;
6626 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6628 was replaced with:
6629 STMT: int_acc = widen_sum <short_a, int_acc>
6631 This means that:
6632 1. The tree-code that is used to create the vector operation in the
6633 epilog code (that reduces the partial results) is not the
6634 tree-code of STMT, but is rather the tree-code of the original
6635 stmt from the pattern that STMT is replacing. I.e, in the example
6636 above we want to use 'widen_sum' in the loop, but 'plus' in the
6637 epilog.
6638 2. The type (mode) we use to check available target support
6639 for the vector operation to be created in the *epilog*, is
6640 determined by the type of the reduction variable (in the example
6641 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6642 However the type (mode) we use to check available target support
6643 for the vector operation to be created *inside the loop*, is
6644 determined by the type of the other arguments to STMT (in the
6645 example we'd check this: optab_handler (widen_sum_optab,
6646 vect_short_mode)).
6648 This is contrary to "regular" reductions, in which the types of all
6649 the arguments are the same as the type of the reduction variable.
6650 For "regular" reductions we can therefore use the same vector type
6651 (and also the same tree-code) when generating the epilog code and
6652 when generating the code inside the loop. */
6654 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6655 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6657 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6658 if (reduction_type == TREE_CODE_REDUCTION)
6660 /* Check whether it's ok to change the order of the computation.
6661 Generally, when vectorizing a reduction we change the order of the
6662 computation. This may change the behavior of the program in some
6663 cases, so we need to check that this is ok. One exception is when
6664 vectorizing an outer-loop: the inner-loop is executed sequentially,
6665 and therefore vectorizing reductions in the inner-loop during
6666 outer-loop vectorization is safe. */
6667 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6669 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6670 is not directy used in stmt. */
6671 if (!only_slp_reduc_chain
6672 && reduc_chain_length != 1)
6674 if (dump_enabled_p ())
6675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6676 "in-order reduction chain without SLP.\n");
6677 return false;
6679 STMT_VINFO_REDUC_TYPE (reduc_info)
6680 = reduction_type = FOLD_LEFT_REDUCTION;
6682 else if (!commutative_tree_code (orig_code)
6683 || !associative_tree_code (orig_code))
6685 if (dump_enabled_p ())
6686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6687 "reduction: not commutative/associative");
6688 return false;
6692 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6693 && ncopies > 1)
6695 if (dump_enabled_p ())
6696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6697 "multiple types in double reduction or condition "
6698 "reduction or fold-left reduction.\n");
6699 return false;
6702 internal_fn reduc_fn = IFN_LAST;
6703 if (reduction_type == TREE_CODE_REDUCTION
6704 || reduction_type == FOLD_LEFT_REDUCTION
6705 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6706 || reduction_type == CONST_COND_REDUCTION)
6708 if (reduction_type == FOLD_LEFT_REDUCTION
6709 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6710 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6712 if (reduc_fn != IFN_LAST
6713 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6714 OPTIMIZE_FOR_SPEED))
6716 if (dump_enabled_p ())
6717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6718 "reduc op not supported by target.\n");
6720 reduc_fn = IFN_LAST;
6723 else
6725 if (!nested_cycle || double_reduc)
6727 if (dump_enabled_p ())
6728 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6729 "no reduc code for scalar code.\n");
6731 return false;
6735 else if (reduction_type == COND_REDUCTION)
6737 int scalar_precision
6738 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6739 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6740 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6741 nunits_out);
6743 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6744 OPTIMIZE_FOR_SPEED))
6745 reduc_fn = IFN_REDUC_MAX;
6747 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6749 if (reduction_type != EXTRACT_LAST_REDUCTION
6750 && (!nested_cycle || double_reduc)
6751 && reduc_fn == IFN_LAST
6752 && !nunits_out.is_constant ())
6754 if (dump_enabled_p ())
6755 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6756 "missing target support for reduction on"
6757 " variable-length vectors.\n");
6758 return false;
6761 /* For SLP reductions, see if there is a neutral value we can use. */
6762 tree neutral_op = NULL_TREE;
6763 if (slp_node)
6764 neutral_op = neutral_op_for_slp_reduction
6765 (slp_node_instance->reduc_phis, vectype_out, orig_code,
6766 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6768 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6770 /* We can't support in-order reductions of code such as this:
6772 for (int i = 0; i < n1; ++i)
6773 for (int j = 0; j < n2; ++j)
6774 l += a[j];
6776 since GCC effectively transforms the loop when vectorizing:
6778 for (int i = 0; i < n1 / VF; ++i)
6779 for (int j = 0; j < n2; ++j)
6780 for (int k = 0; k < VF; ++k)
6781 l += a[j];
6783 which is a reassociation of the original operation. */
6784 if (dump_enabled_p ())
6785 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6786 "in-order double reduction not supported.\n");
6788 return false;
6791 if (reduction_type == FOLD_LEFT_REDUCTION
6792 && slp_node
6793 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6795 /* We cannot use in-order reductions in this case because there is
6796 an implicit reassociation of the operations involved. */
6797 if (dump_enabled_p ())
6798 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6799 "in-order unchained SLP reductions not supported.\n");
6800 return false;
6803 /* For double reductions, and for SLP reductions with a neutral value,
6804 we construct a variable-length initial vector by loading a vector
6805 full of the neutral value and then shift-and-inserting the start
6806 values into the low-numbered elements. */
6807 if ((double_reduc || neutral_op)
6808 && !nunits_out.is_constant ()
6809 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6810 vectype_out, OPTIMIZE_FOR_SPEED))
6812 if (dump_enabled_p ())
6813 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6814 "reduction on variable-length vectors requires"
6815 " target support for a vector-shift-and-insert"
6816 " operation.\n");
6817 return false;
6820 /* Check extra constraints for variable-length unchained SLP reductions. */
6821 if (STMT_SLP_TYPE (stmt_info)
6822 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6823 && !nunits_out.is_constant ())
6825 /* We checked above that we could build the initial vector when
6826 there's a neutral element value. Check here for the case in
6827 which each SLP statement has its own initial value and in which
6828 that value needs to be repeated for every instance of the
6829 statement within the initial vector. */
6830 unsigned int group_size = SLP_TREE_LANES (slp_node);
6831 if (!neutral_op
6832 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6833 TREE_TYPE (vectype_out)))
6835 if (dump_enabled_p ())
6836 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6837 "unsupported form of SLP reduction for"
6838 " variable-length vectors: cannot build"
6839 " initial vector.\n");
6840 return false;
6842 /* The epilogue code relies on the number of elements being a multiple
6843 of the group size. The duplicate-and-interleave approach to setting
6844 up the initial vector does too. */
6845 if (!multiple_p (nunits_out, group_size))
6847 if (dump_enabled_p ())
6848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6849 "unsupported form of SLP reduction for"
6850 " variable-length vectors: the vector size"
6851 " is not a multiple of the number of results.\n");
6852 return false;
6856 if (reduction_type == COND_REDUCTION)
6858 widest_int ni;
6860 if (! max_loop_iterations (loop, &ni))
6862 if (dump_enabled_p ())
6863 dump_printf_loc (MSG_NOTE, vect_location,
6864 "loop count not known, cannot create cond "
6865 "reduction.\n");
6866 return false;
6868 /* Convert backedges to iterations. */
6869 ni += 1;
6871 /* The additional index will be the same type as the condition. Check
6872 that the loop can fit into this less one (because we'll use up the
6873 zero slot for when there are no matches). */
6874 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6875 if (wi::geu_p (ni, wi::to_widest (max_index)))
6877 if (dump_enabled_p ())
6878 dump_printf_loc (MSG_NOTE, vect_location,
6879 "loop size is greater than data size.\n");
6880 return false;
6884 /* In case the vectorization factor (VF) is bigger than the number
6885 of elements that we can fit in a vectype (nunits), we have to generate
6886 more than one vector stmt - i.e - we need to "unroll" the
6887 vector stmt by a factor VF/nunits. For more details see documentation
6888 in vectorizable_operation. */
6890 /* If the reduction is used in an outer loop we need to generate
6891 VF intermediate results, like so (e.g. for ncopies=2):
6892 r0 = phi (init, r0)
6893 r1 = phi (init, r1)
6894 r0 = x0 + r0;
6895 r1 = x1 + r1;
6896 (i.e. we generate VF results in 2 registers).
6897 In this case we have a separate def-use cycle for each copy, and therefore
6898 for each copy we get the vector def for the reduction variable from the
6899 respective phi node created for this copy.
6901 Otherwise (the reduction is unused in the loop nest), we can combine
6902 together intermediate results, like so (e.g. for ncopies=2):
6903 r = phi (init, r)
6904 r = x0 + r;
6905 r = x1 + r;
6906 (i.e. we generate VF/2 results in a single register).
6907 In this case for each copy we get the vector def for the reduction variable
6908 from the vectorized reduction operation generated in the previous iteration.
6910 This only works when we see both the reduction PHI and its only consumer
6911 in vectorizable_reduction and there are no intermediate stmts
6912 participating. */
6913 if (ncopies > 1
6914 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6915 && reduc_chain_length == 1)
6916 single_defuse_cycle = true;
6918 if (single_defuse_cycle || lane_reduc_code_p)
6920 gcc_assert (code != COND_EXPR);
6922 /* 4. Supportable by target? */
6923 bool ok = true;
6925 /* 4.1. check support for the operation in the loop */
6926 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6927 if (!optab)
6929 if (dump_enabled_p ())
6930 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6931 "no optab.\n");
6932 ok = false;
6935 machine_mode vec_mode = TYPE_MODE (vectype_in);
6936 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6938 if (dump_enabled_p ())
6939 dump_printf (MSG_NOTE, "op not supported by target.\n");
6940 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6941 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6942 ok = false;
6943 else
6944 if (dump_enabled_p ())
6945 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6948 /* Worthwhile without SIMD support? */
6949 if (ok
6950 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6951 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6953 if (dump_enabled_p ())
6954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6955 "not worthwhile without SIMD support.\n");
6956 ok = false;
6959 /* lane-reducing operations have to go through vect_transform_reduction.
6960 For the other cases try without the single cycle optimization. */
6961 if (!ok)
6963 if (lane_reduc_code_p)
6964 return false;
6965 else
6966 single_defuse_cycle = false;
6969 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6971 /* If the reduction stmt is one of the patterns that have lane
6972 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6973 if ((ncopies > 1 && ! single_defuse_cycle)
6974 && lane_reduc_code_p)
6976 if (dump_enabled_p ())
6977 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6978 "multi def-use cycle not possible for lane-reducing "
6979 "reduction operation\n");
6980 return false;
6983 if (slp_node
6984 && !(!single_defuse_cycle
6985 && code != DOT_PROD_EXPR
6986 && code != WIDEN_SUM_EXPR
6987 && code != SAD_EXPR
6988 && reduction_type != FOLD_LEFT_REDUCTION))
6989 for (i = 0; i < op_type; i++)
6990 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
6992 if (dump_enabled_p ())
6993 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6994 "incompatible vector types for invariants\n");
6995 return false;
6998 if (slp_node)
6999 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7000 else
7001 vec_num = 1;
7003 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7004 reduction_type, ncopies, cost_vec);
7005 if (dump_enabled_p ()
7006 && reduction_type == FOLD_LEFT_REDUCTION)
7007 dump_printf_loc (MSG_NOTE, vect_location,
7008 "using an in-order (fold-left) reduction.\n");
7009 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7010 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7011 reductions go through their own vectorizable_* routines. */
7012 if (!single_defuse_cycle
7013 && code != DOT_PROD_EXPR
7014 && code != WIDEN_SUM_EXPR
7015 && code != SAD_EXPR
7016 && reduction_type != FOLD_LEFT_REDUCTION)
7018 stmt_vec_info tem
7019 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7020 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7022 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7023 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7025 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7026 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7028 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7030 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7031 internal_fn cond_fn = get_conditional_internal_fn (code);
7033 if (reduction_type != FOLD_LEFT_REDUCTION
7034 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7035 && (cond_fn == IFN_LAST
7036 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7037 OPTIMIZE_FOR_SPEED)))
7039 if (dump_enabled_p ())
7040 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7041 "can't operate on partial vectors because"
7042 " no conditional operation is available.\n");
7043 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7045 else if (reduction_type == FOLD_LEFT_REDUCTION
7046 && reduc_fn == IFN_LAST
7047 && !expand_vec_cond_expr_p (vectype_in,
7048 truth_type_for (vectype_in),
7049 SSA_NAME))
7051 if (dump_enabled_p ())
7052 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7053 "can't operate on partial vectors because"
7054 " no conditional operation is available.\n");
7055 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7057 else
7058 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7059 vectype_in, NULL);
7061 return true;
7064 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7065 value. */
7067 bool
7068 vect_transform_reduction (loop_vec_info loop_vinfo,
7069 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7070 gimple **vec_stmt, slp_tree slp_node)
7072 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7073 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7074 int i;
7075 int ncopies;
7076 int vec_num;
7078 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7079 gcc_assert (reduc_info->is_reduc_info);
7081 if (nested_in_vect_loop_p (loop, stmt_info))
7083 loop = loop->inner;
7084 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7087 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7088 enum tree_code code = gimple_assign_rhs_code (stmt);
7089 int op_type = TREE_CODE_LENGTH (code);
7091 /* Flatten RHS. */
7092 tree ops[3];
7093 switch (get_gimple_rhs_class (code))
7095 case GIMPLE_TERNARY_RHS:
7096 ops[2] = gimple_assign_rhs3 (stmt);
7097 /* Fall thru. */
7098 case GIMPLE_BINARY_RHS:
7099 ops[0] = gimple_assign_rhs1 (stmt);
7100 ops[1] = gimple_assign_rhs2 (stmt);
7101 break;
7102 default:
7103 gcc_unreachable ();
7106 /* All uses but the last are expected to be defined in the loop.
7107 The last use is the reduction variable. In case of nested cycle this
7108 assumption is not true: we use reduc_index to record the index of the
7109 reduction variable. */
7110 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7111 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7112 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7113 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7115 if (slp_node)
7117 ncopies = 1;
7118 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7120 else
7122 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7123 vec_num = 1;
7126 internal_fn cond_fn = get_conditional_internal_fn (code);
7127 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7128 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7130 /* Transform. */
7131 tree new_temp = NULL_TREE;
7132 auto_vec<tree> vec_oprnds0;
7133 auto_vec<tree> vec_oprnds1;
7134 auto_vec<tree> vec_oprnds2;
7135 tree def0;
7137 if (dump_enabled_p ())
7138 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7140 /* FORNOW: Multiple types are not supported for condition. */
7141 if (code == COND_EXPR)
7142 gcc_assert (ncopies == 1);
7144 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7146 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7147 if (reduction_type == FOLD_LEFT_REDUCTION)
7149 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7150 return vectorize_fold_left_reduction
7151 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7152 reduc_fn, ops, vectype_in, reduc_index, masks);
7155 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7156 gcc_assert (single_defuse_cycle
7157 || code == DOT_PROD_EXPR
7158 || code == WIDEN_SUM_EXPR
7159 || code == SAD_EXPR);
7161 /* Create the destination vector */
7162 tree scalar_dest = gimple_assign_lhs (stmt);
7163 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7165 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7166 single_defuse_cycle && reduc_index == 0
7167 ? NULL_TREE : ops[0], &vec_oprnds0,
7168 single_defuse_cycle && reduc_index == 1
7169 ? NULL_TREE : ops[1], &vec_oprnds1,
7170 op_type == ternary_op
7171 && !(single_defuse_cycle && reduc_index == 2)
7172 ? ops[2] : NULL_TREE, &vec_oprnds2);
7173 if (single_defuse_cycle)
7175 gcc_assert (!slp_node);
7176 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7177 ops[reduc_index],
7178 reduc_index == 0 ? &vec_oprnds0
7179 : (reduc_index == 1 ? &vec_oprnds1
7180 : &vec_oprnds2));
7183 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7185 gimple *new_stmt;
7186 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7187 if (masked_loop_p && !mask_by_cond_expr)
7189 /* Make sure that the reduction accumulator is vop[0]. */
7190 if (reduc_index == 1)
7192 gcc_assert (commutative_tree_code (code));
7193 std::swap (vop[0], vop[1]);
7195 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7196 vectype_in, i);
7197 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7198 vop[0], vop[1], vop[0]);
7199 new_temp = make_ssa_name (vec_dest, call);
7200 gimple_call_set_lhs (call, new_temp);
7201 gimple_call_set_nothrow (call, true);
7202 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7203 new_stmt = call;
7205 else
7207 if (op_type == ternary_op)
7208 vop[2] = vec_oprnds2[i];
7210 if (masked_loop_p && mask_by_cond_expr)
7212 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7213 vectype_in, i);
7214 build_vect_cond_expr (code, vop, mask, gsi);
7217 new_stmt = gimple_build_assign (vec_dest, code,
7218 vop[0], vop[1], vop[2]);
7219 new_temp = make_ssa_name (vec_dest, new_stmt);
7220 gimple_assign_set_lhs (new_stmt, new_temp);
7221 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7224 if (slp_node)
7225 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7226 else if (single_defuse_cycle
7227 && i < ncopies - 1)
7229 if (reduc_index == 0)
7230 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7231 else if (reduc_index == 1)
7232 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7233 else if (reduc_index == 2)
7234 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7236 else
7237 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7240 if (!slp_node)
7241 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7243 return true;
7246 /* Transform phase of a cycle PHI. */
7248 bool
7249 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7250 stmt_vec_info stmt_info, gimple **vec_stmt,
7251 slp_tree slp_node, slp_instance slp_node_instance)
7253 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7254 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7255 int i;
7256 int ncopies;
7257 int j;
7258 bool nested_cycle = false;
7259 int vec_num;
7261 if (nested_in_vect_loop_p (loop, stmt_info))
7263 loop = loop->inner;
7264 nested_cycle = true;
7267 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7268 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7269 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7270 gcc_assert (reduc_info->is_reduc_info);
7272 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7273 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7274 /* Leave the scalar phi in place. */
7275 return true;
7277 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7278 /* For a nested cycle we do not fill the above. */
7279 if (!vectype_in)
7280 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7281 gcc_assert (vectype_in);
7283 if (slp_node)
7285 /* The size vect_schedule_slp_instance computes is off for us. */
7286 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7287 * SLP_TREE_LANES (slp_node), vectype_in);
7288 ncopies = 1;
7290 else
7292 vec_num = 1;
7293 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7296 /* Check whether we should use a single PHI node and accumulate
7297 vectors to one before the backedge. */
7298 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7299 ncopies = 1;
7301 /* Create the destination vector */
7302 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7303 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7304 vectype_out);
7306 /* Get the loop-entry arguments. */
7307 tree vec_initial_def;
7308 auto_vec<tree> vec_initial_defs;
7309 if (slp_node)
7311 vec_initial_defs.reserve (vec_num);
7312 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7313 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7314 tree neutral_op
7315 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7316 STMT_VINFO_REDUC_CODE (reduc_info),
7317 first != NULL);
7318 get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7319 &vec_initial_defs, vec_num,
7320 first != NULL, neutral_op);
7322 else
7324 /* Get at the scalar def before the loop, that defines the initial
7325 value of the reduction variable. */
7326 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7327 loop_preheader_edge (loop));
7328 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7329 and we can't use zero for induc_val, use initial_def. Similarly
7330 for REDUC_MIN and initial_def larger than the base. */
7331 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7333 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7334 if (TREE_CODE (initial_def) == INTEGER_CST
7335 && !integer_zerop (induc_val)
7336 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7337 && tree_int_cst_lt (initial_def, induc_val))
7338 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7339 && tree_int_cst_lt (induc_val, initial_def))))
7341 induc_val = initial_def;
7342 /* Communicate we used the initial_def to epilouge
7343 generation. */
7344 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7346 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7347 vec_initial_defs.create (ncopies);
7348 for (i = 0; i < ncopies; ++i)
7349 vec_initial_defs.quick_push (vec_initial_def);
7351 else if (nested_cycle)
7353 /* Do not use an adjustment def as that case is not supported
7354 correctly if ncopies is not one. */
7355 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7356 ncopies, initial_def,
7357 &vec_initial_defs);
7359 else
7361 tree adjustment_def = NULL_TREE;
7362 tree *adjustment_defp = &adjustment_def;
7363 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7364 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7365 adjustment_defp = NULL;
7366 vec_initial_def
7367 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7368 initial_def, adjustment_defp);
7369 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7370 vec_initial_defs.create (ncopies);
7371 for (i = 0; i < ncopies; ++i)
7372 vec_initial_defs.quick_push (vec_initial_def);
7376 /* Generate the reduction PHIs upfront. */
7377 for (i = 0; i < vec_num; i++)
7379 tree vec_init_def = vec_initial_defs[i];
7380 for (j = 0; j < ncopies; j++)
7382 /* Create the reduction-phi that defines the reduction
7383 operand. */
7384 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7386 /* Set the loop-entry arg of the reduction-phi. */
7387 if (j != 0 && nested_cycle)
7388 vec_init_def = vec_initial_defs[j];
7389 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7390 UNKNOWN_LOCATION);
7392 /* The loop-latch arg is set in epilogue processing. */
7394 if (slp_node)
7395 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7396 else
7398 if (j == 0)
7399 *vec_stmt = new_phi;
7400 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7405 return true;
7408 /* Vectorizes LC PHIs. */
7410 bool
7411 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7412 stmt_vec_info stmt_info, gimple **vec_stmt,
7413 slp_tree slp_node)
7415 if (!loop_vinfo
7416 || !is_a <gphi *> (stmt_info->stmt)
7417 || gimple_phi_num_args (stmt_info->stmt) != 1)
7418 return false;
7420 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7421 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7422 return false;
7424 if (!vec_stmt) /* transformation not required. */
7426 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7427 return true;
7430 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7431 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7432 basic_block bb = gimple_bb (stmt_info->stmt);
7433 edge e = single_pred_edge (bb);
7434 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7435 auto_vec<tree> vec_oprnds;
7436 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7437 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7438 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7439 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7441 /* Create the vectorized LC PHI node. */
7442 gphi *new_phi = create_phi_node (vec_dest, bb);
7443 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7444 if (slp_node)
7445 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7446 else
7447 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7449 if (!slp_node)
7450 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7452 return true;
7456 /* Function vect_min_worthwhile_factor.
7458 For a loop where we could vectorize the operation indicated by CODE,
7459 return the minimum vectorization factor that makes it worthwhile
7460 to use generic vectors. */
7461 static unsigned int
7462 vect_min_worthwhile_factor (enum tree_code code)
7464 switch (code)
7466 case PLUS_EXPR:
7467 case MINUS_EXPR:
7468 case NEGATE_EXPR:
7469 return 4;
7471 case BIT_AND_EXPR:
7472 case BIT_IOR_EXPR:
7473 case BIT_XOR_EXPR:
7474 case BIT_NOT_EXPR:
7475 return 2;
7477 default:
7478 return INT_MAX;
7482 /* Return true if VINFO indicates we are doing loop vectorization and if
7483 it is worth decomposing CODE operations into scalar operations for
7484 that loop's vectorization factor. */
7486 bool
7487 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7489 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7490 unsigned HOST_WIDE_INT value;
7491 return (loop_vinfo
7492 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7493 && value >= vect_min_worthwhile_factor (code));
7496 /* Function vectorizable_induction
7498 Check if STMT_INFO performs an induction computation that can be vectorized.
7499 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7500 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7501 Return true if STMT_INFO is vectorizable in this way. */
7503 bool
7504 vectorizable_induction (loop_vec_info loop_vinfo,
7505 stmt_vec_info stmt_info,
7506 gimple **vec_stmt, slp_tree slp_node,
7507 stmt_vector_for_cost *cost_vec)
7509 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7510 unsigned ncopies;
7511 bool nested_in_vect_loop = false;
7512 class loop *iv_loop;
7513 tree vec_def;
7514 edge pe = loop_preheader_edge (loop);
7515 basic_block new_bb;
7516 tree new_vec, vec_init, vec_step, t;
7517 tree new_name;
7518 gimple *new_stmt;
7519 gphi *induction_phi;
7520 tree induc_def, vec_dest;
7521 tree init_expr, step_expr;
7522 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7523 unsigned i;
7524 tree expr;
7525 gimple_seq stmts;
7526 gimple_stmt_iterator si;
7528 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7529 if (!phi)
7530 return false;
7532 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7533 return false;
7535 /* Make sure it was recognized as induction computation. */
7536 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7537 return false;
7539 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7540 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7542 if (slp_node)
7543 ncopies = 1;
7544 else
7545 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7546 gcc_assert (ncopies >= 1);
7548 /* FORNOW. These restrictions should be relaxed. */
7549 if (nested_in_vect_loop_p (loop, stmt_info))
7551 imm_use_iterator imm_iter;
7552 use_operand_p use_p;
7553 gimple *exit_phi;
7554 edge latch_e;
7555 tree loop_arg;
7557 if (ncopies > 1)
7559 if (dump_enabled_p ())
7560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7561 "multiple types in nested loop.\n");
7562 return false;
7565 /* FORNOW: outer loop induction with SLP not supported. */
7566 if (STMT_SLP_TYPE (stmt_info))
7567 return false;
7569 exit_phi = NULL;
7570 latch_e = loop_latch_edge (loop->inner);
7571 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7572 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7574 gimple *use_stmt = USE_STMT (use_p);
7575 if (is_gimple_debug (use_stmt))
7576 continue;
7578 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7580 exit_phi = use_stmt;
7581 break;
7584 if (exit_phi)
7586 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7587 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7588 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7590 if (dump_enabled_p ())
7591 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7592 "inner-loop induction only used outside "
7593 "of the outer vectorized loop.\n");
7594 return false;
7598 nested_in_vect_loop = true;
7599 iv_loop = loop->inner;
7601 else
7602 iv_loop = loop;
7603 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7605 if (slp_node && !nunits.is_constant ())
7607 /* The current SLP code creates the initial value element-by-element. */
7608 if (dump_enabled_p ())
7609 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7610 "SLP induction not supported for variable-length"
7611 " vectors.\n");
7612 return false;
7615 if (!vec_stmt) /* transformation not required. */
7617 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7618 DUMP_VECT_SCOPE ("vectorizable_induction");
7619 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7620 return true;
7623 /* Transform. */
7625 /* Compute a vector variable, initialized with the first VF values of
7626 the induction variable. E.g., for an iv with IV_PHI='X' and
7627 evolution S, for a vector of 4 units, we want to compute:
7628 [X, X + S, X + 2*S, X + 3*S]. */
7630 if (dump_enabled_p ())
7631 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7633 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7634 gcc_assert (step_expr != NULL_TREE);
7635 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7637 pe = loop_preheader_edge (iv_loop);
7638 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7639 loop_preheader_edge (iv_loop));
7641 stmts = NULL;
7642 if (!nested_in_vect_loop)
7644 /* Convert the initial value to the IV update type. */
7645 tree new_type = TREE_TYPE (step_expr);
7646 init_expr = gimple_convert (&stmts, new_type, init_expr);
7648 /* If we are using the loop mask to "peel" for alignment then we need
7649 to adjust the start value here. */
7650 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7651 if (skip_niters != NULL_TREE)
7653 if (FLOAT_TYPE_P (vectype))
7654 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7655 skip_niters);
7656 else
7657 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7658 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7659 skip_niters, step_expr);
7660 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7661 init_expr, skip_step);
7665 if (stmts)
7667 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7668 gcc_assert (!new_bb);
7671 /* Find the first insertion point in the BB. */
7672 basic_block bb = gimple_bb (phi);
7673 si = gsi_after_labels (bb);
7675 /* For SLP induction we have to generate several IVs as for example
7676 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7677 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7678 [VF*S, VF*S, VF*S, VF*S] for all. */
7679 if (slp_node)
7681 /* Enforced above. */
7682 unsigned int const_nunits = nunits.to_constant ();
7684 /* Generate [VF*S, VF*S, ... ]. */
7685 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7687 expr = build_int_cst (integer_type_node, vf);
7688 expr = fold_convert (TREE_TYPE (step_expr), expr);
7690 else
7691 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7692 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7693 expr, step_expr);
7694 if (! CONSTANT_CLASS_P (new_name))
7695 new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7696 TREE_TYPE (step_expr), NULL);
7697 new_vec = build_vector_from_val (step_vectype, new_name);
7698 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7699 new_vec, step_vectype, NULL);
7701 /* Now generate the IVs. */
7702 unsigned group_size = SLP_TREE_LANES (slp_node);
7703 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7704 unsigned elts = const_nunits * nvects;
7705 /* Compute the number of distinct IVs we need. First reduce
7706 group_size if it is a multiple of const_nunits so we get
7707 one IV for a group_size of 4 but const_nunits 2. */
7708 unsigned group_sizep = group_size;
7709 if (group_sizep % const_nunits == 0)
7710 group_sizep = group_sizep / const_nunits;
7711 unsigned nivs = least_common_multiple (group_sizep,
7712 const_nunits) / const_nunits;
7713 gcc_assert (elts % group_size == 0);
7714 tree elt = init_expr;
7715 unsigned ivn;
7716 for (ivn = 0; ivn < nivs; ++ivn)
7718 tree_vector_builder elts (step_vectype, const_nunits, 1);
7719 stmts = NULL;
7720 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7722 if (ivn*const_nunits + eltn >= group_size
7723 && (ivn * const_nunits + eltn) % group_size == 0)
7724 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7725 elt, step_expr);
7726 elts.quick_push (elt);
7728 vec_init = gimple_build_vector (&stmts, &elts);
7729 vec_init = gimple_convert (&stmts, vectype, vec_init);
7730 if (stmts)
7732 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7733 gcc_assert (!new_bb);
7736 /* Create the induction-phi that defines the induction-operand. */
7737 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7738 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7739 induc_def = PHI_RESULT (induction_phi);
7741 /* Create the iv update inside the loop */
7742 gimple_seq stmts = NULL;
7743 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7744 vec_def = gimple_build (&stmts,
7745 PLUS_EXPR, step_vectype, vec_def, vec_step);
7746 vec_def = gimple_convert (&stmts, vectype, vec_def);
7747 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7749 /* Set the arguments of the phi node: */
7750 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7751 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7752 UNKNOWN_LOCATION);
7754 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7756 /* Fill up to the number of vectors we need for the whole group. */
7757 nivs = least_common_multiple (group_size,
7758 const_nunits) / const_nunits;
7759 for (; ivn < nivs; ++ivn)
7760 SLP_TREE_VEC_STMTS (slp_node)
7761 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
7763 /* Re-use IVs when we can. */
7764 if (ivn < nvects)
7766 unsigned vfp
7767 = least_common_multiple (group_size, const_nunits) / group_size;
7768 /* Generate [VF'*S, VF'*S, ... ]. */
7769 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7771 expr = build_int_cst (integer_type_node, vfp);
7772 expr = fold_convert (TREE_TYPE (step_expr), expr);
7774 else
7775 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7776 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7777 expr, step_expr);
7778 if (! CONSTANT_CLASS_P (new_name))
7779 new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7780 TREE_TYPE (step_expr), NULL);
7781 new_vec = build_vector_from_val (step_vectype, new_name);
7782 vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
7783 step_vectype, NULL);
7784 for (; ivn < nvects; ++ivn)
7786 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7787 tree def;
7788 if (gimple_code (iv) == GIMPLE_PHI)
7789 def = gimple_phi_result (iv);
7790 else
7791 def = gimple_assign_lhs (iv);
7792 gimple_seq stmts = NULL;
7793 def = gimple_convert (&stmts, step_vectype, def);
7794 def = gimple_build (&stmts,
7795 PLUS_EXPR, step_vectype, def, vec_step);
7796 def = gimple_convert (&stmts, vectype, def);
7797 if (gimple_code (iv) == GIMPLE_PHI)
7798 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7799 else
7801 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7802 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7804 SLP_TREE_VEC_STMTS (slp_node)
7805 .quick_push (SSA_NAME_DEF_STMT (def));
7809 return true;
7812 /* Create the vector that holds the initial_value of the induction. */
7813 if (nested_in_vect_loop)
7815 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7816 been created during vectorization of previous stmts. We obtain it
7817 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7818 auto_vec<tree> vec_inits;
7819 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7820 init_expr, &vec_inits);
7821 vec_init = vec_inits[0];
7822 /* If the initial value is not of proper type, convert it. */
7823 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7825 new_stmt
7826 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7827 vect_simple_var,
7828 "vec_iv_"),
7829 VIEW_CONVERT_EXPR,
7830 build1 (VIEW_CONVERT_EXPR, vectype,
7831 vec_init));
7832 vec_init = gimple_assign_lhs (new_stmt);
7833 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7834 new_stmt);
7835 gcc_assert (!new_bb);
7838 else
7840 /* iv_loop is the loop to be vectorized. Create:
7841 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7842 stmts = NULL;
7843 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7845 unsigned HOST_WIDE_INT const_nunits;
7846 if (nunits.is_constant (&const_nunits))
7848 tree_vector_builder elts (step_vectype, const_nunits, 1);
7849 elts.quick_push (new_name);
7850 for (i = 1; i < const_nunits; i++)
7852 /* Create: new_name_i = new_name + step_expr */
7853 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7854 new_name, step_expr);
7855 elts.quick_push (new_name);
7857 /* Create a vector from [new_name_0, new_name_1, ...,
7858 new_name_nunits-1] */
7859 vec_init = gimple_build_vector (&stmts, &elts);
7861 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7862 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7863 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7864 new_name, step_expr);
7865 else
7867 /* Build:
7868 [base, base, base, ...]
7869 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7870 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7871 gcc_assert (flag_associative_math);
7872 tree index = build_index_vector (step_vectype, 0, 1);
7873 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7874 new_name);
7875 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7876 step_expr);
7877 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7878 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7879 vec_init, step_vec);
7880 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7881 vec_init, base_vec);
7883 vec_init = gimple_convert (&stmts, vectype, vec_init);
7885 if (stmts)
7887 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7888 gcc_assert (!new_bb);
7893 /* Create the vector that holds the step of the induction. */
7894 if (nested_in_vect_loop)
7895 /* iv_loop is nested in the loop to be vectorized. Generate:
7896 vec_step = [S, S, S, S] */
7897 new_name = step_expr;
7898 else
7900 /* iv_loop is the loop to be vectorized. Generate:
7901 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7902 gimple_seq seq = NULL;
7903 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7905 expr = build_int_cst (integer_type_node, vf);
7906 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7908 else
7909 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7910 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7911 expr, step_expr);
7912 if (seq)
7914 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7915 gcc_assert (!new_bb);
7919 t = unshare_expr (new_name);
7920 gcc_assert (CONSTANT_CLASS_P (new_name)
7921 || TREE_CODE (new_name) == SSA_NAME);
7922 new_vec = build_vector_from_val (step_vectype, t);
7923 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7924 new_vec, step_vectype, NULL);
7927 /* Create the following def-use cycle:
7928 loop prolog:
7929 vec_init = ...
7930 vec_step = ...
7931 loop:
7932 vec_iv = PHI <vec_init, vec_loop>
7934 STMT
7936 vec_loop = vec_iv + vec_step; */
7938 /* Create the induction-phi that defines the induction-operand. */
7939 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7940 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7941 induc_def = PHI_RESULT (induction_phi);
7943 /* Create the iv update inside the loop */
7944 stmts = NULL;
7945 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7946 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7947 vec_def = gimple_convert (&stmts, vectype, vec_def);
7948 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7949 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7951 /* Set the arguments of the phi node: */
7952 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7953 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7954 UNKNOWN_LOCATION);
7956 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
7957 *vec_stmt = induction_phi;
7959 /* In case that vectorization factor (VF) is bigger than the number
7960 of elements that we can fit in a vectype (nunits), we have to generate
7961 more than one vector stmt - i.e - we need to "unroll" the
7962 vector stmt by a factor VF/nunits. For more details see documentation
7963 in vectorizable_operation. */
7965 if (ncopies > 1)
7967 gimple_seq seq = NULL;
7968 /* FORNOW. This restriction should be relaxed. */
7969 gcc_assert (!nested_in_vect_loop);
7971 /* Create the vector that holds the step of the induction. */
7972 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7974 expr = build_int_cst (integer_type_node, nunits);
7975 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7977 else
7978 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7979 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7980 expr, step_expr);
7981 if (seq)
7983 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7984 gcc_assert (!new_bb);
7987 t = unshare_expr (new_name);
7988 gcc_assert (CONSTANT_CLASS_P (new_name)
7989 || TREE_CODE (new_name) == SSA_NAME);
7990 new_vec = build_vector_from_val (step_vectype, t);
7991 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7992 new_vec, step_vectype, NULL);
7994 vec_def = induc_def;
7995 for (i = 1; i < ncopies; i++)
7997 /* vec_i = vec_prev + vec_step */
7998 gimple_seq stmts = NULL;
7999 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8000 vec_def = gimple_build (&stmts,
8001 PLUS_EXPR, step_vectype, vec_def, vec_step);
8002 vec_def = gimple_convert (&stmts, vectype, vec_def);
8004 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8005 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8006 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8010 if (dump_enabled_p ())
8011 dump_printf_loc (MSG_NOTE, vect_location,
8012 "transform induction: created def-use cycle: %G%G",
8013 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8015 return true;
8018 /* Function vectorizable_live_operation.
8020 STMT_INFO computes a value that is used outside the loop. Check if
8021 it can be supported. */
8023 bool
8024 vectorizable_live_operation (vec_info *vinfo,
8025 stmt_vec_info stmt_info,
8026 gimple_stmt_iterator *gsi,
8027 slp_tree slp_node, slp_instance slp_node_instance,
8028 int slp_index, bool vec_stmt_p,
8029 stmt_vector_for_cost *cost_vec)
8031 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8032 imm_use_iterator imm_iter;
8033 tree lhs, lhs_type, bitsize, vec_bitsize;
8034 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8035 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8036 int ncopies;
8037 gimple *use_stmt;
8038 auto_vec<tree> vec_oprnds;
8039 int vec_entry = 0;
8040 poly_uint64 vec_index = 0;
8042 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8044 /* If a stmt of a reduction is live, vectorize it via
8045 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8046 validity so just trigger the transform here. */
8047 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8049 if (!vec_stmt_p)
8050 return true;
8051 if (slp_node)
8053 /* For reduction chains the meta-info is attached to
8054 the group leader. */
8055 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8056 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8057 /* For SLP reductions we vectorize the epilogue for
8058 all involved stmts together. */
8059 else if (slp_index != 0)
8060 return true;
8061 else
8062 /* For SLP reductions the meta-info is attached to
8063 the representative. */
8064 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8066 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8067 gcc_assert (reduc_info->is_reduc_info);
8068 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8069 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8070 return true;
8071 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8072 slp_node_instance);
8073 return true;
8076 /* If STMT is not relevant and it is a simple assignment and its inputs are
8077 invariant then it can remain in place, unvectorized. The original last
8078 scalar value that it computes will be used. */
8079 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8081 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8082 if (dump_enabled_p ())
8083 dump_printf_loc (MSG_NOTE, vect_location,
8084 "statement is simple and uses invariant. Leaving in "
8085 "place.\n");
8086 return true;
8089 if (slp_node)
8090 ncopies = 1;
8091 else
8092 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8094 if (slp_node)
8096 gcc_assert (slp_index >= 0);
8098 /* Get the last occurrence of the scalar index from the concatenation of
8099 all the slp vectors. Calculate which slp vector it is and the index
8100 within. */
8101 int num_scalar = SLP_TREE_LANES (slp_node);
8102 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8103 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8105 /* Calculate which vector contains the result, and which lane of
8106 that vector we need. */
8107 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8109 if (dump_enabled_p ())
8110 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8111 "Cannot determine which vector holds the"
8112 " final result.\n");
8113 return false;
8117 if (!vec_stmt_p)
8119 /* No transformation required. */
8120 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8122 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8123 OPTIMIZE_FOR_SPEED))
8125 if (dump_enabled_p ())
8126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8127 "can't operate on partial vectors "
8128 "because the target doesn't support extract "
8129 "last reduction.\n");
8130 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8132 else if (slp_node)
8134 if (dump_enabled_p ())
8135 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8136 "can't operate on partial vectors "
8137 "because an SLP statement is live after "
8138 "the loop.\n");
8139 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8141 else if (ncopies > 1)
8143 if (dump_enabled_p ())
8144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8145 "can't operate on partial vectors "
8146 "because ncopies is greater than 1.\n");
8147 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8149 else
8151 gcc_assert (ncopies == 1 && !slp_node);
8152 vect_record_loop_mask (loop_vinfo,
8153 &LOOP_VINFO_MASKS (loop_vinfo),
8154 1, vectype, NULL);
8157 /* ??? Enable for loop costing as well. */
8158 if (!loop_vinfo)
8159 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8160 0, vect_epilogue);
8161 return true;
8164 /* Use the lhs of the original scalar statement. */
8165 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8166 if (dump_enabled_p ())
8167 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8168 "stmt %G", stmt);
8170 lhs = gimple_get_lhs (stmt);
8171 lhs_type = TREE_TYPE (lhs);
8173 bitsize = vector_element_bits_tree (vectype);
8174 vec_bitsize = TYPE_SIZE (vectype);
8176 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8177 tree vec_lhs, bitstart;
8178 gimple *vec_stmt;
8179 if (slp_node)
8181 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8183 /* Get the correct slp vectorized stmt. */
8184 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8185 vec_lhs = gimple_get_lhs (vec_stmt);
8187 /* Get entry to use. */
8188 bitstart = bitsize_int (vec_index);
8189 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8191 else
8193 /* For multiple copies, get the last copy. */
8194 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8195 vec_lhs = gimple_get_lhs (vec_stmt);
8197 /* Get the last lane in the vector. */
8198 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8201 if (loop_vinfo)
8203 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8204 requirement, insert one phi node for it. It looks like:
8205 loop;
8207 # lhs' = PHI <lhs>
8209 loop;
8211 # vec_lhs' = PHI <vec_lhs>
8212 new_tree = lane_extract <vec_lhs', ...>;
8213 lhs' = new_tree; */
8215 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8216 basic_block exit_bb = single_exit (loop)->dest;
8217 gcc_assert (single_pred_p (exit_bb));
8219 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8220 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8221 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8223 gimple_seq stmts = NULL;
8224 tree new_tree;
8225 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8227 /* Emit:
8229 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8231 where VEC_LHS is the vectorized live-out result and MASK is
8232 the loop mask for the final iteration. */
8233 gcc_assert (ncopies == 1 && !slp_node);
8234 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8235 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8236 1, vectype, 0);
8237 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8238 mask, vec_lhs_phi);
8240 /* Convert the extracted vector element to the scalar type. */
8241 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8243 else
8245 tree bftype = TREE_TYPE (vectype);
8246 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8247 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8248 new_tree = build3 (BIT_FIELD_REF, bftype,
8249 vec_lhs_phi, bitsize, bitstart);
8250 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8251 &stmts, true, NULL_TREE);
8254 if (stmts)
8256 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8257 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8259 /* Remove existing phi from lhs and create one copy from new_tree. */
8260 tree lhs_phi = NULL_TREE;
8261 gimple_stmt_iterator gsi;
8262 for (gsi = gsi_start_phis (exit_bb);
8263 !gsi_end_p (gsi); gsi_next (&gsi))
8265 gimple *phi = gsi_stmt (gsi);
8266 if ((gimple_phi_arg_def (phi, 0) == lhs))
8268 remove_phi_node (&gsi, false);
8269 lhs_phi = gimple_phi_result (phi);
8270 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8271 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8272 break;
8277 /* Replace use of lhs with newly computed result. If the use stmt is a
8278 single arg PHI, just replace all uses of PHI result. It's necessary
8279 because lcssa PHI defining lhs may be before newly inserted stmt. */
8280 use_operand_p use_p;
8281 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8282 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8283 && !is_gimple_debug (use_stmt))
8285 if (gimple_code (use_stmt) == GIMPLE_PHI
8286 && gimple_phi_num_args (use_stmt) == 1)
8288 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8290 else
8292 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8293 SET_USE (use_p, new_tree);
8295 update_stmt (use_stmt);
8298 else
8300 /* For basic-block vectorization simply insert the lane-extraction. */
8301 tree bftype = TREE_TYPE (vectype);
8302 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8303 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8304 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8305 vec_lhs, bitsize, bitstart);
8306 gimple_seq stmts = NULL;
8307 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8308 &stmts, true, NULL_TREE);
8310 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
8312 /* Replace use of lhs with newly computed result. If the use stmt is a
8313 single arg PHI, just replace all uses of PHI result. It's necessary
8314 because lcssa PHI defining lhs may be before newly inserted stmt. */
8315 use_operand_p use_p;
8316 stmt_vec_info use_stmt_info;
8317 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8318 if (!is_gimple_debug (use_stmt)
8319 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8320 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8322 /* ??? This can happen when the live lane ends up being
8323 used in a vector construction code-generated by an
8324 external SLP node (and code-generation for that already
8325 happened). See gcc.dg/vect/bb-slp-47.c.
8326 Doing this is what would happen if that vector CTOR
8327 were not code-generated yet so it is not too bad.
8328 ??? In fact we'd likely want to avoid this situation
8329 in the first place. */
8330 if (gimple_code (use_stmt) != GIMPLE_PHI
8331 && !vect_stmt_dominates_stmt_p (gsi_stmt (*gsi), use_stmt))
8333 gcc_assert (is_gimple_assign (use_stmt)
8334 && gimple_assign_rhs_code (use_stmt) == CONSTRUCTOR);
8335 if (dump_enabled_p ())
8336 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8337 "Using original scalar computation for "
8338 "live lane because use preceeds vector "
8339 "def\n");
8340 continue;
8342 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8343 SET_USE (use_p, new_tree);
8344 update_stmt (use_stmt);
8348 return true;
8351 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8353 static void
8354 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8356 ssa_op_iter op_iter;
8357 imm_use_iterator imm_iter;
8358 def_operand_p def_p;
8359 gimple *ustmt;
8361 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8363 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8365 basic_block bb;
8367 if (!is_gimple_debug (ustmt))
8368 continue;
8370 bb = gimple_bb (ustmt);
8372 if (!flow_bb_inside_loop_p (loop, bb))
8374 if (gimple_debug_bind_p (ustmt))
8376 if (dump_enabled_p ())
8377 dump_printf_loc (MSG_NOTE, vect_location,
8378 "killing debug use\n");
8380 gimple_debug_bind_reset_value (ustmt);
8381 update_stmt (ustmt);
8383 else
8384 gcc_unreachable ();
8390 /* Given loop represented by LOOP_VINFO, return true if computation of
8391 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8392 otherwise. */
8394 static bool
8395 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8397 /* Constant case. */
8398 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8400 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8401 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8403 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8404 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8405 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8406 return true;
8409 widest_int max;
8410 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8411 /* Check the upper bound of loop niters. */
8412 if (get_max_loop_iterations (loop, &max))
8414 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8415 signop sgn = TYPE_SIGN (type);
8416 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8417 if (max < type_max)
8418 return true;
8420 return false;
8423 /* Return a mask type with half the number of elements as OLD_TYPE,
8424 given that it should have mode NEW_MODE. */
8426 tree
8427 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8429 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8430 return build_truth_vector_type_for_mode (nunits, new_mode);
8433 /* Return a mask type with twice as many elements as OLD_TYPE,
8434 given that it should have mode NEW_MODE. */
8436 tree
8437 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8439 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8440 return build_truth_vector_type_for_mode (nunits, new_mode);
8443 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8444 contain a sequence of NVECTORS masks that each control a vector of type
8445 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8446 these vector masks with the vector version of SCALAR_MASK. */
8448 void
8449 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8450 unsigned int nvectors, tree vectype, tree scalar_mask)
8452 gcc_assert (nvectors != 0);
8453 if (masks->length () < nvectors)
8454 masks->safe_grow_cleared (nvectors, true);
8455 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8456 /* The number of scalars per iteration and the number of vectors are
8457 both compile-time constants. */
8458 unsigned int nscalars_per_iter
8459 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8460 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8462 if (scalar_mask)
8464 scalar_cond_masked_key cond (scalar_mask, nvectors);
8465 loop_vinfo->scalar_cond_masked_set.add (cond);
8468 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8470 rgm->max_nscalars_per_iter = nscalars_per_iter;
8471 rgm->type = truth_type_for (vectype);
8472 rgm->factor = 1;
8476 /* Given a complete set of masks MASKS, extract mask number INDEX
8477 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8478 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8480 See the comment above vec_loop_masks for more details about the mask
8481 arrangement. */
8483 tree
8484 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8485 unsigned int nvectors, tree vectype, unsigned int index)
8487 rgroup_controls *rgm = &(*masks)[nvectors - 1];
8488 tree mask_type = rgm->type;
8490 /* Populate the rgroup's mask array, if this is the first time we've
8491 used it. */
8492 if (rgm->controls.is_empty ())
8494 rgm->controls.safe_grow_cleared (nvectors, true);
8495 for (unsigned int i = 0; i < nvectors; ++i)
8497 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8498 /* Provide a dummy definition until the real one is available. */
8499 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8500 rgm->controls[i] = mask;
8504 tree mask = rgm->controls[index];
8505 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8506 TYPE_VECTOR_SUBPARTS (vectype)))
8508 /* A loop mask for data type X can be reused for data type Y
8509 if X has N times more elements than Y and if Y's elements
8510 are N times bigger than X's. In this case each sequence
8511 of N elements in the loop mask will be all-zero or all-one.
8512 We can then view-convert the mask so that each sequence of
8513 N elements is replaced by a single element. */
8514 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8515 TYPE_VECTOR_SUBPARTS (vectype)));
8516 gimple_seq seq = NULL;
8517 mask_type = truth_type_for (vectype);
8518 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8519 if (seq)
8520 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8522 return mask;
8525 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
8526 lengths for controlling an operation on VECTYPE. The operation splits
8527 each element of VECTYPE into FACTOR separate subelements, measuring the
8528 length as a number of these subelements. */
8530 void
8531 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8532 unsigned int nvectors, tree vectype, unsigned int factor)
8534 gcc_assert (nvectors != 0);
8535 if (lens->length () < nvectors)
8536 lens->safe_grow_cleared (nvectors, true);
8537 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8539 /* The number of scalars per iteration, scalar occupied bytes and
8540 the number of vectors are both compile-time constants. */
8541 unsigned int nscalars_per_iter
8542 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8543 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8545 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
8547 /* For now, we only support cases in which all loads and stores fall back
8548 to VnQI or none do. */
8549 gcc_assert (!rgl->max_nscalars_per_iter
8550 || (rgl->factor == 1 && factor == 1)
8551 || (rgl->max_nscalars_per_iter * rgl->factor
8552 == nscalars_per_iter * factor));
8553 rgl->max_nscalars_per_iter = nscalars_per_iter;
8554 rgl->type = vectype;
8555 rgl->factor = factor;
8559 /* Given a complete set of length LENS, extract length number INDEX for an
8560 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
8562 tree
8563 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8564 unsigned int nvectors, unsigned int index)
8566 rgroup_controls *rgl = &(*lens)[nvectors - 1];
8568 /* Populate the rgroup's len array, if this is the first time we've
8569 used it. */
8570 if (rgl->controls.is_empty ())
8572 rgl->controls.safe_grow_cleared (nvectors, true);
8573 for (unsigned int i = 0; i < nvectors; ++i)
8575 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
8576 gcc_assert (len_type != NULL_TREE);
8577 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
8579 /* Provide a dummy definition until the real one is available. */
8580 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
8581 rgl->controls[i] = len;
8585 return rgl->controls[index];
8588 /* Scale profiling counters by estimation for LOOP which is vectorized
8589 by factor VF. */
8591 static void
8592 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8594 edge preheader = loop_preheader_edge (loop);
8595 /* Reduce loop iterations by the vectorization factor. */
8596 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8597 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8599 if (freq_h.nonzero_p ())
8601 profile_probability p;
8603 /* Avoid dropping loop body profile counter to 0 because of zero count
8604 in loop's preheader. */
8605 if (!(freq_e == profile_count::zero ()))
8606 freq_e = freq_e.force_nonzero ();
8607 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8608 scale_loop_frequencies (loop, p);
8611 edge exit_e = single_exit (loop);
8612 exit_e->probability = profile_probability::always ()
8613 .apply_scale (1, new_est_niter + 1);
8615 edge exit_l = single_pred_edge (loop->latch);
8616 profile_probability prob = exit_l->probability;
8617 exit_l->probability = exit_e->probability.invert ();
8618 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8619 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8622 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
8623 latch edge values originally defined by it. */
8625 static void
8626 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
8627 stmt_vec_info def_stmt_info)
8629 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
8630 if (!def || TREE_CODE (def) != SSA_NAME)
8631 return;
8632 stmt_vec_info phi_info;
8633 imm_use_iterator iter;
8634 use_operand_p use_p;
8635 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
8636 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
8637 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
8638 && (phi_info = loop_vinfo->lookup_stmt (phi))
8639 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
8640 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
8641 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
8643 loop_p loop = gimple_bb (phi)->loop_father;
8644 edge e = loop_latch_edge (loop);
8645 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
8647 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
8648 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
8649 gcc_assert (phi_defs.length () == latch_defs.length ());
8650 for (unsigned i = 0; i < phi_defs.length (); ++i)
8651 add_phi_arg (as_a <gphi *> (phi_defs[i]),
8652 gimple_get_lhs (latch_defs[i]), e,
8653 gimple_phi_arg_location (phi, e->dest_idx));
8658 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8659 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8660 stmt_vec_info. */
8662 static void
8663 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8664 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8666 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8667 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8669 if (dump_enabled_p ())
8670 dump_printf_loc (MSG_NOTE, vect_location,
8671 "------>vectorizing statement: %G", stmt_info->stmt);
8673 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8674 vect_loop_kill_debug_uses (loop, stmt_info);
8676 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8677 && !STMT_VINFO_LIVE_P (stmt_info))
8678 return;
8680 if (STMT_VINFO_VECTYPE (stmt_info))
8682 poly_uint64 nunits
8683 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8684 if (!STMT_SLP_TYPE (stmt_info)
8685 && maybe_ne (nunits, vf)
8686 && dump_enabled_p ())
8687 /* For SLP VF is set according to unrolling factor, and not
8688 to vector size, hence for SLP this print is not valid. */
8689 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8692 /* Pure SLP statements have already been vectorized. We still need
8693 to apply loop vectorization to hybrid SLP statements. */
8694 if (PURE_SLP_STMT (stmt_info))
8695 return;
8697 if (dump_enabled_p ())
8698 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8700 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
8701 *seen_store = stmt_info;
8704 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8705 in the hash_map with its corresponding values. */
8707 static tree
8708 find_in_mapping (tree t, void *context)
8710 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8712 tree *value = mapping->get (t);
8713 return value ? *value : t;
8716 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
8717 original loop that has now been vectorized.
8719 The inits of the data_references need to be advanced with the number of
8720 iterations of the main loop. This has been computed in vect_do_peeling and
8721 is stored in parameter ADVANCE. We first restore the data_references
8722 initial offset with the values recored in ORIG_DRS_INIT.
8724 Since the loop_vec_info of this EPILOGUE was constructed for the original
8725 loop, its stmt_vec_infos all point to the original statements. These need
8726 to be updated to point to their corresponding copies as well as the SSA_NAMES
8727 in their PATTERN_DEF_SEQs and RELATED_STMTs.
8729 The data_reference's connections also need to be updated. Their
8730 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8731 stmt_vec_infos, their statements need to point to their corresponding copy,
8732 if they are gather loads or scatter stores then their reference needs to be
8733 updated to point to its corresponding copy and finally we set
8734 'base_misaligned' to false as we have already peeled for alignment in the
8735 prologue of the main loop. */
8737 static void
8738 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8740 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8741 auto_vec<gimple *> stmt_worklist;
8742 hash_map<tree,tree> mapping;
8743 gimple *orig_stmt, *new_stmt;
8744 gimple_stmt_iterator epilogue_gsi;
8745 gphi_iterator epilogue_phi_gsi;
8746 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8747 basic_block *epilogue_bbs = get_loop_body (epilogue);
8748 unsigned i;
8750 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8752 /* Advance data_reference's with the number of iterations of the previous
8753 loop and its prologue. */
8754 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8757 /* The EPILOGUE loop is a copy of the original loop so they share the same
8758 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
8759 point to the copied statements. We also create a mapping of all LHS' in
8760 the original loop and all the LHS' in the EPILOGUE and create worklists to
8761 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
8762 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8764 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8765 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8767 new_stmt = epilogue_phi_gsi.phi ();
8769 gcc_assert (gimple_uid (new_stmt) > 0);
8770 stmt_vinfo
8771 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8773 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8774 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8776 mapping.put (gimple_phi_result (orig_stmt),
8777 gimple_phi_result (new_stmt));
8778 /* PHI nodes can not have patterns or related statements. */
8779 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8780 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8783 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8784 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8786 new_stmt = gsi_stmt (epilogue_gsi);
8787 if (is_gimple_debug (new_stmt))
8788 continue;
8790 gcc_assert (gimple_uid (new_stmt) > 0);
8791 stmt_vinfo
8792 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8794 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8795 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8797 if (tree old_lhs = gimple_get_lhs (orig_stmt))
8798 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8800 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8802 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8803 for (gimple_stmt_iterator gsi = gsi_start (seq);
8804 !gsi_end_p (gsi); gsi_next (&gsi))
8805 stmt_worklist.safe_push (gsi_stmt (gsi));
8808 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8809 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8811 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8812 stmt_worklist.safe_push (stmt);
8813 /* Set BB such that the assert in
8814 'get_initial_def_for_reduction' is able to determine that
8815 the BB of the related stmt is inside this loop. */
8816 gimple_set_bb (stmt,
8817 gimple_bb (new_stmt));
8818 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8819 gcc_assert (related_vinfo == NULL
8820 || related_vinfo == stmt_vinfo);
8825 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8826 using the original main loop and thus need to be updated to refer to the
8827 cloned variables used in the epilogue. */
8828 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8830 gimple *stmt = stmt_worklist[i];
8831 tree *new_op;
8833 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8835 tree op = gimple_op (stmt, j);
8836 if ((new_op = mapping.get(op)))
8837 gimple_set_op (stmt, j, *new_op);
8838 else
8840 /* PR92429: The last argument of simplify_replace_tree disables
8841 folding when replacing arguments. This is required as
8842 otherwise you might end up with different statements than the
8843 ones analyzed in vect_loop_analyze, leading to different
8844 vectorization. */
8845 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8846 &find_in_mapping, &mapping, false);
8847 gimple_set_op (stmt, j, op);
8852 struct data_reference *dr;
8853 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
8854 FOR_EACH_VEC_ELT (datarefs, i, dr)
8856 orig_stmt = DR_STMT (dr);
8857 gcc_assert (gimple_uid (orig_stmt) > 0);
8858 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8859 /* Data references for gather loads and scatter stores do not use the
8860 updated offset we set using ADVANCE. Instead we have to make sure the
8861 reference in the data references point to the corresponding copy of
8862 the original in the epilogue. */
8863 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8864 == VMAT_GATHER_SCATTER)
8866 DR_REF (dr)
8867 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8868 &find_in_mapping, &mapping);
8869 DR_BASE_ADDRESS (dr)
8870 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8871 &find_in_mapping, &mapping);
8873 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8874 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8875 /* The vector size of the epilogue is smaller than that of the main loop
8876 so the alignment is either the same or lower. This means the dr will
8877 thus by definition be aligned. */
8878 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8881 epilogue_vinfo->shared->datarefs_copy.release ();
8882 epilogue_vinfo->shared->save_datarefs ();
8885 /* Function vect_transform_loop.
8887 The analysis phase has determined that the loop is vectorizable.
8888 Vectorize the loop - created vectorized stmts to replace the scalar
8889 stmts in the loop, and update the loop exit condition.
8890 Returns scalar epilogue loop if any. */
8892 class loop *
8893 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8895 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8896 class loop *epilogue = NULL;
8897 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8898 int nbbs = loop->num_nodes;
8899 int i;
8900 tree niters_vector = NULL_TREE;
8901 tree step_vector = NULL_TREE;
8902 tree niters_vector_mult_vf = NULL_TREE;
8903 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8904 unsigned int lowest_vf = constant_lower_bound (vf);
8905 gimple *stmt;
8906 bool check_profitability = false;
8907 unsigned int th;
8909 DUMP_VECT_SCOPE ("vec_transform_loop");
8911 loop_vinfo->shared->check_datarefs ();
8913 /* Use the more conservative vectorization threshold. If the number
8914 of iterations is constant assume the cost check has been performed
8915 by our caller. If the threshold makes all loops profitable that
8916 run at least the (estimated) vectorization factor number of times
8917 checking is pointless, too. */
8918 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8919 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8921 if (dump_enabled_p ())
8922 dump_printf_loc (MSG_NOTE, vect_location,
8923 "Profitability threshold is %d loop iterations.\n",
8924 th);
8925 check_profitability = true;
8928 /* Make sure there exists a single-predecessor exit bb. Do this before
8929 versioning. */
8930 edge e = single_exit (loop);
8931 if (! single_pred_p (e->dest))
8933 split_loop_exit_edge (e, true);
8934 if (dump_enabled_p ())
8935 dump_printf (MSG_NOTE, "split exit edge\n");
8938 /* Version the loop first, if required, so the profitability check
8939 comes first. */
8941 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8943 class loop *sloop
8944 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8945 sloop->force_vectorize = false;
8946 check_profitability = false;
8949 /* Make sure there exists a single-predecessor exit bb also on the
8950 scalar loop copy. Do this after versioning but before peeling
8951 so CFG structure is fine for both scalar and if-converted loop
8952 to make slpeel_duplicate_current_defs_from_edges face matched
8953 loop closed PHI nodes on the exit. */
8954 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8956 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8957 if (! single_pred_p (e->dest))
8959 split_loop_exit_edge (e, true);
8960 if (dump_enabled_p ())
8961 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8965 tree niters = vect_build_loop_niters (loop_vinfo);
8966 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8967 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8968 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8969 tree advance;
8970 drs_init_vec orig_drs_init;
8972 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8973 &step_vector, &niters_vector_mult_vf, th,
8974 check_profitability, niters_no_overflow,
8975 &advance);
8977 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8978 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8979 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8980 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8982 if (niters_vector == NULL_TREE)
8984 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8985 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
8986 && known_eq (lowest_vf, vf))
8988 niters_vector
8989 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8990 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8991 step_vector = build_one_cst (TREE_TYPE (niters));
8993 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
8994 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8995 &step_vector, niters_no_overflow);
8996 else
8997 /* vect_do_peeling subtracted the number of peeled prologue
8998 iterations from LOOP_VINFO_NITERS. */
8999 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9000 &niters_vector, &step_vector,
9001 niters_no_overflow);
9004 /* 1) Make sure the loop header has exactly two entries
9005 2) Make sure we have a preheader basic block. */
9007 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9009 split_edge (loop_preheader_edge (loop));
9011 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
9012 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
9013 /* This will deal with any possible peeling. */
9014 vect_prepare_for_masked_peels (loop_vinfo);
9016 /* Schedule the SLP instances first, then handle loop vectorization
9017 below. */
9018 if (!loop_vinfo->slp_instances.is_empty ())
9020 DUMP_VECT_SCOPE ("scheduling SLP instances");
9021 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9024 /* FORNOW: the vectorizer supports only loops which body consist
9025 of one basic block (header + empty latch). When the vectorizer will
9026 support more involved loop forms, the order by which the BBs are
9027 traversed need to be reconsidered. */
9029 for (i = 0; i < nbbs; i++)
9031 basic_block bb = bbs[i];
9032 stmt_vec_info stmt_info;
9034 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9035 gsi_next (&si))
9037 gphi *phi = si.phi ();
9038 if (dump_enabled_p ())
9039 dump_printf_loc (MSG_NOTE, vect_location,
9040 "------>vectorizing phi: %G", phi);
9041 stmt_info = loop_vinfo->lookup_stmt (phi);
9042 if (!stmt_info)
9043 continue;
9045 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9046 vect_loop_kill_debug_uses (loop, stmt_info);
9048 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9049 && !STMT_VINFO_LIVE_P (stmt_info))
9050 continue;
9052 if (STMT_VINFO_VECTYPE (stmt_info)
9053 && (maybe_ne
9054 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9055 && dump_enabled_p ())
9056 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9058 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9059 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9060 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9061 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9062 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9063 && ! PURE_SLP_STMT (stmt_info))
9065 if (dump_enabled_p ())
9066 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9067 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9071 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9072 gsi_next (&si))
9074 gphi *phi = si.phi ();
9075 stmt_info = loop_vinfo->lookup_stmt (phi);
9076 if (!stmt_info)
9077 continue;
9079 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9080 && !STMT_VINFO_LIVE_P (stmt_info))
9081 continue;
9083 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9084 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9085 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9086 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9087 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9088 && ! PURE_SLP_STMT (stmt_info))
9089 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9092 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9093 !gsi_end_p (si);)
9095 stmt = gsi_stmt (si);
9096 /* During vectorization remove existing clobber stmts. */
9097 if (gimple_clobber_p (stmt))
9099 unlink_stmt_vdef (stmt);
9100 gsi_remove (&si, true);
9101 release_defs (stmt);
9103 else
9105 /* Ignore vector stmts created in the outer loop. */
9106 stmt_info = loop_vinfo->lookup_stmt (stmt);
9108 /* vector stmts created in the outer-loop during vectorization of
9109 stmts in an inner-loop may not have a stmt_info, and do not
9110 need to be vectorized. */
9111 stmt_vec_info seen_store = NULL;
9112 if (stmt_info)
9114 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9116 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9117 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9118 !gsi_end_p (subsi); gsi_next (&subsi))
9120 stmt_vec_info pat_stmt_info
9121 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9122 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9123 &si, &seen_store);
9125 stmt_vec_info pat_stmt_info
9126 = STMT_VINFO_RELATED_STMT (stmt_info);
9127 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
9128 &seen_store);
9129 maybe_set_vectorized_backedge_value (loop_vinfo,
9130 pat_stmt_info);
9132 else
9134 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9135 &seen_store);
9136 maybe_set_vectorized_backedge_value (loop_vinfo,
9137 stmt_info);
9140 gsi_next (&si);
9141 if (seen_store)
9143 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9144 /* Interleaving. If IS_STORE is TRUE, the
9145 vectorization of the interleaving chain was
9146 completed - free all the stores in the chain. */
9147 vect_remove_stores (loop_vinfo,
9148 DR_GROUP_FIRST_ELEMENT (seen_store));
9149 else
9150 /* Free the attached stmt_vec_info and remove the stmt. */
9151 loop_vinfo->remove_stmt (stmt_info);
9156 /* Stub out scalar statements that must not survive vectorization.
9157 Doing this here helps with grouped statements, or statements that
9158 are involved in patterns. */
9159 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9160 !gsi_end_p (gsi); gsi_next (&gsi))
9162 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9163 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9165 tree lhs = gimple_get_lhs (call);
9166 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9168 tree zero = build_zero_cst (TREE_TYPE (lhs));
9169 gimple *new_stmt = gimple_build_assign (lhs, zero);
9170 gsi_replace (&gsi, new_stmt, true);
9174 } /* BBs in loop */
9176 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9177 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9178 if (integer_onep (step_vector))
9179 niters_no_overflow = true;
9180 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9181 niters_vector_mult_vf, !niters_no_overflow);
9183 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9184 scale_profile_for_vect_loop (loop, assumed_vf);
9186 /* True if the final iteration might not handle a full vector's
9187 worth of scalar iterations. */
9188 bool final_iter_may_be_partial
9189 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9190 /* The minimum number of iterations performed by the epilogue. This
9191 is 1 when peeling for gaps because we always need a final scalar
9192 iteration. */
9193 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9194 /* +1 to convert latch counts to loop iteration counts,
9195 -min_epilogue_iters to remove iterations that cannot be performed
9196 by the vector code. */
9197 int bias_for_lowest = 1 - min_epilogue_iters;
9198 int bias_for_assumed = bias_for_lowest;
9199 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9200 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9202 /* When the amount of peeling is known at compile time, the first
9203 iteration will have exactly alignment_npeels active elements.
9204 In the worst case it will have at least one. */
9205 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9206 bias_for_lowest += lowest_vf - min_first_active;
9207 bias_for_assumed += assumed_vf - min_first_active;
9209 /* In these calculations the "- 1" converts loop iteration counts
9210 back to latch counts. */
9211 if (loop->any_upper_bound)
9212 loop->nb_iterations_upper_bound
9213 = (final_iter_may_be_partial
9214 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9215 lowest_vf) - 1
9216 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9217 lowest_vf) - 1);
9218 if (loop->any_likely_upper_bound)
9219 loop->nb_iterations_likely_upper_bound
9220 = (final_iter_may_be_partial
9221 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9222 + bias_for_lowest, lowest_vf) - 1
9223 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9224 + bias_for_lowest, lowest_vf) - 1);
9225 if (loop->any_estimate)
9226 loop->nb_iterations_estimate
9227 = (final_iter_may_be_partial
9228 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9229 assumed_vf) - 1
9230 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9231 assumed_vf) - 1);
9233 if (dump_enabled_p ())
9235 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9237 dump_printf_loc (MSG_NOTE, vect_location,
9238 "LOOP VECTORIZED\n");
9239 if (loop->inner)
9240 dump_printf_loc (MSG_NOTE, vect_location,
9241 "OUTER LOOP VECTORIZED\n");
9242 dump_printf (MSG_NOTE, "\n");
9244 else
9245 dump_printf_loc (MSG_NOTE, vect_location,
9246 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9247 GET_MODE_NAME (loop_vinfo->vector_mode));
9250 /* Loops vectorized with a variable factor won't benefit from
9251 unrolling/peeling. */
9252 if (!vf.is_constant ())
9254 loop->unroll = 1;
9255 if (dump_enabled_p ())
9256 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9257 " variable-length vectorization factor\n");
9259 /* Free SLP instances here because otherwise stmt reference counting
9260 won't work. */
9261 slp_instance instance;
9262 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9263 vect_free_slp_instance (instance, true);
9264 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9265 /* Clear-up safelen field since its value is invalid after vectorization
9266 since vectorized loop can have loop-carried dependencies. */
9267 loop->safelen = 0;
9269 if (epilogue)
9271 update_epilogue_loop_vinfo (epilogue, advance);
9273 epilogue->simduid = loop->simduid;
9274 epilogue->force_vectorize = loop->force_vectorize;
9275 epilogue->dont_vectorize = false;
9278 return epilogue;
9281 /* The code below is trying to perform simple optimization - revert
9282 if-conversion for masked stores, i.e. if the mask of a store is zero
9283 do not perform it and all stored value producers also if possible.
9284 For example,
9285 for (i=0; i<n; i++)
9286 if (c[i])
9288 p1[i] += 1;
9289 p2[i] = p3[i] +2;
9291 this transformation will produce the following semi-hammock:
9293 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9295 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9296 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9297 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9298 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9299 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9300 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9304 void
9305 optimize_mask_stores (class loop *loop)
9307 basic_block *bbs = get_loop_body (loop);
9308 unsigned nbbs = loop->num_nodes;
9309 unsigned i;
9310 basic_block bb;
9311 class loop *bb_loop;
9312 gimple_stmt_iterator gsi;
9313 gimple *stmt;
9314 auto_vec<gimple *> worklist;
9315 auto_purge_vect_location sentinel;
9317 vect_location = find_loop_location (loop);
9318 /* Pick up all masked stores in loop if any. */
9319 for (i = 0; i < nbbs; i++)
9321 bb = bbs[i];
9322 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9323 gsi_next (&gsi))
9325 stmt = gsi_stmt (gsi);
9326 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9327 worklist.safe_push (stmt);
9331 free (bbs);
9332 if (worklist.is_empty ())
9333 return;
9335 /* Loop has masked stores. */
9336 while (!worklist.is_empty ())
9338 gimple *last, *last_store;
9339 edge e, efalse;
9340 tree mask;
9341 basic_block store_bb, join_bb;
9342 gimple_stmt_iterator gsi_to;
9343 tree vdef, new_vdef;
9344 gphi *phi;
9345 tree vectype;
9346 tree zero;
9348 last = worklist.pop ();
9349 mask = gimple_call_arg (last, 2);
9350 bb = gimple_bb (last);
9351 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9352 the same loop as if_bb. It could be different to LOOP when two
9353 level loop-nest is vectorized and mask_store belongs to the inner
9354 one. */
9355 e = split_block (bb, last);
9356 bb_loop = bb->loop_father;
9357 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9358 join_bb = e->dest;
9359 store_bb = create_empty_bb (bb);
9360 add_bb_to_loop (store_bb, bb_loop);
9361 e->flags = EDGE_TRUE_VALUE;
9362 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9363 /* Put STORE_BB to likely part. */
9364 efalse->probability = profile_probability::unlikely ();
9365 store_bb->count = efalse->count ();
9366 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9367 if (dom_info_available_p (CDI_DOMINATORS))
9368 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9369 if (dump_enabled_p ())
9370 dump_printf_loc (MSG_NOTE, vect_location,
9371 "Create new block %d to sink mask stores.",
9372 store_bb->index);
9373 /* Create vector comparison with boolean result. */
9374 vectype = TREE_TYPE (mask);
9375 zero = build_zero_cst (vectype);
9376 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9377 gsi = gsi_last_bb (bb);
9378 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9379 /* Create new PHI node for vdef of the last masked store:
9380 .MEM_2 = VDEF <.MEM_1>
9381 will be converted to
9382 .MEM.3 = VDEF <.MEM_1>
9383 and new PHI node will be created in join bb
9384 .MEM_2 = PHI <.MEM_1, .MEM_3>
9386 vdef = gimple_vdef (last);
9387 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9388 gimple_set_vdef (last, new_vdef);
9389 phi = create_phi_node (vdef, join_bb);
9390 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9392 /* Put all masked stores with the same mask to STORE_BB if possible. */
9393 while (true)
9395 gimple_stmt_iterator gsi_from;
9396 gimple *stmt1 = NULL;
9398 /* Move masked store to STORE_BB. */
9399 last_store = last;
9400 gsi = gsi_for_stmt (last);
9401 gsi_from = gsi;
9402 /* Shift GSI to the previous stmt for further traversal. */
9403 gsi_prev (&gsi);
9404 gsi_to = gsi_start_bb (store_bb);
9405 gsi_move_before (&gsi_from, &gsi_to);
9406 /* Setup GSI_TO to the non-empty block start. */
9407 gsi_to = gsi_start_bb (store_bb);
9408 if (dump_enabled_p ())
9409 dump_printf_loc (MSG_NOTE, vect_location,
9410 "Move stmt to created bb\n%G", last);
9411 /* Move all stored value producers if possible. */
9412 while (!gsi_end_p (gsi))
9414 tree lhs;
9415 imm_use_iterator imm_iter;
9416 use_operand_p use_p;
9417 bool res;
9419 /* Skip debug statements. */
9420 if (is_gimple_debug (gsi_stmt (gsi)))
9422 gsi_prev (&gsi);
9423 continue;
9425 stmt1 = gsi_stmt (gsi);
9426 /* Do not consider statements writing to memory or having
9427 volatile operand. */
9428 if (gimple_vdef (stmt1)
9429 || gimple_has_volatile_ops (stmt1))
9430 break;
9431 gsi_from = gsi;
9432 gsi_prev (&gsi);
9433 lhs = gimple_get_lhs (stmt1);
9434 if (!lhs)
9435 break;
9437 /* LHS of vectorized stmt must be SSA_NAME. */
9438 if (TREE_CODE (lhs) != SSA_NAME)
9439 break;
9441 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9443 /* Remove dead scalar statement. */
9444 if (has_zero_uses (lhs))
9446 gsi_remove (&gsi_from, true);
9447 continue;
9451 /* Check that LHS does not have uses outside of STORE_BB. */
9452 res = true;
9453 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9455 gimple *use_stmt;
9456 use_stmt = USE_STMT (use_p);
9457 if (is_gimple_debug (use_stmt))
9458 continue;
9459 if (gimple_bb (use_stmt) != store_bb)
9461 res = false;
9462 break;
9465 if (!res)
9466 break;
9468 if (gimple_vuse (stmt1)
9469 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9470 break;
9472 /* Can move STMT1 to STORE_BB. */
9473 if (dump_enabled_p ())
9474 dump_printf_loc (MSG_NOTE, vect_location,
9475 "Move stmt to created bb\n%G", stmt1);
9476 gsi_move_before (&gsi_from, &gsi_to);
9477 /* Shift GSI_TO for further insertion. */
9478 gsi_prev (&gsi_to);
9480 /* Put other masked stores with the same mask to STORE_BB. */
9481 if (worklist.is_empty ()
9482 || gimple_call_arg (worklist.last (), 2) != mask
9483 || worklist.last () != stmt1)
9484 break;
9485 last = worklist.pop ();
9487 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9491 /* Decide whether it is possible to use a zero-based induction variable
9492 when vectorizing LOOP_VINFO with partial vectors. If it is, return
9493 the value that the induction variable must be able to hold in order
9494 to ensure that the rgroups eventually have no active vector elements.
9495 Return -1 otherwise. */
9497 widest_int
9498 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
9500 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9501 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9502 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9504 /* Calculate the value that the induction variable must be able
9505 to hit in order to ensure that we end the loop with an all-false mask.
9506 This involves adding the maximum number of inactive trailing scalar
9507 iterations. */
9508 widest_int iv_limit = -1;
9509 if (max_loop_iterations (loop, &iv_limit))
9511 if (niters_skip)
9513 /* Add the maximum number of skipped iterations to the
9514 maximum iteration count. */
9515 if (TREE_CODE (niters_skip) == INTEGER_CST)
9516 iv_limit += wi::to_widest (niters_skip);
9517 else
9518 iv_limit += max_vf - 1;
9520 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9521 /* Make a conservatively-correct assumption. */
9522 iv_limit += max_vf - 1;
9524 /* IV_LIMIT is the maximum number of latch iterations, which is also
9525 the maximum in-range IV value. Round this value down to the previous
9526 vector alignment boundary and then add an extra full iteration. */
9527 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9528 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9530 return iv_limit;
9533 /* For the given rgroup_controls RGC, check whether an induction variable
9534 would ever hit a value that produces a set of all-false masks or zero
9535 lengths before wrapping around. Return true if it's possible to wrap
9536 around before hitting the desirable value, otherwise return false. */
9538 bool
9539 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
9541 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
9543 if (iv_limit == -1)
9544 return true;
9546 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9547 unsigned int compare_precision = TYPE_PRECISION (compare_type);
9548 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
9550 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
9551 return true;
9553 return false;